Пример #1
0
 def slice_catalog(catalog: ConfiguredAirbyteCatalog,
                   streams: Set[str]) -> ConfiguredAirbyteCatalog:
     sliced_catalog = ConfiguredAirbyteCatalog(streams=[])
     for stream in catalog.streams:
         if stream.stream.name in streams:
             sliced_catalog.streams.append(stream)
     return sliced_catalog
Пример #2
0
    def test_streams_outputs_records(self, catalog_path, stream_config):
        configured_catalog = ConfiguredAirbyteCatalog.parse_file(catalog_path)
        records, states = self._read_records(stream_config, configured_catalog)

        assert records, "should have some records returned"
        if configured_catalog.streams[0].sync_mode == SyncMode.incremental:
            assert states, "should have some states returned"
Пример #3
0
def configured_catalog() -> ConfiguredAirbyteCatalog:
    catalog_filename = HERE.parent / "sample_files" / "configured_catalog.json"
    if not catalog_filename.exists():
        raise RuntimeError(
            f"Please provide configured catalog in {catalog_filename}")

    return ConfiguredAirbyteCatalog.parse_file(catalog_filename)
Пример #4
0
    def test_parse_sheet_and_column_names_from_catalog(self):
        sheet1 = "soccer_team"
        sheet1_columns = frozenset(["arsenal", "chelsea", "manutd", "liverpool"])
        sheet1_schema = {"properties": {c: {"type": "string"} for c in sheet1_columns}}

        sheet2 = "basketball_teams"
        sheet2_columns = frozenset(["gsw", "lakers"])
        sheet2_schema = {"properties": {c: {"type": "string"} for c in sheet2_columns}}

        catalog = ConfiguredAirbyteCatalog(
            streams=[
                ConfiguredAirbyteStream(
                    stream=AirbyteStream(name=sheet1, json_schema=sheet1_schema),
                    sync_mode=SyncMode.full_refresh,
                    destination_sync_mode=DestinationSyncMode.overwrite,
                ),
                ConfiguredAirbyteStream(
                    stream=AirbyteStream(name=sheet2, json_schema=sheet2_schema),
                    sync_mode=SyncMode.full_refresh,
                    destination_sync_mode=DestinationSyncMode.overwrite,
                ),
            ]
        )

        actual = Helpers.parse_sheet_and_column_names_from_catalog(catalog)

        expected = {sheet1: sheet1_columns, sheet2: sheet2_columns}
        self.assertEqual(actual, expected)
Пример #5
0
    def read(self,
             logger: AirbyteLogger,
             config_container: ConfigContainer,
             catalog_path,
             state_path: str = None) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config_container)

        if state_path:
            logger.info("Starting sync with provided state file")
            state_obj = json.loads(open(state_path, "r").read())
        else:
            logger.info("No state provided, starting fresh sync")
            state_obj = {}

        state = defaultdict(dict, state_obj)
        catalog = ConfiguredAirbyteCatalog.parse_obj(
            self.read_config(catalog_path))

        logger.info("Starting syncing mailchimp")
        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            for record in self._read_record(client=client,
                                            stream=stream.name,
                                            state=state):
                yield record

        logger.info("Finished syncing mailchimp")
Пример #6
0
def configured_catalog_from_client(
        client: BaseClient) -> ConfiguredAirbyteCatalog:
    """Helper to generate configured catalog for testing"""
    catalog = ConfiguredAirbyteCatalog(streams=[
        ConfiguredAirbyteStream(stream=stream) for stream in client.streams
    ])

    return catalog
Пример #7
0
    def read(self,
             logger,
             config_container,
             catalog_path,
             state_path=None) -> Generator[AirbyteMessage, None, None]:
        """

        :param logger:
        :param config_container:
        :param catalog_path:
        :param state_path:
        :return:
        """
        config = config_container.rendered_config
        storage = SourceFile.get_storage_scheme(logger,
                                                config["provider"]["storage"],
                                                config["url"])
        url = SourceFile.get_simple_url(config["url"])
        name = SourceFile.get_stream_name(config)
        logger.info(
            f"Reading {name} ({storage}{url}, {catalog_path}, {state_path})..."
        )
        catalog = ConfiguredAirbyteCatalog.parse_obj(
            self.read_config(catalog_path))
        selection = SourceFile.parse_catalog(catalog)
        try:
            if "format" in config and config["format"] == "json":
                data_list = SourceFile.load_nested_json(config, logger)
                for data in data_list:
                    yield AirbyteMessage(
                        type=Type.RECORD,
                        record=AirbyteRecordMessage(
                            stream=name,
                            data=data,
                            emitted_at=int(datetime.now().timestamp()) * 1000),
                    )
            else:
                df_list = SourceFile.load_dataframes(config, logger)
                for df in df_list:
                    if len(selection) > 0:
                        columns = selection.intersection(set(df.columns))
                    else:
                        columns = df.columns
                    df = df.replace(np.nan, "NaN", regex=True)
                    for data in df[columns].to_dict(orient="records"):
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=AirbyteRecordMessage(
                                stream=name,
                                data=data,
                                emitted_at=int(datetime.now().timestamp()) *
                                1000),
                        )
        except Exception as err:
            reason = f"Failed to read data of {name} at {storage}{url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
Пример #8
0
def incremental_only_catalog(configured_catalog: ConfiguredAirbyteCatalog) -> ConfiguredAirbyteCatalog:
    """Transform provided catalog to catalog with all streams configured to use Incremental sync (when possible)"""
    streams = []
    for stream in configured_catalog.streams:
        if SyncMode.incremental in stream.stream.supported_sync_modes:
            stream.sync_mode = SyncMode.incremental
            streams.append(stream)

    configured_catalog.streams = streams
    return configured_catalog
Пример #9
0
def full_refresh_only_catalog(configured_catalog: ConfiguredAirbyteCatalog) -> ConfiguredAirbyteCatalog:
    """Transform provided catalog to catalog with all streams configured to use Full Refresh sync (when possible)"""
    streams = []
    for stream in configured_catalog.streams:
        if SyncMode.full_refresh in stream.stream.supported_sync_modes:
            stream.sync_mode = SyncMode.full_refresh
            streams.append(stream)

    configured_catalog.streams = streams
    return configured_catalog
Пример #10
0
    def read(
        self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path: str, state_path: str = None
    ) -> Generator[AirbyteMessage, None, None]:
        """
        Implements the parent class read method.
        """
        catalogs = self._discover_internal(logger, config_container.config_path)
        masked_airbyte_catalog = ConfiguredAirbyteCatalog.parse_obj(self.read_config(catalog_path))
        selected_singer_catalog_path = SingerHelper.create_singer_catalog_with_selection(masked_airbyte_catalog, catalogs.singer_catalog)

        read_cmd = self.read_cmd(logger, config_container.config_path, selected_singer_catalog_path, state_path)
        return SingerHelper.read(logger, read_cmd)
Пример #11
0
    def read(self,
             logger: AirbyteLogger,
             config_container: ConfigContainer,
             catalog_path,
             state=None) -> Generator[AirbyteMessage, None, None]:
        client = self._get_client(config_container)

        config = self.read_config(catalog_path)
        catalog = ConfiguredAirbyteCatalog.parse_obj(config)

        logger.info(f"Starting syncing {self.__class__.__name__}")
        for configured_stream in catalog.streams:
            for record in client.read_stream(configured_stream.stream):
                yield AirbyteMessage(type=airbyte_protocol.Type.RECORD,
                                     record=record)
        logger.info(f"Finished syncing {self.__class__.__name__}")
Пример #12
0
    def read(self,
             logger: AirbyteLogger,
             config_container: ConfigContainer,
             catalog_path,
             state=None) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config_container)

        catalog = ConfiguredAirbyteCatalog.parse_obj(
            self.read_config(catalog_path))

        logger.info("Starting syncing mailchimp")
        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            for record in self._read_record(client=client, stream=stream.name):
                yield AirbyteMessage(type=Type.RECORD, record=record)

        logger.info("Finished syncing mailchimp")
Пример #13
0
    def read(self, logger: AirbyteLogger, config_container, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]:
        config = config_container.rendered_config
        client = Helpers.get_authenticated_sheets_client(json.loads(config["credentials_json"]))

        catalog = ConfiguredAirbyteCatalog.parse_obj(self.read_config(catalog_path))

        sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog)
        spreadsheet_id = config["spreadsheet_id"]

        logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
        # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
        # a blank row, emit the row batch
        sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(client, spreadsheet_id, sheet_to_column_name)
        for sheet in sheet_to_column_index_to_name.keys():
            logger.info(f"Syncing sheet {sheet}")
            column_index_to_name = sheet_to_column_index_to_name[sheet]
            row_cursor = 2  # we start syncing past the header row
            encountered_blank_row = False
            while not encountered_blank_row:
                range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}"
                logger.info(f"Fetching range {range}")
                row_batch = SpreadsheetValues.parse_obj(
                    client.values().batchGet(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS").execute()
                )
                row_cursor += ROW_BATCH_SIZE + 1
                # there should always be one range since we requested only one
                value_ranges = row_batch.valueRanges[0]

                if not value_ranges.values:
                    break

                row_values = value_ranges.values
                if len(row_values) == 0:
                    break

                for row in row_values:
                    if Helpers.is_row_empty(row):
                        encountered_blank_row = True
                        break
                    elif Helpers.row_contains_relevant_data(row, column_index_to_name.keys()):
                        yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name))
        logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
Пример #14
0
    def read(self,
             logger: AirbyteLogger,
             config_container: ConfigContainer,
             catalog_path,
             state=None) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config_container)

        config = self.read_config(catalog_path)
        catalog = ConfiguredAirbyteCatalog.parse_obj(config)

        logger.info("Starting syncing recurly")
        for configured_stream in catalog.streams:
            # TODO handle incremental syncs
            stream = configured_stream.stream
            if stream.name not in client.ENTITIES:
                logger.warn(
                    f"Stream '{stream}' not found in the recognized entities")
                continue
            for record in self._read_record(client=client, stream=stream.name):
                yield AirbyteMessage(type=Type.RECORD, record=record)

        logger.info("Finished syncing recurly")
Пример #15
0
 def read_catalog(self, catalog_path: str) -> ConfiguredAirbyteCatalog:
     catalog = ConfiguredAirbyteCatalog.parse_obj(
         self.read_config(catalog_path))
     if not self.reports_to_read:
         self.reports_to_read = [i.stream.name for i in catalog.streams]
     return catalog_path
Пример #16
0
 def get_catalog(self) -> ConfiguredAirbyteCatalog:
     raw_catalog = pkgutil.get_data(
         self.__class__.__module__.split(".")[0],
         self.CONFIGURED_CATALOG_FILENAME)
     return ConfiguredAirbyteCatalog.parse_obj(json.loads(raw_catalog))
Пример #17
0
 def get_catalog(self) -> ConfiguredAirbyteCatalog:
     return ConfiguredAirbyteCatalog.parse_raw(pkgutil.get_data(self.__class__.__module__.split(".")[0], "catalog_subset.json"))
Пример #18
0
 def get_catalog(self) -> ConfiguredAirbyteCatalog:
     raw_catalog = pkgutil.get_data(
         self.__class__.__module__.split(".")[0], "configured_catalog.json")
     return ConfiguredAirbyteCatalog.parse_obj(json.loads(raw_catalog))
Пример #19
0
def configured_catalog_fixture(
        configured_catalog_path) -> Optional[ConfiguredAirbyteCatalog]:
    if configured_catalog_path:
        return ConfiguredAirbyteCatalog.parse_file(configured_catalog_path)
    return None
Пример #20
0
 def read_catalog(self, catalog_path: str) -> ConfiguredAirbyteCatalog:
     return ConfiguredAirbyteCatalog.parse_obj(
         self.read_config(catalog_path))
Пример #21
0
def configured_catalog_fixture():
    return ConfiguredAirbyteCatalog.parse_file(
        "sample_files/configured_catalog.json")
Пример #22
0
def configured_catalog_fixture():
    return ConfiguredAirbyteCatalog.parse_file(
        BASE_DIRECTORY /
        "sample_files/configured_catalog_activities_overview.json")
Пример #23
0
 def _read_catalog(path):
     return ConfiguredAirbyteCatalog.parse_raw(open(path, "r").read())