def test_sequential_reads(self, connector_config, configured_catalog, docker_runner: ConnectorRunner): configured_catalog = full_refresh_only_catalog(configured_catalog) output = docker_runner.call_read(connector_config, configured_catalog) records_1 = [message.record.data for message in output if message.type == Type.RECORD] output = docker_runner.call_read(connector_config, configured_catalog) records_2 = [message.record.data for message in output if message.type == Type.RECORD] serialize = partial(json.dumps, sort_keys=True) assert not ( set(map(serialize, records_1)) - set(map(serialize, records_2)) ), "The two sequential reads should produce either equal set of records or one of them is a strict subset of the other"
def test_sequential_reads( self, inputs: ConnectionTestConfig, connector_config: SecretDict, configured_catalog: ConfiguredAirbyteCatalog, docker_runner: ConnectorRunner, detailed_logger: Logger, ): ignored_fields = getattr(inputs, "ignored_fields") or {} configured_catalog = full_refresh_only_catalog(configured_catalog) output = docker_runner.call_read(connector_config, configured_catalog) records_1 = [ message.record for message in output if message.type == Type.RECORD ] records_by_stream_1 = defaultdict(list) for record in records_1: records_by_stream_1[record.stream].append(record.data) output = docker_runner.call_read(connector_config, configured_catalog) records_2 = [ message.record for message in output if message.type == Type.RECORD ] records_by_stream_2 = defaultdict(list) for record in records_2: records_by_stream_2[record.stream].append(record.data) pks_by_stream = primary_keys_by_stream(configured_catalog) for stream in records_by_stream_1: if pks_by_stream.get(stream): serializer = partial(primary_keys_only, pks=pks_by_stream.get(stream)) else: serializer = partial(make_hashable, exclude_fields=ignored_fields.get(stream)) stream_records_1 = records_by_stream_1.get(stream) stream_records_2 = records_by_stream_2.get(stream) # Using output_diff = set(map(serializer, stream_records_1)).symmetric_difference( set(map(serializer, stream_records_2))) if output_diff: msg = f"{stream}: the two sequential reads should produce either equal set of records or one of them is a strict subset of the other" detailed_logger.info(msg) detailed_logger.info("First read") detailed_logger.log_json_list(stream_records_1) detailed_logger.info("Second read") detailed_logger.log_json_list(stream_records_2) detailed_logger.info("Difference") detailed_logger.log_json_list(output_diff) pytest.fail(msg)
def test_airbyte_trace_message_on_failure(self, connector_config, inputs: BasicReadTestConfig, docker_runner: ConnectorRunner): if not inputs.expect_trace_message_on_failure: pytest.skip("Skipping `test_airbyte_trace_message_on_failure` because `inputs.expect_trace_message_on_failure=False`") return invalid_configured_catalog = ConfiguredAirbyteCatalog( streams=[ # create ConfiguredAirbyteStream without validation ConfiguredAirbyteStream.construct( stream=AirbyteStream( name="__AIRBYTE__stream_that_does_not_exist", json_schema={"type": "object", "properties": {"f1": {"type": "string"}}}, supported_sync_modes=[SyncMode.full_refresh], ), sync_mode="INVALID", destination_sync_mode="INVALID", ) ] ) output = docker_runner.call_read(connector_config, invalid_configured_catalog, raise_container_error=False) trace_messages = filter_output(output, Type.TRACE) error_trace_messages = list(filter(lambda m: m.trace.type == TraceType.ERROR, trace_messages)) assert len(error_trace_messages) >= 1, "Connector should emit at least one error trace message"
def test_read( self, connector_config, configured_catalog, inputs: BasicReadTestConfig, expected_records: List[AirbyteRecordMessage], docker_runner: ConnectorRunner, detailed_logger, ): output = docker_runner.call_read(connector_config, configured_catalog) records = [message.record for message in filter_output(output, Type.RECORD)] assert records, "At least one record should be read using provided catalog" if inputs.validate_schema: self._validate_schema(records=records, configured_catalog=configured_catalog) self._validate_empty_streams(records=records, configured_catalog=configured_catalog, allowed_empty_streams=inputs.empty_streams) for pks, record in primary_keys_for_records(streams=configured_catalog.streams, records=records): for pk_path, pk_value in pks.items(): assert ( pk_value is not None ), f"Primary key subkeys {repr(pk_path)} have null values or not present in {record.stream} stream records." # TODO: remove this condition after https://github.com/airbytehq/airbyte/issues/8312 is done if inputs.validate_data_points: self._validate_field_appears_at_least_once(records=records, configured_catalog=configured_catalog) if expected_records: self._validate_expected_records( records=records, expected_records=expected_records, flags=inputs.expect_records, detailed_logger=detailed_logger )
def test_two_sequential_reads(self, connector_config, configured_catalog_for_incremental, cursor_paths, docker_runner: ConnectorRunner): stream_mapping = { stream.stream.name: stream for stream in configured_catalog_for_incremental.streams } output = docker_runner.call_read(connector_config, configured_catalog_for_incremental) records_1 = filter_output(output, type_=Type.RECORD) states_1 = filter_output(output, type_=Type.STATE) assert states_1, "Should produce at least one state" assert records_1, "Should produce at least one record" latest_state = states_1[-1].state.data for record_value, state_value in records_with_state( records_1, latest_state, stream_mapping, cursor_paths): assert ( record_value <= state_value ), "First incremental sync should produce records younger or equal to cursor value from the state" output = docker_runner.call_read_with_state( connector_config, configured_catalog_for_incremental, state=latest_state) records_2 = filter_output(output, type_=Type.RECORD) for record_value, state_value in records_with_state( records_2, latest_state, stream_mapping, cursor_paths): assert ( record_value >= state_value ), "Second incremental sync should produce records older or equal to cursor value from the state"
def test_read( self, connector_config, configured_catalog, inputs: BasicReadTestConfig, expected_records: List[AirbyteMessage], docker_runner: ConnectorRunner, detailed_logger, ): output = docker_runner.call_read(connector_config, configured_catalog) records = [message.record for message in filter_output(output, Type.RECORD)] assert records, "At least one record should be read using provided catalog" if inputs.validate_schema: self._validate_schema(records=records, configured_catalog=configured_catalog) self._validate_empty_streams(records=records, configured_catalog=configured_catalog, allowed_empty_streams=inputs.empty_streams) for pks, record in primary_keys_for_records(streams=configured_catalog.streams, records=records): for pk_path, pk_value in pks.items(): assert pk_value is not None, ( f"Primary key subkeys {repr(pk_path)} " f"have null values or not present in {record.stream} stream records." ) if expected_records: self._validate_expected_records( records=records, expected_records=expected_records, flags=inputs.expect_records, detailed_logger=detailed_logger )
def test_read( self, connector_config, configured_catalog, inputs: BasicReadTestConfig, expected_records: List[AirbyteMessage], docker_runner: ConnectorRunner, ): output = docker_runner.call_read(connector_config, configured_catalog) records = [ message.record for message in output if message.type == Type.RECORD ] counter = Counter(record.stream for record in records) all_streams = set(stream.stream.name for stream in configured_catalog.streams) streams_with_records = set(counter.keys()) streams_without_records = all_streams - streams_with_records assert records, "At least one record should be read using provided catalog" for pks, record in primary_keys_for_records( streams=configured_catalog.streams, records=records): for pk_path, pk_value in pks.items(): assert pk_value is not None, ( f"Primary key subkeys {repr(pk_path)} " f"have null values or not present in {record.stream} stream records." ) if inputs.validate_output_from_all_streams: assert ( not streams_without_records ), f"All streams should return some records, streams without records: {streams_without_records}" if expected_records: actual_by_stream = self.group_by_stream(records) expected_by_stream = self.group_by_stream(expected_records) for stream_name, expected in expected_by_stream.items(): actual = actual_by_stream.get(stream_name, []) self.compare_records( stream_name=stream_name, actual=actual, expected=expected, extra_fields=inputs.expect_records.extra_fields, exact_order=inputs.expect_records.exact_order, extra_records=inputs.expect_records.extra_records, )
def test_read(self, connector_config, configured_catalog, inputs: BasicReadTestConfig, docker_runner: ConnectorRunner): output = docker_runner.call_read(connector_config, configured_catalog) records = [ message.record for message in output if message.type == Type.RECORD ] counter = Counter(record.stream for record in records) all_streams = set(stream.stream.name for stream in configured_catalog.streams) streams_with_records = set(counter.keys()) streams_without_records = all_streams - streams_with_records assert records, "At least one record should be read using provided catalog" if inputs.validate_output_from_all_streams: assert ( not streams_without_records ), f"All streams should return some records, streams without records: {streams_without_records}"
def test_two_sequential_reads( self, inputs: IncrementalConfig, connector_config: SecretDict, configured_catalog_for_incremental: ConfiguredAirbyteCatalog, cursor_paths: dict[str, list[str]], docker_runner: ConnectorRunner, ): threshold_days = getattr(inputs, "threshold_days") or 0 stream_mapping = { stream.stream.name: stream for stream in configured_catalog_for_incremental.streams } output = docker_runner.call_read(connector_config, configured_catalog_for_incremental) records_1 = filter_output(output, type_=Type.RECORD) states_1 = filter_output(output, type_=Type.STATE) assert states_1, "Should produce at least one state" assert records_1, "Should produce at least one record" latest_state = states_1[-1].state.data for record_value, state_value, stream_name in records_with_state( records_1, latest_state, stream_mapping, cursor_paths): assert ( record_value <= state_value ), f"First incremental sync should produce records younger or equal to cursor value from the state. Stream: {stream_name}" output = docker_runner.call_read_with_state( connector_config, configured_catalog_for_incremental, state=latest_state) records_2 = filter_output(output, type_=Type.RECORD) for record_value, state_value, stream_name in records_with_state( records_2, latest_state, stream_mapping, cursor_paths): assert compare_cursor_with_threshold( record_value, state_value, threshold_days ), f"Second incremental sync should produce records older or equal to cursor value from the state. Stream: {stream_name}"
def test_read_sequential_slices(self, inputs: IncrementalConfig, connector_config, configured_catalog_for_incremental, cursor_paths, docker_runner: ConnectorRunner): """ Incremental test that makes calls the read method without a state checkpoint. Then we partition the results by stream and slice checkpoints resulting in batches of messages that look like: <state message> <record message> ... <record message> Using these batches, we then make additional read method calls using the state message and verify the correctness of the messages in the response. """ if inputs.skip_comprehensive_incremental_tests: pytest.skip( "Skipping new incremental test based on acceptance-test-config.yml" ) return threshold_days = getattr(inputs, "threshold_days") or 0 stream_mapping = { stream.stream.name: stream for stream in configured_catalog_for_incremental.streams } output = docker_runner.call_read(connector_config, configured_catalog_for_incremental) records_1 = filter_output(output, type_=Type.RECORD) states_1 = filter_output(output, type_=Type.STATE) assert states_1, "Should produce at least one state" assert records_1, "Should produce at least one record" latest_state = states_1[-1].state.data for record_value, state_value, stream_name in records_with_state( records_1, latest_state, stream_mapping, cursor_paths): assert ( record_value <= state_value ), f"First incremental sync should produce records younger or equal to cursor value from the state. Stream: {stream_name}" # Create partitions made up of one state message followed by any records that come before the next state filtered_messages = [ message for message in output if message.type == Type.STATE or message.type == Type.RECORD ] right_index = len(filtered_messages) checkpoint_messages = [] for index, message in reversed(list(enumerate(filtered_messages))): if message.type == Type.STATE: message_group = (filtered_messages[index], filtered_messages[index + 1:right_index]) checkpoint_messages.insert(0, message_group) right_index = index # We sometimes have duplicate identical state messages in a stream which we can filter out to speed things up checkpoint_messages = [ message for index, message in enumerate(checkpoint_messages) if message not in checkpoint_messages[:index] ] # To avoid spamming APIs we only test a fraction of slices num_slices_to_test = 1 if len( checkpoint_messages) <= 5 else len(checkpoint_messages) // 5 for message_batch in checkpoint_messages[::num_slices_to_test]: assert len( message_batch) > 0 and message_batch[0].type == Type.STATE current_state = message_batch[0] output = docker_runner.call_read_with_state( connector_config, configured_catalog_for_incremental, current_state.state.data) records = filter_output(output, type_=Type.RECORD) for record_value, state_value, stream_name in records_with_state( records, current_state.state.data, stream_mapping, cursor_paths): assert compare_cursor_with_threshold( record_value, state_value, threshold_days ), f"Second incremental sync should produce records older or equal to cursor value from the state. Stream: {stream_name}"
def test_read( self, connector_config, configured_catalog, inputs: BasicReadTestConfig, expected_records: List[AirbyteMessage], docker_runner: ConnectorRunner, ): output = docker_runner.call_read(connector_config, configured_catalog) records = [ message.record for message in output if message.type == Type.RECORD ] counter = Counter(record.stream for record in records) if inputs.validate_schema: bar = "-" * 80 streams_errors = verify_records_schema(records, configured_catalog) for stream_name, errors in streams_errors.items(): errors = map(str, errors.values()) str_errors = f"\n{bar}\n".join(errors) logging.error( f"The {stream_name} stream has the following schema errors:\n{str_errors}" ) if streams_errors: pytest.fail( f"Please check your json_schema in selected streams {streams_errors.keys()}." ) all_streams = set(stream.stream.name for stream in configured_catalog.streams) streams_with_records = set(counter.keys()) streams_without_records = all_streams - streams_with_records assert records, "At least one record should be read using provided catalog" for pks, record in primary_keys_for_records( streams=configured_catalog.streams, records=records): for pk_path, pk_value in pks.items(): assert pk_value is not None, ( f"Primary key subkeys {repr(pk_path)} " f"have null values or not present in {record.stream} stream records." ) if inputs.validate_output_from_all_streams: assert ( not streams_without_records ), f"All streams should return some records, streams without records: {streams_without_records}" if expected_records: actual_by_stream = self.group_by_stream(records) expected_by_stream = self.group_by_stream(expected_records) for stream_name, expected in expected_by_stream.items(): actual = actual_by_stream.get(stream_name, []) self.compare_records( stream_name=stream_name, actual=actual, expected=expected, extra_fields=inputs.expect_records.extra_fields, exact_order=inputs.expect_records.exact_order, extra_records=inputs.expect_records.extra_records, )