Пример #1
0
def test_configurations_data_format_log(sdc_executor, sdc_builder, aws, data_format, log_format):
    """Check whether S3 origin can parse different log format or not. A log file is being created in s3 bucket
    mentioned below .S3 origin reads the log file and parse the same.

    Pipeline for the same-
    s3_origin >> trash
    s3_origin >= pipeline_finisher_executor
    """
    if log_format == 'GROK':
        file_content = data_format_content['APACHE_CUSTOM_LOG_FORMAT']
    else:
        file_content = data_format_content[log_format]
    client = aws.s3
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}'
    attributes = {'bucket': aws.s3_bucket_name,
                  'prefix_pattern': f'{s3_key}/*',
                  'read_order': 'LEXICOGRAPHICAL',
                  'data_format': data_format,
                  'log_format': log_format,
                  'custom_log_format': '%h %l %u [%t] "%r" %>s %b',
                  'regular_expression': REGULAR_EXPRESSION,
                  'field_path_to_regex_group_mapping': LOG_FIELD_MAPPING
                  }
    if Version(sdc_builder.version) >= Version('3.7.0'):
        attributes['number_of_threads'] = 1
    pipeline, wiretap = get_aws_origin_to_trash_pipeline(sdc_builder, attributes, aws)
    try:
        client.put_object(Bucket=aws.s3_bucket_name, Key=f'{s3_key}/{get_random_string()}.log', Body=file_content)
        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(pipeline).wait_for_finished()
        assert wiretap.output_records[0].field == get_data_to_verify_output[log_format]
    finally:
        # cleaning up s3 bucket
        delete_aws_objects(client, aws, s3_key)
def test_kafka_origin_batch_max_size(sdc_builder, sdc_executor, cluster):
    """Check that retrieving messages from Kafka using Kafka Multitopic Consumer respects both the Batch Max Wait Time
    and the Max Batch Size. Batches are sent when the first of the two conditions is met. This test is checking that
    the Max Batch Size condition is first met.

    Kafka Multitopic Consumer Origin pipeline with standalone mode:
        kafka_multitopic_consumer >> trash
    """

    messages = [f'message{i}' for i in range(1, 21)]
    expected = [f'message{i}' for i in range(1, 21)]

    num_batches = 2
    kafka_consumer_group = get_random_string(string.ascii_letters, 10)

    # Build the Kafka consumer pipeline with Standalone mode.
    builder = sdc_builder.get_pipeline_builder()
    kafka_multitopic_consumer = get_kafka_multitopic_consumer_stage(
        builder, cluster)

    produce_kafka_messages_list(kafka_multitopic_consumer.topic_list[0],
                                cluster, messages, 'TEXT')

    if Version(sdc_builder.version) < Version('3.7.0'):
        kafka_multitopic_consumer.configuration_properties = [{
            'key':
            'auto.offset.reset',
            'value':
            'earliest'
        }]
    else:
        kafka_multitopic_consumer.auto_offset_reset = 'EARLIEST'

    kafka_multitopic_consumer.set_attributes(
        consumer_group=kafka_consumer_group,
        max_batch_size_in_records=10,
        batch_wait_time_in_ms=30000)

    wiretap = builder.add_wiretap()
    kafka_multitopic_consumer >> wiretap.destination
    kafka_consumer_pipeline = builder.build(title='Kafka Multitopic pipeline Maximum batch size threshold') \
        .configure_for_environment(cluster)
    kafka_consumer_pipeline.configuration['shouldRetry'] = False
    kafka_consumer_pipeline.configuration['executionMode'] = 'STANDALONE'

    sdc_executor.add_pipeline(kafka_consumer_pipeline)

    # First test checking Max Batch Size is reached
    # Publish messages to Kafka and verify using wiretap if the same messages are received.
    # Start Pipeline.

    sdc_executor.start_pipeline(kafka_consumer_pipeline)
    sdc_executor.wait_for_pipeline_metric(kafka_consumer_pipeline,
                                          'input_record_count',
                                          num_batches * 10,
                                          timeout_sec=60)
    sdc_executor.stop_pipeline(kafka_consumer_pipeline)

    assert expected == sorted(
        [str(record.field['text']) for record in wiretap.output_records])
Пример #3
0
def test_status_code(sdc_builder, sdc_executor):
    try:
        pipeline_builder = sdc_builder.get_pipeline_builder()

        rest_service = pipeline_builder.add_stage('REST Service')

        if Version(sdc_builder.version) < Version('3.16.0'):
            rest_service.application_id = APPLICATION_ID
        else:
            rest_service.list_of_application_ids = [{
                "credential":
                APPLICATION_ID
            }]

        rest_service.http_listening_port = HTTP_LISTENING_PORT

        send_response_to_origin = pipeline_builder.add_stage(
            'Send Response to Origin')
        send_response_to_origin.status_code = STATUS_CODE

        rest_service >> send_response_to_origin
        pipeline = pipeline_builder.build()

        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(pipeline)

        protocol = 'https' if sdc_executor.https else 'http'
        rest_service_url = f'{protocol}://{sdc_executor.server_host}:{HTTP_LISTENING_PORT}'
        assert requests.get(rest_service_url,
                            headers={
                                'X-SDC-APPLICATION-ID': APPLICATION_ID
                            }).status_code == STATUS_CODE
    finally:
        sdc_executor.stop_pipeline(pipeline)
def test_control_hub_api_processor_invalid_credentials(sdc_builder, sdc_executor):
    """Test Control Hub API Processor. The pipeline would look like:

        dev_raw_data_source >> control_hub_api_processor >> trash

    With invalid Control Hub credentials, Control Hub API Processor sends the record to error records list.
    """

    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.stop_after_first_batch = True

    control_hub_api_processor = pipeline_builder.add_stage('Control Hub API')
    control_hub_api_processor.control_hub_api_url = 'https://cloud.streamsets.com/security/rest/v1/currentUser'
    control_hub_api_processor.output_field = "/output"
    control_hub_api_processor.control_hub_user_name = "invalid user"
    control_hub_api_processor.password = "******"
    if Version(sdc_builder.version) >= Version('4.0.0'):
        control_hub_api_processor.authentication_type = 'USER_PASSWORD'

    wiretap = pipeline_builder.add_wiretap()

    dev_raw_data_source >> control_hub_api_processor >> wiretap.destination

    pipeline = pipeline_builder.build('Control Hub API Processor Sample Pipeline')
    sdc_executor.add_pipeline(pipeline)

    sdc_executor.validate_pipeline(pipeline)

    sdc_executor.start_pipeline(pipeline).wait_for_finished()

    # Assert Cron Scheduler generated record output
    assert len(wiretap.output_records) == 0
    assert len(wiretap.error_records) == 1
Пример #5
0
def test_use_security(sdc_builder, sdc_executor, elasticsearch, stage_attributes):
    """
    To test the use security configuration we create a pipeline as follows:

    Elasticsearch >> Wiretap

    Since the Elasticsearch server requires using a username with a password
    an error should happen if the use security property is false.
    Otherwise we should succeed to find a document we have put before to an index.
    """

    index = get_random_string(string.ascii_lowercase)
    doc_id = get_random_string(string.ascii_lowercase)

    builder = sdc_builder.get_pipeline_builder()

    origin = builder.add_stage('Elasticsearch', type='origin')
    origin.query = '{"query": {"match_all": {}}}'
    origin.index = index

    wiretap = builder.add_wiretap()

    origin >> wiretap.destination

    pipeline = builder.build().configure_for_environment(elasticsearch)

    configured_origin = pipeline.stages.get(label=origin.label)
    configured_origin.use_security = stage_attributes['use_security']
    if not stage_attributes['use_security']:
        if Version(sdc_builder.version) < Version('3.17.0'):
            configured_origin.configuration['conf.securityConfig.securityUser'] = f':'
        else:
            configured_origin.user_name = ''
            configured_origin.password = ''

    sdc_executor.add_pipeline(pipeline)

    elasticsearch.client.create_document(index=index, id=doc_id, body={"number": 1})

    try:
        if stage_attributes['use_security']:
            sdc_executor.start_pipeline(pipeline).wait_for_finished()

            assert len(wiretap.output_records) == 1

            record = wiretap.output_records[0]
            assert record.field['_id'] == doc_id
            assert record.field['_index'] == index
            assert record.field['_source'] == {"number": 1}

        else:
            with pytest.raises(ValidationError) as e:
                sdc_executor.validate_pipeline(pipeline)

            assert e.value.issues['issueCount'] == 1
            assert e.value.issues['stageIssues'][origin.instance_name][0]['message'].find('ELASTICSEARCH_47') != -1

    finally:
        elasticsearch.client.delete_index(index)
def hive_check(cluster, sdc_builder):
    # based on SDC-13915
    if (isinstance(cluster, AmbariCluster)
            and Version(cluster.version) == Version('3.1')
            and Version(sdc_builder.version) < Version('3.8.1')):
        pytest.skip(
            'Hive stages not available on HDP 3.1.0.0 for SDC versions before 3.8.1'
        )
def test_field_decrypt(sdc_builder, sdc_executor, aws):
    """Basic test to verify Encrypt and Decrypt Fields processor can decrypt a field.
    An encrypted field is sent and after pipeline is run, verification of decryption is done using wiretap.

    ciphertext is a byte array, but raw data source provides no way to specify a byte array.
    Hence a base64 encoded string of the ciphertext is used.
    Once it has been loaded by the raw data source, it needs to be decoded back into a byte array
    for input to the encryption processor.
    The base64 decode processor requires a byte array to decode instead of a string,
    hence the field type converter.
    (https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/Base64Decoder.html#concept_ujj_spy_kv)

    The pipeline looks like:
        dev_raw_data_source >> field_type_converter >> base64_decoder >> field_decrypt >> wiretap
    """
    expected_plaintext = MESSAGE_TEXT.encode()

    ciphertext, _ = aws.encrypt(expected_plaintext)

    pipeline_builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='JSON',
                                       raw_data=json.dumps({'message': base64.b64encode(ciphertext).decode()}),
                                       stop_after_first_batch=True)

    field_type_converter = pipeline_builder.add_stage('Field Type Converter', type='processor')
    field_type_converter_configs = [{'fields': ['/message'], 'targetType': 'BYTE_ARRAY'}]
    field_type_converter.set_attributes(conversion_method='BY_FIELD',
                                        field_type_converter_configs=field_type_converter_configs)

    base64_decoder = pipeline_builder.add_stage('Base64 Field Decoder', type='processor')
    if Version(sdc_builder.version) < Version("4.4.0"):
        base64_decoder.set_attributes(field_to_decode='/message', target_field='/message')
    else:
        base64_decoder.set_attributes(
            fields_to_decode=[{'originFieldPath': '/message', 'resultFieldPath': '/message'}]
        )

    field_decrypt = pipeline_builder.add_stage('Encrypt and Decrypt Fields', type='processor')
    field_decrypt.set_attributes(cipher='ALG_AES_256_GCM_IV12_TAG16_HKDF_SHA384_ECDSA_P384',
                                 fields=['/message'],
                                 frame_size=4096,
                                 mode='DECRYPT')

    wiretap = pipeline_builder.add_wiretap()

    dev_raw_data_source >> field_type_converter >> base64_decoder >> field_decrypt >> wiretap.destination
    pipeline = pipeline_builder.build('Field Decryption Pipeline').configure_for_environment(aws)
    sdc_executor.add_pipeline(pipeline)
    sdc_executor.start_pipeline(pipeline).wait_for_finished()

    actual_value = wiretap.output_records[0].get_field_data('/message')
    assert actual_value == expected_plaintext
def test_topic_list(sdc_builder, sdc_executor, cluster):
    MESSAGE = 'Hello World from SDC & DPM!'
    EXPECTED = {'text': 'Hello World from SDC & DPM!'}

    # Build the Kafka consumer pipeline with Standalone mode.
    builder = sdc_builder.get_pipeline_builder()

    topic_name = get_random_string()
    kafka_multitopic_consumer = builder.add_stage(
        'Kafka Multitopic Consumer',
        library=cluster.kafka.standalone_stage_lib)

    if Version(sdc_builder.version) < Version('3.7.0'):
        kafka_multitopic_consumer.set_attributes(batch_wait_time_in_ms=2000,
                                                 data_format='TEXT',
                                                 topic_list=[topic_name])
        kafka_multitopic_consumer.configuration_properties = [{
            'key':
            'auto.offset.reset',
            'value':
            'earliest'
        }]
    else:
        kafka_multitopic_consumer.set_attributes(auto_offset_reset='EARLIEST',
                                                 batch_wait_time_in_ms=2000,
                                                 data_format='TEXT',
                                                 topic_list=[topic_name])

    wiretap = builder.add_wiretap()
    kafka_multitopic_consumer >> wiretap.destination
    pipeline = builder.build().configure_for_environment(cluster)
    sdc_executor.add_pipeline(pipeline)

    try:
        # Publish messages to Kafka and verify using wiretap if the same messages are received.
        producer = cluster.kafka.producer()
        producer.send(topic_name, MESSAGE.encode())

        # Start Pipeline.
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline,
                                              'input_record_count',
                                              1,
                                              timeout_sec=120)
        sdc_executor.stop_pipeline(pipeline)

        # Verify wiretap data.
        records = [record.field for record in wiretap.output_records]
        assert [EXPECTED] == records
    finally:
        if sdc_executor.get_pipeline_status(pipeline).response.json().get(
                'status') == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline)
def test_topic_list(sdc_builder, sdc_executor, cluster):
    MESSAGE = 'Hello World from SDC & DPM!'
    EXPECTED = {'text': 'Hello World from SDC & DPM!'}

    # Build the Kafka consumer pipeline with Standalone mode.
    builder = sdc_builder.get_pipeline_builder()

    topic_name = get_random_string()
    kafka_multitopic_consumer = builder.add_stage(
        'Kafka Multitopic Consumer',
        library=cluster.kafka.standalone_stage_lib)

    if Version(sdc_builder.version) < Version('3.7.0'):
        kafka_multitopic_consumer.set_attributes(batch_wait_time_in_ms=2000,
                                                 data_format='TEXT',
                                                 topic_list=[topic_name])
        kafka_multitopic_consumer.configuration_properties = [{
            'key':
            'auto.offset.reset',
            'value':
            'earliest'
        }]
    else:
        kafka_multitopic_consumer.set_attributes(auto_offset_reset='EARLIEST',
                                                 batch_wait_time_in_ms=2000,
                                                 data_format='TEXT',
                                                 topic_list=[topic_name])

    trash = builder.add_stage(label='Trash')
    kafka_multitopic_consumer >> trash
    pipeline = builder.build().configure_for_environment(cluster)
    sdc_executor.add_pipeline(pipeline)

    try:
        # Publish messages to Kafka and verify using snapshot if the same messages are received.
        producer = cluster.kafka.producer()
        producer.send(topic_name, MESSAGE.encode())
        # Start Pipeline.
        snapshot = sdc_executor.capture_snapshot(pipeline,
                                                 start_pipeline=True).snapshot

        # Verify snapshot data.
        records = [
            record.field
            for record in snapshot[kafka_multitopic_consumer].output
        ]
        assert [EXPECTED] == records
    finally:
        sdc_executor.stop_pipeline(pipeline)
Пример #10
0
def test_keystore_file(sdc_builder, sdc_executor, stage_attributes):
    """Test "KeyStore path" config parameter. It is tested with two values, one pointing to a real KeyStore file
    and the other to an unexisting file. We check a TLS_01 error is raised for the unexisting file and that
    the pipeline successfully transitions to RUNNING state if the file exists.

    Pipeline:
      rest_srv >> trash

    """
    builder = sdc_builder.get_pipeline_builder()
    rest_srv = builder.add_stage('REST Service')

    if Version('3.16.0') <= Version(sdc_builder.version) < Version('3.17.0'):
        list_of_application_ids = [{"appId": 'admin'}]
        rest_srv.set_attributes(
            list_of_application_ids=list_of_application_ids)
    elif Version(sdc_builder.version) >= Version('3.17.0'):
        list_of_application_ids = [{"credential": 'admin'}]
        rest_srv.set_attributes(
            list_of_application_ids=list_of_application_ids)
    else:
        app_id = 'admin'
        rest_srv.set_attributes(application_id=app_id)

    rest_srv.set_attributes(keystore_type=KEYSTORE_TYPE,
                            keystore_password=KEYSTORE_PASSWORD,
                            **stage_attributes)

    trash = builder.add_stage('Trash')
    rest_srv >> trash

    pipeline = builder.build()
    sdc_executor.add_pipeline(pipeline)

    if stage_attributes['keystore_file'] == KEYSTORE_FILE_PATH:
        # Expecting SDC loads the KeyStore and successfully starts to run the pipeline.
        sdc_executor.start_pipeline(pipeline).wait_for_status(status='RUNNING')
        sdc_executor.stop_pipeline(pipeline)
    else:
        # Expecting a StartError from SDC due to unexisting KeyStore file (TLS_01 error).
        with pytest.raises(StartError) as e:
            sdc_executor.start_pipeline(pipeline).wait_for_status(
                status='RUNNING')
        assert e.value.message.startswith('TLS_01')
Пример #11
0
def test_invalid_execution_mode(sdc_executor, pipeline):
    """Set executionMode to invalid value for a pipeline,
       try starting it and confirm that it raises expected exception."""
    pipeline.configuration['executionMode'] = 'Invalid_Execution_Mode'
    pipeline.id = 'Invalid_Execution_Mode Pipeline'

    try:
        sdc_executor.add_pipeline(pipeline)
        # Do a version check since execution_mode handling changed starting in the 2.7.0.0 version.
        if Version(sdc_executor.version) >= Version('2.7.0.0'):
            with pytest.raises(ValidationError):
                sdc_executor.dump_log_on_error = False
                sdc_executor.start_pipeline(pipeline)
        else:
            with pytest.raises(sdc_api.StartError):
                sdc_executor.dump_log_on_error = False
                sdc_executor.start_pipeline(pipeline)
    finally:
        sdc_executor.dump_log_on_error = True
Пример #12
0
def test_base64_field_encoder(sdc_builder, sdc_executor):
    """Test Base64 Field Encoder processor. Since this processor accepts a byte array, we use a Field Type
    Converter processor which will help convert the raw input string to byte array.
    The pipeline would look like:

        dev_raw_data_source >> field_type_converter >> base64_field_encoder >> wiretap
    """
    raw_data = 'hello there!'

    pipeline_builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=raw_data, stop_after_first_batch=True)
    field_type_converter = pipeline_builder.add_stage('Field Type Converter')
    field_type_converter.set_attributes(conversion_method='BY_FIELD',
                                        field_type_converter_configs=[
                                            {'fields': ['/text'], 'targetType': 'BYTE_ARRAY'}
                                        ])
    base64_field_encoder = pipeline_builder.add_stage('Base64 Field Encoder', type='processor')
    if Version(sdc_builder.version) < Version("4.4.0"):
        base64_field_encoder.set_attributes(field_to_encode='/text', target_field='/result', url_safe=True)
    else:
        base64_field_encoder.set_attributes(
            fields_to_encode=[{'originFieldPath': '/text', 'resultFieldPath': '/result'}],
            url_safe=True
        )

    wiretap = pipeline_builder.add_wiretap()

    dev_raw_data_source >> field_type_converter >> base64_field_encoder >> wiretap.destination
    pipeline = pipeline_builder.build('Base64 Encoder pipeline')
    sdc_executor.add_pipeline(pipeline)
    sdc_executor.start_pipeline(pipeline).wait_for_finished()

    result_data = wiretap.output_records[0].field['result'].value
    # result_data is Base64 encoded by the Base64 encoder stage and for JSON transport it is again encoded, hence
    # we encode our raw_data twice for assertion
    assert base64.b64encode(raw_data.encode()) == result_data
def test_control_hub_api_processor(sdc_builder, sdc_executor):
    """Test Control Hub API Processor. The pipeline would look like:

        dev_raw_data_source >> control_hub_api_processor >> trash

    Call Control Hub API "https://cloud.streamsets.com/public-rest/v1/health" using Control Hub API Processor and
    update field output with the response
    """

    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.stop_after_first_batch = True

    control_hub_api_processor = pipeline_builder.add_stage('Control Hub API')
    control_hub_api_processor.control_hub_api_url = 'https://cloud.streamsets.com/public-rest/v1/health'
    control_hub_api_processor.output_field = "/"
    control_hub_api_processor.control_hub_user_name = "user"
    control_hub_api_processor.password = "******"
    if Version(sdc_builder.version) >= Version('4.0.0'):
        control_hub_api_processor.authentication_type = 'USER_PASSWORD'

    wiretap = pipeline_builder.add_wiretap()

    dev_raw_data_source >> control_hub_api_processor >> wiretap.destination

    pipeline = pipeline_builder.build('Control Hub API Processor Sample Pipeline')
    sdc_executor.add_pipeline(pipeline)

    sdc_executor.validate_pipeline(pipeline)

    sdc_executor.start_pipeline(pipeline).wait_for_finished()

    # Assert Cron Scheduler generated record output
    assert len(wiretap.output_records) == 1
    assert len(wiretap.error_records) == 0
    assert wiretap.output_records[0].field['alive'].value == True
Пример #14
0
def test_base64_field_decoder(sdc_builder, sdc_executor):
    """Test Base64 Field Decoder processor. Since this processor accepts a Base64 encoded byte array, we use
    intermediate Field Type Converter processor for converting our Base64 string to byte array.
    The pipeline would look like:

        dev_raw_data_source >> field_type_converter >> base64_field_decoder >> wiretap
    """
    # input raw_data is a Base64 encoded string
    normal_string = 'hello there!'.encode()
    raw_data = base64.b64encode(normal_string).decode()

    pipeline_builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=raw_data, stop_after_first_batch=True)
    field_type_converter = pipeline_builder.add_stage('Field Type Converter')
    field_type_converter.set_attributes(conversion_method='BY_FIELD',
                                        field_type_converter_configs=[{'fields': ['/text'],
                                                                       'targetType': 'BYTE_ARRAY'}])
    base64_field_decoder = pipeline_builder.add_stage('Base64 Field Decoder', type='processor')
    if Version(sdc_builder.version) < Version("4.4.0"):
        base64_field_decoder.set_attributes(field_to_decode='/text', target_field='/result')
    else:
        base64_field_decoder.set_attributes(
            fields_to_decode=[{'originFieldPath': '/text', 'resultFieldPath': '/result'}]
        )

    wiretap = pipeline_builder.add_wiretap()

    dev_raw_data_source >> field_type_converter >> base64_field_decoder >> wiretap.destination
    pipeline = pipeline_builder.build('Base64 Decoder pipeline')
    sdc_executor.add_pipeline(pipeline)
    sdc_executor.start_pipeline(pipeline).wait_for_finished()

    result_data = wiretap.output_records[0].field['result'].value
    # result data is Base64 encoded for JSON transport, hence we can directly compare to our raw Base64 string
    assert normal_string == result_data
Пример #15
0
def test_topic(sdc_builder, sdc_executor, cluster, stage_attributes):
    topic = get_random_string()
    logger.debug('Kafka topic name: %s', topic)

    DATA = ['Hello World!' for _ in range(7)]
    raw_data = '\n'.join(DATA)

    builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='TEXT',
                                              raw_data=raw_data)
    kafka_destination = builder.add_stage(
        'Kafka Producer', library=cluster.kafka.standalone_stage_lib)
    if Version(sdc_builder.version) >= Version('3.19'):
        if 'provide_keytab' in stage_attributes:
            stage_attributes[
                'provide_keytab_at_runtime'] = stage_attributes.pop(
                    'provide_keytab')
    kafka_destination.set_attributes(topic=topic,
                                     data_format='TEXT',
                                     **stage_attributes)
    pipeline_finisher = builder.add_stage('Pipeline Finisher Executor')
    dev_raw_data_source >> [kafka_destination, pipeline_finisher]
    pipeline = builder.build().configure_for_environment(cluster)

    # Specify timeout so that iteration of consumer is stopped after that time and
    # specify auto_offset_reset to get messages from beginning.
    consumer = cluster.kafka.consumer(consumer_timeout_ms=5000,
                                      auto_offset_reset='earliest')
    consumer.subscribe([topic])

    sdc_executor.add_pipeline(pipeline)
    sdc_executor.start_pipeline(pipeline)

    messages = [message.value.decode().strip() for message in consumer]
    assert messages == DATA
Пример #16
0
def test_principal(sdc_builder,
                   sdc_executor,
                   cluster,
                   stage_attributes,
                   keytab_format=ENCODED_KEYTAB_CONTENTS):
    if not cluster.kafka.is_kerberized:
        pytest.skip('Test runs only if Kafka is kerberized')
    cloudera_streamsets = getattr(cluster, 'streamsets')
    if keytab_format in [CREDENTIAL_FUNCTION, CREDENTIAL_FUNCTION_WITH_GROUP]:
        if not cloudera_streamsets.credential_stores:
            pytest.skip(
                'Test with credential function runs only if credential store was enabled'
            )

    if keytab_format in [CREDENTIAL_FUNCTION_WITH_GROUP]:
        azure_keyvault = cloudera_streamsets.credential_stores.get('azure')
        if not azure_keyvault or not azure_keyvault.enforce_entry_group:
            pytest.skip(
                'Test with credential function with enforce group runs only'
                ' if enforceEntryGroup was set to True')

    encoded_keytabs_for_stages = getattr(cluster.kafka,
                                         'encoded_keytabs_for_stages', None)
    keytab_for_stage = (encoded_keytabs_for_stages.get('Kafka Producer')
                        if encoded_keytabs_for_stages else None)
    if not keytab_for_stage:
        pytest.skip(
            'Test runs only if --stage-keytab argument is provided for `Kafka Producer` stage'
        )
    if keytab_format == ENCODED_KEYTAB_CONTENTS:
        keytab_value = keytab_for_stage.base64_encoded_keytab_contents
    elif keytab_format in [
            CREDENTIAL_FUNCTION, CREDENTIAL_FUNCTION_WITH_GROUP
    ]:
        keytab_value = keytab_for_stage.credential_function_for_keytab

    # Run the pipeline and verify it works as expected.
    topic = get_random_string()
    logger.debug('Kafka topic name: %s', topic)

    DATA = ['Hello World!' for _ in range(7)]
    raw_data = '\n'.join(DATA)

    builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='TEXT',
                                              raw_data=raw_data)
    kafka_destination = builder.add_stage(
        'Kafka Producer', library=cluster.kafka.standalone_stage_lib)

    if Version(sdc_builder.version) < Version('3.19'):
        stage_attributes.update({
            'keytab': keytab_value,
            'principal': keytab_for_stage.principal
        })
    else:
        if 'provide_keytab' in stage_attributes:
            stage_attributes[
                'provide_keytab_at_runtime'] = stage_attributes.pop(
                    'provide_keytab')
        stage_attributes.update({
            'runtime_keytab': keytab_value,
            'runtime_principal': keytab_for_stage.principal
        })
    kafka_destination.set_attributes(data_format='TEXT',
                                     topic=topic,
                                     **stage_attributes)
    pipeline_finisher = builder.add_stage('Pipeline Finisher Executor')
    dev_raw_data_source >> [kafka_destination, pipeline_finisher]
    pipeline = builder.build().configure_for_environment(cluster)

    # Specify timeout so that iteration of consumer is stopped after that time and
    # specify auto_offset_reset to get messages from beginning.
    consumer = cluster.kafka.consumer(consumer_timeout_ms=5000,
                                      auto_offset_reset='earliest')
    consumer.subscribe([topic])

    sdc_executor.add_pipeline(pipeline)
    sdc_executor.start_pipeline(pipeline)

    messages = [message.value.decode().strip() for message in consumer]
    assert messages == DATA
Пример #17
0
def test_kafka_origin_batch_max_size(sdc_builder, sdc_executor, cluster):
    """Check that retrieving messages from Kafka using Kafka Multitopic Consumer respects both the Batch Max Wait Time
    and the Max Batch Size. Batches are sent when the first of the two conditions is met. This test is checking that
    the Max Batch Size condition is first met.

    Kafka Multitopic Consumer Origin pipeline with standalone mode:
        kafka_multitopic_consumer >> trash
    """

    messages = [f'message{i}' for i in range(1, 21)]
    expected = [f'message{i}' for i in range(1, 21)]

    num_batches = 2
    kafka_consumer_group = get_random_string(string.ascii_letters, 10)

    # Build the Kafka consumer pipeline with Standalone mode.
    builder = sdc_builder.get_pipeline_builder()
    kafka_multitopic_consumer = get_kafka_multitopic_consumer_stage(
        builder, cluster)

    produce_kafka_messages_list(kafka_multitopic_consumer.topic_list[0],
                                cluster, messages, 'TEXT')

    if Version(sdc_builder.version) < Version('3.7.0'):
        kafka_multitopic_consumer.configuration_properties = [{
            'key':
            'auto.offset.reset',
            'value':
            'earliest'
        }]
    else:
        kafka_multitopic_consumer.auto_offset_reset = 'EARLIEST'

    kafka_multitopic_consumer.set_attributes(
        consumer_group=kafka_consumer_group,
        max_batch_size_in_records=10,
        batch_wait_time_in_ms=30000)

    trash = builder.add_stage(label='Trash')
    kafka_multitopic_consumer >> trash
    kafka_consumer_pipeline = builder.build(title='Kafka Multitopic pipeline Maximum batch size threshold')\
        .configure_for_environment(cluster)
    kafka_consumer_pipeline.configuration['shouldRetry'] = False
    kafka_consumer_pipeline.configuration['executionMode'] = 'STANDALONE'

    sdc_executor.add_pipeline(kafka_consumer_pipeline)

    try:
        # First test checking Max Batch Size is reached
        # Publish messages to Kafka and verify using snapshot if the same messages are received.
        # Start Pipeline.
        snapshot = sdc_executor.capture_snapshot(kafka_consumer_pipeline,
                                                 start_pipeline=True,
                                                 batches=num_batches,
                                                 batch_size=10).snapshot
        records_fields = []

        for snapshot_batch in snapshot.snapshot_batches:
            for value in snapshot_batch[kafka_consumer_pipeline[0].
                                        instance_name].output_lanes.values():
                for record in value:
                    records_fields.append(str(record.field['text']))

        assert expected == records_fields

    finally:
        sdc_executor.stop_pipeline(kafka_consumer_pipeline, force=True)
def test_security_username_and_password(sdc_builder, sdc_executor,
                                        elasticsearch, stage_attributes):
    """
    To test the username and password configurations we create a pipeline as follows:

    Elasticsearch >> Wiretap

    Then we check different combinations of valid/invalid/empty username/password configuration values.
    We expect no errors when the username and password are not empty and are valid.
    We verify that an appropriate error happens when an invalid/empty username and/or password are set.
    """

    if stage_attributes['with_valid_password'] is None:
        password = ''
    elif stage_attributes['with_valid_password']:
        password = elasticsearch.password
    else:
        password = get_random_string()

    if stage_attributes['with_valid_username'] is None:
        username = ''
    elif stage_attributes['with_valid_username']:
        username = elasticsearch.username
    else:
        username = get_random_string()

    index = get_random_string(string.ascii_lowercase)
    doc_id = get_random_string(string.ascii_lowercase)

    builder = sdc_builder.get_pipeline_builder()

    origin = builder.add_stage('Elasticsearch', type='origin')
    origin.query = '{"query": {"match_all": {}}}'
    origin.index = index

    wiretap = builder.add_wiretap()

    origin >> wiretap.destination

    pipeline = builder.build().configure_for_environment(elasticsearch)

    configured_origin = pipeline.stages.get(label=origin.label)

    if Version(sdc_builder.version) < Version('3.17.0'):
        configured_origin.configuration[
            'conf.securityConfig.securityUser'] = f'{username}:{password}'
    else:
        configured_origin.user_name = username
        configured_origin.password = password

    sdc_executor.add_pipeline(pipeline)

    if stage_attributes['error_code'] is None:

        elasticsearch.client.create_document(index=index,
                                             id=doc_id,
                                             body={"number": 1})

        try:
            sdc_executor.start_pipeline(pipeline).wait_for_finished()

            assert len(wiretap.output_records) == 1

            record = wiretap.output_records[0]
            assert record.field['_index'] == index
            assert record.field['_id'] == doc_id
            assert record.field['_source'] == {"number": 1}

        finally:
            elasticsearch.client.delete_index(index)

    else:
        with pytest.raises(ValidationError) as e:
            sdc_executor.validate_pipeline(pipeline)

        assert e.value.issues['issueCount'] == 1
        assert e.value.issues['stageIssues'][
            origin.instance_name][0]['message'].find(
                stage_attributes['error_code']) != -1
Пример #19
0
def test_jdbc_multitable_consumer_to_hive(sdc_builder, sdc_executor, database,
                                          cluster, table_name_characters,
                                          table_name_length):
    """Validate an end to end case of reading Multi-tables from JDBC source and making sure they are
    written to Hadoop FS. We use Hive Metadata processor for drift synchronization. The pipeline looks like:

        jdbc_multitable_consumer >= pipeline_finisher_executor
        jdbc_multitable_consumer >> expression_evaluator >> field_remover >> hive_metadata
        hive_metadata >> hadoop_fs
        hive_metadata >> hive_metastore

    Note: Numeric fixture of the test fails till SDC-6766 is addressed.
    """
    # based on SDC-13915
    if (isinstance(cluster, AmbariCluster)
            and Version(cluster.version) == Version('3.1')
            and Version(sdc_builder.version) < Version('3.8.1')):
        pytest.skip(
            'Hive stages not available on HDP 3.1.0.0 for SDC versions before 3.8.1'
        )

    # Generate two random strings to use when naming the DB tables at the source.
    src_table_suffix = get_random_string(
        string.ascii_lowercase,
        6)  # lowercase for db compatibility (e.g. PostgreSQL)
    random_table_name_1 = '{}_{}'.format(
        get_random_string(table_name_characters, table_name_length),
        src_table_suffix)
    random_table_name_2 = '{}_{}'.format(
        get_random_string(table_name_characters, table_name_length),
        src_table_suffix)

    # build the pipeline
    pipeline_builder = sdc_builder.get_pipeline_builder()
    jdbc_multitable_consumer = pipeline_builder.add_stage(
        'JDBC Multitable Consumer')
    jdbc_multitable_consumer.set_attributes(
        table_configuration=[{
            'tablePattern': f'%{src_table_suffix}'
        }])
    expression_evaluator = pipeline_builder.add_stage('Expression Evaluator')
    expression_evaluator.header_attribute_expressions = [{
        'attributeToSet':
        'database',
        'headerAttributeExpression':
        f'{database.database}'
    }, {
        'attributeToSet':
        'dt',
        'headerAttributeExpression':
        "${record:value('/dt')}"
    }, {
        'attributeToSet':
        'table_name',
        'headerAttributeExpression':
        "${record:attribute('jdbc.tables')}"
    }]
    field_remover = pipeline_builder.add_stage('Field Remover')
    field_remover.fields = ["/dt"]
    hive_metadata = pipeline_builder.add_stage('Hive Metadata')
    hive_metadata.set_attributes(
        data_format='AVRO',
        database_expression="${record:attribute('database')}",
        decimal_precision_expression=(
            "${record:attribute(str:concat(str:concat("
            "'jdbc.', field:field()), '.precision'))}"),
        decimal_scale_expression=("${record:attribute(str:concat(str:concat("
                                  "'jdbc.', field:field()), '.scale'))}"),
        table_name="${record:attribute('table_name')}")
    hadoop_fs = pipeline_builder.add_stage('Hadoop FS', type='destination')
    hadoop_fs.set_attributes(avro_schema_location='HEADER',
                             data_format='AVRO',
                             directory_in_header=True,
                             file_type='TEXT',
                             files_prefix='sdc-${sdc:id()}',
                             files_suffix='avro',
                             max_file_size=0,
                             max_records_in_file=0,
                             roll_attribute_name='roll',
                             use_roll_attribute=True)
    hive_metastore = pipeline_builder.add_stage('Hive Metastore',
                                                type='destination')
    hive_metastore.set_attributes(stored_as_avro=True)
    pipeline_finisher_executor = pipeline_builder.add_stage(
        'Pipeline Finisher Executor')

    jdbc_multitable_consumer >= pipeline_finisher_executor
    jdbc_multitable_consumer >> expression_evaluator >> field_remover >> hive_metadata
    hive_metadata >> hadoop_fs
    hive_metadata >> hive_metastore
    pipeline = pipeline_builder.build(
        title='Multi-table consumer to Hive').configure_for_environment(
            cluster, database)
    sdc_executor.add_pipeline(pipeline)

    tables = []
    try:
        # create table and load data in the JDBC database
        for table_name in (random_table_name_1, random_table_name_2):
            logger.info('Creating table %s in %s database ...', table_name,
                        database.type)
            table = sqlalchemy.Table(
                table_name, sqlalchemy.MetaData(),
                sqlalchemy.Column('event_id',
                                  sqlalchemy.Integer,
                                  primary_key=True),
                sqlalchemy.Column('order_id', sqlalchemy.Integer),
                sqlalchemy.Column('event_type', sqlalchemy.String(32)),
                sqlalchemy.Column('dt', sqlalchemy.String(20)))
            table.create(database.engine)
            tables.append(table)
            rows = [{
                'event_id': 1,
                'order_id': 123,
                'event_type': 'SHIPPED',
                'dt': '2017-07-13'
            }, {
                'event_id': 2,
                'order_id': 234,
                'event_type': 'ARRIVED',
                'dt': '2017-07-13'
            }, {
                'event_id': 3,
                'order_id': 345,
                'event_type': 'READY',
                'dt': '2017-07-13'
            }]
            logger.info('Adding %s rows to %s of %s database ...', len(rows),
                        table_name, database.type)
            connection = database.engine.connect()
            connection.execute(table.insert(), rows)

        # run the pipeline
        sdc_executor.start_pipeline(pipeline)

        # Check that the data shows up in Hive.
        hive_cursor = cluster.hive.client.cursor()
        for table in tables:
            table_name = table.name if not database.type == 'Oracle' else table.name.upper(
            )
            logger.info('Asserting table %s', table_name)
            hive_cursor.execute(f'SELECT * from `{table_name}`')
            hive_values = [list(row) for row in hive_cursor.fetchall()]
            raw_values = [list(row.values()) for row in rows]
            assert sorted(hive_values) == sorted(raw_values)
    finally:
        for table in tables:
            table_name = table.name if not database.type == 'Oracle' else table.name.upper(
            )
            logger.info('Dropping table %s in %s database ...', table_name,
                        database.type)
            table.drop(database.engine)
            logger.info('Dropping table %s in Hive ...', table_name)
            hive_cursor.execute(f'DROP TABLE `{table_name}`')
def test_start_pipeline_processor(sdc_builder, sdc_executor):
    """Test Start Pipeline Origin/Processor. The pipeline would look like:

        start_pipeline1 >> start_pipeline2 >> pipeline_finisher

    Chain pipeline execution using start pipeline orchestrator stages. start_pipeline1 origin starts and waits
    till pipeline1 completes execution and then start pipeline2.
    """

    start_pipeline_stage_label = 'Start Pipeline'
    metrics_output_generated = False
    if Version(sdc_builder.version) >= Version('3.17.0'):
        start_pipeline_stage_label = 'Start Pipelines'
        metrics_output_generated = True

    # Pipeline - pipeline1
    pipeline1 = _create_batch_pipeline(sdc_builder,
                                       'test_start_pipeline_processor1')
    sdc_executor.add_pipeline(pipeline1)

    # Pipeline - pipeline2
    unique_title = str(uuid.uuid4())
    pipeline2 = _create_batch_pipeline(sdc_builder, unique_title)
    sdc_executor.add_pipeline(pipeline2)

    # Chain Pipeline Execution Sample (start_pipeline1 >> start_pipeline2 >> pipeline_finisher)
    pipeline_builder = sdc_builder.get_pipeline_builder()
    start_pipeline1 = pipeline_builder.add_stage(start_pipeline_stage_label,
                                                 type='origin')
    if Version(sdc_builder.version) >= Version('3.17.0'):
        start_pipeline1.task_name = 'task1'
    else:
        start_pipeline1.unique_task_name = 'task1'
    start_pipeline1.pipelines = [{
        'pipelineIdType': 'ID',
        'pipelineId': pipeline1.id
    }]

    start_pipeline2 = pipeline_builder.add_stage(start_pipeline_stage_label,
                                                 type='processor')
    if Version(sdc_builder.version) >= Version('3.17.0'):
        start_pipeline2.task_name = 'task2'
    else:
        start_pipeline2.unique_task_name = 'task2'
    start_pipeline2.pipelines = [{
        'pipelineIdType': 'TITLE',
        'pipelineId': unique_title
    }]

    pipeline_finisher = pipeline_builder.add_stage(
        'Pipeline Finisher Executor')

    start_pipeline1 >> start_pipeline2 >> pipeline_finisher
    pipeline = pipeline_builder.build('Chain Pipeline Execution Sample')
    sdc_executor.add_pipeline(pipeline)

    sdc_executor.validate_pipeline(pipeline)

    snapshot = sdc_executor.capture_snapshot(pipeline,
                                             start_pipeline=True).snapshot

    # Assert start_pipeline1 record output
    start_pipeline1_output = snapshot[start_pipeline1.instance_name].output
    assert len(start_pipeline1_output) == 1
    _validate_start_pipeline_output(
        start_pipeline1_output[0].field['orchestratorTasks'], 'task1',
        pipeline1, True, metrics_output_generated)

    # Assert start_pipeline2 record output - start_pipeline2 output should contain output of both pipelines
    start_pipeline2_output = snapshot[start_pipeline2.instance_name].output
    assert len(start_pipeline2_output) == 1
    _validate_start_pipeline_output(
        start_pipeline2_output[0].field['orchestratorTasks'], 'task1',
        pipeline1, True, metrics_output_generated)
    _validate_start_pipeline_output(
        start_pipeline2_output[0].field['orchestratorTasks'], 'task2',
        pipeline2, True, metrics_output_generated)
def test_principal(sdc_builder,
                   sdc_executor,
                   cluster,
                   stage_attributes,
                   keytab_format=ENCODED_KEYTAB_CONTENTS):
    if not cluster.kafka.is_kerberized:
        pytest.skip('Test runs only if Kafka is kerberized')
    cloudera_streamsets = getattr(cluster, 'streamsets')
    if keytab_format in [CREDENTIAL_FUNCTION, CREDENTIAL_FUNCTION_WITH_GROUP]:
        if not cloudera_streamsets.credential_stores:
            pytest.skip(
                'Test with credential function runs only if credential store was enabled'
            )

    if keytab_format in [CREDENTIAL_FUNCTION_WITH_GROUP]:
        azure_keyvault = cloudera_streamsets.credential_stores.get('azure')
        if not azure_keyvault or not azure_keyvault.enforce_entry_group:
            pytest.skip(
                'Test with credential function with enforce group runs only'
                ' if enforceEntryGroup was set to True')

    encoded_keytabs_for_stages = getattr(cluster.kafka,
                                         'encoded_keytabs_for_stages', None)
    keytab_for_stage = (encoded_keytabs_for_stages.get('Kafka Consumer')
                        if encoded_keytabs_for_stages else None)
    if not keytab_for_stage:
        pytest.skip(
            'Test runs only if --stage-keytab argument is provided for `Kafka Consumer` stage'
        )

    if keytab_format == ENCODED_KEYTAB_CONTENTS:
        keytab_value = keytab_for_stage.base64_encoded_keytab_contents
    elif keytab_format in [
            CREDENTIAL_FUNCTION, CREDENTIAL_FUNCTION_WITH_GROUP
    ]:
        keytab_value = keytab_for_stage.credential_function_for_keytab

    MESSAGE = 'Hello World from SDC & DPM!'
    EXPECTED = {'text': 'Hello World from SDC & DPM!'}

    # Build the Kafka consumer pipeline with Standalone mode.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    topic_name = get_random_string()
    kafka_consumer = builder.add_stage(
        'Kafka Consumer', library=cluster.kafka.standalone_stage_lib)

    if Version(sdc_builder.version) < Version('3.19'):
        stage_attributes.update({
            'keytab': keytab_value,
            'principal': keytab_for_stage.principal
        })
    else:
        if 'provide_keytab' in stage_attributes:
            stage_attributes[
                'provide_keytab_at_runtime'] = stage_attributes.pop(
                    'provide_keytab')
        stage_attributes.update({
            'runtime_keytab': keytab_value,
            'runtime_principal': keytab_for_stage.principal
        })
    # Default stage configuration.
    kafka_consumer.set_attributes(auto_offset_reset='EARLIEST',
                                  batch_wait_time_in_ms=20000,
                                  data_format='TEXT',
                                  topic=topic_name,
                                  **stage_attributes)

    wiretap = builder.add_wiretap()
    kafka_consumer >> wiretap.destination
    pipeline = builder.build().configure_for_environment(cluster)
    sdc_executor.add_pipeline(pipeline)

    try:
        # Publish messages to Kafka and verify using wiretap if the same messages are received.
        producer = cluster.kafka.producer()
        producer.send(topic_name, MESSAGE.encode())

        # Start Pipeline.
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline,
                                              'input_record_count',
                                              1,
                                              timeout_sec=120)
        sdc_executor.stop_pipeline(pipeline)

        # Verify wiretap data.
        records = [record.field for record in wiretap.output_records]
        assert [EXPECTED] == records
    finally:
        if sdc_executor.get_pipeline_status(pipeline).response.json().get(
                'status') == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline)
def test_kafka_origin_not_saving_offset(sdc_builder, sdc_executor, cluster):
    """Ensure that we read all the data, even when a pipeline fails - thus no records are "auto committed". The test
       runs the same pipeline twice - once with failure and second time with success and ensures that the second run
       see all the records.

       The pipeline reads from Kafka and uses delay processor to model longer processing time (so that Kafka's auto
       commit takes place) and then jython processor to generate pipeline failure (1/0).
    """
    topic = get_random_string(string.ascii_letters, 10)

    builder = sdc_builder.get_pipeline_builder()

    origin = get_kafka_multitopic_consumer_stage(builder, cluster)
    origin.topic_list = [topic]
    origin.consumer_group = get_random_string(string.ascii_letters, 10)
    origin.batch_wait_time_in_ms = 100

    if Version(sdc_builder.version) < Version('3.7.0'):
        origin.configuration_properties = [{
            'key': 'auto.offset.reset',
            'value': 'earliest'
        }]
    else:
        origin.auto_offset_reset = 'EARLIEST'

    delay = builder.add_stage('Delay')
    delay.delay_between_batches = 5 * 1000

    script = builder.add_stage('Jython Evaluator', type='processor')
    script.script = """1/${DIVISOR}
for record in sdc.records:
  try:
      sdc.output.write(record)
  except Exception as e:
      sdc.error.write(record, str(e))
"""

    wiretap = builder.add_wiretap()

    origin >> delay >> script >> wiretap.destination

    pipeline = builder.build().configure_for_environment(cluster)
    pipeline.configuration['shouldRetry'] = False
    pipeline.configuration['executionMode'] = 'STANDALONE'
    pipeline.add_parameters(DIVISOR='0')

    sdc_executor.add_pipeline(pipeline)

    # Produce one message
    producer = cluster.kafka.producer()
    producer.send(topic, 'Super Secret Message'.encode())
    producer.flush()

    try:
        # Start our pipeline - it should fail
        sdc_executor.start_pipeline(pipeline,
                                    runtime_parameters={
                                        'DIVISOR': 0
                                    }).wait_for_status('RUN_ERROR',
                                                       ignore_errors=True)

        # Adding second message so that the topic have at least one new message, so that getting an older
        # versions won't time out but returns immediately.
        producer = cluster.kafka.producer()
        producer.send(topic, 'Not So Super Secret Message'.encode())
        producer.flush()

        # Now run the pipeline second time and it should succeed
        sdc_executor.start_pipeline(pipeline,
                                    runtime_parameters={'DIVISOR': 1})
        sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count',
                                              2)

        # Now this should still read both records
        records = wiretap.output_records
        assert len(records) == 2
        assert records[0].field['text'] == 'Super Secret Message'
        assert records[1].field['text'] == 'Not So Super Secret Message'

    finally:
        sdc_executor.stop_pipeline(pipeline)
def test_kafka_origin_save_offset(sdc_builder, sdc_executor, cluster):
    """ Above SDC-10501 introduced a bug which does not commit offset when the number of records
    is less than the max batch size. This process 5 records for the 1st run, stop pipeline, and
    run again to process 3 records for the 2nd run. 2nd run should process 3 records as the offset
    should be saved after the 1st run.

    Kafka Multitopic Origin >> Trash (Run twice)
    """
    topic = get_random_string(string.ascii_letters, 10)

    builder = sdc_builder.get_pipeline_builder()

    kafka_multitopic_consumer = get_kafka_multitopic_consumer_stage(
        builder, cluster)
    kafka_multitopic_consumer.topic_list = [topic]
    kafka_multitopic_consumer.consumer_group = get_random_string(
        string.ascii_letters, 10)
    kafka_multitopic_consumer.batch_wait_time_in_ms = 100

    if Version(sdc_builder.version) < Version('3.7.0'):
        kafka_multitopic_consumer.configuration_properties = [{
            'key':
            'auto.offset.reset',
            'value':
            'earliest'
        }]
    else:
        kafka_multitopic_consumer.auto_offset_reset = 'EARLIEST'

    wiretap = builder.add_wiretap()

    kafka_multitopic_consumer >> wiretap.destination

    pipeline = builder.build().configure_for_environment(cluster)
    pipeline.configuration['shouldRetry'] = False
    pipeline.configuration['executionMode'] = 'STANDALONE'

    sdc_executor.add_pipeline(pipeline)

    # Produce 5 messages
    messages = [f'message{i}' for i in range(0, 5)]
    produce_kafka_messages_list(kafka_multitopic_consumer.topic_list[0],
                                cluster, messages, 'TEXT')

    try:
        # Start the pipeline, read one batch and stop.
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count',
                                              5)
        sdc_executor.stop_pipeline(pipeline)

        # Check if the pipeline processed 5 records
        records = [
            f'{record.field["text"]}' for record in wiretap.output_records
        ]
        assert len(records) == 5
        assert sorted(messages) == sorted(records)

        # Produce another 3 messages
        messages2 = [f'message{i}' for i in range(5, 8)]
        produce_kafka_messages_list(kafka_multitopic_consumer.topic_list[0],
                                    cluster, messages2, 'TEXT')

        # Resetting wiretap to clean up data from previous runs
        wiretap.reset()

        # Run the pipeline second time
        sdc_executor.start_pipeline(pipeline)
        time.sleep(10)
        sdc_executor.stop_pipeline(pipeline)

        #  2nd run should processed only 3 records
        records2 = [
            f'{record.field["text"]}' for record in wiretap.output_records
        ]
        assert len(records2) == 3
        assert sorted(messages2) == sorted(records2)

    finally:
        if sdc_executor.get_pipeline_status(pipeline).response.json().get(
                'status') == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline)
def test_kudu_lookup_apply_default(sdc_builder, sdc_executor, cluster):
    """
    Test when row is found which matches with primary key, but its column that lookup processor needs to return
    doesn't have value.
    When default value is configured, apply the value.

    dev_raw_data_source >> record_deduplicator >> kudu >> trash
    record_deduplicator >> to_error
    """
    if not hasattr(cluster, 'kudu'):
        pytest.skip('Kudu tests only run against clusters with the Kudu service present.')

    tour_de_france_contenders = [dict(favorite_rank=1),
                                 dict(favorite_rank=2)]

    raw_data = ''.join([json.dumps(contender) for contender in tour_de_france_contenders])
    key_columns_mapping = [dict(field='/favorite_rank', columnName='rank')]
    column_to_output_field_mapping = [dict(columnName='name', field='/name', defaultValue=None),
                                      dict(columnName='wins', field='/wins', defaultValue='0')]

    kudu_table_name = get_random_string(string.ascii_letters, 10)
    kudu_master_address = '{}:{}'.format(cluster.server_host, DEFAULT_KUDU_PORT)

    # Build the pipeline.
    builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                  raw_data=raw_data)
    kudu = builder.add_stage('Kudu Lookup',
                             type='processor').set_attributes(kudu_masters=kudu_master_address,
                                                              kudu_table_name='{}.{}'.format('impala::default',
                                                                                             kudu_table_name),
                                                              key_columns_mapping=key_columns_mapping,
                                                              column_to_output_field_mapping=column_to_output_field_mapping,
                                                              case_sensitive=True,
                                                              ignore_missing_value=True)

    record_deduplicator = builder.add_stage('Record Deduplicator')
    to_error = builder.add_stage('To Error')
    trash = builder.add_stage('Trash')

    dev_raw_data_source >> record_deduplicator >> kudu >> trash
    record_deduplicator >> to_error

    pipeline = builder.build().configure_for_environment(cluster)
    sdc_executor.add_pipeline(pipeline)

    metadata = sqlalchemy.MetaData()
    tdf_contenders_table = sqlalchemy.Table(kudu_table_name,
                                            metadata,
                                            sqlalchemy.Column('rank', sqlalchemy.Integer, primary_key=True),
                                            sqlalchemy.Column('name', sqlalchemy.String),
                                            sqlalchemy.Column('wins', sqlalchemy.Integer),
                                            impala_partition_by='HASH PARTITIONS 16',
                                            impala_stored_as='KUDU',
                                            impala_table_properties={
                                                'kudu.master_addresses': kudu_master_address,
                                                'kudu.num_tablet_replicas': '1'
                                            })

    try:
        logger.info('Creating Kudu table %s ...', kudu_table_name)
        engine = cluster.kudu.engine
        tdf_contenders_table.create(engine)
        conn = engine.connect()
        conn.execute(tdf_contenders_table.insert(), [
            {'rank': 1, 'name': None, 'wins': None},
            {'rank': 2, 'name': None, 'wins': None}])

        snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot
        sdc_executor.stop_pipeline(pipeline)
        for result in snapshot[kudu.instance_name].output:
            if Version(sdc_executor.version) >= Version('3.2.0.0'):
                assert 'name' not in result.field
            else:
                assert result.field['name'].value == 'None'
            assert int(result.field['wins'].value) == 0

    finally:
        logger.info('Dropping Kudu table %s ...', kudu_table_name)
        tdf_contenders_table.drop(engine)
def test_kudu_lookup_decimal_type(sdc_builder, sdc_executor, cluster):
    """
    After inserting rows in a Kudu table containing a decimal type column check that decimal type column is correctly
    retrieved by Kudu processor

    dev_raw_data_source >> kudu >> trash
    """
    if not hasattr(cluster, 'kudu'):
        pytest.skip('Kudu tests only run against clusters with the Kudu service present.')

    if not Version(cluster.kudu.version) >= Version('1.7.0'):
        pytest.skip(f'Test only designed to run on Kudu version >= 1.7.0, but found {cluster.kudu.version}')

    tour_de_france_contenders = [dict(rank=1, weight=150.58),
                                 dict(rank=2, weight=140.11)]

    raw_data = ''.join([json.dumps(contender) for contender in tour_de_france_contenders])
    key_columns_mapping = [dict(field='/rank', columnName='rank')]
    column_to_output_field_mapping = [dict(columnName='rank', field='/rank'),
                                      dict(columnName='weight', field='/weight', defaultValue='0')]

    kudu_table_name = get_random_string(string.ascii_letters, 10)
    kudu_master_address = '{}:{}'.format(cluster.server_host, DEFAULT_KUDU_PORT)

    # Build the pipeline.
    builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                  raw_data=raw_data)
    kudu = builder.add_stage('Kudu Lookup',
                             type='processor').set_attributes(kudu_masters=kudu_master_address,
                                                              kudu_table_name='{}.{}'.format('impala::default',
                                                                                             kudu_table_name),
                                                              key_columns_mapping=key_columns_mapping,
                                                              column_to_output_field_mapping=column_to_output_field_mapping,
                                                              case_sensitive=True,
                                                              ignore_missing_value=True)

    trash = builder.add_stage('Trash')

    dev_raw_data_source >> kudu >> trash

    pipeline = builder.build().configure_for_environment(cluster)
    sdc_executor.add_pipeline(pipeline)

    metadata = sqlalchemy.MetaData()
    tdf_contenders_table = sqlalchemy.Table(kudu_table_name,
                                            metadata,
                                            sqlalchemy.Column('rank', sqlalchemy.Integer, primary_key=True),
                                            sqlalchemy.Column('weight', sqlalchemy.DECIMAL(5,2)),
                                            impala_partition_by='HASH PARTITIONS 16',
                                            impala_stored_as='KUDU',
                                            impala_table_properties={
                                                'kudu.master_addresses': kudu_master_address,
                                                'kudu.num_tablet_replicas': '1'
                                            })

    try:
        logger.info('Creating Kudu table %s ...', kudu_table_name)
        engine = cluster.kudu.engine
        tdf_contenders_table.create(engine)
        conn = engine.connect()
        conn.execute(tdf_contenders_table.insert(), tour_de_france_contenders)

        snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot
        sdc_executor.stop_pipeline(pipeline)
        i = 0
        for result in snapshot[kudu.instance_name].output:
            assert result.field['weight'].value == round(Decimal(tour_de_france_contenders[i]['weight']), 2)
            i += 1

    finally:
        logger.info('Dropping Kudu table %s ...', kudu_table_name)
        tdf_contenders_table.drop(engine)
def test_kudu_destination_decimal_type(sdc_builder, sdc_executor, cluster):
    """Simple Dev Raw Data Source to Kudu pipeline inserting column of decimal type and checking later on
    decimal type is correctly stored by querying Kudu database

    dev_raw_data_source >> kudu
    """
    if not hasattr(cluster, 'kudu'):
        pytest.skip('Kudu tests only run against clusters with the Kudu service present.')

    if not Version(cluster.kudu.version) >= Version('1.7.0'):
        pytest.skip(f'Test only designed to run on Kudu version >= 1.7.0, but found {cluster.kudu.version}')

    # Generate some data.
    tour_de_france_contenders = [dict(favorite_rank=1, name='Chris Froome', wins=3, weight=153.22),
                                 dict(favorite_rank=2, name='Greg LeMond', wins=3, weight=158.73),
                                 dict(favorite_rank=4, name='Vincenzo Nibali', wins=1, weight=144),
                                 dict(favorite_rank=3, name='Nairo Quintana', wins=0, weight=165.34)]
    raw_data = '\n'.join([json.dumps(contender) for contender in tour_de_france_contenders])

    field_to_column_mapping = [dict(field='/favorite_rank', columnName='rank'),
                               dict(field='/name', columnName='name'),
                               dict(field='/wins', columnName='wins'),
                               dict(field='/weight', columnName='weight')]

    kudu_table_name = get_random_string(string.ascii_letters, 10)
    kudu_master_address = '{}:{}'.format(cluster.server_host, DEFAULT_KUDU_PORT)

    # Build the pipeline.
    builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                  raw_data=raw_data)
    kudu = builder.add_stage('Kudu',
                             type='destination').set_attributes(table_name='{}.{}'.format('impala::default',
                                                                                          kudu_table_name),
                                                                default_operation='INSERT',
                                                                field_to_column_mapping=field_to_column_mapping)
    dev_raw_data_source >> kudu

    pipeline = builder.build().configure_for_environment(cluster)
    pipeline.delivery_guarantee = 'AT_MOST_ONCE'
    # We want to write data once and then stop, but Dev Raw Data Source will keep looping, so we set the rate limit to
    # a low value and will rely upon pipeline metrics to know when to stop the pipeline.
    pipeline.rate_limit = 4

    metadata = sqlalchemy.MetaData()
    tdf_contenders_table = sqlalchemy.Table(kudu_table_name,
                                            metadata,
                                            sqlalchemy.Column('rank', sqlalchemy.Integer, primary_key=True),
                                            sqlalchemy.Column('name', sqlalchemy.String),
                                            sqlalchemy.Column('wins', sqlalchemy.Integer),
                                            sqlalchemy.Column('weight', sqlalchemy.DECIMAL(5, 2)),
                                            impala_partition_by='HASH PARTITIONS 16',
                                            impala_stored_as='KUDU',
                                            impala_table_properties={
                                                'kudu.master_addresses': kudu_master_address,
                                                'kudu.num_tablet_replicas': '1'
                                            })

    try:
        logger.info('Creating Kudu table %s ...', kudu_table_name)
        engine = cluster.kudu.engine
        tdf_contenders_table.create(engine)

        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(len(tour_de_france_contenders))
        sdc_executor.stop_pipeline(pipeline)

        connection = engine.connect()
        result = connection.execute(sqlalchemy.sql.select([tdf_contenders_table]).order_by('rank'))
        result_list = list(result)

        sorted_tour_de_france_contenders = [tuple([item['favorite_rank'], item['name'], item['wins'],
                                                   round(Decimal(item['weight']), 2)])
                                            for item in sorted(tour_de_france_contenders,
                                                               key=lambda key: key['favorite_rank'])]

        assert result_list == sorted_tour_de_france_contenders

    finally:
        logger.info('Dropping Kudu table %s ...', kudu_table_name)
        tdf_contenders_table.drop(engine)
def test_kudu_destination_unixtime_micro_datatype(sdc_builder, sdc_executor, cluster):
    """
    Test Kudu's UNIXTIME_MICRO data type support.
    dev_raw_data_source >> kudu
    """
    if not hasattr(cluster, 'kudu'):
        pytest.skip('Kudu tests only run against clusters with the Kudu service present.')

    if Version(cluster.version) < Version('cdh5.12.0'):
        pytest.skip('Test requires CDH 5.12.0+ to run')

    # Generate some data. Kudu does not store microsecond so set it 0.
    now = datetime.now().replace(microsecond=0)
    now_millisecond = time.mktime(now.timetuple()) * 1000
    input_data = [dict(id=1, time=now_millisecond)]

    raw_data = ''.join([json.dumps(contender) for contender in input_data])
    field_to_column_mapping = [dict(field='/id', columnName='id'),
                               dict(field='/time', columnName='unixtime_micro')]

    kudu_table_name = get_random_string(string.ascii_letters, 10)
    kudu_master_address = f'{cluster.server_host}:{DEFAULT_KUDU_PORT}'

    # Build the pipeline.
    builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                  raw_data=raw_data)
    kudu = builder.add_stage('Kudu',
                             type='destination').set_attributes(table_name='{}.{}'.format('impala::default',
                                                                                          kudu_table_name),
                                                                default_operation='INSERT',
                                                                field_to_column_mapping=field_to_column_mapping)
    dev_raw_data_source >> kudu

    pipeline = builder.build().configure_for_environment(cluster)
    pipeline.delivery_guarantee = 'AT_MOST_ONCE'
    # We want to write data once and then stop, but Dev Raw Data Source will keep looping, so we set the rate limit to
    # a low value and will rely upon pipeline metrics to know when to stop the pipeline.
    pipeline.rate_limit = 4

    metadata = sqlalchemy.MetaData()
    test_table = sqlalchemy.Table(kudu_table_name,
                                  metadata,
                                  sqlalchemy.Column('id', sqlalchemy.Integer, primary_key=True),
                                  sqlalchemy.Column('unixtime_micro', sqlalchemy.TIMESTAMP),
                                  impala_partition_by='HASH PARTITIONS 16',
                                  impala_stored_as='KUDU',
                                  impala_table_properties={
                                      'kudu.master_addresses': kudu_master_address,
                                      'kudu.num_tablet_replicas': '1'
                                  })

    try:
        logger.info('Creating Kudu table %s ...', kudu_table_name)
        engine = cluster.kudu.engine
        test_table.create(engine)

        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(len(input_data))
        sdc_executor.stop_pipeline(pipeline)

        connection = engine.connect()
        result = connection.execute(sqlalchemy.sql.select([test_table])).fetchone()
        assert list(result) == [1, now]
    finally:
        logger.info('Dropping Kudu table %s ...', kudu_table_name)
        test_table.drop(engine)
def test_wait_for_completion_processor(sdc_builder, sdc_executor):
    """Test Wait For Pipeline Completion Processor."""

    start_pipeline_stage_label = 'Start Pipeline'
    wait_for_completion_stage_label = 'Wait for Pipeline Completion'
    if Version(sdc_builder.version) >= Version('3.17.0'):
        start_pipeline_stage_label = 'Start Pipelines'
        wait_for_completion_stage_label = 'Wait for Pipelines'

    # Pipeline - pipeline1
    pipeline1 = _create_batch_pipeline(sdc_builder,
                                       'test_wait_for_completion_processor')
    sdc_executor.add_pipeline(pipeline1)

    # Pipeline - pipeline2
    pipeline2 = _create_batch_pipeline(sdc_builder,
                                       'test_wait_for_completion_processor2')
    sdc_executor.add_pipeline(pipeline2)

    pipeline_builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source1 = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source1.stop_after_first_batch = True

    start_pipeline1 = pipeline_builder.add_stage(start_pipeline_stage_label,
                                                 type='processor')
    if Version(sdc_builder.version) >= Version('3.17.0'):
        start_pipeline1.task_name = 'task1'
    else:
        start_pipeline1.unique_task_name = 'task1'
    start_pipeline1.run_in_background = True
    start_pipeline1.pipelines = [{
        'pipelineIdType': 'ID',
        'pipelineId': pipeline1.id
    }]

    start_pipeline2 = pipeline_builder.add_stage(start_pipeline_stage_label,
                                                 type='processor')
    if Version(sdc_builder.version) >= Version('3.17.0'):
        start_pipeline2.task_name = 'task2'
    else:
        start_pipeline2.unique_task_name = 'task2'
    start_pipeline2.run_in_background = True
    start_pipeline2.pipelines = [{
        'pipelineIdType': 'ID',
        'pipelineId': pipeline2.id
    }]

    wait_for_pipeline_completion = pipeline_builder.add_stage(
        wait_for_completion_stage_label)
    trash = pipeline_builder.add_stage('Trash')

    dev_raw_data_source1 >> [start_pipeline1, start_pipeline2]
    start_pipeline1 >> wait_for_pipeline_completion
    start_pipeline2 >> wait_for_pipeline_completion
    wait_for_pipeline_completion >> trash

    pipeline = pipeline_builder.build('Chain Pipeline Execution Sample2')
    sdc_executor.add_pipeline(pipeline)

    sdc_executor.validate_pipeline(pipeline)

    snapshot = sdc_executor.capture_snapshot(pipeline,
                                             start_pipeline=True).snapshot

    # Assert start_pipeline1 record output
    start_pipeline1_output = snapshot[start_pipeline1.instance_name].output
    assert len(start_pipeline1_output) == 1
    _validate_start_pipeline_output(
        start_pipeline1_output[0].field['orchestratorTasks'], 'task1',
        pipeline1, False)

    # Assert start_pipeline2 record output
    start_pipeline2_output = snapshot[start_pipeline2.instance_name].output
    assert len(start_pipeline2_output) == 1
    _validate_start_pipeline_output(
        start_pipeline2_output[0].field['orchestratorTasks'], 'task2',
        pipeline2, False)

    # Assert wait_for_pipeline_completion record output
    wait_for_pipeline_completion_output = snapshot[
        wait_for_pipeline_completion.instance_name].output
    assert len(wait_for_pipeline_completion_output) == 1
    _validate_start_pipeline_output(
        wait_for_pipeline_completion_output[0].field['orchestratorTasks'],
        'task1', pipeline1, True)
    _validate_start_pipeline_output(
        wait_for_pipeline_completion_output[0].field['orchestratorTasks'],
        'task2', pipeline2, True)
def test_data_types(sdc_builder, sdc_executor, mongodb, input, converter_type,
                    improve_types, expected):

    if Version(sdc_builder.version) <= Version('4.0.2') and improve_types:
        pytest.skip(
            'Improved Type Conversion is not present on that SDC version')

    database = get_random_string(string.ascii_letters, 5)
    collection = get_random_string(string.ascii_letters, 10)

    pipeline_builder = sdc_builder.get_pipeline_builder()
    pipeline_builder.add_error_stage('Discard')

    origin = pipeline_builder.add_stage('Dev Raw Data Source')
    origin.set_attributes(data_format='JSON',
                          stop_after_first_batch=True,
                          raw_data=json.dumps({"value": input}))

    converter = pipeline_builder.add_stage('Field Type Converter')
    converter.set_attributes(conversion_method='BY_FIELD',
                             field_type_converter_configs=[{
                                 'fields': ['/value'],
                                 'targetType':
                                 converter_type,
                                 'dataLocale':
                                 'en,US',
                                 'dateFormat':
                                 'YYYY_MM_DD_HH_MM_SS',
                                 'zonedDateTimeFormat':
                                 'ISO_OFFSET_DATE_TIME',
                                 'scale':
                                 2
                             }])

    expression_evaluator = pipeline_builder.add_stage('Expression Evaluator')
    # MongoDB destination uses the CRUD operation in the sdc.operation.type record header attribute when writing
    # to MongoDB. Value 4 specified below is for UPSERT.
    expression_evaluator.header_attribute_expressions = [{
        'attributeToSet':
        'sdc.operation.type',
        'headerAttributeExpression':
        '1'
    }]

    mongodb_dest = pipeline_builder.add_stage('MongoDB', type='destination')

    if Version(sdc_builder.version) > Version('4.0.2'):
        mongodb_dest.set_attributes(improve_type_conversion=improve_types)

    mongodb_dest.set_attributes(database=database, collection=collection)

    origin >> converter >> expression_evaluator >> mongodb_dest

    pipeline = pipeline_builder.build().configure_for_environment(mongodb)
    pipeline.configuration["shouldRetry"] = False
    sdc_executor.add_pipeline(pipeline)

    try:
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        # Run pipeline and read from MongoDB to assert
        mongodb_documents = [
            doc for doc in mongodb.engine[mongodb_dest.database][
                mongodb_dest.collection].find()
        ]

        assert len(mongodb_documents) == 1
        doc = mongodb_documents[0]

        if converter_type == 'FLOAT' and improve_types:
            assert pytest.approx(doc['value']) == expected
        elif converter_type == 'DECIMAL' and improve_types:
            assert pytest.approx(Decimal(str(doc['value']))) == expected
        else:
            assert doc['value'] == expected

    finally:
        logger.info('Dropping %s database...', mongodb_dest.database)
        mongodb.engine.drop_database(mongodb_dest.database)
Пример #30
0
def test_kafka_origin_save_offset(sdc_builder, sdc_executor, cluster):
    """ Above SDC-10501 introduced a bug which does not commit offset when the number of records
    is less than the max batch size. This process 5 records for the 1st run, stop pipeline, and
    run again to process 3 records for the 2nd run. 2nd run should process 3 records as the offset
    should be saved after the 1st run.

    Kafka Multitopic Origin >> Trash (Run twice)
    """
    topic = get_random_string(string.ascii_letters, 10)

    builder = sdc_builder.get_pipeline_builder()

    kafka_multitopic_consumer = get_kafka_multitopic_consumer_stage(
        builder, cluster)
    kafka_multitopic_consumer.topic_list = [topic]
    kafka_multitopic_consumer.consumer_group = get_random_string(
        string.ascii_letters, 10)
    kafka_multitopic_consumer.batch_wait_time_in_ms = 100

    if Version(sdc_builder.version) < Version('3.7.0'):
        kafka_multitopic_consumer.configuration_properties = [{
            'key':
            'auto.offset.reset',
            'value':
            'earliest'
        }]
    else:
        kafka_multitopic_consumer.auto_offset_reset = 'EARLIEST'

    trash = builder.add_stage(label='Trash')

    kafka_multitopic_consumer >> trash

    pipeline = builder.build().configure_for_environment(cluster)
    pipeline.configuration['shouldRetry'] = False
    pipeline.configuration['executionMode'] = 'STANDALONE'

    sdc_executor.add_pipeline(pipeline)

    # Produce 5 messages
    messages = [f'message{i}' for i in range(0, 5)]
    produce_kafka_messages_list(kafka_multitopic_consumer.topic_list[0],
                                cluster, messages, 'TEXT')

    try:
        # Start the pipeline, read one batch and stop.
        snapshot1 = sdc_executor.capture_snapshot(pipeline,
                                                  batches=1,
                                                  start_pipeline=True).snapshot
        sdc_executor.stop_pipeline(pipeline)
        # Check if the pipeline processed 5 records
        records = snapshot1[kafka_multitopic_consumer].output
        assert len(records) == 5

        # Produce another 3 messages
        messages2 = [f'message{i}' for i in range(5, 8)]
        produce_kafka_messages_list(kafka_multitopic_consumer.topic_list[0],
                                    cluster, messages2, 'TEXT')

        # Run the pipeline second time
        snapshot2 = sdc_executor.capture_snapshot(pipeline,
                                                  batches=1,
                                                  start_pipeline=True).snapshot
        #  2nd run should processed only 3 records
        records2 = snapshot2[kafka_multitopic_consumer].output
        assert len(records2) == 3

    finally:
        sdc_executor.stop_pipeline(pipeline)