def test_keystore_file(sdc_builder, sdc_executor, stage_attributes):
    """Test "KeyStore path" config parameter. It is tested with two values, one pointing to a real KeyStore file
    and the other to an unexisting file. We check a TLS_01 error is raised for the unexisting file and that
    the pipeline successfully transitions to RUNNING state if the file exists.

    Pipeline:
      http_server >> trash

    """
    builder = sdc_builder.get_pipeline_builder()
    http_server = builder.add_stage('HTTP Server')
    http_server.set_attributes(keystore_type=KEYSTORE_TYPE,
                               keystore_password=KEYSTORE_PASSWORD,
                               data_format='JSON',
                               **stage_attributes)
    if Version('3.14.0') <= Version(sdc_builder.version) < Version('3.17.0'):
        http_server.list_of_application_ids = [{"appId": 'admin'}]
    elif Version(sdc_builder.version) >= Version('3.17.0'):
        http_server.list_of_application_ids = [{"credential": 'admin'}]
    else:
        http_server.application_id = 'admin'
    trash = builder.add_stage('Trash')
    http_server >> trash

    pipeline = builder.build()
    sdc_executor.add_pipeline(pipeline)

    if stage_attributes['keystore_file'] == KEYSTORE_FILE_PATH:
        # Expecting SDC loads the KeyStore and successfully starts to run the pipeline.
        sdc_executor.start_pipeline(pipeline).wait_for_status(status='RUNNING')
        sdc_executor.stop_pipeline(pipeline)
    else:
        # Expecting a StartError from SDC due to unexisting KeyStore file (TLS_01 error).
        with pytest.raises(StartError) as e:
            sdc_executor.start_pipeline(pipeline).wait_for_status(
                status='RUNNING')
        assert e.value.message.startswith('TLS_01')
def json_test(sdc_builder, sdc_executor, cluster, message, expected):
    """Generic method to tests using JSON format"""

    if (Version(sdc_builder.version) < MIN_SDC_VERSION_WITH_SPARK_2_LIB
            and ('kafka' in cluster.kerberized_services
                 or cluster.kafka.is_ssl_enabled)):
        pytest.skip(
            'Kafka cluster mode test only '
            f'runs against cluster with the non-secured Kafka for SDC version {sdc_builder.version}.'
        )

    # Build the Kafka consumer pipeline.
    builder = sdc_builder.get_pipeline_builder()
    kafka_consumer = get_kafka_consumer_stage(sdc_builder.version, builder,
                                              cluster)
    kafka_consumer.set_attributes(data_format='JSON')

    sdc_rpc_destination = get_rpc_destination(builder, sdc_executor)

    kafka_consumer >> sdc_rpc_destination
    kafka_consumer_pipeline = builder.build(
        title='Cluster kafka JSON pipeline').configure_for_environment(cluster)
    kafka_consumer_pipeline.configuration[
        'executionMode'] = 'CLUSTER_YARN_STREAMING'
    kafka_consumer_pipeline.configuration['shouldRetry'] = False

    # Build the Snapshot pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    sdc_rpc_origin = get_rpc_origin(builder, sdc_rpc_destination)
    trash = builder.add_stage(label='Trash')
    sdc_rpc_origin >> trash
    snapshot_pipeline = builder.build(
        title='Cluster kafka JSON Snapshot pipeline')

    sdc_executor.add_pipeline(kafka_consumer_pipeline, snapshot_pipeline)

    try:
        # Publish messages to Kafka and verify using snapshot if the same messages are received.
        produce_kafka_messages(kafka_consumer.topic, cluster,
                               json.dumps(message).encode(), 'JSON')
        verify_kafka_origin_results(kafka_consumer_pipeline, snapshot_pipeline,
                                    sdc_executor, expected, 'JSON')
    finally:
        sdc_executor.stop_pipeline(kafka_consumer_pipeline)
        sdc_executor.stop_pipeline(snapshot_pipeline)
def get_kafka_consumer_stage(sdc_version, pipeline_builder, cluster):
    """Create and return a Kafka origin stage depending on execution mode for the pipeline."""
    pipeline_builder.add_error_stage('Discard')

    if Version(sdc_version) < MIN_SDC_VERSION_WITH_SPARK_2_LIB:
        kafka_cluster_stage_lib = cluster.kafka.cluster_stage_lib_spark1
    else:
        kafka_cluster_stage_lib = cluster.kafka.cluster_stage_lib_spark2

    kafka_consumer = pipeline_builder.add_stage(
        'Kafka Consumer', type='origin', library=kafka_cluster_stage_lib)
    kafka_consumer.set_attributes(
        data_format='TEXT',
        batch_wait_time_in_ms=20000,
        max_batch_size_in_records=10,
        rate_limit_per_partition_in_kafka_messages=10,
        topic=get_random_string(string.ascii_letters, 10),
        kafka_configuration=[{
            'key': 'auto.offset.reset',
            'value': 'earliest'
        }])

    return kafka_consumer
def test_kafka_origin_syslog_message(sdc_builder, sdc_executor, cluster):
    """Write a text message using UDP datagram mode SYSLOG
    into Kafka multiple partitions with the schema in the records
    and confirm that Kafka successfully reads them.
    Because cluster mode pipelines don't support snapshots, we do this verification using a
    second standalone pipeline whose origin is an SDC RPC written to by the Kafka Consumer pipeline.
    Specifically, this would look like:

    Kafka Consumer Origin pipeline with cluster mode:
        kafka_consumer >> sdc_rpc_destination

    Snapshot pipeline:
        sdc_rpc_origin >> trash
    """
    msg64packet = (
        "rO0ABXeOAAAAAQAAAAEAAAAAAAAAAQAJMTI3LjAuMC4xAAALuAAJMTI3LjAuMC4xAAAH0AAAAFw8MzQ+MSAyMDEz"
        "LTA2LTI4VDA2OjE0OjU2LjAwMCswMjowMCBteW1hY2hpbmUgc3U6ICdzdSByb290JyBmYWlsZWQgZm9yIGxvbnZpY"
        "2sgb24gL2Rldi9wdHMvOA==")

    expected = (
        '{\'severity\': 2, \'senderPort\': 3000, \'receiverAddr\': 127.0.0.1:2000, \'host\': mymachine, \'raw\': '
        '<34>1 2013-06-28T06:14:56.000+02:00 mymachine su: \'su root\' failed for lonvick on /dev/pts/8, '
        '\'senderAddr\': 127.0.0.1:3000, \'priority\': 34, \'facility\': 4, \'version\': 1, \'receiverPort\': 2000, '
        '\'remaining\': su: \'su root\' failed for lonvick on /dev/pts/8, \'timestamp\': 1372392896000}'
    )

    if (Version(sdc_builder.version) < MIN_SDC_VERSION_WITH_SPARK_2_LIB
            and ('kafka' in cluster.kerberized_services
                 or cluster.kafka.is_ssl_enabled)):
        pytest.skip(
            'Kafka cluster mode test only '
            f'runs against cluster with the non-secured Kafka for SDC version {sdc_builder.version}.'
        )

    # Build the Kafka consumer pipeline.
    builder = sdc_builder.get_pipeline_builder()
    kafka_consumer = get_kafka_consumer_stage(sdc_builder.version, builder,
                                              cluster)

    # Override default configuration.
    kafka_consumer.set_attributes(data_format='DATAGRAM',
                                  datagram_packet_format='SYSLOG')

    sdc_rpc_destination = get_rpc_destination(builder, sdc_executor)

    kafka_consumer >> sdc_rpc_destination
    kafka_consumer_pipeline = builder.build(
        title='Cluster kafka SYSLOG pipeline').configure_for_environment(
            cluster)
    kafka_consumer_pipeline.configuration[
        'executionMode'] = 'CLUSTER_YARN_STREAMING'
    kafka_consumer_pipeline.configuration['shouldRetry'] = False

    # Build the Snapshot pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    sdc_rpc_origin = get_rpc_origin(builder, sdc_rpc_destination)
    trash = builder.add_stage(label='Trash')
    sdc_rpc_origin >> trash
    snapshot_pipeline = builder.build(title='Cluster Snapshot pipeline')

    sdc_executor.add_pipeline(kafka_consumer_pipeline, snapshot_pipeline)

    try:
        # Publish messages to Kafka and verify using snapshot if the same messages are received.
        produce_kafka_messages(kafka_consumer.topic, cluster,
                               base64.b64decode(msg64packet), 'SYSLOG')
        verify_kafka_origin_results(kafka_consumer_pipeline, snapshot_pipeline,
                                    sdc_executor, expected, 'SYSLOG')

    finally:
        sdc_executor.stop_pipeline(kafka_consumer_pipeline)
        sdc_executor.stop_pipeline(snapshot_pipeline)
def test_kafka_origin_netflow_message(sdc_builder, sdc_executor, cluster):
    """Write a text message using UDP datagram mode NETFLOW
    into Kafka multiple partitions with the schema in the records
    and confirm that Kafka successfully reads them.
    Because cluster mode pipelines don't support snapshots, we do this verification using a
    second standalone pipeline whose origin is an SDC RPC written to by the Kafka Consumer pipeline.
    Specifically, this would look like:

    Kafka Consumer Origin pipeline with cluster mode:
        kafka_consumer >> sdc_rpc_destination

    Snapshot pipeline:
        sdc_rpc_origin >> trash
    """

    msg64packet = (
        'rO0ABXoAAAIqAAAAAQAAAAIAAAAAAAAAAQAJMTI3LjAuMC4xAAALuAAJMTI3LjAuMC4xAAAH0AAAAfgABQAKAAAAAFVFcOIBWL'
        'IwAAAAAAAAAAD3waSb49Wa8QAAAAAAAAAAAAAAAQAAAFlnyqItZ8qiLQA1JA8AABEAAAAAAAAAAAD3waSb49Wa8QAAAAAAAAAA'
        'AAAAAQAAAFlnyqItZ8qiLQA1+ioAABEAAAAAAAAAAAD3waSb49Wa8QAAAAAAAAAAAAAAAQAAAFlnyqItZ8qiLQA1SWAAABEAAA'
        'AAAAAAAAD55boV49Wa8QAAAAAAAAAAAAAAAQAAAFlnyqIvZ8qiLwA1q94AABEAAAAAAAAAAAB/472549Wa8QAAAAAAAAAAAAAA'
        'AQAAAFlnyqIvZ8qiLwA1IlYAABEAAAAAAAAAAAB/472549Wa8QAAAAAAAAAAAAAAAQAAAFlnyqIvZ8qiLwA1l5sAABEAAAAAAA'
        'AAAAB/472549Wa8QAAAAAAAAAAAAAAAQAAAFlnyqIvZ8qiLwA1u4EAABEAAAAAAAAAAAD55boV49Wa8QAAAAAAAAAAAAAAAQAA'
        'AFlnyqIvZ8qiLwA14OQAABEAAAAAAAAAAAAtZyl349Wa8QAAAAAAAAAAAAAAAQAAArhnyqIxZ8qiMQA11FQAABEAAAAAAAAAAA'
        'B5SzUv49Wa8QAAAAAAAAAAAAAAAQAAAfhnyqIyZ8qiMgA1FbUAABEAAAAAAAAAAAA=')

    expected = ['\'srcaddr\': -138304357', '\'first\': 1432355575064']

    if (Version(sdc_builder.version) < MIN_SDC_VERSION_WITH_SPARK_2_LIB
            and ('kafka' in cluster.kerberized_services
                 or cluster.kafka.is_ssl_enabled)):
        pytest.skip(
            'Kafka cluster mode test only '
            f'runs against cluster with the non-secured Kafka for SDC version {sdc_builder.version}.'
        )

    # Build the Kafka consumer pipeline.
    builder = sdc_builder.get_pipeline_builder()
    kafka_consumer = get_kafka_consumer_stage(sdc_builder.version, builder,
                                              cluster)

    # Override default configuration.
    kafka_consumer.set_attributes(data_format='DATAGRAM',
                                  datagram_data_format='NETFLOW')

    sdc_rpc_destination = get_rpc_destination(builder, sdc_executor)

    kafka_consumer >> sdc_rpc_destination
    kafka_consumer_pipeline = builder.build(
        title='Cluster kafka NETFLOW pipeline').configure_for_environment(
            cluster)
    kafka_consumer_pipeline.configuration[
        'executionMode'] = 'CLUSTER_YARN_STREAMING'
    kafka_consumer_pipeline.configuration['shouldRetry'] = False

    # Build the Snapshot pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    sdc_rpc_origin = get_rpc_origin(builder, sdc_rpc_destination)
    trash = builder.add_stage(label='Trash')
    sdc_rpc_origin >> trash
    snapshot_pipeline = builder.build(title='Cluster Snapshot pipeline')

    sdc_executor.add_pipeline(kafka_consumer_pipeline, snapshot_pipeline)

    try:
        # Publish messages to Kafka and verify using snapshot if the same messages are received.
        produce_kafka_messages(kafka_consumer.topic, cluster,
                               base64.b64decode(msg64packet), 'NETFLOW')
        verify_kafka_origin_results(kafka_consumer_pipeline, snapshot_pipeline,
                                    sdc_executor, expected, 'NETFLOW')

    finally:
        sdc_executor.stop_pipeline(kafka_consumer_pipeline)
        sdc_executor.stop_pipeline(snapshot_pipeline)
def test_mapr_standalone_streams(sdc_builder, sdc_executor, cluster):
    """This test will start MapR Streams producer and consumer pipelines which check for integrity of data
    from a MapR Streams producer to MapR Streams consumer. Both the pipelines run as standalone. Specifically, this
    would look like:

    MapR Streams producer pipeline:
        dev_raw_data_source >> mapr_streams_producer

    MapR Streams consumer pipeline:
        mapr_streams_consumer >> trash
    """
    if cluster.mep_version != '6.0':
        pytest.skip(
            'MapR Streams are currently only supported on latest version of MEP (e.g. MEP 6)'
        )
    # MapR Stream name has to be pre-created in MapR cluster. Clusterdock MapR image has this already.
    stream_name = '/sample-stream'
    stream_topic_name = stream_name + ':' + get_random_string(
        string.ascii_letters, 10)

    # Build the MapR Stream producer pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    dev_raw_data_source = builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.data_format = 'TEXT'
    dev_raw_data_source.raw_data = 'Hello World!'

    mapr_streams_producer = builder.add_stage('MapR Streams Producer')
    mapr_streams_producer.data_format = 'TEXT'
    # Runtime topic resolution is explicitly supported from 3.4.0
    if Version(sdc_executor.version) >= Version('3.4.0'):
        mapr_streams_producer.runtime_topic_resolution = True
        mapr_streams_producer.topic_expression = stream_topic_name
    else:
        mapr_streams_producer.topic = stream_topic_name

    dev_raw_data_source >> mapr_streams_producer
    producer_pipeline = builder.build(
        'MapR Streams producer pipeline - standalone'
    ).configure_for_environment(cluster)
    producer_pipeline.rate_limit = 1

    # Build the MapR Stream consumer pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    mapr_streams_consumer = builder.add_stage('MapR Streams Consumer')
    mapr_streams_consumer.topic = stream_topic_name
    mapr_streams_consumer.data_format = 'TEXT'

    trash = builder.add_stage('Trash')

    mapr_streams_consumer >> trash
    consumer_pipeline = builder.build(
        'MapR Streams consumer pipeline - standalone'
    ).configure_for_environment(cluster)
    consumer_pipeline.rate_limit = 1

    sdc_executor.add_pipeline(producer_pipeline, consumer_pipeline)

    # Run pipelines and assert the data flow. To do that, the sequence of steps is as follows:
    # 1. Start MapR Stream producer and make sure to wait till some batches generate
    # 2. Start MapR Stream consumer via capture snapshot feature to make sure data flow can be captured
    # 3. Capture snapshot on the MapR Stream consumer
    # 4. Compare and assert snapshot result to the data injected at the producer
    try:
        sdc_executor.start_pipeline(
            producer_pipeline).wait_for_pipeline_batch_count(5)
        snapshot_pipeline_command = sdc_executor.capture_snapshot(
            consumer_pipeline, start_pipeline=True, wait=False)
        snapshot = snapshot_pipeline_command.wait_for_finished(
            timeout_sec=120).snapshot
        snapshot_data = snapshot[
            consumer_pipeline[0].instance_name].output[0].field['text'].value
        assert dev_raw_data_source.raw_data == snapshot_data
    finally:
        sdc_executor.stop_pipeline(consumer_pipeline)
        sdc_executor.stop_pipeline(producer_pipeline)
def test_data_types_sqlserver(sdc_builder, sdc_executor, database, sql_type, insert_fragment, expected_type, expected_value, keep_data):
    """Test all feasible SQL Server types."""
    table_name = get_random_string(string.ascii_lowercase, 20)
    connection = database.engine.connect()
    try:
        # Create table
        connection.execute(f"""
            CREATE TABLE {table_name}(
                id int primary key,
                value {sql_type} NULL
            )
        """)

        # And insert a row with actual value
        connection.execute(f"INSERT INTO {table_name} VALUES(1, {insert_fragment})")
        # And a null
        connection.execute(f"INSERT INTO {table_name} VALUES(2, NULL)")

        builder = sdc_builder.get_pipeline_builder()

        origin = builder.add_stage('Dev Raw Data Source')
        origin.data_format = 'JSON'
        origin.raw_data = '{"id": 1}\n{"id": 2}'
        origin.stop_after_first_batch = True

        lookup = builder.add_stage('JDBC Lookup')
        lookup.sql_query = 'SELECT value FROM {0} WHERE '.format(table_name) + 'id = ${record:value("/id")}'
        lookup.column_mappings = [dict(dataType='USE_COLUMN_TYPE', columnName='value', field='/value')]

        wiretap = builder.add_wiretap()

        # As a part of SDC-10125, DATETIMEOFFSET is natively supported in SDC, and is converted into ZONED_DATETIME
        if sql_type == 'DATETIMEOFFSET':
            if Version(sdc_executor.version) >= Version('3.14.0'):
                expected_type = 'ZONED_DATETIME'
                expected_value = '2004-05-23T14:25:10.3456-08:00'
            else:
                expected_type = 'STRING'
                expected_value = '2004-05-23 14:25:10.3456 -08:00'
                # This unknown_type_action setting is required, otherwise DATETIMEOFFSET tests for SDC < 3.14 will fail.
                lookup.on_unknown_type = 'CONVERT_TO_STRING'

        origin >> lookup >> wiretap.destination

        pipeline = builder.build().configure_for_environment(database)
        sdc_executor.add_pipeline(pipeline)

        sdc_executor.start_pipeline(pipeline).wait_for_finished()
        records = wiretap.output_records

        assert len(records) == 2
        record = records[0]
        null_record = records[1]

        # Since we are controlling types, we want to check explicit values inside the record rather the the python
        # wrappers.
        # TLKT-177: Add ability for field to return raw value

        assert record.field['value'].type == expected_type
        assert null_record.field['value'].type == expected_type

        assert record.field['value']._data['value'] == expected_value
        assert null_record.field['value'] == None
    finally:
        if not keep_data:
            logger.info('Dropping table %s in %s database ...', table_name, database.type)
            connection.execute(f"DROP TABLE {table_name}")
Exemplo n.º 8
0
def test_oracle_cdc_client_basic(sdc_builder, sdc_executor, database,
                                 buffer_locally, use_pattern):
    """Basic test that reads inserts/updates/deletes to an Oracle table,
    and validates that they are read in the same order.
    Runs oracle_cdc_client >> trash
    """
    db_engine = database.engine
    pipeline = None
    table = None

    try:
        src_table_name = get_random_string(string.ascii_uppercase, 9)

        # If use_pattern is True, run the test if and only if sdc_builder >= 3.1.0.0
        if use_pattern:
            if Version(sdc_builder.version) >= Version('3.1.0.0'):
                src_table_pattern = _get_table_pattern(src_table_name)
            else:
                pytest.skip('Skipping test as SDC Builder version < 3.1.0.0')
        else:
            src_table_pattern = src_table_name

        connection = database.engine.connect()
        table = _setup_table(database=database, table_name=src_table_name)

        logger.info('Using table pattern %s', src_table_pattern)

        pipeline_builder = sdc_builder.get_pipeline_builder()

        oracle_cdc_client = _get_oracle_cdc_client_origin(
            connection=connection,
            database=database,
            sdc_builder=sdc_builder,
            pipeline_builder=pipeline_builder,
            buffer_locally=buffer_locally,
            src_table_name=src_table_pattern)

        inserts = _insert(connection=connection, table=table)

        rows = inserts.rows
        cdc_op_types = inserts.cdc_op_types
        sdc_op_types = inserts.sdc_op_types
        change_count = inserts.change_count

        updates = _update(connection=connection, table=table)

        rows += updates.rows
        cdc_op_types += updates.cdc_op_types
        sdc_op_types += updates.sdc_op_types
        change_count += updates.change_count

        deletes = _delete(connection=connection, table=table)

        # deletes should have the last state of the row, so it would be the what comes from the updates.
        rows += updates.rows
        cdc_op_types += deletes.cdc_op_types
        sdc_op_types += deletes.sdc_op_types
        change_count += deletes.change_count

        logger.info('Expected number of records is %s.', change_count)

        trash = pipeline_builder.add_stage('Trash')

        # Why do we need to wait?
        # The time at the DB might differ from here. If the DB is behind, we are ok, and we will get all the data.
        # If the DB is ahead, the batch end time the origin may not be after all the changes were written to the DB.
        # So we wait until the time here is past the time at which all data was written out to the DB (current time)
        _wait_until_time(_get_current_oracle_time(connection=connection))

        oracle_cdc_client >> trash
        pipeline = pipeline_builder.build(
            'Oracle CDC Client Pipeline').configure_for_environment(database)
        sdc_executor.add_pipeline(pipeline)

        snapshot = sdc_executor.capture_snapshot(
            pipeline, start_pipeline=True).wait_for_finished(60).snapshot

        row_index = 0
        op_index = 0
        # assert all the data captured have the same raw_data
        for record in snapshot.snapshot_batches[0][
                oracle_cdc_client.instance_name].output:
            assert row_index == int(record.field['ID'].value)
            assert rows[op_index]['NAME'] == record.field['NAME'].value
            assert int(record.header['values']
                       ['sdc.operation.type']) == sdc_op_types[op_index]
            assert record.header['values'][
                'oracle.cdc.operation'] == cdc_op_types[op_index]
            row_index = (row_index + 1) % 3
            op_index += 1

        assert op_index == change_count

    finally:
        if pipeline is not None:
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        if table is not None:
            table.drop(db_engine)
            logger.info('Table: %s dropped.', src_table_name)
Exemplo n.º 9
0
def test_oracle_cdc_to_jdbc_producer(sdc_builder, sdc_executor, database,
                                     buffer_locally, use_pattern):
    db_engine = database.engine
    pipeline = None
    src_table = None
    dest_table = None

    try:
        src_table_name = get_random_string(string.ascii_uppercase, 9)
        # If use_pattern is True, run the test if and only if sdc_builder >= 3.1.0.0
        if use_pattern:
            if Version(sdc_builder.version) >= Version('3.1.0.0'):
                src_table_pattern = _get_table_pattern(src_table_name)
            else:
                pytest.skip('Skipping test as SDC Builder version < 3.1.0.0')
        else:
            src_table_pattern = src_table_name

        connection = database.engine.connect()
        src_table = _setup_table(database, src_table_name)

        pipeline_builder = sdc_builder.get_pipeline_builder()

        logger.info('Using table pattern %s', src_table_pattern)
        batch_size = 10

        oracle_cdc_client = _get_oracle_cdc_client_origin(
            connection=connection,
            database=database,
            sdc_builder=sdc_builder,
            pipeline_builder=pipeline_builder,
            buffer_locally=buffer_locally,
            src_table_name=src_table_pattern,
            batch_size=batch_size)

        dest_table_name = get_random_string(string.ascii_uppercase, 9)

        dest_table = _setup_table(database, dest_table_name)
        jdbc_producer = pipeline_builder.add_stage('JDBC Producer')

        jdbc_producer.set_attributes(
            table_name=dest_table_name,
            default_operation='INSERT',
            # A framework bug creates a 1-element array, so remove the entry
            field_to_column_mapping=[])

        oracle_cdc_client >> jdbc_producer

        pipeline = pipeline_builder.build('Oracle CDC Client to JDBC Producer'
                                          ).configure_for_environment(database)
        sdc_executor.add_pipeline(pipeline)

        inserts = _insert(connection=connection,
                          table=src_table,
                          count=batch_size).rows

        start_pipeline_cmd = sdc_executor.start_pipeline(pipeline)
        start_pipeline_cmd.wait_for_pipeline_batch_count(1)

        assert [tuple(row.values()) for row in inserts
                ] == _select_from_table(db_engine=db_engine,
                                        dest_table=dest_table)

        updates = _update(connection=connection,
                          table=src_table,
                          count=batch_size).rows
        start_pipeline_cmd.wait_for_pipeline_batch_count(2)

        assert [tuple(row.values()) for row in updates
                ] == _select_from_table(db_engine=db_engine,
                                        dest_table=dest_table)

        _delete(connection=connection, table=src_table, count=batch_size)
        start_pipeline_cmd.wait_for_pipeline_batch_count(3)

        assert len(
            _select_from_table(db_engine=db_engine,
                               dest_table=dest_table)) == 0

    finally:
        if pipeline is not None:
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        if src_table is not None:
            src_table.drop(db_engine)
        if dest_table is not None:
            dest_table.drop(db_engine)
import avro
import pytest
from avro.datafile import DataFileWriter
from streamsets.sdk.utils import Version
from streamsets.testframework.markers import cluster
from streamsets.testframework.utils import get_random_string

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# Specify a port for SDC RPC stages to use.
SDC_RPC_PORT = 20000
SNAPSHOT_TIMEOUT_SEC = 150
MAX_BATCH_WAIT_TIME = 30

MIN_SDC_VERSION_WITH_SPARK_2_LIB = Version('3.3.0')

SCHEMA = {
    'namespace':
    'example.avro',
    'type':
    'record',
    'name':
    'Employee',
    'fields': [{
        'name': 'name',
        'type': 'string'
    }, {
        'name': 'age',
        'type': 'int'
    }, {
def test_data_types_sqlserver(sdc_builder, sdc_executor, database, sql_type, insert_fragment, expected_type, expected_value, keep_data):
    """Test all feasible SQL Server types."""
    table_name = get_random_string(string.ascii_lowercase, 20)
    connection = database.engine.connect()
    try:
        # Create table
        connection.execute(f"""
            CREATE TABLE {table_name}(
                id int primary key,
                data_column {sql_type} NULL
            )
        """)

        # And insert a row with actual value
        connection.execute(f"INSERT INTO {table_name} VALUES(1, {insert_fragment})")
        # And a null
        connection.execute(f"INSERT INTO {table_name} VALUES(2, NULL)")

        builder = sdc_builder.get_pipeline_builder()

        origin = builder.add_stage('JDBC Query Consumer')
        origin.sql_query = 'SELECT * FROM {0}'.format(table_name)
        origin.incremental_mode = False

        # As a part of SDC-10125, DATETIMEOFFSET is natively supported in SDC, and is converted into ZONED_DATETIME
        if sql_type == 'DATETIMEOFFSET':
            if Version(sdc_executor.version) >= Version('3.14.0'):
                expected_type = 'ZONED_DATETIME'
                expected_value = '2004-05-23T14:25:10.3456-08:00'
            else:
                expected_type = 'STRING'
                expected_value = '2004-05-23 14:25:10.3456 -08:00'
                # This unknown_type_action setting is required, otherwise DATETIMEOFFSET tests for SDC < 3.14 will fail.
                origin.on_unknown_type = 'CONVERT_TO_STRING'

        wiretap = builder.add_wiretap()

        origin >> wiretap.destination

        pipeline = builder.build().configure_for_environment(database)
        sdc_executor.add_pipeline(pipeline)

        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count', 2)
        sdc_executor.stop_pipeline(pipeline)

        assert len(wiretap.output_records) == 2
        record = wiretap.output_records[0]
        null_record = wiretap.output_records[1]

        # Since we are controlling types, we want to check explicit values inside the record rather the the python
        # wrappers.
        # TLKT-177: Add ability for field to return raw value

        assert record.field['data_column'].type == expected_type
        assert null_record.field['data_column'].type == expected_type

        assert record.field['data_column']._data['value'] == expected_value
        assert null_record.field['data_column'] == None
    finally:
        if not keep_data:
            logger.info('Dropping table %s in %s database ...', table_name, database.type)
            connection.execute(f"DROP TABLE {table_name}")
def test_mongodb_destination_update_on_nested_key(sdc_builder, sdc_executor,
                                                  mongodb):
    """Ensure that an update on a document with a nested unique field is correctly executed"""
    pipeline_builder = sdc_builder.get_pipeline_builder()
    pipeline_builder.add_error_stage('Discard')
    record = {"f1": {"f2": "a"}, "f3": "b"}

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='JSON',
                                       raw_data=json.dumps(record))

    expression_evaluator = pipeline_builder.add_stage('Expression Evaluator')
    expression_evaluator.header_attribute_expressions = [{
        'attributeToSet':
        'sdc.operation.type',
        'headerAttributeExpression':
        '3'
    }]

    mongodb_dest = pipeline_builder.add_stage('MongoDB', type='destination')
    unique_key_field = '/f1/f2' if Version(
        sdc_builder.version) <= Version('3.5.0') else ['/f1/f2']
    mongodb_dest.set_attributes(database=get_random_string(ascii_letters, 5),
                                collection=get_random_string(
                                    ascii_letters, 10),
                                unique_key_field=unique_key_field)

    if Version(sdc_builder.version) >= Version('4.1.0'):
        mongodb_dest.set_attributes(improve_type_conversion=False)

    dev_raw_data_source >> expression_evaluator >> mongodb_dest

    pipeline = pipeline_builder.build().configure_for_environment(mongodb)

    try:
        # Change value of field which will be updated
        record["f3"] = "c"

        # Create document in MongoDB using PyMongo.
        # First a database is created. Then a collection is created inside that database.
        # Then document is created in that collection.
        logger.info('Adding document into %s collection using PyMongo...',
                    mongodb_dest.collection)
        mongodb_database = mongodb.engine[mongodb_dest.database]
        mongodb_collection = mongodb_database[mongodb_dest.collection]
        inserted_doc = mongodb_collection.insert_one(record)
        assert inserted_doc is not None

        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(
            pipeline).wait_for_pipeline_output_records_count(1)
        sdc_executor.stop_pipeline(pipeline)

        logger.info('Verifying docs updated with PyMongo...')
        mongodb_documents = [
            doc for doc in mongodb.engine[mongodb_dest.database][
                mongodb_dest.collection].find()
        ]
        assert len(mongodb_documents) == 1
        assert mongodb_documents[0]["f3"] == "b"
    finally:
        logger.info('Dropping %s database...', mongodb_dest.database)
        mongodb.engine.drop_database(mongodb_dest.database)
Exemplo n.º 13
0
def test_data_types(sdc_builder, sdc_executor, database, sql_type,
                    insert_fragment, expected_type, expected_value, keep_data,
                    fetch_size):
    if not database.is_cdc_enabled:
        pytest.skip('Test only runs against SQL Server with CDC enabled.')

    table_name = get_random_string(string.ascii_lowercase, 20)
    connection = database.engine.connect()
    try:
        # Create table
        connection.execute(f"""
            CREATE TABLE {table_name}(
                id int primary key,
                data_column {sql_type} NULL
            )
        """)
        _enable_cdc(connection, DEFAULT_SCHEMA_NAME, table_name)

        # And insert a row with actual value
        connection.execute(
            f"INSERT INTO {table_name} VALUES(1, {insert_fragment})")
        # And a null
        connection.execute(f"INSERT INTO {table_name} VALUES(2, NULL)")

        builder = sdc_builder.get_pipeline_builder()

        origin = builder.add_stage('SQL Server CDC Client')
        origin.fetch_size = fetch_size
        origin.table_configs = [{
            'capture_instance':
            f"{DEFAULT_SCHEMA_NAME}_{table_name}"
        }]

        wiretap = builder.add_wiretap()

        # As a part of SDC-10125, DATETIMEOFFSET is natively supported in SDC, and is converted into ZONED_DATETIME
        if sql_type == 'DATETIMEOFFSET':
            if Version(sdc_executor.version) >= Version('3.14.0'):
                expected_type = 'ZONED_DATETIME'
                expected_value = '2004-05-23T14:25:10.3456-08:00'
            else:
                expected_type = 'STRING'
                expected_value = '2004-05-23 14:25:10.3456 -08:00'
                # This unknown_type_action setting is required, otherwise DATETIMEOFFSET tests for SDC < 3.14 will fail.
                origin.on_unknown_type = 'CONVERT_TO_STRING'

        origin >> wiretap.destination

        pipeline = builder.build().configure_for_environment(database)
        sdc_executor.add_pipeline(pipeline)

        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(
            pipeline).wait_for_pipeline_output_records_count(2)
        sdc_executor.stop_pipeline(pipeline)

        records = wiretap.output_records
        assert len(records) == 2

        record = records[0]
        null_record = records[1]

        # Since we are controlling types, we want to check explicit values inside the record rather the the python
        # wrappers.
        # TLKT-177: Add ability for field to return raw value

        assert record.field['data_column'].type == expected_type
        assert null_record.field['data_column'].type == expected_type

        assert record.field['data_column']._data['value'] == expected_value
        assert null_record.field['data_column'] == None
    finally:
        if not keep_data:
            logger.info('Dropping table %s in %s database ...', table_name,
                        database.type)
            connection.execute(f"DROP TABLE {table_name}")

        if connection is not None:
            connection.close()
def test_mongodb_destination(sdc_builder, sdc_executor, mongodb):
    """
    Send simple text into MongoDB destination from Dev Raw Data Source and
        confirm that MongoDB correctly received them using PyMongo.

    The pipeline looks like:
        dev_raw_data_source >> record_deduplicator >> expression_evaluator >> mongodb_dest
                               record_deduplicator >> trash
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()
    pipeline_builder.add_error_stage('Discard')

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='TEXT',
                                       raw_data='\n'.join(DATA))

    expression_evaluator = pipeline_builder.add_stage('Expression Evaluator')
    # MongoDB destination uses the CRUD operation in the sdc.operation.type record header attribute when writing
    # to MongoDB. Value 4 specified below is for UPSERT.
    expression_evaluator.header_attribute_expressions = [{
        'attributeToSet':
        'sdc.operation.type',
        'headerAttributeExpression':
        '1'
    }]

    mongodb_dest = pipeline_builder.add_stage('MongoDB', type='destination')
    mongodb_dest.set_attributes(database=get_random_string(ascii_letters, 5),
                                collection=get_random_string(
                                    ascii_letters, 10))

    if Version(sdc_builder.version) >= Version('4.1.0'):
        mongodb_dest.set_attributes(improve_type_conversion=False)

    # From 3.6.0, unique key field is a list, otherwise single string for older version.
    mongodb_dest.unique_key_field = [
        '/text'
    ] if Version(sdc_builder.version) >= Version('3.6.0') else '/text'

    record_deduplicator = pipeline_builder.add_stage('Record Deduplicator')
    trash = pipeline_builder.add_stage('Trash')
    dev_raw_data_source >> record_deduplicator >> expression_evaluator >> mongodb_dest
    record_deduplicator >> trash
    pipeline = pipeline_builder.build().configure_for_environment(mongodb)

    try:
        # Data is generated in dev_raw_data_source and sent to MongoDB using pipeline.
        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(
            pipeline).wait_for_pipeline_output_records_count(len(DATA))
        sdc_executor.stop_pipeline(pipeline)

        # Verify data is received correctly using PyMongo.
        # Similar to writing, while reading data, we specify MongoDB database and the collection inside it.
        logger.info('Verifying docs received with PyMongo...')
        assert [
            item['text'] for item in mongodb.engine[mongodb_dest.database][
                mongodb_dest.collection].find()
        ] == DATA

    finally:
        logger.info('Dropping %s database...', mongodb_dest.database)
        mongodb.engine.drop_database(mongodb_dest.database)
def test_jdbc_query_executor_parallel_query_execution(sdc_builder, sdc_executor, database, enable_parallel_execution):
    """Test JDBC Query Executor's parallel query execution mode.

    Pipeline will insert records into database, then update the records.
    Using sqlalchemy, we verify that correct data was inserted (and updated) in the database.

    Pipeline configuration:
        dev_raw_data_source >> jdbc_query_executor
    """


    table_name = get_random_string(string.ascii_uppercase, 20)
    table = _create_table(table_name, database)

    # Make sure that we properly escape the table name. Ideally we would do escape for all databases, but since we
    # know that all except postgre are passing, we only escape for Postgre for now.
    enclosed_table = f'"{table_name}"' if type(database) == PostgreSqlDatabase else table_name

    # first, the inserts - they will run in parallel,
    # then all the updates will run sequentially
    # net result is all records should get updated to the (last) new value.
    # otherwise we've failed.
    statements = []
    for rec in ROWS_IN_DATABASE:
        statements.extend([f"INSERT INTO {enclosed_table} (name, id) VALUES ('{rec['name']}', {rec['id']})",
                           f"UPDATE {enclosed_table} SET name = 'bob' WHERE id = {rec['id']}",
                           f"UPDATE {enclosed_table} SET name = 'MERRICK' WHERE id = {rec['id']}"])
    # convert to string - Dev Raw Data Source Data Format tab does not seem
    # to "unroll" the array into newline-terminated records.
    statements = "\n".join(statements)

    pipeline_builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=statements)

    jdbc_query_executor = pipeline_builder.add_stage('JDBC Query', type='executor')

    query_str = "${record:value('/text')}"

    jdbc_query_executor.set_attributes(enable_parallel_queries=enable_parallel_execution,
                                       maximum_pool_size=2,
                                       minimum_idle_connections=2)

    if Version(sdc_builder.version) < Version('3.14.0'):
        jdbc_query_executor.set_attributes(sql_query=query_str)
    else:
        jdbc_query_executor.set_attributes(sql_queries=[query_str])

    dev_raw_data_source >> jdbc_query_executor

    pipeline = pipeline_builder.build().configure_for_environment(database)
    sdc_executor.add_pipeline(pipeline)

    try:
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_output_records_count(len(ROWS_IN_DATABASE)*3)
        sdc_executor.stop_pipeline(pipeline)

        result = database.engine.execute(table.select())
        data_from_database = sorted(result.fetchall(), key=lambda row: row[1])  # order by id
        result.close()
        assert data_from_database == [('MERRICK', record['id']) for record in ROWS_IN_DATABASE]
    finally:
        logger.info('Dropping table %s in %s database ...', table_name, database.type)
        table.drop(database.engine)
def test_jdbc_query_executor_select_query_result_count(sdc_builder, sdc_executor, database):
    """Simple JDBC Query Executor test for successful-query event type and query result count enabled.
    Pipeline will insert records into database and then using sqlalchemy, the verification will happen
    that correct data is inserted into database and then the same data is queried. Event records are
    verified for successful-query event type and query-result field for the select query.

    This is achieved by using a deduplicator which assures us that there is only one ingest to database.
    The pipeline looks like:
        dev_raw_data_source >> record_deduplicator >> jdbc_query_executor1 >= jdbc_query_executor2 >= wiretap
                               record_deduplicator >> trash2
    """
    table_name = get_random_string(string.ascii_lowercase, 20)
    table = _create_table(table_name, database)

    DATA = ['id,name'] + [','.join(str(item) for item in rec.values()) for rec in ROWS_IN_DATABASE]
    pipeline_builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='DELIMITED',
                                       header_line='WITH_HEADER',
                                       raw_data='\n'.join(DATA),
                                       stop_after_first_batch = True)

    query_str1 = f"INSERT INTO {table_name} (name, id) VALUES ('${{record:value('/name')}}', '${{record:value('/id')}}')"
    query_str2 = f"SELECT * FROM {table_name}"

    jdbc_query_executor1 = pipeline_builder.add_stage('JDBC Query', type='executor')
    if Version(sdc_builder.version) < Version('3.14.0'):
        jdbc_query_executor1.set_attributes(sql_query=query_str1)
    else:
        jdbc_query_executor1.set_attributes(sql_queries=[query_str1])

    jdbc_query_executor2 = pipeline_builder.add_stage('JDBC Query', type='executor')

    jdbc_query_executor2.set_attributes(include_query_result_count_in_events=True)

    if Version(sdc_builder.version) < Version('3.14.0'):
        jdbc_query_executor2.set_attributes(sql_query=query_str2)
    else:
        jdbc_query_executor2.set_attributes(sql_queries=[query_str2])

    record_deduplicator = pipeline_builder.add_stage('Record Deduplicator')
    wiretap = pipeline_builder.add_wiretap()
    trash2 = pipeline_builder.add_stage('Trash')

    dev_raw_data_source >> record_deduplicator >> jdbc_query_executor1 >= jdbc_query_executor2 >= wiretap.destination
    record_deduplicator >> trash2
    pipeline = pipeline_builder.build(title='JDBC Query Executor').configure_for_environment(database)
    sdc_executor.add_pipeline(pipeline)

    try:
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        event_records = wiretap.output_records
        assert len(event_records) == 3
        assert 'successful-query' == event_records[0].header['values']['sdc.event.type']
        assert 'successful-query' == event_records[1].header['values']['sdc.event.type']
        assert 'successful-query' == event_records[2].header['values']['sdc.event.type']

        assert '3 row(s) returned' == event_records[0].value['value']['query-result']['value']
        assert '3 row(s) returned' == event_records[1].value['value']['query-result']['value']
        assert '3 row(s) returned' == event_records[2].value['value']['query-result']['value']

        result = database.engine.execute(table.select())
        result.close()
    finally:
        logger.info('Dropping table %s in %s database ...', table_name, database.type)
        table.drop(database.engine)
def test_kafka_origin_collecd_message(sdc_builder, sdc_executor, cluster):
    """Write a text message using UDP datagram mode COLLECTD
    into Kafka multiple partitions with the schema in the records
    and confirm that Kafka successfully reads them.
    Because cluster mode pipelines don't support snapshots, we do this verification using a
    second standalone pipeline whose origin is an SDC RPC written to by the Kafka Consumer pipeline.
    Specifically, this would look like:

    Kafka Consumer Origin pipeline with cluster mode:
        kafka_consumer >> sdc_rpc_destination

    Snapshot pipeline:
        sdc_rpc_origin >> trash
    """

    msg64packet = (
        'rO0ABXoAAAQAAAAAAQAAAAMAAAAAAAAAAQAJMTI3LjAuMC4xAAALuAAJMTI3LjAuMC4xAAAH0AAABVkCAAAoLmo9Of+LakZDcogiJUJa2iIO1'
        '+Fl9GzuT86v9yB0HXN1c2VyAAAAMWlwLTE5Mi0xNjgtNDItMjM4LnVzLXdlc3QtMi5jb21wdXRlLmludGVybmFsAAAIAAwVa65L6bcTJwAJAA'
        'wAAAACgAAAAAACAA5pbnRlcmZhY2UAAAMACGxvMAAABAAOaWZfZXJyb3JzAAAGABgAAgICAAAAAAAAAAAAAAAAAAAAAAAIAAwVa65L6bZ8KAA'
        'CAAlsb2FkAAADAAUAAAQACWxvYWQAAAYAIQADAQEBAAAAAAA2BkAAAAAAAMcOQAAAAAAALA5AAAgADBVrrkvptwrDAAIADmludGVyZmFjZQAA'
        'AwAIbG8wAAAEAA9pZl9wYWNrZXRzAAAGABgAAgICAAAAAAAR1/AAAAAAABHX8AAIAAwVa65L6bb5/AAEAA5pZl9vY3RldHMAAAYAGAACAgIAA'
        'AAAISMkFAAAAAAhIyQUAAgADBVrrkvptzCDAAMACWdpZjAAAAYAGAACAgIAAAAAAAAAAAAAAAAAAAAAAAgADBVrrkvptwaRAAIAC21lbW9yeQ'
        'AAAwAFAAAEAAttZW1vcnkAAAUACndpcmVkAAAGAA8AAQEAAAAABA7yQQAIAAwVa65L6bfHggACAA5pbnRlcmZhY2UAAAMACWdpZjAAAAQAD2l'
        'mX3BhY2tldHMAAAUABQAABgAYAAICAgAAAAAAAAAAAAAAAAAAAAAACAAMFWuuS+m3BpEAAgALbWVtb3J5AAADAAUAAAQAC21lbW9yeQAABQAN'
        'aW5hY3RpdmUAAAYADwABAQAAAADW3OlBAAUAC2FjdGl2ZQAABgAPAAEBAAAAAPI17kEACAAMFWuuS+m4Cp0AAgAOaW50ZXJmYWNlAAADAAlna'
        'WYwAAAEAA5pZl9lcnJvcnMAAAUABQAABgAYAAICAgAAAAAAAAAAAAAAAAAAAAAACAAMFWuuS+m3BpEAAgALbWVtb3J5AAADAAUAAAQAC21lbW'
        '9yeQAABQAJZnJlZQAABgAPAAEBAAAAAECHnUEACAAMFWuuS+m4kNUAAgAOaW50ZXJmYWNlAAADAAlzdGYwAAAEAA5pZl9vY3RldHMAAAUABQA'
        'ABgAYAAICAgAAAAAAAAAAAAAAAAAAAAAACAAMFWuuS+m4mTkABAAOaWZfZXJyb3JzAAAGABgAAgICAAAAAAAAAAAAAAAAAAAAAAAIAAwVa65L'
        '6bidagADAAhlbjAAAAQADmlmX29jdGV0cwAABgAYAAICAgAAAABFC4cKAAAAAAhjPdIACHoAAAGLAAwVa65L6biVBwADAAlzdGYwAAAEAA9pZ'
        'l9wYWNrZXRzAAAGABgAAgICAAAAAAAAAAAAAAAAAAAAAAAIAAwVa65L6bi2lQADAAhlbjAAAAYAGAACAgIAAAAAABJhDgAAAAAADMIoAAgADB'
        'VrrkvpuLrHAAQADmlmX2Vycm9ycwAABgAYAAICAgAAAAAAAAAAAAAAAAAAAAAACAAMFWuuS+m4vvgAAwAIZW4xAAAEAA5pZl9vY3RldHMAAAY'
        'AGAACAgIAAAAAAAAAAAAAAAAAAAAAAAQAD2lmX3BhY2tldHMAAAYAGAACAgIAAAAAAAAAAAAAAAAAAAAAAAgADBVrrkvpuMMqAAQADmlmX2Vy'
        'cm9ycwAABgAYAAICAgAAAAAAAAAAAAAAAAAAAAAAAwAIZW4yAAAEAA5pZl9vY3RldHMAAAYAGAACAgIAAAAAAAAAAAAAAAAAAAAAAAgADBVrr'
        'kvpuMdcAAQADmlmX2Vycm9ycwAABgAYAAICAgAAAAAAAAAAAAAAAAAAAAA=')

    expected = (
        '{\'plugin_instance\': lo0, \'plugin\': interface, \'tx\': 0, \'rx\': 0, \'host\': ip-192-168-42-238.us-west-2.'
        'compute.internal, \'time_hires\': 1543518938371396391, \'type\': if_errors}'
    )

    if (Version(sdc_builder.version) < MIN_SDC_VERSION_WITH_SPARK_2_LIB
            and ('kafka' in cluster.kerberized_services
                 or cluster.kafka.is_ssl_enabled)):
        pytest.skip(
            'Kafka cluster mode test only '
            f'runs against cluster with the non-secured Kafka for SDC version {sdc_builder.version}.'
        )

    # Build the Kafka consumer pipeline.
    builder = sdc_builder.get_pipeline_builder()
    kafka_consumer = get_kafka_consumer_stage(sdc_builder.version, builder,
                                              cluster)

    # Override default configuration.
    kafka_consumer.set_attributes(data_format='DATAGRAM',
                                  datagram_data_format='COLLECTD')

    sdc_rpc_destination = get_rpc_destination(builder, sdc_executor)

    kafka_consumer >> sdc_rpc_destination
    kafka_consumer_pipeline = builder.build(
        title='Cluster kafka COLLECTD pipeline').configure_for_environment(
            cluster)
    kafka_consumer_pipeline.configuration[
        'executionMode'] = 'CLUSTER_YARN_STREAMING'
    kafka_consumer_pipeline.configuration['shouldRetry'] = False

    # Build the Snapshot pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    sdc_rpc_origin = get_rpc_origin(builder, sdc_rpc_destination)
    trash = builder.add_stage(label='Trash')
    sdc_rpc_origin >> trash
    snapshot_pipeline = builder.build(title='Cluster Snapshot pipeline')

    sdc_executor.add_pipeline(kafka_consumer_pipeline, snapshot_pipeline)

    try:
        # Publish messages to Kafka and verify using snapshot if the same messages are received.
        produce_kafka_messages(kafka_consumer.topic, cluster,
                               base64.b64decode(msg64packet), 'COLLECTD')
        verify_kafka_origin_results(kafka_consumer_pipeline, snapshot_pipeline,
                                    sdc_executor, expected, 'COLLECTD')

    finally:
        sdc_executor.stop_pipeline(kafka_consumer_pipeline)
        sdc_executor.stop_pipeline(snapshot_pipeline)
def test_kafka_log_record_cluster(sdc_builder, sdc_executor, cluster):
    """Write simple log messages into Kafka and confirm that Kafka successfully reads them.

    Kafka Consumer Origin pipeline with cluster mode:
        kafka_consumer >> sdc_rpc_destination

    Snapshot pipeline:
        sdc_rpc_origin >> trash
    """

    message = (
        '+20150320 [15:53:31,161] DEBUG PipelineConfigurationValidator - Pipeline \'test:preview\' validation. '
        'valid=true, canPreview=true, issuesCount=0 - ')

    if (Version(sdc_builder.version) < MIN_SDC_VERSION_WITH_SPARK_2_LIB
            and ('kafka' in cluster.kerberized_services
                 or cluster.kafka.is_ssl_enabled)):
        pytest.skip(
            'Kafka cluster mode test only '
            f'runs against cluster with the non-secured Kafka for SDC version {sdc_builder.version}.'
        )

    # Build the Kafka consumer pipeline.
    builder = sdc_builder.get_pipeline_builder()
    kafka_consumer = get_kafka_consumer_stage(sdc_builder.version, builder,
                                              cluster)

    # Override default configuration.
    kafka_consumer.set_attributes(data_format='LOG',
                                  log_format='LOG4J',
                                  retain_original_line=True,
                                  on_parse_error='INCLUDE_AS_STACK_TRACE')

    sdc_rpc_destination = get_rpc_destination(builder, sdc_executor)

    kafka_consumer >> sdc_rpc_destination
    kafka_consumer_pipeline = builder.build(
        title='Cluster kafka BINARY pipeline').configure_for_environment(
            cluster)
    kafka_consumer_pipeline.configuration[
        'executionMode'] = 'CLUSTER_YARN_STREAMING'
    kafka_consumer_pipeline.configuration['shouldRetry'] = False

    # Build the Snapshot pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    sdc_rpc_origin = get_rpc_origin(builder, sdc_rpc_destination)
    trash = builder.add_stage(label='Trash')
    sdc_rpc_origin >> trash
    snapshot_pipeline = builder.build(title='Cluster kafka BINARY snapshot')

    sdc_executor.add_pipeline(kafka_consumer_pipeline, snapshot_pipeline)

    try:
        # Publish messages to Kafka and verify using snapshot if the same messages are received.
        produce_kafka_messages(kafka_consumer.topic, cluster, message.encode(),
                               'LOG')
        verify_kafka_origin_results(kafka_consumer_pipeline, snapshot_pipeline,
                                    sdc_executor, message, 'LOG')
    finally:
        sdc_executor.stop_pipeline(kafka_consumer_pipeline)
        sdc_executor.stop_pipeline(snapshot_pipeline)
def test_mapr_cluster_streams(sdc_builder, sdc_executor, cluster):
    """This test will start MapR Streams producer and consumer pipelines which check for integrity of data flow
    from a MapR Streams producer to MapR Streams consumer. Producer pipeline runs as standalone while the consumer
    one runs on cluster. Since cluster pipeline cannot be snapshot, we use RPC stage to snapshot the data.
    The pipeline would look like:

    MapR Streams producer pipeline:
        dev_raw_data_source >> mapr_streams_producer

    MapR Streams consumer pipeline:
        mapr_streams_consumer >> sdc_rpc_destination

    Snapshot pipeline:
        sdc_rpc_origin >> trash
    """
    # MapR Stream name has to be pre-created in MapR cluster. Clusterdock MapR image has this already.
    stream_name = '/sample-stream'
    stream_topic_name = stream_name + ':' + get_random_string(
        string.ascii_letters, 10)
    sdc_rpc_id = get_random_string(string.ascii_letters, 10)

    # Build the MapR Stream producer pipeline.
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.data_format = 'TEXT'
    dev_raw_data_source.raw_data = 'Hello World!'

    mapr_streams_producer = builder.add_stage('MapR Streams Producer')
    mapr_streams_producer.data_format = 'TEXT'
    # Runtime topic resolution is explicitly supported from 3.4.0
    if Version(sdc_executor.version) >= Version('3.4.0'):
        mapr_streams_producer.runtime_topic_resolution = True
        mapr_streams_producer.topic_expression = stream_topic_name
    else:
        mapr_streams_producer.topic = stream_topic_name

    dev_raw_data_source >> mapr_streams_producer
    producer_pipeline = builder.build(
        'Streams Producer - cluster').configure_for_environment(cluster)
    producer_pipeline.rate_limit = 1

    # Build the MapR Stream consumer pipeline.
    builder = sdc_builder.get_pipeline_builder()

    mapr_streams_consumer = builder.add_stage('MapR Streams Consumer')
    mapr_streams_consumer.topic = stream_topic_name
    mapr_streams_consumer.data_format = 'TEXT'

    sdc_rpc_destination = builder.add_stage(
        name='com_streamsets_pipeline_stage_destination_sdcipc_SdcIpcDTarget')
    sdc_rpc_destination.sdc_rpc_connection.append('{}:{}'.format(
        sdc_executor.server_host, SDC_RPC_LISTENING_PORT))
    sdc_rpc_destination.sdc_rpc_id = sdc_rpc_id

    mapr_streams_consumer >> sdc_rpc_destination
    consumer_pipeline = builder.build(
        'Streams Consumer - cluster').configure_for_environment(cluster)
    consumer_pipeline.configuration['executionMode'] = 'CLUSTER_YARN_STREAMING'
    consumer_pipeline.rate_limit = 1

    # Build the Snapshot pipeline.
    builder = sdc_builder.get_pipeline_builder()

    sdc_rpc_origin = builder.add_stage(
        name='com_streamsets_pipeline_stage_origin_sdcipc_SdcIpcDSource')
    sdc_rpc_origin.sdc_rpc_listening_port = SDC_RPC_LISTENING_PORT
    sdc_rpc_origin.sdc_rpc_id = sdc_rpc_id
    # Since YARN jobs take a while to get going, set RPC origin batch wait time to 5 min. to avoid
    # getting an empty batch in the snapshot.
    sdc_rpc_origin.batch_wait_time_in_secs = 300

    trash = builder.add_stage('Trash')

    sdc_rpc_origin >> trash
    snapshot_pipeline = builder.build('Snapshot pipeline - cluster')

    sdc_executor.add_pipeline(producer_pipeline, consumer_pipeline,
                              snapshot_pipeline)

    # Run pipelines and assert the data flow. To do that, the sequence of steps is as follows:
    # 1. Start MapR Stream producer and make sure to wait till some output generates - ensures topic creation
    # 2. Start RPC origin (snapshot_pipeline) where snapshot can be captured
    # 3. Start MapR Stream consumer and make sure to wait till some output generates - ensures cluster streaming
    # 4. Initiate and capture snapshot on the RPC origin pipeline
    # 5. Compare and assert snapshot result to the data injected at the MapR Stream producer
    try:
        sdc_executor.start_pipeline(
            producer_pipeline).wait_for_pipeline_output_records_count(5)
        # RUNNING ensures RPC origin is started
        sdc_executor.start_pipeline(snapshot_pipeline)

        consumer_start_cmd = sdc_executor.start_pipeline(consumer_pipeline)
        consumer_start_cmd.wait_for_pipeline_output_records_count(5)

        snapshot_pipeline_command = sdc_executor.capture_snapshot(
            snapshot_pipeline, start_pipeline=False, wait=False)
        snapshot = snapshot_pipeline_command.wait_for_finished(
            timeout_sec=120).snapshot
        snapshot_data = snapshot[snapshot_pipeline[0].instance_name].output[
            0].value['value']['text']['value']

        assert dev_raw_data_source.raw_data == snapshot_data
    finally:
        # Force stop the pipeline to avoid hanging until the SDC RPC stage's max batch wait time is reached.
        sdc_executor.stop_pipeline(pipeline=snapshot_pipeline, force=True)
        sdc_executor.stop_pipeline(producer_pipeline)
        sdc_executor.stop_pipeline(consumer_pipeline)
def test_kafka_origin_cluster(sdc_builder, sdc_executor, cluster):
    """Write simple text messages into Kafka and confirm that Kafka successfully reads them.
    Because cluster mode pipelines don't support snapshots, we do this verification using a
    second standalone pipeline whose origin is an SDC RPC written to by the Kafka Consumer pipeline.
    Specifically, this would look like:

    Kafka Consumer Origin pipeline with cluster mode:
        kafka_consumer >> sdc_rpc_destination

    Snapshot pipeline:
        sdc_rpc_origin >> trash
    """

    message = 'Hello World from SDC & DPM!'
    expected = '{\'text\': Hello World from SDC & DPM!}'

    if (Version(sdc_builder.version) < MIN_SDC_VERSION_WITH_SPARK_2_LIB
            and ('kafka' in cluster.kerberized_services
                 or cluster.kafka.is_ssl_enabled)):
        pytest.skip(
            'Kafka cluster mode test only '
            f'runs against cluster with the non-secured Kafka for SDC version {sdc_builder.version}.'
        )

    # Build the Kafka consumer pipeline.
    builder = sdc_builder.get_pipeline_builder()
    kafka_consumer = get_kafka_consumer_stage(sdc_builder.version, builder,
                                              cluster)

    sdc_rpc_destination = get_rpc_destination(builder, sdc_executor)

    kafka_consumer >> sdc_rpc_destination
    kafka_consumer_pipeline = builder.build(
        title='Cluster kafka String pipeline').configure_for_environment(
            cluster)
    kafka_consumer_pipeline.configuration[
        'executionMode'] = 'CLUSTER_YARN_STREAMING'
    kafka_consumer_pipeline.configuration['shouldRetry'] = False

    # Build the Snapshot pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    sdc_rpc_origin = get_rpc_origin(builder, sdc_rpc_destination)
    trash = builder.add_stage(label='Trash')
    sdc_rpc_origin >> trash

    snapshot_pipeline = builder.build(
        title='Cluster kafka String Snapshot pipeline')

    sdc_executor.add_pipeline(kafka_consumer_pipeline, snapshot_pipeline)

    try:
        # Publish messages to Kafka and verify using snapshot if the same messages are received.
        produce_kafka_messages(kafka_consumer.topic, cluster, message.encode(),
                               'TEXT')
        verify_kafka_origin_results(kafka_consumer_pipeline, snapshot_pipeline,
                                    sdc_executor, expected, 'TEXT')
    finally:
        sdc_executor.stop_pipeline(kafka_consumer_pipeline)
        sdc_executor.stop_pipeline(snapshot_pipeline)
def version_check(sdc_builder, cluster):
    if cluster.version.startswith('cdh6.0') and Version(
            sdc_builder.version) < Version('3.7.0'):
        pytest.skip(
            'HBase destination is not included in streamsets-datacollector-cdh_6_0-lib until SDC 3.7.0 (SDC-9976)'
        )
Exemplo n.º 22
0
def test_s3_executor_tag_object(sdc_builder, sdc_executor, aws):
    """Test for S3 executor stage. We do so by running a dev raw data source generator to S3 destination
    sandbox bucket and then reading S3 bucket using STF client to assert data between the client to what has
    been created by the pipeline. We use a record deduplicator processor in between dev raw data source origin
    and S3 destination in order to limit number of objects to one.

    For recent SDC versions we also check that the corresponding 'file-changed' event is generated.

    S3 Destination pipeline:
        dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap.destination
                                                   >> to_error
    """
    s3_bucket = aws.s3_bucket_name
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'
    raw_str = f'{{"bucket": "{s3_bucket}", "key": "{s3_key}"}}'

    # Build the pipeline.
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='JSON',
                                              raw_data=raw_str,
                                              stop_after_first_batch=True)

    record_deduplicator = builder.add_stage('Record Deduplicator')
    to_error = builder.add_stage('To Error')

    s3_executor = builder.add_stage('Amazon S3', type='executor')
    s3_executor.set_attributes(bucket='${record:value("/bucket")}',
                               task='CHANGE_EXISTING_OBJECT',
                               object='${record:value("/key")}',
                               tags=Configuration(
                                   property_key='key',
                                   company='${record:value("/company")}'))

    wiretap = builder.add_wiretap()

    dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap.destination
    record_deduplicator >> to_error

    s3_exec_pipeline = builder.build(
        title='Amazon S3 executor pipeline').configure_for_environment(aws)
    sdc_executor.add_pipeline(s3_exec_pipeline)

    client = aws.s3
    try:
        # Pre-create the object so that it exists.
        client.put_object(Body='Secret Data', Bucket=s3_bucket, Key=s3_key)

        sdc_executor.start_pipeline(s3_exec_pipeline).wait_for_finished()

        tags = client.get_object_tagging(Bucket=s3_bucket,
                                         Key=s3_key)['TagSet']
        assert len(tags) == 1

        # Check if the 'file-created' event was generated (only for recent sdc versions).
        if Version(
                sdc_builder.version) >= MIN_SDC_VERSION_WITH_EXECUTOR_EVENTS:
            assert len(wiretap.output_records) == 1
            assert wiretap.output_records[0].header.values[
                'sdc.event.type'] == 'file-changed'

    finally:
        _ensure_pipeline_is_stopped(sdc_executor, s3_exec_pipeline)
        delete_keys = {
            'Objects': [{
                'Key': k['Key']
            } for k in client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
                        ['Contents']]
        }
        client.delete_objects(Bucket=s3_bucket, Delete=delete_keys)
Exemplo n.º 23
0
def test_oracle_cdc_client_string_null_values(sdc_builder, sdc_executor,
                                              database, buffer_locally,
                                              use_pattern):
    """Basic test that tests for SDC-8340. This test ensures that Strings with value 'NULL'/'null' is treated correctly,
    and null is not returned.
    Runs oracle_cdc_client >> trash
    """
    db_engine = database.engine
    pipeline = None
    table = None

    try:
        src_table_name = get_random_string(string.ascii_uppercase, 9)

        # If use_pattern is True, run the test if and only if sdc_builder >= 3.1.0.0
        if use_pattern:
            if Version(sdc_builder.version) >= Version('3.1.0.0'):
                src_table_pattern = _get_table_pattern(src_table_name)
            else:
                pytest.skip('Skipping test as SDC Builder version < 3.1.0.0')
        else:
            src_table_pattern = src_table_name

        connection = database.engine.connect()
        table = _setup_table(database=database,
                             table_name=src_table_name,
                             create_primary_key=False)

        logger.info('Using table pattern %s', src_table_pattern)

        pipeline_builder = sdc_builder.get_pipeline_builder()

        oracle_cdc_client = _get_oracle_cdc_client_origin(
            connection=connection,
            database=database,
            sdc_builder=sdc_builder,
            pipeline_builder=pipeline_builder,
            buffer_locally=buffer_locally,
            src_table_name=src_table_pattern)
        rows = [{
            'ID': 100,
            'NAME': 'NULL'
        }, {
            'ID': None,
            'NAME': 'Whose Name?'
        }, {
            'ID': 123,
            'NAME': None
        }, {
            'ID': None,
            'NAME': None
        }]
        txn = connection.begin()

        connection.execute(table.insert(), rows)

        try:

            def update_table_where_id(tbl_row):
                connection.execute(table.update().where(
                    table.c.ID == tbl_row['ID']).values(NAME=tbl_row['NAME']))

            # using ID is None causes an invalid SQL statement to be created since "is" is evaluated right away.
            row = {'ID': None, 'NAME': 'New Name'}
            update_table_where_id(row)
            # The above statement will update 2 rows, so the change generates 2 records.
            rows += [row for _ in range(0, 2)]

            row = {'ID': 100, 'NAME': None}
            update_table_where_id(row)
            rows.append(row)

            row = {'ID': 123, 'NAME': 'NULL'}
            update_table_where_id(row)
            rows.append(row)

            row = {'ID': None, 'NAME': 'New Name'}
            connection.execute(table.update().where(
                table.c.NAME == row['NAME']).values(ID=row['ID']))
            rows += [row for _ in range(0, 2)]

            txn.commit()
        except:
            txn.rollback()
            raise

        trash = pipeline_builder.add_stage('Trash')

        # Why do we need to wait?
        # The time at the DB might differ from here. If the DB is behind, we are ok, and we will get all the data.
        # If the DB is ahead, the batch end time the origin may not be after all the changes were written to the DB.
        # So we wait until the time here is past the time at which all data was written out to the DB (current time)
        _wait_until_time(_get_current_oracle_time(connection=connection))

        oracle_cdc_client >> trash
        pipeline = pipeline_builder.build(
            'Oracle CDC Client Pipeline').configure_for_environment(database)
        sdc_executor.add_pipeline(pipeline)

        snapshot = sdc_executor.capture_snapshot(
            pipeline, start_pipeline=True).wait_for_finished(60).snapshot

        # assert all the data captured have the same raw_data
        output = snapshot.snapshot_batches[0][
            oracle_cdc_client.instance_name].output
        for i, record in enumerate(output):
            # In update records, values with NULLs in the row are not returned
            if 'ID' in record.field:
                id_val = record.field['ID'].value
                assert rows[i]['ID'] == None if id_val is None else int(id_val)
            assert rows[i]['NAME'] == record.field['NAME']

        assert len(output) == len(rows)
    finally:
        if pipeline is not None:
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        if table is not None:
            table.drop(db_engine)
            logger.info('Table: %s dropped.', src_table_name)
def mongodbLookupResultFieldName(sdc_builder):
    """Resolve proper name for the "Result Field" in lookup - it will differ based on SDC version."""
    if Version(sdc_builder.version) >= Version("3.7.0"):
        return 'result_field'
    else:
        return 'new_field_to_save_lookup_result'
Exemplo n.º 25
0
def test_rollback_to_savepoint(sdc_builder, sdc_executor, database,
                               buffer_locally, use_pattern):
    """Test that writes some data, then creates a save point, writes some more data and then rolls back to savepoint,
    and validates that only the data that is before the save point and after the rollback is read
    Runs oracle_cdc_client >> trash
    """
    db_engine = database.engine
    pipeline = None
    table = None

    try:
        src_table_name = get_random_string(string.ascii_uppercase, 9)

        # If use_pattern is True, run the test if and only if sdc_builder >= 3.1.0.0
        if use_pattern:
            if Version(sdc_builder.version) >= Version('3.1.0.0'):
                src_table_pattern = _get_table_pattern(src_table_name)
            else:
                pytest.skip('Skipping test as SDC Builder version < 3.1.0.0')
        else:
            src_table_pattern = src_table_name

        connection = database.engine.connect()
        table = _setup_table(database=database, table_name=src_table_name)

        logger.info('Using table pattern %s', src_table_pattern)

        pipeline_builder = sdc_builder.get_pipeline_builder()

        oracle_cdc_client = _get_oracle_cdc_client_origin(
            connection=connection,
            database=database,
            sdc_builder=sdc_builder,
            pipeline_builder=pipeline_builder,
            buffer_locally=buffer_locally,
            src_table_name=src_table_pattern)
        trash = pipeline_builder.add_stage('Trash')
        lines = [
            f"INSERT INTO {src_table_name} VALUES (1, 'MORDOR')",
            f"INSERT INTO {src_table_name} VALUES (2, 'GONDOR')",
            f"UPDATE {src_table_name} SET {OTHER_COLUMN} = 'MINAS MORGUL' WHERE {PRIMARY_KEY} = 1",
            'SAVEPOINT stf_test_savepoint',
            f"INSERT INTO {src_table_name} VALUES(3, 'ROHAN')",
            f"UPDATE {src_table_name} SET {OTHER_COLUMN} = 'SHIRE' WHERE {PRIMARY_KEY} = 1",
            f"DELETE FROM {src_table_name} WHERE {PRIMARY_KEY} = 1",
            'ROLLBACK TO stf_test_savepoint',
            f"UPDATE {src_table_name} SET {OTHER_COLUMN} = 'HOBBITON' WHERE {PRIMARY_KEY} = 2",
            f"INSERT INTO {src_table_name} VALUES (3, 'GONDOR')", 'COMMIT'
        ]
        txn = connection.begin()
        for line in lines:
            transaction_text = text(line)
            connection.execute(transaction_text)
        txn.commit()

        # Why do we need to wait?
        # The time at the DB might differ from here. If the DB is behind, we are ok, and we will get all the data.
        # If the DB is ahead, the batch end time the origin may not be after all the changes were written to the DB.
        # So we wait until the time here is past the time at which all data was written out to the DB (current time)
        _wait_until_time(_get_current_oracle_time(connection=connection))

        oracle_cdc_client >> trash
        pipeline = pipeline_builder.build(
            'Oracle CDC Client Pipeline').configure_for_environment(database)
        sdc_executor.add_pipeline(pipeline)

        snapshot = sdc_executor.capture_snapshot(
            pipeline, start_pipeline=True).wait_for_finished(60).snapshot
        # assert all the data captured have the same raw_data
        output_records = snapshot.snapshot_batches[0][
            oracle_cdc_client.instance_name].output
        assert len(output_records) == 5
        assert output_records[0].field[PRIMARY_KEY] == 1
        assert output_records[0].field[OTHER_COLUMN] == 'MORDOR'
        assert output_records[0].header['values']['sdc.operation.type'] == '1'
        assert output_records[1].field[PRIMARY_KEY] == 2
        assert output_records[1].field[OTHER_COLUMN] == 'GONDOR'
        assert output_records[1].header['values']['sdc.operation.type'] == '1'
        assert output_records[2].field[PRIMARY_KEY] == 1
        assert output_records[2].field[OTHER_COLUMN] == 'MINAS MORGUL'
        assert output_records[2].header['values']['sdc.operation.type'] == '3'
        assert output_records[3].field[PRIMARY_KEY] == 2
        assert output_records[3].field[OTHER_COLUMN] == 'HOBBITON'
        assert output_records[3].header['values']['sdc.operation.type'] == '3'
        assert output_records[4].field[PRIMARY_KEY] == 3
        assert output_records[4].field[OTHER_COLUMN] == 'GONDOR'
        assert output_records[4].header['values']['sdc.operation.type'] == '1'

    finally:
        if pipeline is not None:
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        if table is not None:
            table.drop(db_engine)
            logger.info('Table: %s dropped.', src_table_name)
Exemplo n.º 26
0
from streamsets.sdk.models import Configuration
from streamsets.sdk.utils import Version
from streamsets.testframework.markers import aws, sdc_min_version
from streamsets.testframework.utils import get_random_string

from .utils.utils_aws import allow_public_access, restore_public_access, configure_stage_for_anonymous, \
    create_anonymous_client

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# Sandbox prefix for S3 bucket
S3_SANDBOX_PREFIX = 'sandbox'

MIN_SDC_VERSION_WITH_EXECUTOR_EVENTS = Version('3.4.0')


@aws('s3')
@sdc_min_version('2.6.0.1-0002')
def test_s3_executor_create_object(sdc_builder, sdc_executor, aws):
    """Test for S3 executor stage. We do so by running a dev raw data source generator to S3 executor
    sandbox bucket and then reading S3 bucket using STF client to assert data between the client to what has
    been created by the pipeline. We use a record deduplicator processor in between dev raw data source origin
    and S3 destination in order to limit number of objects to one.

    For recent SDC versions we also check that the corresponding 'file-created' event is generated.

    S3 Destination pipeline:
    dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap.destination
                           record_deduplicator >> to_error
Exemplo n.º 27
0
def test_jdbc_multitable_consumer_to_jdbc(sdc_builder, sdc_executor, database,
                                          table_name_characters,
                                          table_name_length, no_of_tables,
                                          number_of_threads,
                                          per_batch_strategy,
                                          partitioning_mode, non_incremental):
    """Tests Multithreaded Multi-table JDBC source. Replicates a set of tables with prefix 'src' to a another
    set of tables with 'target' prefix. Also leveraging the NO_MORE_DATA EVENT by the Multi-table JDBC source after
    no data in tables. On Event path, the pipeline will execute the pipeline finisher if the event type is seen to
    be no-more-data
    The pipeline would look like:

            jdbc_multitable_consumer >> jdbc_query_dest
                                     >= stream_selector >> finisher
    """

    if non_incremental and Version(sdc_builder.version) < Version('3.0.0.0'):
        # non-incremental support was only added as of SDC 3.0.0.0
        raise pytest.skip(
            'Skipping because SDC builder version {sdc_builder.version} is less than 3.0.0.0'
        )

    event_table_name = get_random_string(string.ascii_lowercase, 10)
    update_event_table_statement = f'UPDATE {event_table_name} set {EVENT_COLUMN_NAME} = 1'

    pipeline_builder = sdc_builder.get_pipeline_builder()

    jdbc_multitable_consumer = pipeline_builder.add_stage(
        'JDBC Multitable Consumer')

    table_configs = [{
        'tablePattern': f'{SRC_TABLE_PREFIX}%',
        'partitioningMode': partitioning_mode,
        'partitionSize': PARTITION_SIZE
    }]
    if Version(sdc_builder.version) >= Version('3.0.0.0'):
        table_configs[0]['enableNonIncremental'] = non_incremental
    jdbc_multitable_consumer.set_attributes(
        number_of_threads=number_of_threads,
        per_batch_strategy=per_batch_strategy,
        maximum_pool_size=number_of_threads,
        minimum_idle_connections=number_of_threads,
        table_configs=table_configs)

    if partitioning_mode == 'BEST_EFFORT' and Version(
            sdc_builder.version) < Version('3.0.0.0'):
        # pipeline upgraded across 3.0 boundary with partitioning; default resulting queriesPerSecond will be
        # unacceptably slow for partitioning, so set query interval to 0 instead
        jdbc_multitable_consumer.query_interval = 0

    # The target used to replicate is JDBCQueryExecutor.
    # After SDC-5757 is resolved, we can use JDBCProducer.
    jdbc_query_dest = pipeline_builder.add_stage('JDBC Query', type='executor')
    table_name = (
        f"${{str:replace(record:attribute('jdbc.tables'),"
        f"'{SRC_TABLE_PREFIX if not database.type == 'Oracle' else SRC_TABLE_PREFIX.upper()}',"
        f"'{TGT_TABLE_PREFIX}')}}")
    query = (
        f"INSERT into {table_name} values "
        f"(${{record:value('/{FIRST_COLUMN if not database.type == 'Oracle' else FIRST_COLUMN.upper()}')}}"
        f", '${{record:value('/{OTHER_COLUMN if not database.type == 'Oracle' else OTHER_COLUMN.upper()}')}}')"
    )

    jdbc_query_dest.set_attributes(sql_query=query)

    finisher = pipeline_builder.add_stage('Pipeline Finisher Executor')
    finisher.set_attributes(
        stage_record_preconditions=["${record:eventType() == 'no-more-data'}"])

    jdbc_multitable_consumer >> jdbc_query_dest
    jdbc_multitable_consumer >= finisher

    non_inc = ', non-incremental' if non_incremental else ''
    pipeline_name = (
        f'JDBC multitable consumer pipeline - {per_batch_strategy} batch strategy, '
        f'{number_of_threads} threads, {partitioning_mode} partitioning{non_inc}'
    )
    pipeline = pipeline_builder.build(pipeline_name).configure_for_environment(
        database)
    sdc_executor.add_pipeline(pipeline)

    # Generate random table names.
    table_names = [
        '{}_{}'.format(
            get_random_string(table_name_characters,
                              table_name_length).lower(), tableNo)
        for tableNo in range(0, no_of_tables)
    ]

    random.shuffle(table_names)

    # when using non-incremental mode, give only half the tables primary keys
    pk_tables = table_names[:len(table_names) //
                            2] if non_incremental else table_names

    # build tuples with table name, and whether to use a primary key
    src_tables = [
        TableInfo(name=TABLE_PREFIX_NAME_FMT.format(
            table_prefix=SRC_TABLE_PREFIX, table_name=table_name),
                  use_primary_key=table_name in pk_tables)
        for table_name in table_names
    ]
    target_tables = [
        TableInfo(name=TABLE_PREFIX_NAME_FMT.format(
            table_prefix=TGT_TABLE_PREFIX, table_name=table_name),
                  use_primary_key=table_name in pk_tables)
        for table_name in table_names
    ]
    try:
        setup_tables(database, src_tables, target_tables, event_table_name)
        sdc_executor.start_pipeline(pipeline).wait_for_finished()
        assert_tables_replicated(database, src_tables)
    finally:
        logger.info('Dropping test related tables in %s database...',
                    database.type)
        teardown_tables(database,
                        [table.name for table in src_tables + target_tables] +
                        [event_table_name])
Exemplo n.º 28
0
def _run_test_s3_executor_create_object(sdc_builder, sdc_executor, aws,
                                        anonymous):
    # Setup test static.
    s3_bucket = aws.s3_bucket_name
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'
    raw_str = f'{{"bucket": "{s3_bucket}", "company": "StreamSets Inc."}}'

    # Build the pipeline.
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='JSON',
                                              raw_data=raw_str,
                                              stop_after_first_batch=True)

    record_deduplicator = builder.add_stage('Record Deduplicator')
    to_error = builder.add_stage('To Error')

    s3_executor = builder.add_stage('Amazon S3', type='executor')
    s3_executor.set_attributes(bucket='${record:value("/bucket")}',
                               task='CREATE_NEW_OBJECT',
                               object=s3_key,
                               content='${record:value("/company")}')
    if anonymous:
        configure_stage_for_anonymous(s3_executor)

    wiretap = builder.add_wiretap()

    dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap.destination
    record_deduplicator >> to_error

    s3_exec_pipeline = builder.build(
        title='Amazon S3 executor pipeline').configure_for_environment(aws)
    sdc_executor.add_pipeline(s3_exec_pipeline)

    client = aws.s3
    public_access_block = None
    bucket_policy = None
    try:
        if anonymous:
            public_access_block, bucket_policy = allow_public_access(
                client, s3_bucket, True, True)

        sdc_executor.start_pipeline(s3_exec_pipeline).wait_for_finished()

        # Assert record count to S3 the size of the objects put.
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
        assert len(list_s3_objs['Contents']) == 1

        # Read data from S3 to assert it is what got ingested into the pipeline.
        client_to_read = create_anonymous_client() if anonymous else client
        s3_contents = [
            client_to_read.get_object(
                Bucket=s3_bucket,
                Key=s3_content['Key'])['Body'].read().decode().strip()
            for s3_content in list_s3_objs['Contents']
        ]

        assert s3_contents[0] == 'StreamSets Inc.'

        # Check if the 'file-created' event was generated (only for recent sdc versions).
        if Version(
                sdc_builder.version) >= MIN_SDC_VERSION_WITH_EXECUTOR_EVENTS:
            assert len(wiretap.output_records) == 1
            assert wiretap.output_records[0].header.values[
                'sdc.event.type'] == 'file-created'

    finally:
        _ensure_pipeline_is_stopped(sdc_executor, s3_exec_pipeline)
        restore_public_access(client, s3_bucket, public_access_block,
                              bucket_policy)
        delete_keys = {
            'Objects': [{
                'Key': k['Key']
            } for k in client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
                        ['Contents']]
        }
        client.delete_objects(Bucket=s3_bucket, Delete=delete_keys)
def mongodbLookupMappingName(sdc_builder):
    """Resolve proper name for the "Document to SDC Field Mappings" in lookup - it will differ based on SDC version."""
    if Version(sdc_builder.version) >= Version("3.7.0"):
        return 'document_to_sdc_field_mappings'
    else:
        return 'sdc_field_to_document_field_mapping'
def test_produce_avro_records_without_schema(sdc_builder, sdc_executor,
                                             cluster):
    """Write avro text messages into Kafka multiple partitions with the schema in the records
    and confirm that Kafka successfully reads them.
    Because cluster mode pipelines don't support snapshots, we do this verification using a
    second standalone pipeline whose origin is an SDC RPC written to by the Kafka Consumer pipeline.
    Specifically, this would look like:

    Kafka Consumer Origin pipeline with cluster mode:
        kafka_consumer >> sdc_rpc_destination

    Snapshot pipeline:
        sdc_rpc_origin >> trash
    """

    msg = {
        'name': 'boss',
        'age': 60,
        'emails': ['*****@*****.**', '*****@*****.**'],
        'boss': None
    }
    expected = (
        'OrderedDict([(\'name\', boss), (\'age\', 60), (\'emails\', [[email protected], [email protected]]),'
        ' (\'boss\', None)])')

    if (Version(sdc_builder.version) < MIN_SDC_VERSION_WITH_SPARK_2_LIB
            and ('kafka' in cluster.kerberized_services
                 or cluster.kafka.is_ssl_enabled)):
        pytest.skip(
            'Kafka cluster mode test only '
            f'runs against cluster with the non-secured Kafka for SDC version {sdc_builder.version}.'
        )

    # Build the Kafka consumer pipeline.
    builder = sdc_builder.get_pipeline_builder()
    kafka_consumer = get_kafka_consumer_stage(sdc_builder.version, builder,
                                              cluster)

    kafka_consumer.set_attributes(data_format='AVRO',
                                  avro_schema_location='SOURCE')

    sdc_rpc_destination = get_rpc_destination(builder, sdc_executor)

    kafka_consumer >> sdc_rpc_destination
    kafka_consumer_pipeline = builder.build(
        title='Cluster kafka AVRO pipeline').configure_for_environment(cluster)
    kafka_consumer_pipeline.configuration[
        'executionMode'] = 'CLUSTER_YARN_STREAMING'
    kafka_consumer_pipeline.configuration['shouldRetry'] = False

    # Build the Snapshot pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    sdc_rpc_origin = get_rpc_origin(builder, sdc_rpc_destination)
    trash = builder.add_stage(label='Trash')
    sdc_rpc_origin >> trash
    snapshot_pipeline = builder.build(title='Cluster Snapshot pipeline')

    sdc_executor.add_pipeline(kafka_consumer_pipeline, snapshot_pipeline)

    try:
        # Publish messages to Kafka and verify using snapshot if the same messages are received.
        produce_kafka_messages(kafka_consumer.topic, cluster, msg,
                               'AVRO_WITHOUT_SCHEMA')
        verify_kafka_origin_results(kafka_consumer_pipeline, snapshot_pipeline,
                                    sdc_executor, expected,
                                    'AVRO_WITHOUT_SCHEMA')
    finally:
        sdc_executor.stop_pipeline(kafka_consumer_pipeline)
        sdc_executor.stop_pipeline(snapshot_pipeline)