예제 #1
0
def test_s3_executor_non_existing_object(sdc_builder, sdc_executor, aws):
    """Variant of S3 executor testing focusing on what happens when we try to apply tags on non existing object."""
    # setup test static
    s3_bucket = aws.s3_bucket_name
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'
    raw_str = f'{{"bucket": "{s3_bucket}", "key": "{s3_key}"}}'

    # Build the pipeline
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                  raw_data=raw_str)

    s3_executor = builder.add_stage('Amazon S3', type='executor')
    s3_executor.set_attributes(bucket='${record:value("/bucket")}',
                               task='CHANGE_EXISTING_OBJECT',
                               object='${record:value("/key")}',
                               tags=Configuration(property_key='key', company='${record:value("/company")}'))

    dev_raw_data_source >> s3_executor

    s3_exec_pipeline = builder.build(title='Amazon S3 executor pipeline').configure_for_environment(aws)
    sdc_executor.add_pipeline(s3_exec_pipeline)

    # Read snapshot of the pipeline
    snapshot = sdc_executor.capture_snapshot(s3_exec_pipeline, start_pipeline=True).snapshot
    sdc_executor.stop_pipeline(s3_exec_pipeline)

    # All records should go to error stream.
    input_records = snapshot[dev_raw_data_source.instance_name].output
    stage = snapshot[s3_executor.instance_name]
    assert len(stage.error_records) == len(input_records)
예제 #2
0
def test_s3_executor_tag_object(sdc_builder, sdc_executor, aws):
    """Test for S3 executor stage. We do so by running a dev raw data source generator to S3 destination
    sandbox bucket and then reading S3 bucket using STF client to assert data between the client to what has
    been created by the pipeline. We use a record deduplicator processor in between dev raw data source origin
    and S3 destination in order to limit number of objects to one. The pipeline looks like the following:

    S3 Destination pipeline:
        dev_raw_data_source >> record_deduplicator >> s3_executor
                                                   >> to_error
    """
    s3_bucket = aws.s3_bucket_name
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'
    raw_str = f'{{"bucket": "{s3_bucket}", "key": "{s3_key}"}}'

    # Build the pipeline
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                  raw_data=raw_str)

    record_deduplicator = builder.add_stage('Record Deduplicator')
    to_error = builder.add_stage('To Error')

    s3_executor = builder.add_stage('Amazon S3', type='executor')
    s3_executor.set_attributes(bucket='${record:value("/bucket")}',
                               task='CHANGE_EXISTING_OBJECT',
                               object='${record:value("/key")}',
                               tags=Configuration(property_key='key', company='${record:value("/company")}'))

    dev_raw_data_source >> record_deduplicator >> s3_executor
    record_deduplicator >> to_error

    s3_exec_pipeline = builder.build(title='Amazon S3 executor pipeline').configure_for_environment(aws)
    sdc_executor.add_pipeline(s3_exec_pipeline)

    client = aws.s3
    try:
        # Pre-create the object so that it exists
        client.put_object(Body='Secret Data', Bucket=s3_bucket, Key=s3_key)

        # And run the pipeline for at least one record (rest will be removed by the de-dup)
        sdc_executor.start_pipeline(s3_exec_pipeline).wait_for_pipeline_output_records_count(1)
        sdc_executor.stop_pipeline(s3_exec_pipeline)

        tags = client.get_object_tagging(Bucket=s3_bucket, Key=s3_key)['TagSet']
        assert len(tags) == 1

    finally:
        delete_keys = {'Objects': [{'Key': k['Key']}
                                   for k in client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)['Contents']]}
        client.delete_objects(Bucket=s3_bucket, Delete=delete_keys)
예제 #3
0
    def shell_executor_(script, environment_variables=None):
        builder = sdc_executor.get_pipeline_builder()
        dev_raw_data_source = builder.add_stage('Dev Raw Data Source')
        dev_raw_data_source.set_attributes(data_format='TEXT', raw_data='noop', stop_after_first_batch=True)
        shell = builder.add_stage('Shell')
        shell.set_attributes(script=script,
                             environment_variables=(Configuration(**environment_variables)._data
                                                    if environment_variables
                                                    else []))
        trash = builder.add_stage('Trash')
        dev_raw_data_source >> [trash, shell]
        pipeline = builder.build('Shell executor pipeline')

        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(pipeline).wait_for_finished()
        sdc_executor.remove_pipeline(pipeline)
def pipeline_shell_generator(sdc_executor):
    builder = sdc_executor.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.data_format = 'JSON'
    dev_raw_data_source.raw_data = '{}'

    shell_executor = builder.add_stage('Shell')
    shell_executor.environment_variables = Configuration(property_key='key',
                                                         file='${FILE}')
    shell_executor.script = 'echo `whoami` > $file'

    dev_raw_data_source >> shell_executor

    executor_pipeline = builder.build()
    executor_pipeline.add_parameters(FILE='/')
    sdc_executor.add_pipeline(executor_pipeline)

    yield executor_pipeline
예제 #5
0
def test_s3_executor_non_existing_object(sdc_builder, sdc_executor, aws):
    """Variant of S3 executor testing focusing on what happens when we try to apply tags on non existing object."""
    # setup test static
    s3_bucket = aws.s3_bucket_name
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'
    raw_str = f'{{"bucket": "{s3_bucket}", "key": "{s3_key}"}}'

    # Build the pipeline
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='JSON',
                                              raw_data=raw_str,
                                              stop_after_first_batch=True)

    record_deduplicator = builder.add_stage('Record Deduplicator')

    s3_executor = builder.add_stage('Amazon S3', type='executor')
    s3_executor.set_attributes(bucket='${record:value("/bucket")}',
                               task='CHANGE_EXISTING_OBJECT',
                               object='${record:value("/key")}',
                               tags=Configuration(
                                   property_key='key',
                                   company='${record:value("/company")}'))

    wiretap1 = builder.add_wiretap()
    wiretap2 = builder.add_wiretap()

    dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap2.destination
    record_deduplicator >> wiretap1.destination

    s3_exec_pipeline = builder.build(
        title='Amazon S3 executor pipeline').configure_for_environment(aws)
    sdc_executor.add_pipeline(s3_exec_pipeline)

    sdc_executor.start_pipeline(s3_exec_pipeline).wait_for_finished()

    # All records should go to error stream.
    assert len(wiretap1.output_records) == len(wiretap2.output_records)
예제 #6
0
def test_s3_executor_tag_object(sdc_builder, sdc_executor, aws):
    """Test for S3 executor stage. We do so by running a dev raw data source generator to S3 destination
    sandbox bucket and then reading S3 bucket using STF client to assert data between the client to what has
    been created by the pipeline. We use a record deduplicator processor in between dev raw data source origin
    and S3 destination in order to limit number of objects to one.

    For recent SDC versions we also check that the corresponding 'file-changed' event is generated.

    S3 Destination pipeline:
        dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap.destination
                                                   >> to_error
    """
    s3_bucket = aws.s3_bucket_name
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'
    raw_str = f'{{"bucket": "{s3_bucket}", "key": "{s3_key}"}}'

    # Build the pipeline.
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='JSON',
                                              raw_data=raw_str,
                                              stop_after_first_batch=True)

    record_deduplicator = builder.add_stage('Record Deduplicator')
    to_error = builder.add_stage('To Error')

    s3_executor = builder.add_stage('Amazon S3', type='executor')
    s3_executor.set_attributes(bucket='${record:value("/bucket")}',
                               task='CHANGE_EXISTING_OBJECT',
                               object='${record:value("/key")}',
                               tags=Configuration(
                                   property_key='key',
                                   company='${record:value("/company")}'))

    wiretap = builder.add_wiretap()

    dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap.destination
    record_deduplicator >> to_error

    s3_exec_pipeline = builder.build(
        title='Amazon S3 executor pipeline').configure_for_environment(aws)
    sdc_executor.add_pipeline(s3_exec_pipeline)

    client = aws.s3
    try:
        # Pre-create the object so that it exists.
        client.put_object(Body='Secret Data', Bucket=s3_bucket, Key=s3_key)

        sdc_executor.start_pipeline(s3_exec_pipeline).wait_for_finished()

        tags = client.get_object_tagging(Bucket=s3_bucket,
                                         Key=s3_key)['TagSet']
        assert len(tags) == 1

        # Check if the 'file-created' event was generated (only for recent sdc versions).
        if Version(
                sdc_builder.version) >= MIN_SDC_VERSION_WITH_EXECUTOR_EVENTS:
            assert len(wiretap.output_records) == 1
            assert wiretap.output_records[0].header.values[
                'sdc.event.type'] == 'file-changed'

    finally:
        _ensure_pipeline_is_stopped(sdc_executor, s3_exec_pipeline)
        delete_keys = {
            'Objects': [{
                'Key': k['Key']
            } for k in client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
                        ['Contents']]
        }
        client.delete_objects(Bucket=s3_bucket, Delete=delete_keys)