def test_s3_executor_non_existing_object(sdc_builder, sdc_executor, aws): """Variant of S3 executor testing focusing on what happens when we try to apply tags on non existing object.""" # setup test static s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}' raw_str = f'{{"bucket": "{s3_bucket}", "key": "{s3_key}"}}' # Build the pipeline builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_str) s3_executor = builder.add_stage('Amazon S3', type='executor') s3_executor.set_attributes(bucket='${record:value("/bucket")}', task='CHANGE_EXISTING_OBJECT', object='${record:value("/key")}', tags=Configuration(property_key='key', company='${record:value("/company")}')) dev_raw_data_source >> s3_executor s3_exec_pipeline = builder.build(title='Amazon S3 executor pipeline').configure_for_environment(aws) sdc_executor.add_pipeline(s3_exec_pipeline) # Read snapshot of the pipeline snapshot = sdc_executor.capture_snapshot(s3_exec_pipeline, start_pipeline=True).snapshot sdc_executor.stop_pipeline(s3_exec_pipeline) # All records should go to error stream. input_records = snapshot[dev_raw_data_source.instance_name].output stage = snapshot[s3_executor.instance_name] assert len(stage.error_records) == len(input_records)
def test_s3_executor_tag_object(sdc_builder, sdc_executor, aws): """Test for S3 executor stage. We do so by running a dev raw data source generator to S3 destination sandbox bucket and then reading S3 bucket using STF client to assert data between the client to what has been created by the pipeline. We use a record deduplicator processor in between dev raw data source origin and S3 destination in order to limit number of objects to one. The pipeline looks like the following: S3 Destination pipeline: dev_raw_data_source >> record_deduplicator >> s3_executor >> to_error """ s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}' raw_str = f'{{"bucket": "{s3_bucket}", "key": "{s3_key}"}}' # Build the pipeline builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_str) record_deduplicator = builder.add_stage('Record Deduplicator') to_error = builder.add_stage('To Error') s3_executor = builder.add_stage('Amazon S3', type='executor') s3_executor.set_attributes(bucket='${record:value("/bucket")}', task='CHANGE_EXISTING_OBJECT', object='${record:value("/key")}', tags=Configuration(property_key='key', company='${record:value("/company")}')) dev_raw_data_source >> record_deduplicator >> s3_executor record_deduplicator >> to_error s3_exec_pipeline = builder.build(title='Amazon S3 executor pipeline').configure_for_environment(aws) sdc_executor.add_pipeline(s3_exec_pipeline) client = aws.s3 try: # Pre-create the object so that it exists client.put_object(Body='Secret Data', Bucket=s3_bucket, Key=s3_key) # And run the pipeline for at least one record (rest will be removed by the de-dup) sdc_executor.start_pipeline(s3_exec_pipeline).wait_for_pipeline_output_records_count(1) sdc_executor.stop_pipeline(s3_exec_pipeline) tags = client.get_object_tagging(Bucket=s3_bucket, Key=s3_key)['TagSet'] assert len(tags) == 1 finally: delete_keys = {'Objects': [{'Key': k['Key']} for k in client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)['Contents']]} client.delete_objects(Bucket=s3_bucket, Delete=delete_keys)
def shell_executor_(script, environment_variables=None): builder = sdc_executor.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='TEXT', raw_data='noop', stop_after_first_batch=True) shell = builder.add_stage('Shell') shell.set_attributes(script=script, environment_variables=(Configuration(**environment_variables)._data if environment_variables else [])) trash = builder.add_stage('Trash') dev_raw_data_source >> [trash, shell] pipeline = builder.build('Shell executor pipeline') sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline).wait_for_finished() sdc_executor.remove_pipeline(pipeline)
def pipeline_shell_generator(sdc_executor): builder = sdc_executor.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source') dev_raw_data_source.data_format = 'JSON' dev_raw_data_source.raw_data = '{}' shell_executor = builder.add_stage('Shell') shell_executor.environment_variables = Configuration(property_key='key', file='${FILE}') shell_executor.script = 'echo `whoami` > $file' dev_raw_data_source >> shell_executor executor_pipeline = builder.build() executor_pipeline.add_parameters(FILE='/') sdc_executor.add_pipeline(executor_pipeline) yield executor_pipeline
def test_s3_executor_non_existing_object(sdc_builder, sdc_executor, aws): """Variant of S3 executor testing focusing on what happens when we try to apply tags on non existing object.""" # setup test static s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}' raw_str = f'{{"bucket": "{s3_bucket}", "key": "{s3_key}"}}' # Build the pipeline builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage( 'Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_str, stop_after_first_batch=True) record_deduplicator = builder.add_stage('Record Deduplicator') s3_executor = builder.add_stage('Amazon S3', type='executor') s3_executor.set_attributes(bucket='${record:value("/bucket")}', task='CHANGE_EXISTING_OBJECT', object='${record:value("/key")}', tags=Configuration( property_key='key', company='${record:value("/company")}')) wiretap1 = builder.add_wiretap() wiretap2 = builder.add_wiretap() dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap2.destination record_deduplicator >> wiretap1.destination s3_exec_pipeline = builder.build( title='Amazon S3 executor pipeline').configure_for_environment(aws) sdc_executor.add_pipeline(s3_exec_pipeline) sdc_executor.start_pipeline(s3_exec_pipeline).wait_for_finished() # All records should go to error stream. assert len(wiretap1.output_records) == len(wiretap2.output_records)
def test_s3_executor_tag_object(sdc_builder, sdc_executor, aws): """Test for S3 executor stage. We do so by running a dev raw data source generator to S3 destination sandbox bucket and then reading S3 bucket using STF client to assert data between the client to what has been created by the pipeline. We use a record deduplicator processor in between dev raw data source origin and S3 destination in order to limit number of objects to one. For recent SDC versions we also check that the corresponding 'file-changed' event is generated. S3 Destination pipeline: dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap.destination >> to_error """ s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}' raw_str = f'{{"bucket": "{s3_bucket}", "key": "{s3_key}"}}' # Build the pipeline. builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage( 'Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_str, stop_after_first_batch=True) record_deduplicator = builder.add_stage('Record Deduplicator') to_error = builder.add_stage('To Error') s3_executor = builder.add_stage('Amazon S3', type='executor') s3_executor.set_attributes(bucket='${record:value("/bucket")}', task='CHANGE_EXISTING_OBJECT', object='${record:value("/key")}', tags=Configuration( property_key='key', company='${record:value("/company")}')) wiretap = builder.add_wiretap() dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap.destination record_deduplicator >> to_error s3_exec_pipeline = builder.build( title='Amazon S3 executor pipeline').configure_for_environment(aws) sdc_executor.add_pipeline(s3_exec_pipeline) client = aws.s3 try: # Pre-create the object so that it exists. client.put_object(Body='Secret Data', Bucket=s3_bucket, Key=s3_key) sdc_executor.start_pipeline(s3_exec_pipeline).wait_for_finished() tags = client.get_object_tagging(Bucket=s3_bucket, Key=s3_key)['TagSet'] assert len(tags) == 1 # Check if the 'file-created' event was generated (only for recent sdc versions). if Version( sdc_builder.version) >= MIN_SDC_VERSION_WITH_EXECUTOR_EVENTS: assert len(wiretap.output_records) == 1 assert wiretap.output_records[0].header.values[ 'sdc.event.type'] == 'file-changed' finally: _ensure_pipeline_is_stopped(sdc_executor, s3_exec_pipeline) delete_keys = { 'Objects': [{ 'Key': k['Key'] } for k in client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key) ['Contents']] } client.delete_objects(Bucket=s3_bucket, Delete=delete_keys)