def test_object_names_path(sdc_builder, sdc_executor, aws, test_name, path_name): """ Verify that we can respect all the documented buckets names possible """ s3_key = path_name s3_bucket = aws.s3_bucket_name data = [ dict(f1=get_random_string(), f2=get_random_string()) for _ in range(10) ] # Build pipeline. builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') s3_origin = builder.add_stage('Amazon S3', type='origin') s3_origin.set_attributes(bucket=s3_bucket, data_format='JSON', json_content='ARRAY_OBJECTS', prefix_pattern=f'{s3_key}*') wiretap = builder.add_wiretap() pipeline_finished_executor = builder.add_stage( 'Pipeline Finisher Executor') pipeline_finished_executor.set_attributes( stage_record_preconditions=["${record:eventType() == 'no-more-data'}"]) s3_origin >> wiretap.destination s3_origin >= pipeline_finished_executor s3_origin_pipeline = builder.build().configure_for_environment(aws) s3_origin_pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(s3_origin_pipeline) client = aws.s3 try: # Insert objects into S3. client.put_object(Bucket=s3_bucket, Key=f'{s3_key}', Body=json.dumps(data)) sdc_executor.start_pipeline(s3_origin_pipeline).wait_for_finished() output_records_values = [ record.field for record in wiretap.output_records ] assert len(output_records_values) == 10 assert output_records_values == data finally: if sdc_executor.get_pipeline_status( s3_origin_pipeline).response.json().get('status') == 'RUNNING': logger.info('Stopping pipeline') sdc_executor.stop_pipeline(s3_origin_pipeline) # Clean up S3. aws.delete_s3_data(s3_bucket, s3_key)
def test_s3_region_other(sdc_builder, sdc_executor, aws): """ Test that using a specific region and specifying the endpoint works as expected We create a file and verify that the tags are correctly propagated to the object created in S3. S3 Destination pipeline: dev_raw_data_source >> s3_destination """ s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}' # Bucket name is inside the record itself raw_str = f'{{ "bucket" : "{s3_bucket}", "company" : "StreamSets Inc."}}' # Build the pipeline builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage( 'Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_str, stop_after_first_batch=True) s3_destination = builder.add_stage('Amazon S3', type='destination') bucket_val = (s3_bucket if sdc_builder.version < '2.6.0.1-0002' else '${record:value("/bucket")}') s3_destination.set_attributes(bucket=bucket_val, data_format='JSON', partition_prefix=s3_key) dev_raw_data_source >> s3_destination s3_dest_pipeline = builder.build().configure_for_environment(aws) s3_destination.set_attributes(use_specific_region=True, region='OTHER', endpoint=f's3.{aws.region}.amazonaws.com') sdc_executor.add_pipeline(s3_dest_pipeline) client = aws.s3 try: # start pipeline and capture pipeline messages to assert sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_finished() # assert record count to S3 the size of the objects put list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key) assert len(list_s3_objs['Contents']) == 1 # read data from S3 to assert it is what got ingested into the pipeline s3_obj_key = client.get_object(Bucket=s3_bucket, Key=list_s3_objs['Contents'][0]['Key']) # We're comparing the logic structure (JSON) rather than byte-to-byte to allow for different ordering, ... s3_contents = s3_obj_key['Body'].read().decode().strip() assert json.loads(s3_contents) == json.loads(raw_str) finally: aws.delete_s3_data(s3_bucket, s3_key)
def test_multiple_batch(sdc_builder, sdc_executor, aws): """ Test that using multithreaded pipeline we can start our pipeline multiple times adding more objects in between without reading any duplicated record neither missing them. """ s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/sdc' # Build pipeline. builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') s3_origin = builder.add_stage('Amazon S3', type='origin') s3_origin.set_attributes(bucket=aws.s3_bucket_name, data_format='JSON', prefix_pattern=f'{s3_key}/*', max_batch_size_in_records=50) pipeline_finished_executor = builder.add_stage( 'Pipeline Finisher Executor') pipeline_finished_executor.set_attributes( stage_record_preconditions=["${record:eventType() == 'no-more-data'}"]) wiretap = builder.add_wiretap() s3_origin >> wiretap.destination s3_origin >= pipeline_finished_executor s3_origin_pipeline = builder.build( title='Amazon S3 origin multithreaded pipeline' ).configure_for_environment(aws) s3_origin_pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(s3_origin_pipeline) client = aws.s3 try: total_data = [] for i in range(100): actual_data = dict(f1=get_random_string()) total_data.append(actual_data) client.put_object(Bucket=aws.s3_bucket_name, Key=f'{s3_key}/{i}', Body=json.dumps(actual_data)) sdc_executor.start_pipeline(s3_origin_pipeline).wait_for_finished() records = [ dict(f1=record.field['f1']) for record in wiretap.output_records ] assert len(records) == len(total_data) assert all(element in records for element in total_data) assert all(element in total_data for element in records) finally: # Clean up S3. aws.delete_s3_data(aws.s3_bucket_name, s3_key)
def test_data_format_delimited(sdc_builder, sdc_executor, aws, csv_parser): DATA = "A,B,C\n" \ "1,2,3\n" \ "10,20,30\n" s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/sdc' # Build pipeline. builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') s3_origin = builder.add_stage('Amazon S3', type='origin') s3_origin.set_attributes(bucket=s3_bucket, data_format='DELIMITED', header_line='WITH_HEADER', csv_parser=csv_parser, prefix_pattern=f'{s3_key}*') wiretap = builder.add_wiretap() s3_origin >> wiretap.destination s3_origin_pipeline = builder.build().configure_for_environment(aws) s3_origin_pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(s3_origin_pipeline) client = aws.s3 try: # Insert objects into S3. client.put_object(Bucket=s3_bucket, Key=f'{s3_key}', Body=DATA) sdc_executor.start_pipeline(s3_origin_pipeline) sdc_executor.wait_for_pipeline_metric(s3_origin_pipeline, 'output_record_count', 2, timeout_sec=120) sdc_executor.stop_pipeline(s3_origin_pipeline) records = wiretap.output_records assert len(records) == 2 assert records[0].field['A'] == "1" assert records[0].field['B'] == "2" assert records[0].field['C'] == "3" assert records[1].field['A'] == "10" assert records[1].field['B'] == "20" assert records[1].field['C'] == "30" finally: if sdc_executor.get_pipeline_status( s3_origin_pipeline).response.json().get('status') == 'RUNNING': logger.info('Stopping pipeline') sdc_executor.stop_pipeline(s3_origin_pipeline) # Clean up S3. aws.delete_s3_data(s3_bucket, s3_key)
def test_s3_whole_file_transfer(sdc_builder, sdc_executor, aws): """Test simple scenario of moving files from source to target using WHOLE_FILE_FORMAT.""" s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/' s3_dest_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/' data = 'Completely random string that is transfered as whole file format.' # Build pipeline. builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') origin = builder.add_stage('Amazon S3', type='origin') origin.set_attributes(bucket=aws.s3_bucket_name, data_format='WHOLE_FILE', prefix_pattern=f'{s3_key}/*', max_batch_size_in_records=100) target = builder.add_stage('Amazon S3', type='destination') target.set_attributes(bucket=aws.s3_bucket_name, data_format='WHOLE_FILE', partition_prefix=s3_dest_key, file_name_expression='output.txt') wiretap = builder.add_wiretap() origin >> target target >= wiretap.destination pipeline = builder.build().configure_for_environment(aws) pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(pipeline) client = aws.s3 try: client.put_object(Bucket=aws.s3_bucket_name, Key=f'{s3_key}/input.txt', Body=data.encode('ascii')) sdc_executor.start_pipeline(pipeline) sdc_executor.wait_for_pipeline_metric(pipeline, 'output_record_count', 1) # Validate event generation assert len(wiretap.output_records) == 1 assert wiretap.output_records[0].get_field_data('/targetFileInfo/bucket') == aws.s3_bucket_name assert wiretap.output_records[0].get_field_data( '/targetFileInfo/objectKey') == f'{s3_dest_key}sdc-output.txt' # We should have exactly one file on the destination side list_s3_objs = client.list_objects_v2(Bucket=aws.s3_bucket_name, Prefix=s3_dest_key) assert len(list_s3_objs['Contents']) == 1 # With our secret message s3_obj_key = client.get_object(Bucket=aws.s3_bucket_name, Key=list_s3_objs['Contents'][0]['Key']) s3_contents = s3_obj_key['Body'].read().decode().strip() assert s3_contents == data finally: logger.info('Deleting input S3 data from bucket %s with location %s ...', aws.s3_bucket_name, s3_key) aws.delete_s3_data(aws.s3_bucket_name, s3_key) logger.info('Deleting output S3 data from bucket %s with location %s ...', aws.s3_bucket_name, s3_dest_key) aws.delete_s3_data(aws.s3_bucket_name, s3_dest_key)
def test_s3_whole_file_transfer_existing_file(sdc_builder, sdc_executor, aws): """Test simple scenario of moving files from source to target using WHOLE_FILE_FORMAT.""" s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/' s3_dest_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/' data = 'Completely random string that is transferred as whole file format.' # Build pipeline. builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') origin = builder.add_stage('Amazon S3', type='origin') origin.set_attributes(bucket=aws.s3_bucket_name, data_format='WHOLE_FILE', prefix_pattern=f'{s3_key}/*', max_batch_size_in_records=100) target = builder.add_stage('Amazon S3', type='destination') target.set_attributes(bucket=aws.s3_bucket_name, data_format='WHOLE_FILE', partition_prefix=s3_dest_key, file_name_expression='output.txt', object_name_prefix='', file_exists='TO_ERROR') finisher = builder.add_stage('Pipeline Finisher Executor') finisher.set_attributes(stage_record_preconditions=["${record:eventType() == 'no-more-data'}"]) origin >> target origin >= finisher pipeline = builder.build().configure_for_environment(aws) pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(pipeline) client = aws.s3 try: # We create both input as well as output file client.put_object(Bucket=aws.s3_bucket_name, Key=f'{s3_key}input.txt', Body=data.encode('ascii')) logger.info(f"Pre creating output file {s3_dest_key}output.txt") client.put_object(Bucket=aws.s3_bucket_name, Key=f'{s3_dest_key}output.txt', Body=data.encode('ascii')) sdc_executor.start_pipeline(pipeline).wait_for_finished() history = sdc_executor.get_pipeline_history(pipeline) assert history.latest.metrics.counter('stage.AmazonS3_02.inputRecords.counter').count == 1 assert history.latest.metrics.counter('stage.AmazonS3_02.outputRecords.counter').count == 0 assert history.latest.metrics.counter('stage.AmazonS3_02.errorRecords.counter').count == 1 finally: logger.info('Deleting input S3 data from bucket %s with location %s ...', aws.s3_bucket_name, s3_key) aws.delete_s3_data(aws.s3_bucket_name, s3_key) logger.info('Deleting output S3 data from bucket %s with location %s ...', aws.s3_bucket_name, s3_dest_key) aws.delete_s3_data(aws.s3_bucket_name, s3_dest_key)
def test_dataflow_events(sdc_builder, sdc_executor, aws): """ We write from Dev to S3 using wiretap to capture events and verifying their content """ s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}' # Bucket name is inside the record itself raw_str = f'{{ "bucket" : "{s3_bucket}", "company" : "StreamSets Inc."}}' # Build the pipeline builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_str, stop_after_first_batch=True) s3_destination = builder.add_stage('Amazon S3', type='destination') s3_destination.set_attributes(bucket=s3_bucket, data_format='JSON', partition_prefix=s3_key) wiretap = builder.add_wiretap() dev_raw_data_source >> s3_destination >= wiretap.destination s3_dest_pipeline = builder.build().configure_for_environment(aws) sdc_executor.add_pipeline(s3_dest_pipeline) client = aws.s3 try: sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_finished() # Validate event generation assert wiretap.output_records[0].get_field_data('/bucket') == aws.s3_bucket_name assert wiretap.output_records[0].get_field_data('/recordCount') == 1 # assert record count to S3 the size of the objects put list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key) assert len(list_s3_objs['Contents']) == 1 # read data from S3 to assert it is what got ingested into the pipeline s3_obj_key = client.get_object(Bucket=s3_bucket, Key=list_s3_objs['Contents'][0]['Key']) # We're comparing the logic structure (JSON) rather than byte-to-byte to allow for different ordering, ... s3_contents = s3_obj_key['Body'].read().decode().strip() assert json.loads(s3_contents) == json.loads(raw_str) finally: aws.delete_s3_data(s3_bucket, s3_key)
def test_multiple_batches(sdc_builder, sdc_executor, aws): """ Test for S3 target stage. We verify that the destination work fine with more than one batch. """ s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}' # Bucket name is inside the record itself raw_str = f'{{ "bucket" : "{s3_bucket}", "company" : "StreamSets Inc."}}' # Build the pipeline builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_str, stop_after_first_batch=False) s3_destination = builder.add_stage('Amazon S3', type='destination') s3_destination.set_attributes(bucket=s3_bucket, data_format='JSON', partition_prefix=s3_key) dev_raw_data_source >> s3_destination s3_dest_pipeline = builder.build().configure_for_environment(aws) sdc_executor.add_pipeline(s3_dest_pipeline) client = aws.s3 try: sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_pipeline_output_records_count(20) sdc_executor.stop_pipeline(s3_dest_pipeline) # assert record count to S3 the size of the objects put list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key) history = sdc_executor.get_pipeline_history(s3_dest_pipeline) history_records = history.latest.metrics.counter('stage.AmazonS3_01.outputRecords.counter').count assert len(list_s3_objs['Contents']) == history_records # read data from S3 to assert it is what got ingested into the pipeline s3_obj_key = client.get_object(Bucket=s3_bucket, Key=list_s3_objs['Contents'][0]['Key']) # We're comparing the logic structure (JSON) rather than byte-to-byte to allow for different ordering, ... s3_contents = s3_obj_key['Body'].read().decode().strip() assert json.loads(s3_contents) == json.loads(raw_str) finally: aws.delete_s3_data(s3_bucket, s3_key)
def test_object_names_path(sdc_builder, sdc_executor, aws, test_name, path_name): """Test for S3 target stage. We do so by running a dev raw data source generator to S3 destination sandbox bucket and then reading S3 bucket using STF client to assert data between the client to what has been ingested by the pipeline. """ s3_bucket = aws.s3_bucket_name s3_key = path_name # Bucket name is inside the record itself raw_str = f'{{ "bucket" : "{s3_bucket}", "company" : "StreamSets Inc."}}' # Build the pipeline builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_str, stop_after_first_batch=True) s3_destination = builder.add_stage('Amazon S3', type='destination') s3_destination.set_attributes(bucket=s3_bucket, data_format='JSON', partition_prefix=s3_key) dev_raw_data_source >> s3_destination s3_dest_pipeline = builder.build().configure_for_environment(aws) sdc_executor.add_pipeline(s3_dest_pipeline) client = aws.s3 try: sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_finished() # assert record count to S3 the size of the objects put list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key) assert len(list_s3_objs['Contents']) == 1 # read data from S3 to assert it is what got ingested into the pipeline s3_obj_key = client.get_object(Bucket=s3_bucket, Key=list_s3_objs['Contents'][0]['Key']) # We're comparing the logic structure (JSON) rather than byte-to-byte to allow for different ordering, ... s3_contents = s3_obj_key['Body'].read().decode().strip() assert json.loads(s3_contents) == json.loads(raw_str) finally: aws.delete_s3_data(s3_bucket, s3_key)
def test_push_pull(sdc_builder, sdc_executor, aws): """ We plan to verify that the connector works fine with Dev Raw Data Source and Dev Data Generator, an example of pull and push strategies, so as we already verified Dev Raw Data Source, we will use Dev Data Generator here to complete the coverage. """ s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}' # Build the pipeline builder = sdc_builder.get_pipeline_builder() dev_data_generator = builder.add_stage('Dev Data Generator') dev_data_generator.set_attributes(batch_size=1, fields_to_generate=[ {'field': 'stringField', 'type': 'STRING', 'precision': 10, 'scale': 2}]) s3_destination = builder.add_stage('Amazon S3', type='destination') s3_destination.set_attributes(bucket=s3_bucket, data_format='JSON', partition_prefix=s3_key) dev_data_generator >> s3_destination s3_dest_pipeline = builder.build().configure_for_environment(aws) sdc_executor.add_pipeline(s3_dest_pipeline) client = aws.s3 try: sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_pipeline_output_records_count(25) sdc_executor.stop_pipeline(s3_dest_pipeline) history = sdc_executor.get_pipeline_history(s3_dest_pipeline) history_records = history.latest.metrics.counter('stage.AmazonS3_01.outputRecords.counter').count # assert record count to S3 the size of the objects put list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key) assert len(list_s3_objs['Contents']) == history_records finally: aws.delete_s3_data(s3_bucket, s3_key)
def _test_emr_origin_to_s3(sdc_builder, sdc_executor, aws): s3_bucket = aws.emr_s3_bucket_name s3_input_key = '{0}/{1}/input'.format(S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10)) s3_output_key = '{0}/{1}/output'.format(S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10)) s3_staging_bucket = aws.emr_s3_staging_bucket_name s3_staging_key = '{0}/{1}/sdc_staging'.format(S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10)) s3_logging_key = '{0}/{1}/sdc_logging'.format(S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10)) raw_str = 'Hello World!' s3_obj_count = 2 # keep it low, so as the number of MR jobs don't spin a lot and take a while lot of time logger.info('%s S3 bucket used with input key: %s output key: %s and object count: %s', s3_bucket, s3_input_key, s3_output_key, s3_obj_count) logger.info('%s S3 staging bucket used with EMR staging key: %s and EMR logging key: %s', s3_staging_bucket, s3_staging_key, s3_logging_key) # build pipeline builder = sdc_builder.get_pipeline_builder() emr_origin = builder.add_stage('Hadoop FS', type='origin') emr_origin.set_attributes( hadoop_fs_uri=f's3a://{s3_bucket}', input_paths=[f'/{s3_input_key}'], data_format='TEXT' ) s3_destination = builder.add_stage('Amazon S3', type='destination') s3_destination.set_attributes(bucket=s3_bucket, data_format='TEXT', partition_prefix=s3_output_key) emr_origin >> s3_destination pipeline = builder.build(title='Amazon EMR to S3 pipeline').configure_for_environment(aws) sdc_executor.add_pipeline(pipeline) client = aws.s3 try: logger.info('Creating input S3 data ...') [client.put_object(Bucket=s3_staging_bucket, Key='{0}/{1}'.format(s3_input_key, i), Body=raw_str) for i in range(s3_obj_count)] # lets not wait for pipeline start, as the transition from START to RUNNING takes more time sdc_executor.start_pipeline(pipeline, wait=False).wait_for_finished(timeout_sec=1800) # assert record count to S3 the size of the objects put list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_output_key) assert len(list_s3_objs['Contents']) == s3_obj_count # read data from S3 to assert it is what got ingested into the pipeline s3_contents = [client.get_object(Bucket=s3_bucket, Key=s3_content['Key'])['Body'].read().decode().strip() for s3_content in list_s3_objs['Contents']] assert s3_contents == [raw_str] * s3_obj_count finally: logger.info('Deleting input S3 data from bucket %s with location %s ...', s3_bucket, s3_input_key) aws.delete_s3_data(s3_bucket, s3_input_key) logger.info('Deleting output S3 data from bucket %s with location %s ...', s3_bucket, s3_output_key) aws.delete_s3_data(s3_bucket, s3_output_key) logger.info('Deleting staging S3 data from bucket %s with location %s ...', s3_staging_bucket, s3_staging_key) aws.delete_s3_data(s3_staging_bucket, s3_staging_key) logger.info('Deleting logging S3 data from bucket %s with location %s ...', s3_staging_bucket, s3_logging_key) aws.delete_s3_data(s3_staging_bucket, s3_logging_key)
def test_object_names_bucket(sdc_builder, sdc_executor, aws, test_name, bucket_generator): """ Verify that we can respect all the documented buckets names possible """ client = aws.s3 retry = 0 s3_bucket = None # Since S3 buckets are globally unique, doing our usual randomization doesn't work well - we always have a chance # to create bucket that already exists. That is why we have a retry logic - we try to generate several bucket names # and see which one we manage to "claim". while s3_bucket is None and retry < 10: retry = retry + 1 s3_bucket = bucket_generator() logger.info(f"Retry {retry} with bucket name '{s3_bucket}'") try: client.create_bucket( Bucket=s3_bucket, CreateBucketConfiguration={'LocationConstraint': aws.region}) except Exception as e: s3_bucket = None logger.error(f"Can't use bucket name '{s3_bucket}': {e}") # We might not be able to find suitable bucket in max retries in which case we will simply die assert s3_bucket is not None try: client.put_bucket_tagging(Bucket=s3_bucket, Tagging={ 'TagSet': [ { 'Key': 'stf-env', 'Value': 'nightly-tests' }, { 'Key': 'managed-by', 'Value': 'ep' }, { 'Key': 'dept', 'Value': 'eng' }, ] }) s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/sdc' data = [ dict(f1=get_random_string(), f2=get_random_string()) for _ in range(10) ] # Build pipeline. builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') s3_origin = builder.add_stage('Amazon S3', type='origin') s3_origin.set_attributes(bucket=s3_bucket, data_format='JSON', json_content='ARRAY_OBJECTS', prefix_pattern=f'{s3_key}*') wiretap = builder.add_wiretap() pipeline_finished_executor = builder.add_stage( 'Pipeline Finisher Executor') pipeline_finished_executor.set_attributes(stage_record_preconditions=[ "${record:eventType() == 'no-more-data'}" ]) s3_origin >> wiretap.destination s3_origin >= pipeline_finished_executor s3_origin_pipeline = builder.build().configure_for_environment(aws) s3_origin_pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(s3_origin_pipeline) # Insert objects into S3. client.put_object(Bucket=s3_bucket, Key=f'{s3_key}', Body=json.dumps(data)) sdc_executor.start_pipeline(s3_origin_pipeline).wait_for_finished() output_records_values = [ record.field for record in wiretap.output_records ] assert len(output_records_values) == 10 assert output_records_values == data finally: # Clean up S3. try: aws.delete_s3_data(s3_bucket, s3_key) except Exception as e: logger.error(f"Can't remove files from bucket {s3_bucket}: {e}") finally: try: client.delete_bucket(Bucket=s3_bucket) except Exception as e: logger.error(f"Can't delete buckeet: {e}") if sdc_executor.get_pipeline_status( s3_origin_pipeline).response.json().get('status') == 'RUNNING': logger.info('Stopping pipeline') sdc_executor.stop_pipeline(s3_origin_pipeline)
def test_s3_executor_tag_object(sdc_builder, sdc_executor, aws): """Test for S3 executor stage. We do so by running a dev raw data source generator to S3 destination sandbox bucket and then reading S3 bucket using STF client to assert data between the client to what has been created by the pipeline. We use a record deduplicator processor in between dev raw data source origin and S3 destination in order to limit number of objects to one. For recent SDC versions we also check that the corresponding 'file-changed' event is generated. S3 Destination pipeline: dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap.destination >> to_error """ s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}' raw_str = f'{{"bucket": "{s3_bucket}", "key": "{s3_key}"}}' # Build the pipeline. builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage( 'Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_str, stop_after_first_batch=True) record_deduplicator = builder.add_stage('Record Deduplicator') to_error = builder.add_stage('To Error') s3_executor = builder.add_stage('Amazon S3', type='executor') s3_executor.set_attributes(bucket='${record:value("/bucket")}', task='CHANGE_EXISTING_OBJECT', object='${record:value("/key")}', tags=Configuration( property_key='key', company='${record:value("/company")}')) wiretap = builder.add_wiretap() dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap.destination record_deduplicator >> to_error s3_exec_pipeline = builder.build( title='Amazon S3 executor pipeline').configure_for_environment(aws) sdc_executor.add_pipeline(s3_exec_pipeline) client = aws.s3 try: # Pre-create the object so that it exists. client.put_object(Body='Secret Data', Bucket=s3_bucket, Key=s3_key) sdc_executor.start_pipeline(s3_exec_pipeline).wait_for_finished() tags = client.get_object_tagging(Bucket=s3_bucket, Key=s3_key)['TagSet'] assert len(tags) == 1 # Check if the 'file-created' event was generated (only for recent sdc versions). if Version( sdc_builder.version) >= MIN_SDC_VERSION_WITH_EXECUTOR_EVENTS: assert len(wiretap.output_records) == 1 assert wiretap.output_records[0].header.values[ 'sdc.event.type'] == 'file-changed' finally: _ensure_pipeline_is_stopped(sdc_executor, s3_exec_pipeline) aws.delete_s3_data(s3_bucket, s3_key)
def test_s3_whole_file_transfer_with_tags(sdc_builder, sdc_executor, aws): """Test for tags on S3 destination using WHOLE_FILE_FORMAT.. We create a file and verify that the tags are correctly propagated to the object created in S3. S3 Destination pipeline: dev_raw_data_source >> s3_destination """ s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/' s3_dest_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/' data = 'Completely random string that is transfered as whole file format.' # Build pipeline. builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') origin = builder.add_stage('Amazon S3', type='origin') origin.set_attributes(bucket=aws.s3_bucket_name, data_format='WHOLE_FILE', prefix_pattern=f'{s3_key}/*', max_batch_size_in_records=100) target = builder.add_stage('Amazon S3', type='destination') target.set_attributes(bucket=aws.s3_bucket_name, data_format='WHOLE_FILE', partition_prefix=s3_dest_key, file_name_expression='output.txt', add_tags=True, tags=[{ "key": "this-is-a-test-tag-key", "value": "this-is-a-test-tag-value" }]) origin >> target pipeline = builder.build().configure_for_environment(aws) pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(pipeline) client = aws.s3 try: client.put_object(Bucket=aws.s3_bucket_name, Key=f'{s3_key}/input.txt', Body=data.encode('ascii')) sdc_executor.start_pipeline(pipeline) sdc_executor.wait_for_pipeline_metric(pipeline, 'output_record_count', 1, timeout_sec=120) # We should have exactly one file on the destination side list_s3_objs = client.list_objects_v2(Bucket=aws.s3_bucket_name, Prefix=s3_dest_key) assert len(list_s3_objs['Contents']) == 1 # With our secret message s3_obj_key = client.get_object(Bucket=aws.s3_bucket_name, Key=list_s3_objs['Contents'][0]['Key']) s3_contents = s3_obj_key['Body'].read().decode().strip() assert s3_contents == data object_tagging = client.get_object_tagging( Bucket=aws.s3_bucket_name, Key=list_s3_objs['Contents'][0]['Key']) assert object_tagging['TagSet'] == [{ "Key": "this-is-a-test-tag-key", "Value": "this-is-a-test-tag-value" }] finally: logger.info( 'Deleting input S3 data from bucket %s with location %s ...', aws.s3_bucket_name, s3_key) aws.delete_s3_data(aws.s3_bucket_name, s3_key) logger.info( 'Deleting output S3 data from bucket %s with location %s ...', aws.s3_bucket_name, s3_dest_key) aws.delete_s3_data(aws.s3_bucket_name, s3_dest_key)
def test_s3_multithreading_multiple_batches(sdc_builder, sdc_executor, aws): """Test for S3 target stage. Data loss scenario happened when multiple threads try writing within the same millisecond. We just add the runnerId in the file name, if the option is specified (true by default): S3 Destination pipeline: dev_data_generator >> s3_destination dev_data_generator >> wiretap.destination """ number_of_records = 100 batch_size = 1 delay_between_batches = 1 number_of_threads = 10 try: s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}' # Build the pipeline builder = sdc_builder.get_pipeline_builder() dev_data_generator = builder.add_stage('Dev Data Generator') dev_data_generator.fields_to_generate = [ { 'field': 'id', 'type': 'POKEMON' }, ] dev_data_generator.set_attributes( delay_between_batches=delay_between_batches, batch_size=batch_size, records_to_be_generated=number_of_records, number_of_threads=number_of_threads) s3_destination = builder.add_stage('Amazon S3', type='destination') s3_destination.set_attributes(bucket=s3_bucket, data_format='JSON', partition_prefix=s3_key) wiretap = builder.add_wiretap() dev_data_generator >> [s3_destination, wiretap.destination] s3_dest_pipeline = builder.build(title='Amazon S3 destination pipeline' ).configure_for_environment(aws) sdc_executor.add_pipeline(s3_dest_pipeline) client = aws.s3 # start pipeline and capture pipeline messages to assert sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_finished() # assert record count to S3 the size of the objects put list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key) assert len(list_s3_objs['Contents']) == number_of_records records = [] for record in wiretap.output_records: # We need to get every field inside each record records = records + [record.field] file_names = [] for i in range(0, number_of_records - 1): # We just check each file in s3 contains ths same as wiretap s3_obj_key = client.get_object( Bucket=s3_bucket, Key=list_s3_objs['Contents'][i]['Key']) s3_contents = s3_obj_key['Body'].read().decode().strip() file_names = file_names + [list_s3_objs['Contents'][i]['Key']] assert json.loads(s3_contents) in records for i in range(0, number_of_threads - 1): # We also check we have at least one file name (computed before) of each thread (from 000 to 009) thread_number = '-' + str(i).zfill(3) assert any(thread_number in file_name for file_name in file_names) finally: aws.delete_s3_data(s3_bucket, s3_key)
def _run_test_s3_destination(sdc_builder, sdc_executor, aws, sse_kms, anonymous): try: if anonymous: s3_bucket = create_bucket(aws) logger.info(f'Bucket {s3_bucket} created') else: s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}' # Bucket name is inside the record itself raw_str = f'{{ "bucket" : "{s3_bucket}", "company" : "StreamSets Inc."}}' # Build the pipeline builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage( 'Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_str, stop_after_first_batch=True) s3_destination = builder.add_stage('Amazon S3', type='destination') bucket_val = (s3_bucket if sdc_builder.version < '2.6.0.1-0002' else '${record:value("/bucket")}') s3_destination.set_attributes(bucket=bucket_val, data_format='JSON', partition_prefix=s3_key) if sse_kms: # Use SSE with KMS s3_destination.set_attributes(use_server_side_encryption=True, server_side_encryption_option='KMS', aws_kms_key_arn=aws.kms_key_arn) if anonymous: configure_stage_for_anonymous(s3_destination) wiretap = builder.add_wiretap() dev_raw_data_source >> s3_destination s3_destination >= wiretap.destination s3_dest_pipeline = builder.build(title='Amazon S3 destination pipeline' ).configure_for_environment(aws) sdc_executor.add_pipeline(s3_dest_pipeline) client = aws.s3 # start pipeline and capture pipeline messages to assert sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_finished() # Validate event generation assert len(wiretap.output_records) == 1 assert [record.field['bucket'] for record in wiretap.output_records][0] == s3_bucket assert [ record.field['recordCount'] for record in wiretap.output_records ][0] == 1 # assert record count to S3 the size of the objects put list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key) assert len(list_s3_objs['Contents']) == 1 # read data from S3 to assert it is what got ingested into the pipeline client_to_read = create_anonymous_client() if anonymous else client s3_obj_key = client_to_read.get_object( Bucket=s3_bucket, Key=list_s3_objs['Contents'][0]['Key']) # We're comparing the logic structure (JSON) rather than byte-to-byte to allow for different ordering, ... s3_contents = s3_obj_key['Body'].read().decode().strip() assert json.loads(s3_contents) == json.loads(raw_str) if sse_kms: # assert that the data was stored with SSE using the KMS assert s3_obj_key['ServerSideEncryption'] == 'aws:kms' assert s3_obj_key['SSEKMSKeyId'] == aws.kms_key_arn finally: try: aws.delete_s3_data(s3_bucket, s3_key) finally: if anonymous: logger.info(f'Deleting bucket {s3_bucket}') aws.s3.delete_bucket(Bucket=s3_bucket)
def test_dataflow_events_new_file(sdc_builder, sdc_executor, aws): """ Test that we receive an new-file event whenever we start reading a file. """ s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/sdc' data = [ dict(f1=get_random_string(), f2=get_random_string()) for _ in range(10) ] # Build pipeline. builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') s3_origin = builder.add_stage('Amazon S3', type='origin') s3_origin.set_attributes(bucket=s3_bucket, data_format='JSON', json_content='ARRAY_OBJECTS', prefix_pattern=f'{s3_key}*') events_wiretap = builder.add_wiretap() records_wiretap = builder.add_wiretap() s3_origin >> records_wiretap.destination s3_origin >= events_wiretap.destination s3_origin_pipeline = builder.build().configure_for_environment(aws) s3_origin_pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(s3_origin_pipeline) client = aws.s3 try: # Insert objects into S3. client.put_object(Bucket=s3_bucket, Key=f'{s3_key}', Body=json.dumps(data)) sdc_executor.start_pipeline(s3_origin_pipeline) sdc_executor.wait_for_pipeline_metric(s3_origin_pipeline, 'output_record_count', 1, timeout_sec=120) sdc_executor.stop_pipeline(s3_origin_pipeline) output_records_values = [ record.field for record in records_wiretap.output_records ] assert len(output_records_values) == 10 assert output_records_values == data # We have exactly one output record, check that it is a new-file event event_record = events_wiretap.output_records[0] event_type = event_record.header.values['sdc.event.type'] assert event_type == 'new-file', 'Received %s as event type (expected new-file)' % event_type finally: if sdc_executor.get_pipeline_status( s3_origin_pipeline).response.json().get('status') == 'RUNNING': logger.info('Stopping pipeline') sdc_executor.stop_pipeline(s3_origin_pipeline) # Clean up S3. aws.delete_s3_data(s3_bucket, s3_key)
def _run_test_s3_executor_create_object(sdc_builder, sdc_executor, aws, anonymous): # Setup test static. s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}' raw_str = f'{{"bucket": "{s3_bucket}", "company": "StreamSets Inc."}}' # Build the pipeline. builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage( 'Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_str, stop_after_first_batch=True) record_deduplicator = builder.add_stage('Record Deduplicator') to_error = builder.add_stage('To Error') s3_executor = builder.add_stage('Amazon S3', type='executor') s3_executor.set_attributes(bucket='${record:value("/bucket")}', task='CREATE_NEW_OBJECT', object=s3_key, content='${record:value("/company")}') if anonymous: configure_stage_for_anonymous(s3_executor) wiretap = builder.add_wiretap() dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap.destination record_deduplicator >> to_error s3_exec_pipeline = builder.build( title='Amazon S3 executor pipeline').configure_for_environment(aws) sdc_executor.add_pipeline(s3_exec_pipeline) client = aws.s3 public_access_block = None bucket_policy = None try: if anonymous: public_access_block, bucket_policy = allow_public_access( client, s3_bucket, True, True) sdc_executor.start_pipeline(s3_exec_pipeline).wait_for_finished() # Assert record count to S3 the size of the objects put. list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key) assert len(list_s3_objs['Contents']) == 1 # Read data from S3 to assert it is what got ingested into the pipeline. client_to_read = create_anonymous_client() if anonymous else client s3_contents = [ client_to_read.get_object( Bucket=s3_bucket, Key=s3_content['Key'])['Body'].read().decode().strip() for s3_content in list_s3_objs['Contents'] ] assert s3_contents[0] == 'StreamSets Inc.' # Check if the 'file-created' event was generated (only for recent sdc versions). if Version( sdc_builder.version) >= MIN_SDC_VERSION_WITH_EXECUTOR_EVENTS: assert len(wiretap.output_records) == 1 assert wiretap.output_records[0].header.values[ 'sdc.event.type'] == 'file-created' finally: _ensure_pipeline_is_stopped(sdc_executor, s3_exec_pipeline) restore_public_access(client, s3_bucket, public_access_block, bucket_policy) aws.delete_s3_data(s3_bucket, s3_key)
def test_object_names_bucket(sdc_builder, sdc_executor, aws, test_name, bucket_generator): """Test for S3 target stage. We do so by running a dev raw data source generator to S3 destination sandbox bucket and then reading S3 bucket using STF client to assert data between the client to what has been ingested by the pipeline. """ client = aws.s3 retry = 0 s3_bucket = None try: # Since S3 buckets are globally unique, doing our usual randomization doesn't work well - we always have a chance # to create bucket that already exists. That is why we have a retry logic - we try to generate several bucket names # and see which one we manage to "claim". while s3_bucket is None and retry < 10: retry = retry + 1 s3_bucket = bucket_generator() logger.info(f"Retry {retry} with bucket name '{s3_bucket}'") try: client.create_bucket(Bucket=s3_bucket, CreateBucketConfiguration={'LocationConstraint': aws.region}) except Exception as e: s3_bucket = None logger.error(f"Can't use bucket name '{s3_bucket}': {e}") # We might not be able to find suitable bucket in max retries in which case we will simply die assert s3_bucket is not None client.put_bucket_tagging( Bucket=s3_bucket, Tagging={ 'TagSet': [ {'Key': 'stf-env', 'Value': 'nightly-tests'}, {'Key': 'managed-by', 'Value': 'ep'}, {'Key': 'dept', 'Value': 'eng'}, ] } ) s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}' # Bucket name is inside the record itself raw_str = f'{{ "bucket" : "{s3_bucket}", "company" : "StreamSets Inc."}}' # Build the pipeline builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_str, stop_after_first_batch=True) s3_destination = builder.add_stage('Amazon S3', type='destination') s3_destination.set_attributes(bucket=s3_bucket, data_format='JSON', partition_prefix=s3_key) dev_raw_data_source >> s3_destination s3_dest_pipeline = builder.build().configure_for_environment(aws) sdc_executor.add_pipeline(s3_dest_pipeline) sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_finished() # assert record count to S3 the size of the objects put list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key) assert len(list_s3_objs['Contents']) == 1 # read data from S3 to assert it is what got ingested into the pipeline s3_obj_key = client.get_object(Bucket=s3_bucket, Key=list_s3_objs['Contents'][0]['Key']) # We're comparing the logic structure (JSON) rather than byte-to-byte to allow for different ordering, ... s3_contents = s3_obj_key['Body'].read().decode().strip() assert json.loads(s3_contents) == json.loads(raw_str) finally: try: aws.delete_s3_data(s3_bucket, s3_key) except Exception as e: logger.error(f"Can't remove files from bucket {s3_bucket}: {e}") finally: try: client.delete_bucket(Bucket=s3_bucket) except Exception as e: logger.error(f"Can't delete buckeet: {e}")
def _run_test_s3_error_destination(sdc_builder, sdc_executor, aws, anonymous): try: if anonymous: s3_bucket = create_bucket(aws) logger.info(f'Bucket {s3_bucket} created') else: s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/errDest-{get_random_string()}/' random_string = get_random_string(string.ascii_letters, 10) random_raw_json_str = f'{{"text":"{random_string}"}}' # Build pipeline. builder = sdc_builder.get_pipeline_builder() s3_err = builder.add_error_stage('Write to Amazon S3') s3_err.set_attributes(bucket=s3_bucket, common_prefix=s3_key) if anonymous: configure_stage_for_anonymous(s3_err) origin = builder.add_stage('Dev Raw Data Source', type='origin') origin.set_attributes(data_format='JSON', raw_data=random_raw_json_str, stop_after_first_batch=True) target = builder.add_stage('To Error', type='destination') origin >> target pipeline = builder.build().configure_for_environment(aws) pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(pipeline) # Now we build and run another pipeline with an S3 Origin to read the data back builder = sdc_builder.get_pipeline_builder() s3_origin = builder.add_stage('Amazon S3', type='origin') s3_origin.set_attributes(bucket=s3_bucket, data_format='SDC_JSON', prefix_pattern=f'{s3_key}*', max_batch_size_in_records=100) if anonymous: configure_stage_for_anonymous(s3_origin) wiretap = builder.add_wiretap() finisher = builder.add_stage('Pipeline Finisher Executor') finisher.set_attributes(stage_record_preconditions=[ "${record:eventType() == 'no-more-data'}" ]) s3_origin >> wiretap.destination s3_origin >= finisher read_pipeline = builder.build().configure_for_environment(aws) read_pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(read_pipeline) client = aws.s3 sdc_executor.start_pipeline(pipeline).wait_for_finished() # We should have exactly one file in the bucket list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key) assert 'Contents' in list_s3_objs # If no object was found, there is no 'Contents' key assert len(list_s3_objs['Contents']) == 1 sdc_executor.start_pipeline(read_pipeline).wait_for_finished() assert len(wiretap.output_records) == 1 assert [record.field['text'] for record in wiretap.output_records][0] == random_string finally: try: aws.delete_s3_data(s3_bucket, s3_key) finally: if anonymous: logger.info(f'Deleting bucket {s3_bucket}') aws.s3.delete_bucket(Bucket=s3_bucket)