Python delete_s3_data示例，streamsets.testframework.markers.aws.delete_s3_data Python示例

示例#1

0

显示文件

文件： test_aws_s3_origin.py 项目： streamsets/datacollector-tests

def test_object_names_path(sdc_builder, sdc_executor, aws, test_name,
                           path_name):
    """
    Verify that we can respect all the documented buckets names possible
    """
    s3_key = path_name
    s3_bucket = aws.s3_bucket_name

    data = [
        dict(f1=get_random_string(), f2=get_random_string()) for _ in range(10)
    ]

    # Build pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    s3_origin = builder.add_stage('Amazon S3', type='origin')

    s3_origin.set_attributes(bucket=s3_bucket,
                             data_format='JSON',
                             json_content='ARRAY_OBJECTS',
                             prefix_pattern=f'{s3_key}*')

    wiretap = builder.add_wiretap()

    pipeline_finished_executor = builder.add_stage(
        'Pipeline Finisher Executor')
    pipeline_finished_executor.set_attributes(
        stage_record_preconditions=["${record:eventType() == 'no-more-data'}"])

    s3_origin >> wiretap.destination
    s3_origin >= pipeline_finished_executor

    s3_origin_pipeline = builder.build().configure_for_environment(aws)
    s3_origin_pipeline.configuration['shouldRetry'] = False

    sdc_executor.add_pipeline(s3_origin_pipeline)

    client = aws.s3
    try:
        # Insert objects into S3.
        client.put_object(Bucket=s3_bucket,
                          Key=f'{s3_key}',
                          Body=json.dumps(data))

        sdc_executor.start_pipeline(s3_origin_pipeline).wait_for_finished()
        output_records_values = [
            record.field for record in wiretap.output_records
        ]

        assert len(output_records_values) == 10
        assert output_records_values == data
    finally:
        if sdc_executor.get_pipeline_status(
                s3_origin_pipeline).response.json().get('status') == 'RUNNING':
            logger.info('Stopping pipeline')
            sdc_executor.stop_pipeline(s3_origin_pipeline)
        # Clean up S3.
        aws.delete_s3_data(s3_bucket, s3_key)

示例#2

0

显示文件

文件： test_aws_s3_destination.py 项目： streamsets/datacollector-tests

def test_s3_region_other(sdc_builder, sdc_executor, aws):
    """
    Test that using a specific region and specifying the endpoint works as expected
    We create a file and verify that the tags are correctly propagated to the object created in S3.

        S3 Destination pipeline:
            dev_raw_data_source >> s3_destination
    """
    s3_bucket = aws.s3_bucket_name

    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'

    # Bucket name is inside the record itself
    raw_str = f'{{ "bucket" : "{s3_bucket}", "company" : "StreamSets Inc."}}'

    # Build the pipeline
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='JSON',
                                              raw_data=raw_str,
                                              stop_after_first_batch=True)

    s3_destination = builder.add_stage('Amazon S3', type='destination')
    bucket_val = (s3_bucket if sdc_builder.version < '2.6.0.1-0002' else
                  '${record:value("/bucket")}')
    s3_destination.set_attributes(bucket=bucket_val,
                                  data_format='JSON',
                                  partition_prefix=s3_key)

    dev_raw_data_source >> s3_destination

    s3_dest_pipeline = builder.build().configure_for_environment(aws)

    s3_destination.set_attributes(use_specific_region=True,
                                  region='OTHER',
                                  endpoint=f's3.{aws.region}.amazonaws.com')

    sdc_executor.add_pipeline(s3_dest_pipeline)

    client = aws.s3
    try:
        # start pipeline and capture pipeline messages to assert
        sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_finished()

        # assert record count to S3 the size of the objects put
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
        assert len(list_s3_objs['Contents']) == 1

        # read data from S3 to assert it is what got ingested into the pipeline
        s3_obj_key = client.get_object(Bucket=s3_bucket,
                                       Key=list_s3_objs['Contents'][0]['Key'])

        # We're comparing the logic structure (JSON) rather than byte-to-byte to allow for different ordering, ...
        s3_contents = s3_obj_key['Body'].read().decode().strip()
        assert json.loads(s3_contents) == json.loads(raw_str)
    finally:
        aws.delete_s3_data(s3_bucket, s3_key)

示例#3

0

显示文件

文件： test_aws_s3_origin.py 项目： streamsets/datacollector-tests

def test_multiple_batch(sdc_builder, sdc_executor, aws):
    """
    Test that using multithreaded pipeline we can start our pipeline multiple times adding more objects in between
    without reading any duplicated record neither missing them.
    """
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/sdc'

    # Build pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    s3_origin = builder.add_stage('Amazon S3', type='origin')

    s3_origin.set_attributes(bucket=aws.s3_bucket_name,
                             data_format='JSON',
                             prefix_pattern=f'{s3_key}/*',
                             max_batch_size_in_records=50)

    pipeline_finished_executor = builder.add_stage(
        'Pipeline Finisher Executor')
    pipeline_finished_executor.set_attributes(
        stage_record_preconditions=["${record:eventType() == 'no-more-data'}"])

    wiretap = builder.add_wiretap()

    s3_origin >> wiretap.destination
    s3_origin >= pipeline_finished_executor

    s3_origin_pipeline = builder.build(
        title='Amazon S3 origin multithreaded pipeline'
    ).configure_for_environment(aws)
    s3_origin_pipeline.configuration['shouldRetry'] = False
    sdc_executor.add_pipeline(s3_origin_pipeline)

    client = aws.s3
    try:
        total_data = []
        for i in range(100):
            actual_data = dict(f1=get_random_string())
            total_data.append(actual_data)
            client.put_object(Bucket=aws.s3_bucket_name,
                              Key=f'{s3_key}/{i}',
                              Body=json.dumps(actual_data))

        sdc_executor.start_pipeline(s3_origin_pipeline).wait_for_finished()

        records = [
            dict(f1=record.field['f1']) for record in wiretap.output_records
        ]

        assert len(records) == len(total_data)
        assert all(element in records for element in total_data)
        assert all(element in total_data for element in records)

    finally:
        # Clean up S3.
        aws.delete_s3_data(aws.s3_bucket_name, s3_key)

示例#4

0

显示文件

文件： test_aws_s3_origin.py 项目： streamsets/datacollector-tests

def test_data_format_delimited(sdc_builder, sdc_executor, aws, csv_parser):
    DATA = "A,B,C\n" \
           "1,2,3\n" \
           "10,20,30\n"
    s3_bucket = aws.s3_bucket_name
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/sdc'

    # Build pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    s3_origin = builder.add_stage('Amazon S3', type='origin')

    s3_origin.set_attributes(bucket=s3_bucket,
                             data_format='DELIMITED',
                             header_line='WITH_HEADER',
                             csv_parser=csv_parser,
                             prefix_pattern=f'{s3_key}*')

    wiretap = builder.add_wiretap()

    s3_origin >> wiretap.destination

    s3_origin_pipeline = builder.build().configure_for_environment(aws)
    s3_origin_pipeline.configuration['shouldRetry'] = False

    sdc_executor.add_pipeline(s3_origin_pipeline)

    client = aws.s3
    try:
        # Insert objects into S3.
        client.put_object(Bucket=s3_bucket, Key=f'{s3_key}', Body=DATA)

        sdc_executor.start_pipeline(s3_origin_pipeline)
        sdc_executor.wait_for_pipeline_metric(s3_origin_pipeline,
                                              'output_record_count',
                                              2,
                                              timeout_sec=120)
        sdc_executor.stop_pipeline(s3_origin_pipeline)

        records = wiretap.output_records
        assert len(records) == 2

        assert records[0].field['A'] == "1"
        assert records[0].field['B'] == "2"
        assert records[0].field['C'] == "3"
        assert records[1].field['A'] == "10"
        assert records[1].field['B'] == "20"
        assert records[1].field['C'] == "30"
    finally:
        if sdc_executor.get_pipeline_status(
                s3_origin_pipeline).response.json().get('status') == 'RUNNING':
            logger.info('Stopping pipeline')
            sdc_executor.stop_pipeline(s3_origin_pipeline)
        # Clean up S3.
        aws.delete_s3_data(s3_bucket, s3_key)

示例#5

0

显示文件

文件： test_aws_s3_destination.py 项目： onefoursix/datacollector-tests

def test_s3_whole_file_transfer(sdc_builder, sdc_executor, aws):
    """Test simple scenario of moving files from source to target using WHOLE_FILE_FORMAT."""
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/'
    s3_dest_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/'
    data = 'Completely random string that is transfered as whole file format.'

    # Build pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    origin = builder.add_stage('Amazon S3', type='origin')
    origin.set_attributes(bucket=aws.s3_bucket_name, data_format='WHOLE_FILE',
                          prefix_pattern=f'{s3_key}/*',
                          max_batch_size_in_records=100)

    target = builder.add_stage('Amazon S3', type='destination')
    target.set_attributes(bucket=aws.s3_bucket_name, data_format='WHOLE_FILE', partition_prefix=s3_dest_key,
                          file_name_expression='output.txt')

    wiretap = builder.add_wiretap()

    origin >> target
    target >= wiretap.destination

    pipeline = builder.build().configure_for_environment(aws)
    pipeline.configuration['shouldRetry'] = False
    sdc_executor.add_pipeline(pipeline)

    client = aws.s3
    try:
        client.put_object(Bucket=aws.s3_bucket_name, Key=f'{s3_key}/input.txt', Body=data.encode('ascii'))
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline, 'output_record_count', 1)

        # Validate event generation
        assert len(wiretap.output_records) == 1
        assert wiretap.output_records[0].get_field_data('/targetFileInfo/bucket') == aws.s3_bucket_name
        assert wiretap.output_records[0].get_field_data(
            '/targetFileInfo/objectKey') == f'{s3_dest_key}sdc-output.txt'

        # We should have exactly one file on the destination side
        list_s3_objs = client.list_objects_v2(Bucket=aws.s3_bucket_name, Prefix=s3_dest_key)
        assert len(list_s3_objs['Contents']) == 1

        # With our secret message
        s3_obj_key = client.get_object(Bucket=aws.s3_bucket_name, Key=list_s3_objs['Contents'][0]['Key'])
        s3_contents = s3_obj_key['Body'].read().decode().strip()
        assert s3_contents == data
    finally:
        logger.info('Deleting input S3 data from bucket %s with location %s ...', aws.s3_bucket_name, s3_key)
        aws.delete_s3_data(aws.s3_bucket_name, s3_key)

        logger.info('Deleting output S3 data from bucket %s with location %s ...', aws.s3_bucket_name, s3_dest_key)
        aws.delete_s3_data(aws.s3_bucket_name, s3_dest_key)

示例#6

0

显示文件

文件： test_aws_s3_destination.py 项目： onefoursix/datacollector-tests

def test_s3_whole_file_transfer_existing_file(sdc_builder, sdc_executor, aws):
    """Test simple scenario of moving files from source to target using WHOLE_FILE_FORMAT."""
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/'
    s3_dest_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/'
    data = 'Completely random string that is transferred as whole file format.'

    # Build pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    origin = builder.add_stage('Amazon S3', type='origin')
    origin.set_attributes(bucket=aws.s3_bucket_name, data_format='WHOLE_FILE',
                          prefix_pattern=f'{s3_key}/*',
                          max_batch_size_in_records=100)

    target = builder.add_stage('Amazon S3', type='destination')
    target.set_attributes(bucket=aws.s3_bucket_name, data_format='WHOLE_FILE', partition_prefix=s3_dest_key,
                          file_name_expression='output.txt', object_name_prefix='', file_exists='TO_ERROR')

    finisher = builder.add_stage('Pipeline Finisher Executor')
    finisher.set_attributes(stage_record_preconditions=["${record:eventType() == 'no-more-data'}"])

    origin >> target
    origin >= finisher

    pipeline = builder.build().configure_for_environment(aws)
    pipeline.configuration['shouldRetry'] = False
    sdc_executor.add_pipeline(pipeline)

    client = aws.s3
    try:
        # We create both input as well as output file
        client.put_object(Bucket=aws.s3_bucket_name, Key=f'{s3_key}input.txt', Body=data.encode('ascii'))
        logger.info(f"Pre creating output file {s3_dest_key}output.txt")
        client.put_object(Bucket=aws.s3_bucket_name, Key=f'{s3_dest_key}output.txt', Body=data.encode('ascii'))

        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        history = sdc_executor.get_pipeline_history(pipeline)
        assert history.latest.metrics.counter('stage.AmazonS3_02.inputRecords.counter').count == 1
        assert history.latest.metrics.counter('stage.AmazonS3_02.outputRecords.counter').count == 0
        assert history.latest.metrics.counter('stage.AmazonS3_02.errorRecords.counter').count == 1

    finally:
        logger.info('Deleting input S3 data from bucket %s with location %s ...', aws.s3_bucket_name, s3_key)
        aws.delete_s3_data(aws.s3_bucket_name, s3_key)

        logger.info('Deleting output S3 data from bucket %s with location %s ...', aws.s3_bucket_name, s3_dest_key)
        aws.delete_s3_data(aws.s3_bucket_name, s3_dest_key)

示例#7

0

显示文件

文件： test_aws_s3_destination.py 项目： streamsets/datacollector-tests

def test_dataflow_events(sdc_builder, sdc_executor, aws):
    """
    We write from Dev to S3 using wiretap to capture events and verifying their content
    """

    s3_bucket = aws.s3_bucket_name
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'

    # Bucket name is inside the record itself
    raw_str = f'{{ "bucket" : "{s3_bucket}", "company" : "StreamSets Inc."}}'

    # Build the pipeline
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                  raw_data=raw_str,
                                                                                  stop_after_first_batch=True)

    s3_destination = builder.add_stage('Amazon S3', type='destination')
    s3_destination.set_attributes(bucket=s3_bucket, data_format='JSON', partition_prefix=s3_key)

    wiretap = builder.add_wiretap()

    dev_raw_data_source >> s3_destination >= wiretap.destination

    s3_dest_pipeline = builder.build().configure_for_environment(aws)
    sdc_executor.add_pipeline(s3_dest_pipeline)

    client = aws.s3
    try:
        sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_finished()

        # Validate event generation
        assert wiretap.output_records[0].get_field_data('/bucket') == aws.s3_bucket_name
        assert wiretap.output_records[0].get_field_data('/recordCount') == 1

        # assert record count to S3 the size of the objects put
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
        assert len(list_s3_objs['Contents']) == 1

        # read data from S3 to assert it is what got ingested into the pipeline
        s3_obj_key = client.get_object(Bucket=s3_bucket, Key=list_s3_objs['Contents'][0]['Key'])

        # We're comparing the logic structure (JSON) rather than byte-to-byte to allow for different ordering, ...
        s3_contents = s3_obj_key['Body'].read().decode().strip()
        assert json.loads(s3_contents) == json.loads(raw_str)

    finally:
        aws.delete_s3_data(s3_bucket, s3_key)

示例#8

0

显示文件

文件： test_aws_s3_destination.py 项目： streamsets/datacollector-tests

def test_multiple_batches(sdc_builder, sdc_executor, aws):
    """
    Test for S3 target stage. We verify that the destination work fine with more than one batch.
    """

    s3_bucket = aws.s3_bucket_name
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'

    # Bucket name is inside the record itself
    raw_str = f'{{ "bucket" : "{s3_bucket}", "company" : "StreamSets Inc."}}'

    # Build the pipeline
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                  raw_data=raw_str,
                                                                                  stop_after_first_batch=False)

    s3_destination = builder.add_stage('Amazon S3', type='destination')
    s3_destination.set_attributes(bucket=s3_bucket, data_format='JSON', partition_prefix=s3_key)

    dev_raw_data_source >> s3_destination

    s3_dest_pipeline = builder.build().configure_for_environment(aws)
    sdc_executor.add_pipeline(s3_dest_pipeline)

    client = aws.s3
    try:
        sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_pipeline_output_records_count(20)
        sdc_executor.stop_pipeline(s3_dest_pipeline)

        # assert record count to S3 the size of the objects put
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)

        history = sdc_executor.get_pipeline_history(s3_dest_pipeline)
        history_records = history.latest.metrics.counter('stage.AmazonS3_01.outputRecords.counter').count
        assert len(list_s3_objs['Contents']) == history_records

        # read data from S3 to assert it is what got ingested into the pipeline
        s3_obj_key = client.get_object(Bucket=s3_bucket, Key=list_s3_objs['Contents'][0]['Key'])

        # We're comparing the logic structure (JSON) rather than byte-to-byte to allow for different ordering, ...
        s3_contents = s3_obj_key['Body'].read().decode().strip()
        assert json.loads(s3_contents) == json.loads(raw_str)

    finally:
        aws.delete_s3_data(s3_bucket, s3_key)

示例#9

0

显示文件

文件： test_aws_s3_destination.py 项目： streamsets/datacollector-tests

def test_object_names_path(sdc_builder, sdc_executor, aws, test_name, path_name):
    """Test for S3 target stage. We do so by running a dev raw data source generator to S3 destination
    sandbox bucket and then reading S3 bucket using STF client to assert data between the client to what has
    been ingested by the pipeline.
    """

    s3_bucket = aws.s3_bucket_name
    s3_key = path_name

    # Bucket name is inside the record itself
    raw_str = f'{{ "bucket" : "{s3_bucket}", "company" : "StreamSets Inc."}}'

    # Build the pipeline
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                  raw_data=raw_str,
                                                                                  stop_after_first_batch=True)

    s3_destination = builder.add_stage('Amazon S3', type='destination')
    s3_destination.set_attributes(bucket=s3_bucket, data_format='JSON', partition_prefix=s3_key)

    dev_raw_data_source >> s3_destination

    s3_dest_pipeline = builder.build().configure_for_environment(aws)
    sdc_executor.add_pipeline(s3_dest_pipeline)

    client = aws.s3
    try:
        sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_finished()

        # assert record count to S3 the size of the objects put
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
        assert len(list_s3_objs['Contents']) == 1

        # read data from S3 to assert it is what got ingested into the pipeline
        s3_obj_key = client.get_object(Bucket=s3_bucket, Key=list_s3_objs['Contents'][0]['Key'])

        # We're comparing the logic structure (JSON) rather than byte-to-byte to allow for different ordering, ...
        s3_contents = s3_obj_key['Body'].read().decode().strip()
        assert json.loads(s3_contents) == json.loads(raw_str)

    finally:
        aws.delete_s3_data(s3_bucket, s3_key)

示例#10

0

显示文件

文件： test_aws_s3_destination.py 项目： streamsets/datacollector-tests

def test_push_pull(sdc_builder, sdc_executor, aws):
    """
    We plan to verify that the connector works fine with Dev Raw Data Source and Dev Data Generator, an example of pull
    and push strategies, so as we already verified Dev Raw Data Source, we will use Dev Data Generator here to complete
    the coverage.
    """

    s3_bucket = aws.s3_bucket_name
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'

    # Build the pipeline
    builder = sdc_builder.get_pipeline_builder()

    dev_data_generator = builder.add_stage('Dev Data Generator')

    dev_data_generator.set_attributes(batch_size=1,
                                      fields_to_generate=[
                                          {'field': 'stringField', 'type': 'STRING', 'precision': 10, 'scale': 2}])

    s3_destination = builder.add_stage('Amazon S3', type='destination')
    s3_destination.set_attributes(bucket=s3_bucket, data_format='JSON', partition_prefix=s3_key)

    dev_data_generator >> s3_destination

    s3_dest_pipeline = builder.build().configure_for_environment(aws)
    sdc_executor.add_pipeline(s3_dest_pipeline)

    client = aws.s3
    try:
        sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_pipeline_output_records_count(25)
        sdc_executor.stop_pipeline(s3_dest_pipeline)

        history = sdc_executor.get_pipeline_history(s3_dest_pipeline)
        history_records = history.latest.metrics.counter('stage.AmazonS3_01.outputRecords.counter').count

        # assert record count to S3 the size of the objects put
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
        assert len(list_s3_objs['Contents']) == history_records

    finally:
        aws.delete_s3_data(s3_bucket, s3_key)

示例#11

0

显示文件

文件： test_aws_emr.py 项目： streamsets/datacollector-tests

def _test_emr_origin_to_s3(sdc_builder, sdc_executor, aws):
    s3_bucket = aws.emr_s3_bucket_name
    s3_input_key = '{0}/{1}/input'.format(S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10))
    s3_output_key = '{0}/{1}/output'.format(S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10))

    s3_staging_bucket = aws.emr_s3_staging_bucket_name
    s3_staging_key = '{0}/{1}/sdc_staging'.format(S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10))
    s3_logging_key = '{0}/{1}/sdc_logging'.format(S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10))

    raw_str = 'Hello World!'
    s3_obj_count = 2  # keep it low, so as the number of MR jobs don't spin a lot and take a while lot of time

    logger.info('%s S3 bucket used with input key: %s output key: %s and object count: %s',
                s3_bucket, s3_input_key, s3_output_key, s3_obj_count)
    logger.info('%s S3 staging bucket used with EMR staging key: %s and EMR logging key: %s',
                s3_staging_bucket, s3_staging_key, s3_logging_key)

    # build pipeline
    builder = sdc_builder.get_pipeline_builder()

    emr_origin = builder.add_stage('Hadoop FS', type='origin')
    emr_origin.set_attributes(
        hadoop_fs_uri=f's3a://{s3_bucket}',
        input_paths=[f'/{s3_input_key}'],
        data_format='TEXT'
    )

    s3_destination = builder.add_stage('Amazon S3', type='destination')
    s3_destination.set_attributes(bucket=s3_bucket, data_format='TEXT', partition_prefix=s3_output_key)

    emr_origin >> s3_destination

    pipeline = builder.build(title='Amazon EMR to S3 pipeline').configure_for_environment(aws)
    sdc_executor.add_pipeline(pipeline)

    client = aws.s3
    try:
        logger.info('Creating input S3 data ...')
        [client.put_object(Bucket=s3_staging_bucket, Key='{0}/{1}'.format(s3_input_key, i), Body=raw_str)
         for i in range(s3_obj_count)]

        # lets not wait for pipeline start, as the transition from START to RUNNING takes more time
        sdc_executor.start_pipeline(pipeline, wait=False).wait_for_finished(timeout_sec=1800)

        # assert record count to S3 the size of the objects put
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_output_key)
        assert len(list_s3_objs['Contents']) == s3_obj_count

        # read data from S3 to assert it is what got ingested into the pipeline
        s3_contents = [client.get_object(Bucket=s3_bucket, Key=s3_content['Key'])['Body'].read().decode().strip()
                       for s3_content in list_s3_objs['Contents']]

        assert s3_contents == [raw_str] * s3_obj_count
    finally:
        logger.info('Deleting input S3 data from bucket %s with location %s ...', s3_bucket, s3_input_key)
        aws.delete_s3_data(s3_bucket, s3_input_key)

        logger.info('Deleting output S3 data from bucket %s with location %s ...', s3_bucket, s3_output_key)
        aws.delete_s3_data(s3_bucket, s3_output_key)

        logger.info('Deleting staging S3 data from bucket %s with location %s ...', s3_staging_bucket, s3_staging_key)
        aws.delete_s3_data(s3_staging_bucket, s3_staging_key)

        logger.info('Deleting logging S3 data from bucket %s with location %s ...', s3_staging_bucket, s3_logging_key)
        aws.delete_s3_data(s3_staging_bucket, s3_logging_key)

示例#12

0

显示文件

文件： test_aws_s3_origin.py 项目： streamsets/datacollector-tests

def test_object_names_bucket(sdc_builder, sdc_executor, aws, test_name,
                             bucket_generator):
    """
    Verify that we can respect all the documented buckets names possible
    """
    client = aws.s3
    retry = 0
    s3_bucket = None
    # Since S3 buckets are globally unique, doing our usual randomization doesn't work well - we always have a chance
    # to create bucket that already exists. That is why we have a retry logic - we try to generate several bucket names
    # and see which one we manage to "claim".
    while s3_bucket is None and retry < 10:
        retry = retry + 1
        s3_bucket = bucket_generator()
        logger.info(f"Retry {retry} with bucket name '{s3_bucket}'")

        try:
            client.create_bucket(
                Bucket=s3_bucket,
                CreateBucketConfiguration={'LocationConstraint': aws.region})
        except Exception as e:
            s3_bucket = None
            logger.error(f"Can't use bucket name '{s3_bucket}': {e}")

    # We might not be able to find suitable bucket in max retries in which case we will simply die
    assert s3_bucket is not None

    try:
        client.put_bucket_tagging(Bucket=s3_bucket,
                                  Tagging={
                                      'TagSet': [
                                          {
                                              'Key': 'stf-env',
                                              'Value': 'nightly-tests'
                                          },
                                          {
                                              'Key': 'managed-by',
                                              'Value': 'ep'
                                          },
                                          {
                                              'Key': 'dept',
                                              'Value': 'eng'
                                          },
                                      ]
                                  })
        s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/sdc'

        data = [
            dict(f1=get_random_string(), f2=get_random_string())
            for _ in range(10)
        ]

        # Build pipeline.
        builder = sdc_builder.get_pipeline_builder()
        builder.add_error_stage('Discard')

        s3_origin = builder.add_stage('Amazon S3', type='origin')

        s3_origin.set_attributes(bucket=s3_bucket,
                                 data_format='JSON',
                                 json_content='ARRAY_OBJECTS',
                                 prefix_pattern=f'{s3_key}*')

        wiretap = builder.add_wiretap()

        pipeline_finished_executor = builder.add_stage(
            'Pipeline Finisher Executor')
        pipeline_finished_executor.set_attributes(stage_record_preconditions=[
            "${record:eventType() == 'no-more-data'}"
        ])

        s3_origin >> wiretap.destination
        s3_origin >= pipeline_finished_executor

        s3_origin_pipeline = builder.build().configure_for_environment(aws)
        s3_origin_pipeline.configuration['shouldRetry'] = False

        sdc_executor.add_pipeline(s3_origin_pipeline)

        # Insert objects into S3.
        client.put_object(Bucket=s3_bucket,
                          Key=f'{s3_key}',
                          Body=json.dumps(data))

        sdc_executor.start_pipeline(s3_origin_pipeline).wait_for_finished()
        output_records_values = [
            record.field for record in wiretap.output_records
        ]

        assert len(output_records_values) == 10
        assert output_records_values == data
    finally:
        # Clean up S3.
        try:
            aws.delete_s3_data(s3_bucket, s3_key)
        except Exception as e:
            logger.error(f"Can't remove files from bucket {s3_bucket}: {e}")
        finally:
            try:
                client.delete_bucket(Bucket=s3_bucket)
            except Exception as e:
                logger.error(f"Can't delete buckeet: {e}")

        if sdc_executor.get_pipeline_status(
                s3_origin_pipeline).response.json().get('status') == 'RUNNING':
            logger.info('Stopping pipeline')
            sdc_executor.stop_pipeline(s3_origin_pipeline)

示例#13

0

显示文件

def test_s3_executor_tag_object(sdc_builder, sdc_executor, aws):
    """Test for S3 executor stage. We do so by running a dev raw data source generator to S3 destination
    sandbox bucket and then reading S3 bucket using STF client to assert data between the client to what has
    been created by the pipeline. We use a record deduplicator processor in between dev raw data source origin
    and S3 destination in order to limit number of objects to one.

    For recent SDC versions we also check that the corresponding 'file-changed' event is generated.

    S3 Destination pipeline:
        dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap.destination
                                                   >> to_error
    """
    s3_bucket = aws.s3_bucket_name
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'
    raw_str = f'{{"bucket": "{s3_bucket}", "key": "{s3_key}"}}'

    # Build the pipeline.
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='JSON',
                                              raw_data=raw_str,
                                              stop_after_first_batch=True)

    record_deduplicator = builder.add_stage('Record Deduplicator')
    to_error = builder.add_stage('To Error')

    s3_executor = builder.add_stage('Amazon S3', type='executor')
    s3_executor.set_attributes(bucket='${record:value("/bucket")}',
                               task='CHANGE_EXISTING_OBJECT',
                               object='${record:value("/key")}',
                               tags=Configuration(
                                   property_key='key',
                                   company='${record:value("/company")}'))

    wiretap = builder.add_wiretap()

    dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap.destination
    record_deduplicator >> to_error

    s3_exec_pipeline = builder.build(
        title='Amazon S3 executor pipeline').configure_for_environment(aws)
    sdc_executor.add_pipeline(s3_exec_pipeline)

    client = aws.s3
    try:
        # Pre-create the object so that it exists.
        client.put_object(Body='Secret Data', Bucket=s3_bucket, Key=s3_key)

        sdc_executor.start_pipeline(s3_exec_pipeline).wait_for_finished()

        tags = client.get_object_tagging(Bucket=s3_bucket,
                                         Key=s3_key)['TagSet']
        assert len(tags) == 1

        # Check if the 'file-created' event was generated (only for recent sdc versions).
        if Version(
                sdc_builder.version) >= MIN_SDC_VERSION_WITH_EXECUTOR_EVENTS:
            assert len(wiretap.output_records) == 1
            assert wiretap.output_records[0].header.values[
                'sdc.event.type'] == 'file-changed'

    finally:
        _ensure_pipeline_is_stopped(sdc_executor, s3_exec_pipeline)
        aws.delete_s3_data(s3_bucket, s3_key)

示例#14

0

显示文件

文件： test_aws_s3_destination.py 项目： streamsets/datacollector-tests

def test_s3_whole_file_transfer_with_tags(sdc_builder, sdc_executor, aws):
    """Test for tags on S3 destination using WHOLE_FILE_FORMAT..
    We create a file and verify that the tags are correctly propagated to the object created in S3.

        S3 Destination pipeline:
            dev_raw_data_source >> s3_destination
    """
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/'
    s3_dest_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/'
    data = 'Completely random string that is transfered as whole file format.'

    # Build pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    origin = builder.add_stage('Amazon S3', type='origin')
    origin.set_attributes(bucket=aws.s3_bucket_name,
                          data_format='WHOLE_FILE',
                          prefix_pattern=f'{s3_key}/*',
                          max_batch_size_in_records=100)

    target = builder.add_stage('Amazon S3', type='destination')
    target.set_attributes(bucket=aws.s3_bucket_name,
                          data_format='WHOLE_FILE',
                          partition_prefix=s3_dest_key,
                          file_name_expression='output.txt',
                          add_tags=True,
                          tags=[{
                              "key": "this-is-a-test-tag-key",
                              "value": "this-is-a-test-tag-value"
                          }])

    origin >> target

    pipeline = builder.build().configure_for_environment(aws)
    pipeline.configuration['shouldRetry'] = False
    sdc_executor.add_pipeline(pipeline)

    client = aws.s3
    try:
        client.put_object(Bucket=aws.s3_bucket_name,
                          Key=f'{s3_key}/input.txt',
                          Body=data.encode('ascii'))
        sdc_executor.start_pipeline(pipeline)
        sdc_executor.wait_for_pipeline_metric(pipeline,
                                              'output_record_count',
                                              1,
                                              timeout_sec=120)

        # We should have exactly one file on the destination side
        list_s3_objs = client.list_objects_v2(Bucket=aws.s3_bucket_name,
                                              Prefix=s3_dest_key)
        assert len(list_s3_objs['Contents']) == 1

        # With our secret message
        s3_obj_key = client.get_object(Bucket=aws.s3_bucket_name,
                                       Key=list_s3_objs['Contents'][0]['Key'])
        s3_contents = s3_obj_key['Body'].read().decode().strip()
        assert s3_contents == data

        object_tagging = client.get_object_tagging(
            Bucket=aws.s3_bucket_name, Key=list_s3_objs['Contents'][0]['Key'])
        assert object_tagging['TagSet'] == [{
            "Key": "this-is-a-test-tag-key",
            "Value": "this-is-a-test-tag-value"
        }]
    finally:
        logger.info(
            'Deleting input S3 data from bucket %s with location %s ...',
            aws.s3_bucket_name, s3_key)
        aws.delete_s3_data(aws.s3_bucket_name, s3_key)

        logger.info(
            'Deleting output S3 data from bucket %s with location %s ...',
            aws.s3_bucket_name, s3_dest_key)
        aws.delete_s3_data(aws.s3_bucket_name, s3_dest_key)

示例#15

0

显示文件

文件： test_aws_s3_destination.py 项目： streamsets/datacollector-tests

def test_s3_multithreading_multiple_batches(sdc_builder, sdc_executor, aws):
    """Test for S3 target stage. Data loss scenario happened when multiple threads try writing within the same
    millisecond. We just add the runnerId in the file name, if the option is specified (true by default):

    S3 Destination pipeline:
        dev_data_generator >> s3_destination
        dev_data_generator >> wiretap.destination
    """
    number_of_records = 100
    batch_size = 1
    delay_between_batches = 1
    number_of_threads = 10
    try:
        s3_bucket = aws.s3_bucket_name

        s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'

        # Build the pipeline
        builder = sdc_builder.get_pipeline_builder()

        dev_data_generator = builder.add_stage('Dev Data Generator')
        dev_data_generator.fields_to_generate = [
            {
                'field': 'id',
                'type': 'POKEMON'
            },
        ]
        dev_data_generator.set_attributes(
            delay_between_batches=delay_between_batches,
            batch_size=batch_size,
            records_to_be_generated=number_of_records,
            number_of_threads=number_of_threads)

        s3_destination = builder.add_stage('Amazon S3', type='destination')
        s3_destination.set_attributes(bucket=s3_bucket,
                                      data_format='JSON',
                                      partition_prefix=s3_key)

        wiretap = builder.add_wiretap()

        dev_data_generator >> [s3_destination, wiretap.destination]

        s3_dest_pipeline = builder.build(title='Amazon S3 destination pipeline'
                                         ).configure_for_environment(aws)
        sdc_executor.add_pipeline(s3_dest_pipeline)

        client = aws.s3

        # start pipeline and capture pipeline messages to assert
        sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_finished()

        # assert record count to S3 the size of the objects put
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
        assert len(list_s3_objs['Contents']) == number_of_records

        records = []
        for record in wiretap.output_records:
            # We need to get every field inside each record
            records = records + [record.field]

        file_names = []
        for i in range(0, number_of_records - 1):
            # We just check each file in s3 contains ths same as wiretap
            s3_obj_key = client.get_object(
                Bucket=s3_bucket, Key=list_s3_objs['Contents'][i]['Key'])
            s3_contents = s3_obj_key['Body'].read().decode().strip()
            file_names = file_names + [list_s3_objs['Contents'][i]['Key']]
            assert json.loads(s3_contents) in records

        for i in range(0, number_of_threads - 1):
            # We also check we have at least one file name (computed before) of each thread (from 000 to 009)
            thread_number = '-' + str(i).zfill(3)
            assert any(thread_number in file_name for file_name in file_names)
    finally:
        aws.delete_s3_data(s3_bucket, s3_key)

示例#16

0

显示文件

文件： test_aws_s3_destination.py 项目： streamsets/datacollector-tests

def _run_test_s3_destination(sdc_builder, sdc_executor, aws, sse_kms,
                             anonymous):
    try:
        if anonymous:
            s3_bucket = create_bucket(aws)
            logger.info(f'Bucket {s3_bucket} created')
        else:
            s3_bucket = aws.s3_bucket_name

        s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'

        # Bucket name is inside the record itself
        raw_str = f'{{ "bucket" : "{s3_bucket}", "company" : "StreamSets Inc."}}'

        # Build the pipeline
        builder = sdc_builder.get_pipeline_builder()

        dev_raw_data_source = builder.add_stage(
            'Dev Raw Data Source').set_attributes(data_format='JSON',
                                                  raw_data=raw_str,
                                                  stop_after_first_batch=True)

        s3_destination = builder.add_stage('Amazon S3', type='destination')
        bucket_val = (s3_bucket if sdc_builder.version < '2.6.0.1-0002' else
                      '${record:value("/bucket")}')
        s3_destination.set_attributes(bucket=bucket_val,
                                      data_format='JSON',
                                      partition_prefix=s3_key)
        if sse_kms:
            # Use SSE with KMS
            s3_destination.set_attributes(use_server_side_encryption=True,
                                          server_side_encryption_option='KMS',
                                          aws_kms_key_arn=aws.kms_key_arn)
        if anonymous:
            configure_stage_for_anonymous(s3_destination)

        wiretap = builder.add_wiretap()

        dev_raw_data_source >> s3_destination
        s3_destination >= wiretap.destination

        s3_dest_pipeline = builder.build(title='Amazon S3 destination pipeline'
                                         ).configure_for_environment(aws)
        sdc_executor.add_pipeline(s3_dest_pipeline)

        client = aws.s3

        # start pipeline and capture pipeline messages to assert
        sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_finished()

        # Validate event generation
        assert len(wiretap.output_records) == 1
        assert [record.field['bucket']
                for record in wiretap.output_records][0] == s3_bucket
        assert [
            record.field['recordCount'] for record in wiretap.output_records
        ][0] == 1

        # assert record count to S3 the size of the objects put
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
        assert len(list_s3_objs['Contents']) == 1

        # read data from S3 to assert it is what got ingested into the pipeline
        client_to_read = create_anonymous_client() if anonymous else client
        s3_obj_key = client_to_read.get_object(
            Bucket=s3_bucket, Key=list_s3_objs['Contents'][0]['Key'])

        # We're comparing the logic structure (JSON) rather than byte-to-byte to allow for different ordering, ...
        s3_contents = s3_obj_key['Body'].read().decode().strip()
        assert json.loads(s3_contents) == json.loads(raw_str)

        if sse_kms:
            # assert that the data was stored with SSE using the KMS
            assert s3_obj_key['ServerSideEncryption'] == 'aws:kms'
            assert s3_obj_key['SSEKMSKeyId'] == aws.kms_key_arn
    finally:
        try:
            aws.delete_s3_data(s3_bucket, s3_key)
        finally:
            if anonymous:
                logger.info(f'Deleting bucket {s3_bucket}')
                aws.s3.delete_bucket(Bucket=s3_bucket)

示例#17

0

显示文件

文件： test_aws_s3_origin.py 项目： streamsets/datacollector-tests

def test_dataflow_events_new_file(sdc_builder, sdc_executor, aws):
    """
    Test that we receive an new-file event whenever we start reading a file.
    """
    s3_bucket = aws.s3_bucket_name
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}/sdc'

    data = [
        dict(f1=get_random_string(), f2=get_random_string()) for _ in range(10)
    ]

    # Build pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    s3_origin = builder.add_stage('Amazon S3', type='origin')

    s3_origin.set_attributes(bucket=s3_bucket,
                             data_format='JSON',
                             json_content='ARRAY_OBJECTS',
                             prefix_pattern=f'{s3_key}*')

    events_wiretap = builder.add_wiretap()
    records_wiretap = builder.add_wiretap()

    s3_origin >> records_wiretap.destination
    s3_origin >= events_wiretap.destination

    s3_origin_pipeline = builder.build().configure_for_environment(aws)
    s3_origin_pipeline.configuration['shouldRetry'] = False

    sdc_executor.add_pipeline(s3_origin_pipeline)

    client = aws.s3
    try:
        # Insert objects into S3.
        client.put_object(Bucket=s3_bucket,
                          Key=f'{s3_key}',
                          Body=json.dumps(data))

        sdc_executor.start_pipeline(s3_origin_pipeline)
        sdc_executor.wait_for_pipeline_metric(s3_origin_pipeline,
                                              'output_record_count',
                                              1,
                                              timeout_sec=120)
        sdc_executor.stop_pipeline(s3_origin_pipeline)

        output_records_values = [
            record.field for record in records_wiretap.output_records
        ]
        assert len(output_records_values) == 10
        assert output_records_values == data

        # We have exactly one output record, check that it is a new-file event
        event_record = events_wiretap.output_records[0]
        event_type = event_record.header.values['sdc.event.type']
        assert event_type == 'new-file', 'Received %s as event type (expected new-file)' % event_type
    finally:
        if sdc_executor.get_pipeline_status(
                s3_origin_pipeline).response.json().get('status') == 'RUNNING':
            logger.info('Stopping pipeline')
            sdc_executor.stop_pipeline(s3_origin_pipeline)
        # Clean up S3.
        aws.delete_s3_data(s3_bucket, s3_key)

示例#18

0

显示文件

def _run_test_s3_executor_create_object(sdc_builder, sdc_executor, aws,
                                        anonymous):
    # Setup test static.
    s3_bucket = aws.s3_bucket_name
    s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'
    raw_str = f'{{"bucket": "{s3_bucket}", "company": "StreamSets Inc."}}'

    # Build the pipeline.
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='JSON',
                                              raw_data=raw_str,
                                              stop_after_first_batch=True)

    record_deduplicator = builder.add_stage('Record Deduplicator')
    to_error = builder.add_stage('To Error')

    s3_executor = builder.add_stage('Amazon S3', type='executor')
    s3_executor.set_attributes(bucket='${record:value("/bucket")}',
                               task='CREATE_NEW_OBJECT',
                               object=s3_key,
                               content='${record:value("/company")}')
    if anonymous:
        configure_stage_for_anonymous(s3_executor)

    wiretap = builder.add_wiretap()

    dev_raw_data_source >> record_deduplicator >> s3_executor >= wiretap.destination
    record_deduplicator >> to_error

    s3_exec_pipeline = builder.build(
        title='Amazon S3 executor pipeline').configure_for_environment(aws)
    sdc_executor.add_pipeline(s3_exec_pipeline)

    client = aws.s3
    public_access_block = None
    bucket_policy = None
    try:
        if anonymous:
            public_access_block, bucket_policy = allow_public_access(
                client, s3_bucket, True, True)

        sdc_executor.start_pipeline(s3_exec_pipeline).wait_for_finished()

        # Assert record count to S3 the size of the objects put.
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
        assert len(list_s3_objs['Contents']) == 1

        # Read data from S3 to assert it is what got ingested into the pipeline.
        client_to_read = create_anonymous_client() if anonymous else client
        s3_contents = [
            client_to_read.get_object(
                Bucket=s3_bucket,
                Key=s3_content['Key'])['Body'].read().decode().strip()
            for s3_content in list_s3_objs['Contents']
        ]

        assert s3_contents[0] == 'StreamSets Inc.'

        # Check if the 'file-created' event was generated (only for recent sdc versions).
        if Version(
                sdc_builder.version) >= MIN_SDC_VERSION_WITH_EXECUTOR_EVENTS:
            assert len(wiretap.output_records) == 1
            assert wiretap.output_records[0].header.values[
                'sdc.event.type'] == 'file-created'

    finally:
        _ensure_pipeline_is_stopped(sdc_executor, s3_exec_pipeline)
        restore_public_access(client, s3_bucket, public_access_block,
                              bucket_policy)
        aws.delete_s3_data(s3_bucket, s3_key)

示例#19

0

显示文件

文件： test_aws_s3_destination.py 项目： streamsets/datacollector-tests

def test_object_names_bucket(sdc_builder, sdc_executor, aws, test_name, bucket_generator):
    """Test for S3 target stage. We do so by running a dev raw data source generator to S3 destination
    sandbox bucket and then reading S3 bucket using STF client to assert data between the client to what has
    been ingested by the pipeline.
    """
    client = aws.s3
    retry = 0
    s3_bucket = None

    try:
        # Since S3 buckets are globally unique, doing our usual randomization doesn't work well - we always have a chance
        # to create bucket that already exists. That is why we have a retry logic - we try to generate several bucket names
        # and see which one we manage to "claim".
        while s3_bucket is None and retry < 10:
            retry = retry + 1
            s3_bucket = bucket_generator()
            logger.info(f"Retry {retry} with bucket name '{s3_bucket}'")

            try:
                client.create_bucket(Bucket=s3_bucket, CreateBucketConfiguration={'LocationConstraint': aws.region})
            except Exception as e:
                s3_bucket = None
                logger.error(f"Can't use bucket name '{s3_bucket}': {e}")

        # We might not be able to find suitable bucket in max retries in which case we will simply die
        assert s3_bucket is not None

        client.put_bucket_tagging(
            Bucket=s3_bucket,
            Tagging={
                'TagSet': [
                    {'Key': 'stf-env', 'Value': 'nightly-tests'},
                    {'Key': 'managed-by', 'Value': 'ep'},
                    {'Key': 'dept', 'Value': 'eng'},
                ]
            }
        )
        s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string(string.ascii_letters, 10)}'

        # Bucket name is inside the record itself
        raw_str = f'{{ "bucket" : "{s3_bucket}", "company" : "StreamSets Inc."}}'

        # Build the pipeline
        builder = sdc_builder.get_pipeline_builder()

        dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                      raw_data=raw_str,
                                                                                      stop_after_first_batch=True)

        s3_destination = builder.add_stage('Amazon S3', type='destination')
        s3_destination.set_attributes(bucket=s3_bucket, data_format='JSON', partition_prefix=s3_key)

        dev_raw_data_source >> s3_destination

        s3_dest_pipeline = builder.build().configure_for_environment(aws)
        sdc_executor.add_pipeline(s3_dest_pipeline)

        sdc_executor.start_pipeline(s3_dest_pipeline).wait_for_finished()

        # assert record count to S3 the size of the objects put
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
        assert len(list_s3_objs['Contents']) == 1

        # read data from S3 to assert it is what got ingested into the pipeline
        s3_obj_key = client.get_object(Bucket=s3_bucket, Key=list_s3_objs['Contents'][0]['Key'])

        # We're comparing the logic structure (JSON) rather than byte-to-byte to allow for different ordering, ...
        s3_contents = s3_obj_key['Body'].read().decode().strip()
        assert json.loads(s3_contents) == json.loads(raw_str)

    finally:
        try:
            aws.delete_s3_data(s3_bucket, s3_key)
        except Exception as e:
            logger.error(f"Can't remove files from bucket {s3_bucket}: {e}")
        finally:
            try:
                client.delete_bucket(Bucket=s3_bucket)
            except Exception as e:
                logger.error(f"Can't delete buckeet: {e}")

示例#20

0

显示文件

文件： test_aws_s3_destination.py 项目： streamsets/datacollector-tests

def _run_test_s3_error_destination(sdc_builder, sdc_executor, aws, anonymous):
    try:
        if anonymous:
            s3_bucket = create_bucket(aws)
            logger.info(f'Bucket {s3_bucket} created')
        else:
            s3_bucket = aws.s3_bucket_name

        s3_key = f'{S3_SANDBOX_PREFIX}/errDest-{get_random_string()}/'
        random_string = get_random_string(string.ascii_letters, 10)
        random_raw_json_str = f'{{"text":"{random_string}"}}'

        # Build pipeline.
        builder = sdc_builder.get_pipeline_builder()
        s3_err = builder.add_error_stage('Write to Amazon S3')
        s3_err.set_attributes(bucket=s3_bucket, common_prefix=s3_key)
        if anonymous:
            configure_stage_for_anonymous(s3_err)

        origin = builder.add_stage('Dev Raw Data Source', type='origin')
        origin.set_attributes(data_format='JSON',
                              raw_data=random_raw_json_str,
                              stop_after_first_batch=True)

        target = builder.add_stage('To Error', type='destination')

        origin >> target

        pipeline = builder.build().configure_for_environment(aws)
        pipeline.configuration['shouldRetry'] = False
        sdc_executor.add_pipeline(pipeline)

        # Now we build and run another pipeline with an S3 Origin to read the data back
        builder = sdc_builder.get_pipeline_builder()
        s3_origin = builder.add_stage('Amazon S3', type='origin')
        s3_origin.set_attributes(bucket=s3_bucket,
                                 data_format='SDC_JSON',
                                 prefix_pattern=f'{s3_key}*',
                                 max_batch_size_in_records=100)
        if anonymous:
            configure_stage_for_anonymous(s3_origin)

        wiretap = builder.add_wiretap()
        finisher = builder.add_stage('Pipeline Finisher Executor')
        finisher.set_attributes(stage_record_preconditions=[
            "${record:eventType() == 'no-more-data'}"
        ])

        s3_origin >> wiretap.destination
        s3_origin >= finisher

        read_pipeline = builder.build().configure_for_environment(aws)
        read_pipeline.configuration['shouldRetry'] = False
        sdc_executor.add_pipeline(read_pipeline)

        client = aws.s3
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        # We should have exactly one file in the bucket
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
        assert 'Contents' in list_s3_objs  # If no object was found, there is no 'Contents' key
        assert len(list_s3_objs['Contents']) == 1

        sdc_executor.start_pipeline(read_pipeline).wait_for_finished()
        assert len(wiretap.output_records) == 1
        assert [record.field['text']
                for record in wiretap.output_records][0] == random_string
    finally:
        try:
            aws.delete_s3_data(s3_bucket, s3_key)
        finally:
            if anonymous:
                logger.info(f'Deleting bucket {s3_bucket}')
                aws.s3.delete_bucket(Bucket=s3_bucket)