def test_script_processor_with_all_parameters(exists_mock, isfile_mock,
                                              sagemaker_session):
    processor = ScriptProcessor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        command=["python3"],
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{
            "Key": "my-tag",
            "Value": "my-tag-value"
        }],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
        sagemaker_session=sagemaker_session,
    )

    processor.run(
        code="/local/path/to/processing_code.py",
        inputs=[
            ProcessingInput(
                source="s3://path/to/my/dataset/census.csv",
                destination="/container/path/",
                input_name="my_dataset",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/container/path/",
                destination="s3://uri/",
                output_name="my_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["--drop-columns", "'SelfEmployed'"],
        wait=True,
        logs=False,
        job_name="my_job_name",
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(
        processor._current_job_name)

    sagemaker_session.process.assert_called_with(**expected_args)
    assert "my_job_name" in processor._current_job_name
示例#2
0
def test_script_processor_with_no_inputs_or_outputs(sagemaker_session,
                                                    image_uri,
                                                    cpu_instance_type):
    script_processor = ScriptProcessor(
        role=ROLE,
        image_uri=image_uri,
        command=["python3"],
        instance_count=1,
        instance_type=cpu_instance_type,
        volume_size_in_gb=100,
        volume_kms_key=None,
        max_runtime_in_seconds=3600,
        base_job_name="test-script-processor-with-no-inputs-or-outputs",
        env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"},
        tags=[{
            "Key": "dummy-tag",
            "Value": "dummy-tag-value"
        }],
        sagemaker_session=sagemaker_session,
    )

    script_processor.run(code=os.path.join(DATA_DIR, "dummy_script.py"),
                         arguments=["-v"],
                         wait=True,
                         logs=True)

    job_description = script_processor.latest_job.describe()

    assert job_description["ProcessingInputs"][0]["InputName"] == "code"

    assert job_description["ProcessingJobName"].startswith(
        "test-script-processor-with-no-inputs")

    assert job_description["ProcessingJobStatus"] == "Completed"

    assert job_description["ProcessingResources"] == {
        "ClusterConfig": {
            "InstanceCount": 1,
            "InstanceType": "ml.m4.xlarge",
            "VolumeSizeInGB": 100
        }
    }

    assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"]
    assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
        "python3",
        "/opt/ml/processing/input/code/dummy_script.py",
    ]
    assert job_description["AppSpecification"]["ImageUri"] == image_uri

    assert job_description["Environment"] == {
        "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"
    }

    assert ROLE in job_description["RoleArn"]

    assert job_description["StoppingCondition"] == {
        "MaxRuntimeInSeconds": 3600
    }
def test_script_processor_with_all_parameters_via_run_args(
    exists_mock, isfile_mock, sagemaker_session
):
    processor = ScriptProcessor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        command=["python3"],
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{"Key": "my-tag", "Value": "my-tag-value"}],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
        sagemaker_session=sagemaker_session,
    )

    run_args = processor.get_run_args(
        code="/local/path/to/processing_code.py",
        inputs=_get_data_inputs_all_parameters(),
        outputs=_get_data_outputs_all_parameters(),
        arguments=["--drop-columns", "'SelfEmployed'"],
    )

    processor.run(
        code=run_args.code,
        inputs=run_args.inputs,
        outputs=run_args.outputs,
        arguments=run_args.arguments,
        wait=True,
        logs=False,
        job_name="my_job_name",
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(processor._current_job_name)

    sagemaker_session.process.assert_called_with(**expected_args)
    assert "my_job_name" in processor._current_job_name
def lambda_handler(event, context):
    timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())    # Variables de ambiente
    local_filename, headers = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/sparkml-mleap/data/abalone/abalone.csv')
    os.rename(local_filename,'/tmp/abalone.csv')
    
    bucket = os.environ['bucket']
    role = os.environ['role']
    sagemaker_session = sagemaker.Session()
    spark_repository_uri = os.environ['spark_repository_uri']
    # Prefix constantes
    prefix = 'sagemaker/spark-preprocess-demo/' + timestamp_prefix 
    input_prefix = prefix + '/input/raw/abalone'
    input_preprocessed_prefix = prefix + '/input/preprocessed/abalone'
    mleap_model_prefix = prefix + '/mleap-model'
    # Store the value of the execution timestamp
    client = boto3.client('s3')
    client.put_object(Body=timestamp_prefix.encode('ascii'),
                        Bucket=bucket,
                        Key='execution.txt')
    # Upload data so it's present for training and inference
    
    print(sagemaker_session.upload_data(path='/tmp/abalone.csv', bucket=bucket, key_prefix=input_prefix))
    
    spark_processor = ScriptProcessor(base_job_name='spark-preprocessor',
                                  image_uri=spark_repository_uri,
                                  command=['/opt/program/submit'],
                                  role=role,
                                  instance_count=2,
                                  instance_type='ml.r5.xlarge',
                                  max_runtime_in_seconds=1200,
                                  env={'mode': 'python'})

    spark_processor.run(code=f's3://{bucket}/sparkdemo/preprocess.py',
                    arguments=['s3_input_bucket', bucket,
                              's3_input_key_prefix', input_prefix,
                              's3_output_bucket', bucket,
                              's3_output_key_prefix', input_preprocessed_prefix,
                              's3_model_bucket', bucket,
                              's3_mleap_model_prefix', mleap_model_prefix],
                    logs=True)
    
    event['s3_output_path'] = f's3://{bucket}/sagemaker/spark-preprocess-demo/{timestamp_prefix}/xgboost_model'
    event['train_data'] = f's3://{bucket}/sagemaker/spark-preprocess-demo/{timestamp_prefix}/input/preprocessed/abalone/train/part'
    event['validation_data'] = f's3://{bucket}/sagemaker/spark-preprocess-demo/{timestamp_prefix}/input/preprocessed/abalone/validation/part'
    event['training_job'] = f'{timestamp_prefix}-job' 
    return event
def test_processing_job_inputs_and_output_config(sagemaker_session, image_uri,
                                                 cpu_instance_type,
                                                 output_kms_key):
    script_processor = ScriptProcessor(
        role=ROLE,
        image_uri=image_uri,
        command=["python3"],
        instance_count=1,
        instance_type=cpu_instance_type,
        volume_size_in_gb=100,
        volume_kms_key=None,
        output_kms_key=output_kms_key,
        max_runtime_in_seconds=3600,
        base_job_name="test-script-processor",
        env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"},
        tags=[{
            "Key": "dummy-tag",
            "Value": "dummy-tag-value"
        }],
        sagemaker_session=sagemaker_session,
    )

    script_processor.run(
        code=os.path.join(DATA_DIR, "dummy_script.py"),
        inputs=_get_processing_inputs_with_all_parameters(
            sagemaker_session.default_bucket()),
        outputs=_get_processing_outputs_with_all_parameters(),
        arguments=["-v"],
        wait=False,
    )

    job_description = script_processor.latest_job.describe()
    expected_inputs_and_outputs = _get_processing_job_inputs_and_outputs(
        sagemaker_session.default_bucket(), output_kms_key)
    assert (job_description["ProcessingInputs"][:-1] ==
            expected_inputs_and_outputs["ProcessingInputs"])
    assert (job_description["ProcessingOutputConfig"] ==
            expected_inputs_and_outputs["ProcessingOutputConfig"])
示例#6
0
def test_processing_step_with_script_processor(pipeline_session, processing_input, network_config):
    processor = ScriptProcessor(
        role=sagemaker.get_execution_role(),
        image_uri=IMAGE_URI,
        command=["python3"],
        instance_type=INSTANCE_TYPE,
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="volume-kms-key",
        output_kms_key="output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{"Key": "my-tag", "Value": "my-tag-value"}],
        network_config=network_config,
        sagemaker_session=pipeline_session,
    )

    step_args = processor.run(
        inputs=processing_input, code=DUMMY_S3_SCRIPT_PATH, job_name="my-processing-job"
    )

    step = ProcessingStep(
        name="MyProcessingStep",
        step_args=step_args,
    )

    pipeline = Pipeline(
        name="MyPipeline",
        steps=[step],
        sagemaker_session=pipeline_session,
    )

    assert json.loads(pipeline.definition())["Steps"][0] == {
        "Name": "MyProcessingStep",
        "Type": "Processing",
        "Arguments": step_args,
    }
示例#7
0
def test_script_processor(sagemaker_session, image_uri, cpu_instance_type,
                          output_kms_key):
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")

    script_processor = ScriptProcessor(
        role=ROLE,
        image_uri=image_uri,
        command=["python3"],
        instance_count=1,
        instance_type=cpu_instance_type,
        volume_size_in_gb=100,
        volume_kms_key=None,
        output_kms_key=output_kms_key,
        max_runtime_in_seconds=3600,
        base_job_name="test-script-processor",
        env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"},
        tags=[{
            "Key": "dummy-tag",
            "Value": "dummy-tag-value"
        }],
        sagemaker_session=sagemaker_session,
    )

    script_processor.run(
        code=os.path.join(DATA_DIR, "dummy_script.py"),
        inputs=[
            ProcessingInput(
                source=input_file_path,
                destination="/opt/ml/processing/input/container/path/",
                input_name="dummy_input",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/opt/ml/processing/output/container/path/",
                output_name="dummy_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["-v"],
        wait=True,
        logs=True,
    )

    job_description = script_processor.latest_job.describe()

    assert job_description["ProcessingInputs"][0]["InputName"] == "dummy_input"

    assert job_description["ProcessingInputs"][1]["InputName"] == "code"

    assert job_description["ProcessingJobName"].startswith(
        "test-script-processor")

    assert job_description["ProcessingJobStatus"] == "Completed"

    assert job_description["ProcessingOutputConfig"][
        "KmsKeyId"] == output_kms_key
    assert job_description["ProcessingOutputConfig"]["Outputs"][0][
        "OutputName"] == "dummy_output"

    assert job_description["ProcessingResources"]["ClusterConfig"][
        "InstanceCount"] == 1
    assert (job_description["ProcessingResources"]["ClusterConfig"]
            ["InstanceType"] == cpu_instance_type)
    assert job_description["ProcessingResources"]["ClusterConfig"][
        "VolumeSizeInGB"] == 100

    assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"]
    assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
        "python3",
        "/opt/ml/processing/input/code/dummy_script.py",
    ]
    assert job_description["AppSpecification"]["ImageUri"] == image_uri

    assert job_description["Environment"] == {
        "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"
    }

    assert ROLE in job_description["RoleArn"]

    assert job_description["StoppingCondition"] == {
        "MaxRuntimeInSeconds": 3600
    }
示例#8
0
    role=role,
    image_uri=image_uri,
    command=["python3"],
    instance_count=1,
    instance_type="ml.m5.xlarge",
)


script_processor.run(
    code="s3://sagemaker-ap-southeast-1-342474125894/riotinto/preprocessing/code/processor.py",
    inputs=[
        ProcessingInput(
            # Include data files, Can be s3 or local path
            source="s3://sagemaker-ap-southeast-1-342474125894/riotinto/preprocessing/input",
            destination="/opt/ml/processing/input/data",
            input_name="parquet",
            s3_data_type="S3Prefix",
        ),
    ],
    outputs=[
        ProcessingOutput(
            source="/opt/ml/processing/output",
            destination="s3://sagemaker-ap-southeast-1-342474125894/riotinto/preprocessing/output",
            output_name="output",
        ),
    ],
    # Must be list of str
    arguments=["--option", "1"],
    wait=True,
)
示例#9
0
def process(
    session: SagemakerSession,
    role,
    script,
    # script_source,
    inputs=None,
    outputs=None,
    dependencies=None,
    requirements=None,
    configuration_script=None,
    configuration_command=None,
    base_job_name=PROCESSING_JOB_NAME,
    job_name=None,
    image=Images.PROCESSING.tag,
    image_path=Images.PROCESSING.path,
    image_accounts=",".join(Images.PROCESSING.accounts),
    instance=PROCESSING_INSTANCE,
    volume_size=30,
    runtime_seconds=PROCESSING_RUNTIME_SECONDS,
    output_mount=OUTPUT_MOUNT,
    input_mount=INPUT_MOUNT,
    module_mount=MODULE_MOUNT,
    python='python3',
    wait=True,
    logs=True,
    arguments=None,
    tags=None,
    output_json=None,
    env=None
):
    iam = session.boto_session.client('iam')

    image_uri = ecr_ensure_image(
        image=Image(
            path=image_path,
            tag=image,
            accounts=image_accounts.split(",")
        ),
        session=session.boto_session
    )
    role = ensure_processing_role(iam=iam, role_name=role)
    if inputs is None:
        inputs = {}
    if outputs is None:
        outputs = {}
    if dependencies is None:
        dependencies = {}
    if tags is None:
        tags = {}
    else:
        tags = tags.copy()
    if arguments is None:
        arguments = {}
    else:
        arguments = arguments.copy()
    # if module_mount is not None and len(module_mount)> 0:
    #    command = ["PYTHONPATH={module_mount};${{PYTHONPATH}}".format(module_mount=module_mount), "python3"]
    # else:
    #    command = ['python3']
    command = ['sh']
    path_arguments = {}
    processing_inputs = []
    s3 = session.boto_session.client('s3')
    for name, source in inputs.items():
        processing_input, path_argument = make_processing_input(
            mount=input_mount,
            name=name,
            source=source.local,
            mode=source.mode,
            s3=s3
        )
        processing_inputs.append(processing_input)
        path_arguments[name] = path_argument
    for name, source in dependencies.items():
        processing_input, path_argument = make_processing_input(
            mount=module_mount,
            name=name,
            source=source,
            s3=s3
        )
        processing_inputs.append(processing_input)
        path_arguments[name] = path_argument

    script_remote = "{}/{}".format(module_mount, os.path.basename(script))
    processing_inputs.append(
        ProcessingInput(
            source=script,
            destination=module_mount,
            input_name="aws_sagemaker_remote_script",
            # s3_data_type='S3Prefix',
            s3_input_mode='File',
            # s3_data_distribution_type='FullyReplicated',
            # s3_compression_type='None'
        )
    )
    if env:
        env = env.copy()
    else:
        env = {}
    env.update({
        "AWS_SAGEMAKER_REMOTE_MODULE_MOUNT": module_mount,
        "AWS_SAGEMAKER_REMOTE_PYTHON": python,
        "AWS_SAGEMAKER_REMOTE_SCRIPT": script_remote,
        "IS_SAGEMAKER": "1"
    })

    if requirements:
        requirements_remote = "{}/requirements_txt/{}".format(
            module_mount, 'requirements.txt')
        env['AWS_SAGEMAKER_REMOTE_REQUIREMENTS'] = requirements_remote
        processing_inputs.append(
            ProcessingInput(
                source=requirements,
                destination="{}/requirements_txt".format(module_mount),
                input_name="aws_sagemaker_remote_requirements",
                s3_input_mode='File',
            )
        )

    if configuration_script:
        configuration_script_remote = "{}/{}".format(
            module_mount, os.path.basename(configuration_script))
        env['AWS_SAGEMAKER_REMOTE_CONFIGURATION_SCRIPT'] = configuration_script_remote
        processing_inputs.append(
            ProcessingInput(
                source=configuration_script,
                destination=module_mount,
                input_name="aws_sagemaker_remote_configuration_script",
                s3_input_mode='File'
            )
        )

    if configuration_command and len(configuration_command) > 0:
        env['AWS_SAGEMAKER_REMOTE_CONFIGURATION_COMMAND'] = configuration_command

    tags["Source"] = 'aws-sagemaker-remote'
    tags["BaseJobName"] = base_job_name
    tags = make_tags(tags)
    print("Tags: {}".format(tags))
    processor = ScriptProcessor(
        role,
        image_uri=image_uri,
        instance_count=1,
        instance_type=instance,
        command=command,
        volume_size_in_gb=volume_size,
        # volume_kms_key=None,
        # output_kms_key=None,
        max_runtime_in_seconds=runtime_seconds,
        base_job_name=base_job_name,
        sagemaker_session=session,
        env=env,
        tags=tags
        # network_config=None
    )
    processing_outputs = []
    for name, dest in outputs.items():
        # todo: move into PathArgument class
        if not ((not dest.remote) or dest.remote == 'default' or dest.remote.startswith('s3://')):
            raise ValueError("Argument [{}] must be either `default` or an S3 url (`s3://...`). Value given was [{}].".format(
                variable_to_argparse("{}_s3".format(name)), dest.remote))
        source = "{}/{}".format(output_mount, name)
        if dest.mode:
            assert dest.mode in ['EndOfJob', 'Continuous']
        processing_outputs.append(
            ProcessingOutput(
                source=source,
                destination=dest.remote if dest.remote and dest.remote != 'default' else None,
                output_name=name,
                s3_upload_mode=dest.mode or 'EndOfJob'
            ))
        path_arguments[name] = source

    ensure_eol(PROCESSING_SCRIPT)
    code = Path(PROCESSING_SCRIPT).as_uri()
    if job_name is None or len(str(job_name).strip()) == 0:
        job_name = None
    else:
        job_name = str(job_name).strip()

    arguments.update(path_arguments)
    processor.run(
        code=code,
        inputs=processing_inputs,
        outputs=processing_outputs,
        wait=False,
        logs=logs,
        job_name=job_name,
        arguments=sagemaker_arguments(vargs=arguments)
    )
    job = processor.latest_job
    if output_json:
        obj = job.describe()
        #print("Describe: {}".format(obj))
        os.makedirs(os.path.dirname(
            os.path.abspath(output_json)), exist_ok=True)
        with open(output_json, 'w') as f:
            json.dump(obj, f, default=json_converter, indent=4)

    if wait:
        job.wait(logs=logs)
    return processor
示例#10
0
        's3://sagemaker-us-east-1-513905722774/sagemaker_examples/data/temp_audio_raw',
        destination="/opt/ml/processing/input/data"),
]
OUTPUTS = [
    ProcessingOutput(
        source="/opt/ml/processing/output/train",
        destination=
        's3://sagemaker-us-east-1-513905722774/sagemaker_examples/data/temp_audio_features/train'
    ),
    ProcessingOutput(
        source="/opt/ml/processing/output/test",
        destination=
        's3://sagemaker-us-east-1-513905722774/sagemaker_examples/data/temp_audio_features/test'
    ),
]

# PROCESSOR BUILD AND RUN
processor = ScriptProcessor(base_job_name=JOB_NAME,
                            tags=TAGS,
                            role=ROLE_SAGEMAKER,
                            instance_type=INSTANCE_TYPE,
                            instance_count=INSTANCE_COUNT,
                            image_uri=IMAGE_URI,
                            command=['python3'])

processor.run(
    code=SCRIPT,
    arguments=ARGUMENTS,
    inputs=INPUTS,
    outputs=OUTPUTS,
)
def test_byo_container_with_script_processor(sagemaker_session):
    script_processor = ScriptProcessor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        command=["python3"],
        instance_count=1,
        instance_type="ml.m4.xlarge",
        sagemaker_session=sagemaker_session,
    )

    with patch("os.path.isfile", return_value=True):
        script_processor.run(
            code="/local/path/to/sklearn_transformer.py",
            inputs=[
                ProcessingInput(source="/local/path/to/my/dataset/census.csv",
                                destination="/data/")
            ],
            experiment_config={"ExperimentName": "AnExperiment"},
        )

    expected_args = {
        "inputs": [
            {
                "InputName": "input-1",
                "S3Input": {
                    "S3Uri": "mocked_s3_uri_from_upload_data",
                    "LocalPath": "/data/",
                    "S3DataType": "S3Prefix",
                    "S3InputMode": "File",
                    "S3DataDistributionType": "FullyReplicated",
                    "S3CompressionType": "None",
                },
            },
            {
                "InputName": "code",
                "S3Input": {
                    "S3Uri": "mocked_s3_uri_from_upload_data",
                    "LocalPath": "/opt/ml/processing/input/code",
                    "S3DataType": "S3Prefix",
                    "S3InputMode": "File",
                    "S3DataDistributionType": "FullyReplicated",
                    "S3CompressionType": "None",
                },
            },
        ],
        "output_config": {
            "Outputs": []
        },
        "job_name":
        script_processor._current_job_name,
        "resources": {
            "ClusterConfig": {
                "InstanceType": "ml.m4.xlarge",
                "InstanceCount": 1,
                "VolumeSizeInGB": 30,
            }
        },
        "stopping_condition":
        None,
        "app_specification": {
            "ImageUri":
            CUSTOM_IMAGE_URI,
            "ContainerEntrypoint": [
                "python3",
                "/opt/ml/processing/input/code/sklearn_transformer.py",
            ],
        },
        "environment":
        None,
        "network_config":
        None,
        "role_arn":
        ROLE,
        "tags":
        None,
        "experiment_config": {
            "ExperimentName": "AnExperiment"
        },
    }
    sagemaker_session.process.assert_called_with(**expected_args)
示例#12
0
ROLE_ARN = sagemaker.get_execution_role()
## image uri code
ACCOUNT_ID = boto3.client('sts').get_caller_identity().get('Account')
REGION = boto3.Session().region_name
ECR_REPOSITORY = 'sagemaker-processing-container'
TAG = ':latest'
IMAGE_URI = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(ACCOUNT_ID, REGION,
                                                    ECR_REPOSITORY + TAG)
## call processing job
script_processor = ScriptProcessor(command=['python3'],
                                   image_uri=IMAGE_URI,
                                   role=ROLE_ARN,
                                   instance_count=1,
                                   instance_type='ml.m5.xlarge')

script_processor.run(
    code='train_val_test_split.py',
    inputs=[
        ProcessingInput(source=f's3://{BUCKET}/{INPUT_FOLDER}/',
                        destination='/opt/ml/processing/input')
    ],
    outputs=[
        ProcessingOutput(source='/opt/ml/processing/output/train',
                         destination=f's3://{BUCKET}/{OUTPUT_FOLDER}/train'),
        ProcessingOutput(
            source='/opt/ml/processing/output/validation',
            destination=f's3://{BUCKET}/{OUTPUT_FOLDER}/validation'),
        ProcessingOutput(source='/opt/ml/processing/output/test',
                         destination=f's3://{BUCKET}/{OUTPUT_FOLDER}/test')
    ])
# For local training a dummy role will be sufficient
role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'

processor = ScriptProcessor(
    command=['python3'],
    image_uri='sagemaker-scikit-learn-processing-local',
    role=role,
    instance_count=1,
    instance_type='local')

processor.run(code='processing_script.py',
              inputs=[
                  ProcessingInput(source='./input_data/',
                                  destination='/opt/ml/processing/input_data/')
              ],
              outputs=[
                  ProcessingOutput(output_name='word_count_data',
                                   source='/opt/ml/processing/processed_data/')
              ],
              arguments=['job-type', 'word-count'])

preprocessing_job_description = processor.jobs[-1].describe()
output_config = preprocessing_job_description['ProcessingOutputConfig']

print(output_config)

for output in output_config['Outputs']:
    if output['OutputName'] == 'word_count_data':
        word_count_data_file = output['S3Output']['S3Uri']

print('Output file is located on: {}'.format(word_count_data_file))
# For local training a dummy role will be sufficient
role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'

processor = ScriptProcessor(
    command=['python3'],
    image_uri='sagemaker-delta-sharing-processing-local',
    role=role,
    instance_count=1,
    instance_type='local')

processor.run(code='processing_script.py',
              inputs=[
                  ProcessingInput(source='./profile/',
                                  destination='/opt/ml/processing/profile/')
              ],
              outputs=[
                  ProcessingOutput(output_name='delta_lake_processed_data',
                                   source='/opt/ml/processing/processed_data/')
              ])

preprocessing_job_description = processor.jobs[-1].describe()
output_config = preprocessing_job_description['ProcessingOutputConfig']

print(output_config)

for output in output_config['Outputs']:
    if output['OutputName'] == 'delta_lake_processed_data':
        delta_lake_processed_data_file = output['S3Output']['S3Uri']
        bucket = delta_lake_processed_data_file.split("/")[:3][2]
        output_file_name = '/'.join(