Exemplo n.º 1
0
def test_sklearn_processor_with_required_parameters(exists_mock, isfile_mock,
                                                    botocore_resolver,
                                                    sagemaker_session,
                                                    sklearn_version):
    botocore_resolver.return_value.construct_endpoint.return_value = {
        "hostname": ECR_HOSTNAME
    }

    processor = SKLearnProcessor(
        role=ROLE,
        instance_type="ml.m4.xlarge",
        framework_version=sklearn_version,
        instance_count=1,
        sagemaker_session=sagemaker_session,
    )

    processor.run(code="/local/path/to/processing_code.py")

    expected_args = _get_expected_args(processor._current_job_name)

    sklearn_image_uri = (
        "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-py3"
    ).format(sklearn_version)
    expected_args["app_specification"]["ImageUri"] = sklearn_image_uri

    sagemaker_session.process.assert_called_with(**expected_args)
Exemplo n.º 2
0
def test_sklearn_with_network_config(sagemaker_session, sklearn_full_version,
                                     cpu_instance_type):
    script_path = os.path.join(DATA_DIR, "dummy_script.py")
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")

    sklearn_processor = SKLearnProcessor(
        framework_version=sklearn_full_version,
        role=ROLE,
        instance_type=cpu_instance_type,
        instance_count=1,
        command=["python3"],
        sagemaker_session=sagemaker_session,
        base_job_name="test-sklearn-with-network-config",
        network_config=NetworkConfig(enable_network_isolation=True,
                                     encrypt_inter_container_traffic=True),
    )

    sklearn_processor.run(
        code=script_path,
        inputs=[
            ProcessingInput(source=input_file_path,
                            destination="/opt/ml/processing/inputs/")
        ],
        wait=False,
        logs=False,
    )

    job_description = sklearn_processor.latest_job.describe()
    network_config = job_description["NetworkConfig"]
    assert network_config["EnableInterContainerTrafficEncryption"]
    assert network_config["EnableNetworkIsolation"]
Exemplo n.º 3
0
def test_sklearn_with_no_inputs_or_outputs(sagemaker_session, image_uri,
                                           sklearn_full_version,
                                           cpu_instance_type):
    sklearn_processor = SKLearnProcessor(
        framework_version=sklearn_full_version,
        role=ROLE,
        command=["python3"],
        instance_type=cpu_instance_type,
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key=None,
        max_runtime_in_seconds=3600,
        base_job_name="test-sklearn-with-no-inputs-or-outputs",
        env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"},
        tags=[{
            "Key": "dummy-tag",
            "Value": "dummy-tag-value"
        }],
        sagemaker_session=sagemaker_session,
    )

    sklearn_processor.run(code=os.path.join(DATA_DIR, "dummy_script.py"),
                          arguments=["-v"],
                          wait=True,
                          logs=True)

    job_description = sklearn_processor.latest_job.describe()

    assert job_description["ProcessingInputs"][0]["InputName"] == "code"

    assert job_description["ProcessingJobName"].startswith(
        "test-sklearn-with-no-inputs")

    assert job_description["ProcessingJobStatus"] == "Completed"

    assert job_description["ProcessingResources"] == {
        "ClusterConfig": {
            "InstanceCount": 1,
            "InstanceType": "ml.m4.xlarge",
            "VolumeSizeInGB": 100
        }
    }

    assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"]
    assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
        "python3",
        "/opt/ml/processing/input/code/dummy_script.py",
    ]
    assert job_description["AppSpecification"]["ImageUri"] == image_uri

    assert job_description["Environment"] == {
        "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"
    }

    assert ROLE in job_description["RoleArn"]

    assert job_description["StoppingCondition"] == {
        "MaxRuntimeInSeconds": 3600
    }
def test_sklearn_with_no_inputs(sagemaker_session):
    sklearn_processor = SKLearnProcessor(
        framework_version="0.20.0",
        role=ROLE,
        command=["python3"],
        instance_type="ml.m4.xlarge",
        instance_count=1,
        sagemaker_session=sagemaker_session,
    )

    with patch("os.path.isfile", return_value=True):
        sklearn_processor.run(code="/local/path/to/sklearn_transformer.py")

    expected_args = {
        "inputs": [{
            "InputName": "code",
            "S3Input": {
                "S3Uri": "mocked_s3_uri_from_upload_data",
                "LocalPath": "/opt/ml/processing/input/code",
                "S3DataType": "S3Prefix",
                "S3InputMode": "File",
                "S3DataDistributionType": "FullyReplicated",
                "S3CompressionType": "None",
            },
        }],
        "output_config": {
            "Outputs": []
        },
        "job_name":
        sklearn_processor._current_job_name,
        "resources": {
            "ClusterConfig": {
                "InstanceType": "ml.m4.xlarge",
                "InstanceCount": 1,
                "VolumeSizeInGB": 30,
            }
        },
        "stopping_condition":
        None,
        "app_specification": {
            "ImageUri":
            "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3",
            "ContainerEntrypoint": [
                "python3",
                "/opt/ml/processing/input/code/sklearn_transformer.py",
            ],
        },
        "environment":
        None,
        "network_config":
        None,
        "role_arn":
        ROLE,
        "tags":
        None,
        "experiment_config":
        None,
    }
    sagemaker_session.process.assert_called_with(**expected_args)
def test_sklearn_with_all_parameters_via_run_args_called_twice(
    exists_mock, isfile_mock, botocore_resolver, sklearn_version, sagemaker_session
):
    botocore_resolver.return_value.construct_endpoint.return_value = {"hostname": ECR_HOSTNAME}

    processor = SKLearnProcessor(
        role=ROLE,
        framework_version=sklearn_version,
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{"Key": "my-tag", "Value": "my-tag-value"}],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
        sagemaker_session=sagemaker_session,
    )

    run_args = processor.get_run_args(
        code="/local/path/to/processing_code.py",
        inputs=_get_data_inputs_all_parameters(),
        outputs=_get_data_outputs_all_parameters(),
        arguments=["--drop-columns", "'SelfEmployed'"],
    )

    run_args = processor.get_run_args(
        code="/local/path/to/processing_code.py",
        inputs=_get_data_inputs_all_parameters(),
        outputs=_get_data_outputs_all_parameters(),
        arguments=["--drop-columns", "'SelfEmployed'"],
    )

    processor.run(
        code=run_args.code,
        inputs=run_args.inputs,
        outputs=run_args.outputs,
        arguments=run_args.arguments,
        wait=True,
        logs=False,
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(processor._current_job_name)
    sklearn_image_uri = (
        "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-py3"
    ).format(sklearn_version)
    expected_args["app_specification"]["ImageUri"] = sklearn_image_uri

    sagemaker_session.process.assert_called_with(**expected_args)
def test_sklearn_with_all_parameters(exists_mock, isfile_mock, sagemaker_session):
    processor = SKLearnProcessor(
        role=ROLE,
        framework_version="0.20.0",
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{"Key": "my-tag", "Value": "my-tag-value"}],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
        ),
        sagemaker_session=sagemaker_session,
    )

    processor.run(
        code="/local/path/to/processing_code.py",
        inputs=[
            ProcessingInput(
                source="s3://path/to/my/dataset/census.csv",
                destination="/container/path/",
                input_name="my_dataset",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/container/path/",
                destination="s3://uri/",
                output_name="my_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["--drop-columns", "'SelfEmployed'"],
        wait=True,
        logs=False,
        job_name="my_job_name",
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(processor._current_job_name)
    sklearn_image_uri = (
        "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3"
    )
    expected_args["app_specification"]["ImageUri"] = sklearn_image_uri

    sagemaker_session.process.assert_called_with(**expected_args)
def test_sklearn(sagemaker_session, sklearn_full_version, cpu_instance_type):
    logging.getLogger().setLevel(logging.DEBUG)  # TODO-reinvent-2019: REMOVE

    script_path = os.path.join(DATA_DIR, "dummy_script.py")
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")

    sklearn_processor = SKLearnProcessor(
        framework_version=sklearn_full_version,
        role=ROLE,
        instance_type=cpu_instance_type,
        instance_count=1,
        command=["python3"],
        sagemaker_session=sagemaker_session,
        max_runtime_in_seconds=3600,  # TODO-reinvent-2019: REMOVE
        base_job_name="test-sklearn",
    )

    sklearn_processor.run(
        code=script_path,
        inputs=[
            ProcessingInput(source=input_file_path,
                            destination="/opt/ml/processing/inputs/")
        ],
        wait=False,
        logs=False,
    )

    job_description = sklearn_processor.latest_job.describe()

    assert len(job_description["ProcessingInputs"]) == 2
    assert job_description["ProcessingResources"] == {
        "ClusterConfig": {
            "InstanceCount": 1,
            "InstanceType": "ml.m4.xlarge",
            "VolumeSizeInGB": 30
        }
    }
    assert job_description["StoppingCondition"] == {
        "MaxRuntimeInSeconds": 3600
    }
    assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
        "python3",
        "/opt/ml/processing/input/code/dummy_script.py",
    ]
    assert job_description["RoleArn"] == ROLE
def test_sklearn_processor_with_required_parameters(exists_mock, isfile_mock, sagemaker_session):
    processor = SKLearnProcessor(
        role=ROLE,
        instance_type="ml.m4.xlarge",
        framework_version="0.20.0",
        instance_count=1,
        sagemaker_session=sagemaker_session,
    )

    processor.run(code="/local/path/to/processing_code.py")

    expected_args = _get_expected_args(processor._current_job_name)

    sklearn_image_uri = (
        "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3"
    )
    expected_args["app_specification"]["ImageUri"] = sklearn_image_uri

    sagemaker_session.process.assert_called_with(**expected_args)
Exemplo n.º 9
0
def test_sklearn(sagemaker_session, sklearn_full_version, cpu_instance_type):
    script_path = os.path.join(DATA_DIR, "dummy_script.py")
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")

    sklearn_processor = SKLearnProcessor(
        framework_version=sklearn_full_version,
        role=ROLE,
        instance_type=cpu_instance_type,
        instance_count=1,
        command=["python3"],
        sagemaker_session=sagemaker_session,
        base_job_name="test-sklearn",
    )

    sklearn_processor.run(
        code=script_path,
        inputs=[
            ProcessingInput(source=input_file_path,
                            destination="/opt/ml/processing/inputs/")
        ],
        wait=False,
        logs=False,
    )

    job_description = sklearn_processor.latest_job.describe()

    assert len(job_description["ProcessingInputs"]) == 2
    assert job_description["ProcessingResources"]["ClusterConfig"][
        "InstanceCount"] == 1
    assert (job_description["ProcessingResources"]["ClusterConfig"]
            ["InstanceType"] == cpu_instance_type)
    assert job_description["ProcessingResources"]["ClusterConfig"][
        "VolumeSizeInGB"] == 30
    assert job_description["StoppingCondition"] == {
        "MaxRuntimeInSeconds": 86400
    }
    assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
        "python3",
        "/opt/ml/processing/input/code/dummy_script.py",
    ]
    assert ROLE in job_description["RoleArn"]
Exemplo n.º 10
0
def test_local_processing_sklearn(sagemaker_local_session_no_local_code,
                                  sklearn_latest_version):
    script_path = os.path.join(DATA_DIR, "dummy_script.py")
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")

    sklearn_processor = SKLearnProcessor(
        framework_version=sklearn_latest_version,
        role="SageMakerRole",
        instance_type="local",
        instance_count=1,
        command=["python3"],
        sagemaker_session=sagemaker_local_session_no_local_code,
    )

    sklearn_processor.run(
        code=script_path,
        inputs=[
            ProcessingInput(source=input_file_path,
                            destination="/opt/ml/processing/inputs/")
        ],
        wait=False,
        logs=False,
    )

    job_description = sklearn_processor.latest_job.describe()

    assert len(job_description["ProcessingInputs"]) == 2
    assert job_description["ProcessingResources"]["ClusterConfig"][
        "InstanceCount"] == 1
    assert job_description["ProcessingResources"]["ClusterConfig"][
        "InstanceType"] == "local"
    assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
        "python3",
        "/opt/ml/processing/input/code/dummy_script.py",
    ]
    assert job_description["RoleArn"] == "<no_role>"
Exemplo n.º 11
0
def test_sklearn_with_custom_default_bucket(
    sagemaker_session_with_custom_bucket,
    custom_bucket_name,
    image_uri,
    sklearn_full_version,
    cpu_instance_type,
    output_kms_key,
):
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")

    sklearn_processor = SKLearnProcessor(
        framework_version=sklearn_full_version,
        role=ROLE,
        command=["python3"],
        instance_type=cpu_instance_type,
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key=None,
        output_kms_key=output_kms_key,
        max_runtime_in_seconds=3600,
        base_job_name="test-sklearn-with-customizations",
        env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"},
        tags=[{
            "Key": "dummy-tag",
            "Value": "dummy-tag-value"
        }],
        sagemaker_session=sagemaker_session_with_custom_bucket,
    )

    sklearn_processor.run(
        code=os.path.join(DATA_DIR, "dummy_script.py"),
        inputs=[
            ProcessingInput(
                source=input_file_path,
                destination="/opt/ml/processing/input/container/path/",
                input_name="dummy_input",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/opt/ml/processing/output/container/path/",
                output_name="dummy_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["-v"],
        wait=True,
        logs=True,
    )

    job_description = sklearn_processor.latest_job.describe()

    assert job_description["ProcessingInputs"][0]["InputName"] == "dummy_input"
    assert custom_bucket_name in job_description["ProcessingInputs"][0][
        "S3Input"]["S3Uri"]

    assert job_description["ProcessingInputs"][1]["InputName"] == "code"
    assert custom_bucket_name in job_description["ProcessingInputs"][1][
        "S3Input"]["S3Uri"]

    assert job_description["ProcessingJobName"].startswith(
        "test-sklearn-with-customizations")

    assert job_description["ProcessingJobStatus"] == "Completed"

    assert job_description["ProcessingOutputConfig"][
        "KmsKeyId"] == output_kms_key
    assert job_description["ProcessingOutputConfig"]["Outputs"][0][
        "OutputName"] == "dummy_output"

    assert job_description["ProcessingResources"]["ClusterConfig"][
        "InstanceCount"] == 1
    assert (job_description["ProcessingResources"]["ClusterConfig"]
            ["InstanceType"] == cpu_instance_type)
    assert job_description["ProcessingResources"]["ClusterConfig"][
        "VolumeSizeInGB"] == 100

    assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"]
    assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
        "python3",
        "/opt/ml/processing/input/code/dummy_script.py",
    ]
    assert job_description["AppSpecification"]["ImageUri"] == image_uri

    assert job_description["Environment"] == {
        "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"
    }

    assert ROLE in job_description["RoleArn"]

    assert job_description["StoppingCondition"] == {
        "MaxRuntimeInSeconds": 3600
    }
Exemplo n.º 12
0
# For local training a dummy role will be sufficient
role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'

processor = SKLearnProcessor(framework_version='0.20.0',
                             instance_count=1,
                             instance_type='local',
                             role=role)

print('Starting processing job.')
print('Note: if launching for the first time in local mode, container image download might take a few minutes to complete.')
processor.run(code='processing_script.py',
                      inputs=[ProcessingInput(
                          source='./input_data/',
                          destination='/opt/ml/processing/input_data/')],
                      outputs=[ProcessingOutput(
                          output_name='word_count_data',
                          source='/opt/ml/processing/processed_data/')],
                      arguments=['job-type', 'word-count']
                     )

preprocessing_job_description = processor.jobs[-1].describe()
output_config = preprocessing_job_description['ProcessingOutputConfig']

print(output_config)

for output in output_config['Outputs']:
    if output['OutputName'] == 'word_count_data':
        word_count_data_file = output['S3Output']['S3Uri']

print('Output file is located on: {}'.format(word_count_data_file))
Exemplo n.º 13
0
def test_sklearn_with_all_customizations(sagemaker_session):
    sklearn_processor = SKLearnProcessor(
        framework_version="0.20.0",
        role=ROLE,
        command=["python3"],
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key=None,
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{
            "Key": "my-tag",
            "Value": "my-tag-value"
        }],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
        ),
        sagemaker_session=sagemaker_session,
    )

    with patch("os.path.isdir", return_value=True):
        sklearn_processor.run(
            code="/local/path/to/sklearn_transformer.py",
            inputs=[
                ProcessingInput(
                    source="s3://path/to/my/dataset/census.csv",
                    destination="/container/path/",
                    input_name="my_dataset",
                    s3_data_type="S3Prefix",
                    s3_input_mode="File",
                    s3_data_distribution_type="FullyReplicated",
                    s3_compression_type="None",
                )
            ],
            outputs=[
                ProcessingOutput(
                    source="/container/path/",
                    destination="s3://uri/",
                    output_name="my_output",
                    s3_upload_mode="EndOfJob",
                )
            ],
            arguments=["--drop-columns", "'SelfEmployed'"],
            wait=True,
            logs=False,
            job_name="my_job_name",
            experiment_config={"ExperimentName": "AnExperiment"},
        )

    expected_args = {
        "inputs": [
            {
                "InputName": "my_dataset",
                "S3Input": {
                    "S3Uri": "s3://path/to/my/dataset/census.csv",
                    "LocalPath": "/container/path/",
                    "S3DataType": "S3Prefix",
                    "S3InputMode": "File",
                    "S3DataDistributionType": "FullyReplicated",
                    "S3CompressionType": "None",
                },
            },
            {
                "InputName": "code",
                "S3Input": {
                    "S3Uri": "mocked_s3_uri_from_upload_data",
                    "LocalPath": "/opt/ml/processing/input/code",
                    "S3DataType": "S3Prefix",
                    "S3InputMode": "File",
                    "S3DataDistributionType": "FullyReplicated",
                    "S3CompressionType": "None",
                },
            },
        ],
        "output_config": {
            "Outputs": [{
                "OutputName": "my_output",
                "S3Output": {
                    "S3Uri": "s3://uri/",
                    "LocalPath": "/container/path/",
                    "S3UploadMode": "EndOfJob",
                },
            }],
            "KmsKeyId":
            "arn:aws:kms:us-west-2:012345678901:key/kms-key",
        },
        "job_name":
        sklearn_processor._current_job_name,
        "resources": {
            "ClusterConfig": {
                "InstanceType": "ml.m4.xlarge",
                "InstanceCount": 1,
                "VolumeSizeInGB": 100,
            }
        },
        "stopping_condition": {
            "MaxRuntimeInSeconds": 3600
        },
        "app_specification": {
            "ImageUri":
            "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3",
            "ContainerArguments": ["--drop-columns", "'SelfEmployed'"],
            "ContainerEntrypoint": [
                "python3",
                "/opt/ml/processing/input/code/sklearn_transformer.py",
            ],
        },
        "environment": {
            "my_env_variable": "my_env_variable_value"
        },
        "network_config": {
            "EnableNetworkIsolation": True,
            "VpcConfig": {
                "SecurityGroupIds": ["my_security_group_id"],
                "Subnets": ["my_subnet_id"],
            },
        },
        "role_arn":
        ROLE,
        "tags": [{
            "Key": "my-tag",
            "Value": "my-tag-value"
        }],
        "experiment_config": {
            "ExperimentName": "AnExperiment"
        },
    }
    sagemaker_session.process.assert_called_with(**expected_args)
processor = SKLearnProcessor(framework_version='0.20.0',
                             instance_count=1,
                             instance_type='local',
                             role=role)

print('Starting processing job.')
print(
    'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.'
)
processor.run(code='processing_script.py',
              inputs=[
                  ProcessingInput(
                      source='./dependencies/',
                      destination='/opt/ml/processing/dependencies/'),
                  ProcessingInput(source='./input_data/',
                                  destination='/opt/ml/processing/input_data/')
              ],
              outputs=[
                  ProcessingOutput(output_name='tokenized_words_data',
                                   source='/opt/ml/processing/processed_data/')
              ],
              arguments=['job-type', 'word-tokenize'])

preprocessing_job_description = processor.jobs[-1].describe()
output_config = preprocessing_job_description['ProcessingOutputConfig']

print(output_config)

for output in output_config['Outputs']:
    if output['OutputName'] == 'tokenized_words_data':
        tokenized_words_data_file = output['S3Output']['S3Uri']