Пример #1
0
def test_create_model_with_custom_image(name_from_base, sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    image = "selfdrivingcars:9000"
    rl = RLEstimator(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        image_uri=image,
        container_log_level=container_log_level,
        source_dir=source_dir,
    )

    job_name = "new_name"
    rl.fit(job_name=job_name)

    model_name = "model_name"
    name_from_base.return_value = model_name
    new_entry_point = "deploy_script.py"
    model = rl.create_model(entry_point=new_entry_point)

    assert model.sagemaker_session == sagemaker_session
    assert model.image_uri == image
    assert model.entry_point == new_entry_point
    assert model.role == ROLE
    assert model.name == model_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir

    name_from_base.assert_called_with("selfdrivingcars")
Пример #2
0
def test_create_mxnet_model(sagemaker_session, rl_coach_mxnet_version):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    rl = RLEstimator(entry_point=SCRIPT_PATH,
                     role=ROLE,
                     sagemaker_session=sagemaker_session,
                     train_instance_count=INSTANCE_COUNT,
                     train_instance_type=INSTANCE_TYPE,
                     toolkit=RLToolkit.COACH,
                     toolkit_version=rl_coach_mxnet_version,
                     framework=RLFramework.MXNET,
                     container_log_level=container_log_level,
                     source_dir=source_dir)

    job_name = 'new_name'
    rl.fit(inputs='s3://mybucket/train', job_name='new_name')
    model = rl.create_model()
    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[rl_coach_mxnet_version][
        RLFramework.MXNET.value]

    assert isinstance(model, MXNetModel)
    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == framework_version
    assert model.py_version == PYTHON_VERSION
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
    assert model.vpc_config is None
Пример #3
0
def test_attach_wrong_framework(sagemaker_session):
    training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4"
    rjd = {
        "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image},
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "checkpoint_path": '"s3://other/1508872349"',
            "sagemaker_program": '"iris-dnn-classifier.py"',
            "sagemaker_container_log_level": '"logging.INFO"',
            "training_steps": "100",
            "sagemaker_region": '"us-west-2"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"},
        "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=rjd
    )

    with pytest.raises(ValueError) as error:
        RLEstimator.attach(training_job_name="neo", sagemaker_session=sagemaker_session)
    assert "didn't use image for requested framework" in str(error)
Пример #4
0
def test_create_tf_model(name_from_base, sagemaker_session, coach_tensorflow_version):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    rl = RLEstimator(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        toolkit=RLToolkit.COACH,
        toolkit_version=coach_tensorflow_version,
        framework=RLFramework.TENSORFLOW,
        container_log_level=container_log_level,
        source_dir=source_dir,
    )

    rl.fit(inputs="s3://mybucket/train", job_name="new_name")

    model_name = "model_name"
    name_from_base.return_value = model_name
    model = rl.create_model()

    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[coach_tensorflow_version][RLFramework.TENSORFLOW.value]

    assert isinstance(model, TensorFlowModel)
    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == framework_version
    assert model.role == ROLE
    assert model.name == model_name
    assert model._container_log_level == container_log_level
    assert model.vpc_config is None

    call_args = name_from_base.call_args_list[0][0]
    assert call_args[0] in ("sagemaker-rl-tensorflow", "sagemaker-rl-coach-container")
Пример #5
0
def test_create_model_with_optional_params(sagemaker_session, coach_mxnet_version):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    rl = RLEstimator(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        toolkit=RLToolkit.COACH,
        toolkit_version=coach_mxnet_version,
        framework=RLFramework.MXNET,
        container_log_level=container_log_level,
        source_dir=source_dir,
    )

    rl.fit(job_name="new_name")

    new_role = "role"
    new_entry_point = "deploy_script.py"
    vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]}
    model_name = "model-name"
    model = rl.create_model(
        role=new_role, entry_point=new_entry_point, vpc_config_override=vpc_config, name=model_name
    )

    assert model.role == new_role
    assert model.vpc_config == vpc_config
    assert model.entry_point == new_entry_point
    assert model.name == model_name
Пример #6
0
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    image = 'selfdrivingcars:9000'
    rl = RLEstimator(entry_point=SCRIPT_PATH,
                     role=ROLE,
                     sagemaker_session=sagemaker_session,
                     train_instance_count=INSTANCE_COUNT,
                     train_instance_type=INSTANCE_TYPE,
                     image_name=image,
                     container_log_level=container_log_level,
                     source_dir=source_dir)

    job_name = 'new_name'
    rl.fit(job_name=job_name)
    new_entry_point = 'deploy_script.py'
    model = rl.create_model(entry_point=new_entry_point)

    assert model.sagemaker_session == sagemaker_session
    assert model.image == image
    assert model.entry_point == new_entry_point
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
Пример #7
0
def test_create_model_with_optional_params(sagemaker_session,
                                           rl_coach_mxnet_version):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    rl = RLEstimator(entry_point=SCRIPT_PATH,
                     role=ROLE,
                     sagemaker_session=sagemaker_session,
                     train_instance_count=INSTANCE_COUNT,
                     train_instance_type=INSTANCE_TYPE,
                     toolkit=RLToolkit.COACH,
                     toolkit_version=rl_coach_mxnet_version,
                     framework=RLFramework.MXNET,
                     container_log_level=container_log_level,
                     source_dir=source_dir)

    rl.fit(job_name='new_name')

    new_role = 'role'
    new_entry_point = 'deploy_script.py'
    vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']}
    model = rl.create_model(role=new_role,
                            entry_point=new_entry_point,
                            vpc_config_override=vpc_config)

    assert model.role == new_role
    assert model.vpc_config == vpc_config
    assert model.entry_point == new_entry_point
Пример #8
0
def test_create_tf_model(sagemaker_session, rl_coach_tf_version):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    rl = RLEstimator(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        toolkit=RLToolkit.COACH,
        toolkit_version=rl_coach_tf_version,
        framework=RLFramework.TENSORFLOW,
        container_log_level=container_log_level,
        source_dir=source_dir,
    )

    job_name = "new_name"
    rl.fit(inputs="s3://mybucket/train", job_name="new_name")
    model = rl.create_model()
    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[rl_coach_tf_version][
        RLFramework.TENSORFLOW.value]

    assert isinstance(model, tfs.Model)
    assert model.sagemaker_session == sagemaker_session
    assert model._framework_version == framework_version
    assert model.role == ROLE
    assert model.name == job_name
    assert model._container_log_level == container_log_level
    assert model.vpc_config is None
Пример #9
0
def test_cartpole(docker_image, sagemaker_local_session, processor, tmpdir):
    source_dir = os.path.join(RESOURCE_PATH, 'coach_cartpole')
    dependencies = [os.path.join(RESOURCE_PATH, 'sagemaker_rl')]
    cartpole = 'train_coach.py'

    instance_type = 'local' if processor == 'cpu' else 'local_gpu'

    estimator = RLEstimator(entry_point=cartpole,
                            source_dir=source_dir,
                            role='SageMakerRole',
                            train_instance_count=1,
                            train_instance_type=instance_type,
                            sagemaker_session=sagemaker_local_session,
                            image_name=docker_image,
                            output_path='file://{}'.format(tmpdir),
                            dependencies=dependencies,
                            hyperparameters={
                                "save_model": 1,
                                "RLCOACH_PRESET": "preset_cartpole_clippedppo",
                                "rl.agent_params.algorithm.discount": 0.9,
                                "rl.evaluation_steps:EnvironmentEpisodes": 1,
                            })
    estimator.fit()

    local_mode_utils.assert_output_files_exist(str(tmpdir), 'output',
                                               ['success'])
    assert os.path.exists(os.path.join(str(tmpdir),
                                       'model.tar.gz')), 'model file not found'
Пример #10
0
def test_attach_wrong_framework(sagemaker_session):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4'
    rjd = {'AlgorithmSpecification': {'TrainingInputMode': 'File',
                                      'TrainingImage': training_image},
           'HyperParameters':
               {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
                'checkpoint_path': '"s3://other/1508872349"',
                'sagemaker_program': '"iris-dnn-classifier.py"',
                'sagemaker_enable_cloudwatch_metrics': 'false',
                'sagemaker_container_log_level': '"logging.INFO"',
                'training_steps': '100',
                'sagemaker_region': '"us-west-2"'},
           'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
           'ResourceConfig':
               {'VolumeSizeInGB': 30,
                'InstanceCount': 1,
                'InstanceType': 'ml.c4.xlarge'},
           'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
           'TrainingJobName': 'neo',
           'TrainingJobStatus': 'Completed',
           'OutputDataConfig': {'KmsKeyId': '',
                                'S3OutputPath': 's3://place/output/neo'},
           'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job',
                                                                    return_value=rjd)

    with pytest.raises(ValueError) as error:
        RLEstimator.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert "didn't use image for requested framework" in str(error)
Пример #11
0
 def _submit_job_request(self, estimator: RLEstimator) -> object:
     # By setting wait to false we don't block the current thread.
     estimator.fit(job_name=self._rlestimator_job_name, wait=False)
     job_name = estimator.latest_training_job.job_name
     self._rlestimator_job_name = job_name
     response = self._sm_client.describe_training_job(
         TrainingJobName=job_name)
     return response
Пример #12
0
def test_rl(strftime, sagemaker_session, rl_coach_mxnet_version):
    rl = RLEstimator(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        toolkit=RLToolkit.COACH,
        toolkit_version=rl_coach_mxnet_version,
        framework=RLFramework.MXNET,
    )

    inputs = "s3://mybucket/train"

    rl.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ["train", "logs_for_job"]
    boto_call_names = [
        c[0] for c in sagemaker_session.boto_session.method_calls
    ]
    assert boto_call_names == ["resource"]

    expected_train_args = _create_train_job(RLToolkit.COACH.value,
                                            rl_coach_mxnet_version,
                                            RLFramework.MXNET.value)
    expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][
        "S3Uri"] = inputs
    expected_train_args["experiment_config"] = EXPERIMENT_CONFIG

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = rl.create_model()
    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[rl_coach_mxnet_version][
        RLFramework.MXNET.value]

    expected_image_base = "520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:{}-gpu-py3"
    submit_dir = "s3://notmybucket/sagemaker-rl-mxnet-{}/source/sourcedir.tar.gz".format(
        TIMESTAMP)
    assert {
        "Environment": {
            "SAGEMAKER_SUBMIT_DIRECTORY": submit_dir,
            "SAGEMAKER_PROGRAM": "dummy_script.py",
            "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS": "false",
            "SAGEMAKER_REGION": "us-west-2",
            "SAGEMAKER_CONTAINER_LOG_LEVEL": "20",
        },
        "Image": expected_image_base.format(framework_version),
        "ModelDataUrl": "s3://m/m.tar.gz",
    } == model.prepare_container_def(GPU)

    assert "cpu" in model.prepare_container_def(CPU)["Image"]
Пример #13
0
def test_gym(sagemaker_session, ecr_image, instance_type, framework):
    resource_path = os.path.join(RESOURCE_PATH, 'gym')
    gym_script = 'launcher.sh' if framework == 'tensorflow' else 'gym_envs.py'
    estimator = RLEstimator(entry_point=gym_script,
                            source_dir=resource_path,
                            role='SageMakerRole',
                            instance_count=1,
                            instance_type=instance_type,
                            sagemaker_session=sagemaker_session,
                            image_uri=ecr_image)

    with timeout(minutes=15):
        estimator.fit()
Пример #14
0
def test_rl(strftime, sagemaker_session, rl_coach_mxnet_version):
    rl = RLEstimator(entry_point=SCRIPT_PATH,
                     role=ROLE,
                     sagemaker_session=sagemaker_session,
                     train_instance_count=INSTANCE_COUNT,
                     train_instance_type=INSTANCE_TYPE,
                     toolkit=RLToolkit.COACH,
                     toolkit_version=rl_coach_mxnet_version,
                     framework=RLFramework.MXNET)

    inputs = 's3://mybucket/train'

    rl.fit(inputs=inputs)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ['train', 'logs_for_job']
    boto_call_names = [
        c[0] for c in sagemaker_session.boto_session.method_calls
    ]
    assert boto_call_names == ['resource']

    expected_train_args = _create_train_job(RLToolkit.COACH.value,
                                            rl_coach_mxnet_version,
                                            RLFramework.MXNET.value)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource'][
        'S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = rl.create_model()
    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[rl_coach_mxnet_version][
        RLFramework.MXNET.value]

    expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:{}-gpu-py3'
    submit_dir = 's3://notmybucket/sagemaker-rl-mxnet-{}/source/sourcedir.tar.gz'.format(
        TIMESTAMP)
    assert {
        'Environment': {
            'SAGEMAKER_SUBMIT_DIRECTORY': submit_dir,
            'SAGEMAKER_PROGRAM': 'dummy_script.py',
            'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false',
            'SAGEMAKER_REGION': 'us-west-2',
            'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'
        },
        'Image': expected_image_base.format(framework_version),
        'ModelDataUrl': 's3://m/m.tar.gz'
    } == model.prepare_container_def(GPU)

    assert 'cpu' in model.prepare_container_def(CPU)['Image']
Пример #15
0
def test_attach(sagemaker_session, rl_coach_mxnet_version):
    training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-rl-{}:{}{}-cpu-py3".format(
        RLFramework.MXNET.value, RLToolkit.COACH.value, rl_coach_mxnet_version
    )
    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[rl_coach_mxnet_version][RLFramework.MXNET.value]
    returned_job_description = {
        "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image},
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "sagemaker_program": '"train_coach.py"',
            "sagemaker_enable_cloudwatch_metrics": "false",
            "sagemaker_container_log_level": '"logging.INFO"',
            "sagemaker_job_name": '"neo"',
            "training_steps": "100",
            "sagemaker_region": '"us-west-2"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"},
        "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=returned_job_description
    )

    estimator = RLEstimator.attach(training_job_name="neo", sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == "neo"
    assert estimator.framework == RLFramework.MXNET.value
    assert estimator.toolkit == RLToolkit.COACH.value
    assert estimator.framework_version == framework_version
    assert estimator.toolkit_version == rl_coach_mxnet_version
    assert estimator.role == "arn:aws:iam::366:role/SageMakerRole"
    assert estimator.train_instance_count == 1
    assert estimator.train_max_run == 24 * 60 * 60
    assert estimator.input_mode == "File"
    assert estimator.base_job_name == "neo"
    assert estimator.output_path == "s3://place/output/neo"
    assert estimator.output_kms_key == ""
    assert estimator.hyperparameters()["training_steps"] == "100"
    assert estimator.source_dir == "s3://some/sourcedir.tar.gz"
    assert estimator.entry_point == "train_coach.py"
    assert estimator.metric_definitions == RLEstimator.default_metric_definitions(RLToolkit.COACH)
Пример #16
0
def test_ray(sagemaker_session, ecr_image, instance_type, framework):
    source_dir = os.path.join(RESOURCE_PATH, 'ray_cartpole')
    cartpole = 'train_ray_tf.py' if framework == 'tensorflow' else 'train_ray_torch.py'

    estimator = RLEstimator(entry_point=cartpole,
                            source_dir=source_dir,
                            role='SageMakerRole',
                            instance_count=1,
                            instance_type=instance_type,
                            sagemaker_session=sagemaker_session,
                            image_uri=ecr_image)

    with timeout(minutes=15):
        estimator.fit()
Пример #17
0
def test_ray_tf(sagemaker_session, ecr_image, instance_type):
    source_dir = os.path.join(RESOURCE_PATH, 'ray_cartpole')
    cartpole = 'train_ray.py'

    estimator = RLEstimator(entry_point=cartpole,
                            source_dir=source_dir,
                            role='SageMakerRole',
                            train_instance_count=1,
                            train_instance_type=instance_type,
                            sagemaker_session=sagemaker_session,
                            image_name=ecr_image)

    with timeout(minutes=15):
        estimator.fit()
Пример #18
0
def test_coach_mxnet(sagemaker_session, coach_mxnet_latest_version, cpu_instance_type):
    estimator = _test_coach(
        sagemaker_session, RLFramework.MXNET, coach_mxnet_latest_version, cpu_instance_type
    )
    job_name = unique_name_from_base("test-coach-mxnet")

    with timeout(minutes=15):
        estimator.fit(wait="False", job_name=job_name)

        estimator = RLEstimator.attach(
            estimator.latest_training_job.name, sagemaker_session=sagemaker_session
        )

    endpoint_name = "test-mxnet-coach-deploy-{}".format(sagemaker_timestamp())

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(
            1, cpu_instance_type, entry_point="mxnet_deploy.py", endpoint_name=endpoint_name
        )

        observation = numpy.asarray([0, 0, 0, 0])
        action = predictor.predict(observation)

    assert 0 < action[0][0] < 1
    assert 0 < action[0][1] < 1
Пример #19
0
def test_wrong_type_parameters(sagemaker_session):
    with pytest.raises(AttributeError) as e:
        RLEstimator(toolkit=RLToolkit.COACH, framework=RLFramework.TENSORFLOW,
                    toolkit_version=RLEstimator.RAY_LATEST_VERSION,
                    entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE)
    assert 'combination is not supported.' in str(e.value)
Пример #20
0
def test_attach_custom_image(sagemaker_session):
    training_image = 'rl:latest'
    returned_job_description = {'AlgorithmSpecification': {'TrainingInputMode': 'File',
                                                           'TrainingImage': training_image},
                                'HyperParameters':
                                    {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
                                     'sagemaker_program': '"iris-dnn-classifier.py"',
                                     'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"',
                                     'sagemaker_enable_cloudwatch_metrics': 'false',
                                     'sagemaker_container_log_level': '"logging.INFO"',
                                     'sagemaker_job_name': '"neo"',
                                     'training_steps': '100',
                                     'sagemaker_region': '"us-west-2"'},
                                'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
                                'ResourceConfig':
                                    {'VolumeSizeInGB': 30,
                                     'InstanceCount': 1,
                                     'InstanceType': 'ml.c4.xlarge'},
                                'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
                                'TrainingJobName': 'neo',
                                'TrainingJobStatus': 'Completed',
                                'OutputDataConfig':
                                    {'KmsKeyId': '',
                                     'S3OutputPath': 's3://place/output/neo'},
                                'TrainingJobOutput':
                                    {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = \
        Mock(name='describe_training_job', return_value=returned_job_description)

    estimator = RLEstimator.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == 'neo'
    assert estimator.image_name == training_image
    assert estimator.train_image() == training_image
Пример #21
0
def test_attach_custom_image(sagemaker_session):
    training_image = "rl:latest"
    returned_job_description = {
        "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image},
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "sagemaker_program": '"iris-dnn-classifier.py"',
            "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"',
            "sagemaker_enable_cloudwatch_metrics": "false",
            "sagemaker_container_log_level": '"logging.INFO"',
            "sagemaker_job_name": '"neo"',
            "training_steps": "100",
            "sagemaker_region": '"us-west-2"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"},
        "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=returned_job_description
    )

    estimator = RLEstimator.attach(training_job_name="neo", sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == "neo"
    assert estimator.image_name == training_image
    assert estimator.train_image() == training_image
Пример #22
0
def test_attach(sagemaker_session, rl_coach_mxnet_version):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-rl-{}:{}{}-cpu-py3'\
        .format(RLFramework.MXNET.value, RLToolkit.COACH.value, rl_coach_mxnet_version)
    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[rl_coach_mxnet_version][RLFramework.MXNET.value]
    returned_job_description = {'AlgorithmSpecification': {'TrainingInputMode': 'File',
                                                           'TrainingImage': training_image},
                                'HyperParameters':
                                    {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
                                     'sagemaker_program': '"train_coach.py"',
                                     'sagemaker_enable_cloudwatch_metrics': 'false',
                                     'sagemaker_container_log_level': '"logging.INFO"',
                                     'sagemaker_job_name': '"neo"',
                                     'training_steps': '100',
                                     'sagemaker_region': '"us-west-2"'},
                                'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
                                'ResourceConfig':
                                    {'VolumeSizeInGB': 30,
                                     'InstanceCount': 1,
                                     'InstanceType': 'ml.c4.xlarge'},
                                'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
                                'TrainingJobName': 'neo',
                                'TrainingJobStatus': 'Completed',
                                'OutputDataConfig': {'KmsKeyId': '',
                                                     'S3OutputPath': 's3://place/output/neo'},
                                'TrainingJobOutput': {
                                    'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = \
        Mock(name='describe_training_job', return_value=returned_job_description)

    estimator = RLEstimator.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == 'neo'
    assert estimator.framework == RLFramework.MXNET.value
    assert estimator.toolkit == RLToolkit.COACH.value
    assert estimator.framework_version == framework_version
    assert estimator.toolkit_version == rl_coach_mxnet_version
    assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole'
    assert estimator.train_instance_count == 1
    assert estimator.train_max_run == 24 * 60 * 60
    assert estimator.input_mode == 'File'
    assert estimator.base_job_name == 'neo'
    assert estimator.output_path == 's3://place/output/neo'
    assert estimator.output_kms_key == ''
    assert estimator.hyperparameters()['training_steps'] == '100'
    assert estimator.source_dir == 's3://some/sourcedir.tar.gz'
    assert estimator.entry_point == 'train_coach.py'
    assert estimator.metric_definitions == RLEstimator.default_metric_definitions(RLToolkit.COACH)
Пример #23
0
def test_vw_cb_explore(local_instance_type, sagemaker_local_session, docker_image,
                       tmpdir, training_data_bandits, role):
    source_path = os.path.join(RESOURCE_PATH, 'vw')
    estimator = RLEstimator(entry_point="train_cb_explore.py",
                            source_dir=source_path,
                            role=role,
                            train_instance_count=1,
                            hyperparameters={"num_arms": 7},
                            train_instance_type=local_instance_type,
                            sagemaker_session=sagemaker_local_session,
                            output_path='file://{}'.format(tmpdir),
                            image_name=docker_image)
    estimator.fit(inputs=training_data_bandits)

    local_mode_utils.assert_output_files_exist(str(tmpdir), 'output', ['success'])
    local_mode_utils.assert_output_files_exist(str(tmpdir), 'model', ['vw.model', 'vw.metadata'])    
    assert os.path.exists(os.path.join(str(tmpdir), 'model.tar.gz')), 'model file not found'
Пример #24
0
    def configure_estimator(self):
        self.estimator = RLEstimator(
            entry_point=const.entry_point,
            source_dir=const.source_dir,
            image_name=self.custom_image_name,
            dependencies=["common/"],
            role=self.sagemaker_role,
            train_instance_type=self.instance_type,
            train_instance_count=self.instance_pool_count,
            output_path=self.s3_output_path,
            base_job_name=self.job_name_prefix,
            metric_definitions=self.metric_definitions,
            train_max_run=self.job_duration_in_seconds,
            hyperparameters={
                "s3_bucket":
                self.s3_bucket,
                "s3_prefix":
                self.s3_prefix,
                "aws_region":
                self.aws_region,
                "preset_s3_key":
                "%s/presets/preset.py" % self.s3_prefix,
                "model_metadata_s3_key":
                "%s/model_metadata.json" % self.s3_prefix,
                "environment_s3_key":
                "%s/environments/deepracer_racetrack_env.py" % self.s3_prefix,
                "batch_size":
                self.hyperparam_data['batch_size'],
                "num_epochs":
                self.hyperparam_data['optimization_epochs'],
                "beta_entropy":
                self.hyperparam_data['beta_entropy'],
                "lr":
                self.hyperparam_data['learning_rate'],
                "num_episodes_between_training":
                20,
                "discount_factor":
                self.hyperparam_data['discount']
            },
            subnets=self.deepracer_subnets,
            security_group_ids=self.deepracer_security_groups,
        )

        self.estimator.fit(wait=False)
        self.job_name = self.estimator.latest_training_job.job_name
        print("Training job: %s" % self.job_name)
Пример #25
0
def test_gym(local_instance_type, sagemaker_local_session, docker_image,
             tmpdir, framework):
    source_path = os.path.join(RESOURCE_PATH, 'gym')
    gym_script = 'launcher.sh' if framework == 'tensorflow' else 'gym_envs.py'
    estimator = RLEstimator(entry_point=gym_script,
                            source_dir=source_path,
                            role='SageMakerRole',
                            train_instance_count=1,
                            train_instance_type=local_instance_type,
                            sagemaker_session=sagemaker_local_session,
                            output_path='file://{}'.format(tmpdir),
                            image_name=docker_image)
    estimator.fit()

    local_mode_utils.assert_output_files_exist(str(tmpdir), 'output',
                                               ['success'])
    assert os.path.exists(os.path.join(str(tmpdir),
                                       'model.tar.gz')), 'model file not found'
Пример #26
0
def test_wrong_toolkit_format(sagemaker_session):
    with pytest.raises(ValueError) as e:
        RLEstimator(toolkit='coach', framework=RLFramework.TENSORFLOW,
                    toolkit_version=RLEstimator.COACH_LATEST_VERSION,
                    entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                    framework_version=None)

    assert 'Invalid type' in str(e.value)
Пример #27
0
def test_missing_required_parameters(sagemaker_session):
    with pytest.raises(AttributeError) as e:
        RLEstimator(entry_point=SCRIPT_PATH,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE)
    assert 'Please provide `toolkit`, `toolkit_version`, `framework`' + \
           ' or `image_name` parameter.' in str(e.value)
Пример #28
0
def test_ray_tf(local_instance_type, sagemaker_local_session, docker_image,
                tmpdir):
    source_dir = os.path.join(RESOURCE_PATH, 'ray_cartpole')
    cartpole = 'train_ray.py'

    estimator = RLEstimator(entry_point=cartpole,
                            source_dir=source_dir,
                            role='SageMakerRole',
                            train_instance_count=1,
                            train_instance_type=local_instance_type,
                            sagemaker_session=sagemaker_local_session,
                            output_path='file://{}'.format(tmpdir),
                            image_name=docker_image)

    estimator.fit()

    local_mode_utils.assert_output_files_exist(str(tmpdir), 'output',
                                               ['success'])
    assert os.path.exists(os.path.join(str(tmpdir),
                                       'model.tar.gz')), 'model file not found'
def test_ray_tf(sagemaker_session, ray_tensorflow_latest_version,
                cpu_instance_type):
    source_dir = os.path.join(DATA_DIR, "ray_cartpole")
    cartpole = "train_ray.py"

    estimator = RLEstimator(
        entry_point=cartpole,
        source_dir=source_dir,
        toolkit=RLToolkit.RAY,
        framework=RLFramework.TENSORFLOW,
        toolkit_version=ray_tensorflow_latest_version,
        sagemaker_session=sagemaker_session,
        role="SageMakerRole",
        instance_type=cpu_instance_type,
        instance_count=1,
    )
    job_name = unique_name_from_base("test-ray-tf")

    with timeout(minutes=15):
        estimator.fit(job_name=job_name)

    with pytest.raises(NotImplementedError) as e:
        estimator.deploy(1, cpu_instance_type)
    assert "Automatic deployment of Ray models is not currently available" in str(
        e.value)
Пример #30
0
def test_create_mxnet_model(name_from_base, sagemaker_session,
                            coach_mxnet_version):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    rl = RLEstimator(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        toolkit=RLToolkit.COACH,
        toolkit_version=coach_mxnet_version,
        framework=RLFramework.MXNET,
        container_log_level=container_log_level,
        source_dir=source_dir,
    )

    rl.fit(inputs="s3://mybucket/train", job_name="new_name")

    model_name = "model_name"
    name_from_base.return_value = model_name
    model = rl.create_model()

    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[coach_mxnet_version][
        RLFramework.MXNET.value]

    assert isinstance(model, MXNetModel)
    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == framework_version
    assert model.py_version == PYTHON_VERSION
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == model_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
    assert model.vpc_config is None

    name_from_base.assert_called_with("sagemaker-rl-mxnet")