Exemplo n.º 1
0
def test_attach(sagemaker_session, rl_coach_mxnet_version):
    training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-rl-{}:{}{}-cpu-py3".format(
        RLFramework.MXNET.value, RLToolkit.COACH.value, rl_coach_mxnet_version
    )
    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[rl_coach_mxnet_version][RLFramework.MXNET.value]
    returned_job_description = {
        "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image},
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "sagemaker_program": '"train_coach.py"',
            "sagemaker_enable_cloudwatch_metrics": "false",
            "sagemaker_container_log_level": '"logging.INFO"',
            "sagemaker_job_name": '"neo"',
            "training_steps": "100",
            "sagemaker_region": '"us-west-2"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"},
        "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=returned_job_description
    )

    estimator = RLEstimator.attach(training_job_name="neo", sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == "neo"
    assert estimator.framework == RLFramework.MXNET.value
    assert estimator.toolkit == RLToolkit.COACH.value
    assert estimator.framework_version == framework_version
    assert estimator.toolkit_version == rl_coach_mxnet_version
    assert estimator.role == "arn:aws:iam::366:role/SageMakerRole"
    assert estimator.train_instance_count == 1
    assert estimator.train_max_run == 24 * 60 * 60
    assert estimator.input_mode == "File"
    assert estimator.base_job_name == "neo"
    assert estimator.output_path == "s3://place/output/neo"
    assert estimator.output_kms_key == ""
    assert estimator.hyperparameters()["training_steps"] == "100"
    assert estimator.source_dir == "s3://some/sourcedir.tar.gz"
    assert estimator.entry_point == "train_coach.py"
    assert estimator.metric_definitions == RLEstimator.default_metric_definitions(RLToolkit.COACH)
Exemplo n.º 2
0
def test_attach(sagemaker_session, rl_coach_mxnet_version):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-rl-{}:{}{}-cpu-py3'\
        .format(RLFramework.MXNET.value, RLToolkit.COACH.value, rl_coach_mxnet_version)
    supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value]
    framework_version = supported_versions[rl_coach_mxnet_version][RLFramework.MXNET.value]
    returned_job_description = {'AlgorithmSpecification': {'TrainingInputMode': 'File',
                                                           'TrainingImage': training_image},
                                'HyperParameters':
                                    {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
                                     'sagemaker_program': '"train_coach.py"',
                                     'sagemaker_enable_cloudwatch_metrics': 'false',
                                     'sagemaker_container_log_level': '"logging.INFO"',
                                     'sagemaker_job_name': '"neo"',
                                     'training_steps': '100',
                                     'sagemaker_region': '"us-west-2"'},
                                'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
                                'ResourceConfig':
                                    {'VolumeSizeInGB': 30,
                                     'InstanceCount': 1,
                                     'InstanceType': 'ml.c4.xlarge'},
                                'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
                                'TrainingJobName': 'neo',
                                'TrainingJobStatus': 'Completed',
                                'OutputDataConfig': {'KmsKeyId': '',
                                                     'S3OutputPath': 's3://place/output/neo'},
                                'TrainingJobOutput': {
                                    'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = \
        Mock(name='describe_training_job', return_value=returned_job_description)

    estimator = RLEstimator.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == 'neo'
    assert estimator.framework == RLFramework.MXNET.value
    assert estimator.toolkit == RLToolkit.COACH.value
    assert estimator.framework_version == framework_version
    assert estimator.toolkit_version == rl_coach_mxnet_version
    assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole'
    assert estimator.train_instance_count == 1
    assert estimator.train_max_run == 24 * 60 * 60
    assert estimator.input_mode == 'File'
    assert estimator.base_job_name == 'neo'
    assert estimator.output_path == 's3://place/output/neo'
    assert estimator.output_kms_key == ''
    assert estimator.hyperparameters()['training_steps'] == '100'
    assert estimator.source_dir == 's3://some/sourcedir.tar.gz'
    assert estimator.entry_point == 'train_coach.py'
    assert estimator.metric_definitions == RLEstimator.default_metric_definitions(RLToolkit.COACH)
if pretrained == True:
    hyperparameters_core['pretrained_s3_bucket'] = "{}".format(
        s3_pretrained_bucket)
    hyperparameters_core['pretrained_s3_prefix'] = s3_pretrained_prefix

# Downloading the hyperparameter file from our local bucket.
hyperparameter_data = io.BytesIO()
s3Client.download_fileobj(
    s3_bucket, hyperparameter_file, hyperparameter_data)
hyperparameters_nn = json.loads(hyperparameter_data.getvalue().decode("utf-8"))
hyperparameters = {**hyperparameters_core, **hyperparameters_nn}
print("Configured following hyperparameters")
print(hyperparameters)
estimator = RLEstimator(entry_point="training_worker.py",
                        source_dir='markov',
                        dependencies=["common/sagemaker_rl","markov"],
                        sagemaker_session=sage_session,
                        # bypass sagemaker SDK validation of the role
                        role="aaa/",
                        train_instance_type=instance_type,
                        train_instance_count=1,
                        output_path=s3_output_path,
                        base_job_name=job_name,
                        image_name=image_name,
                        train_max_run=job_duration_in_seconds,  # Maximum runtime in seconds
                        hyperparameters=hyperparameters,
                        metric_definitions=RLEstimator.default_metric_definitions(RLToolkit.COACH)
                        )

estimator.fit(job_name=job_name, wait=False)