def test_attach(sagemaker_session, rl_coach_mxnet_version): training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-rl-{}:{}{}-cpu-py3".format( RLFramework.MXNET.value, RLToolkit.COACH.value, rl_coach_mxnet_version ) supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value] framework_version = supported_versions[rl_coach_mxnet_version][RLFramework.MXNET.value] returned_job_description = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"train_coach.py"', "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description ) estimator = RLEstimator.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == "neo" assert estimator.framework == RLFramework.MXNET.value assert estimator.toolkit == RLToolkit.COACH.value assert estimator.framework_version == framework_version assert estimator.toolkit_version == rl_coach_mxnet_version assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == "File" assert estimator.base_job_name == "neo" assert estimator.output_path == "s3://place/output/neo" assert estimator.output_kms_key == "" assert estimator.hyperparameters()["training_steps"] == "100" assert estimator.source_dir == "s3://some/sourcedir.tar.gz" assert estimator.entry_point == "train_coach.py" assert estimator.metric_definitions == RLEstimator.default_metric_definitions(RLToolkit.COACH)
def test_attach(sagemaker_session, rl_coach_mxnet_version): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-rl-{}:{}{}-cpu-py3'\ .format(RLFramework.MXNET.value, RLToolkit.COACH.value, rl_coach_mxnet_version) supported_versions = TOOLKIT_FRAMEWORK_VERSION_MAP[RLToolkit.COACH.value] framework_version = supported_versions[rl_coach_mxnet_version][RLFramework.MXNET.value] returned_job_description = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"train_coach.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': { 'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = \ Mock(name='describe_training_job', return_value=returned_job_description) estimator = RLEstimator.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == 'neo' assert estimator.framework == RLFramework.MXNET.value assert estimator.toolkit == RLToolkit.COACH.value assert estimator.framework_version == framework_version assert estimator.toolkit_version == rl_coach_mxnet_version assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == 'File' assert estimator.base_job_name == 'neo' assert estimator.output_path == 's3://place/output/neo' assert estimator.output_kms_key == '' assert estimator.hyperparameters()['training_steps'] == '100' assert estimator.source_dir == 's3://some/sourcedir.tar.gz' assert estimator.entry_point == 'train_coach.py' assert estimator.metric_definitions == RLEstimator.default_metric_definitions(RLToolkit.COACH)
if pretrained == True: hyperparameters_core['pretrained_s3_bucket'] = "{}".format( s3_pretrained_bucket) hyperparameters_core['pretrained_s3_prefix'] = s3_pretrained_prefix # Downloading the hyperparameter file from our local bucket. hyperparameter_data = io.BytesIO() s3Client.download_fileobj( s3_bucket, hyperparameter_file, hyperparameter_data) hyperparameters_nn = json.loads(hyperparameter_data.getvalue().decode("utf-8")) hyperparameters = {**hyperparameters_core, **hyperparameters_nn} print("Configured following hyperparameters") print(hyperparameters) estimator = RLEstimator(entry_point="training_worker.py", source_dir='markov', dependencies=["common/sagemaker_rl","markov"], sagemaker_session=sage_session, # bypass sagemaker SDK validation of the role role="aaa/", train_instance_type=instance_type, train_instance_count=1, output_path=s3_output_path, base_job_name=job_name, image_name=image_name, train_max_run=job_duration_in_seconds, # Maximum runtime in seconds hyperparameters=hyperparameters, metric_definitions=RLEstimator.default_metric_definitions(RLToolkit.COACH) ) estimator.fit(job_name=job_name, wait=False)