def test_attach_wrong_framework(sagemaker_session): rjd = { "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py3-cpu:1.0.4", }, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "checkpoint_path": '"s3://other/1508872349"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_container_log_level": '"logging.INFO"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=rjd ) with pytest.raises(ValueError) as error: SKLearn.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def test_attach_custom_image(sagemaker_session): training_image = "1.dkr.ecr.us-west-2.amazonaws.com/my_custom_sklearn_image:latest" returned_job_description = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"', "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description ) estimator = SKLearn.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator.image_uri == training_image assert estimator.training_image_uri() == training_image
def test_attach_deploy(sklearn_training_job, sagemaker_session): endpoint_name = "test-sklearn-attach-deploy-{}".format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = SKLearn.attach(sklearn_training_job, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) _predict_and_assert(predictor)
def test_async_fit( sagemaker_session, cpu_instance_type, sklearn_latest_version, sklearn_latest_py_version, ): endpoint_name = "test-sklearn-attach-deploy-{}".format( sagemaker_timestamp()) with timeout(minutes=5): training_job_name = _run_mnist_training_job( sagemaker_session, cpu_instance_type, sklearn_version=sklearn_latest_version, wait=False, ) print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = SKLearn.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) _predict_and_assert(predictor)
def test_attach(sagemaker_session, sklearn_version): training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-{}".format( sklearn_version, PYTHON_VERSION) returned_job_description = { "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": training_image }, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"', "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": { "MaxRuntimeInSeconds": 24 * 60 * 60 }, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": { "KmsKeyId": "", "S3OutputPath": "s3://place/output/neo" }, "TrainingJobOutput": { "S3TrainingJobOutput": "s3://here/output.tar.gz" }, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description) estimator = SKLearn.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator._current_job_name == "neo" assert estimator.latest_training_job.job_name == "neo" assert estimator.py_version == PYTHON_VERSION assert estimator.framework_version == sklearn_version assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == "File" assert estimator.base_job_name == "neo" assert estimator.output_path == "s3://place/output/neo" assert estimator.output_kms_key == "" assert estimator.hyperparameters()["training_steps"] == "100" assert estimator.source_dir == "s3://some/sourcedir.tar.gz" assert estimator.entry_point == "iris-dnn-classifier.py"
def test_attach(sagemaker_session, sklearn_version): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-{}'.format( sklearn_version, PYTHON_VERSION) returned_job_description = { 'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': training_image }, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"' }, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': { 'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge' }, 'StoppingCondition': { 'MaxRuntimeInSeconds': 24 * 60 * 60 }, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'TrainingJobArn': 'arn:aws:sagemaker:us-west-2:336:training-job/neo', 'OutputDataConfig': { 'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo' }, 'TrainingJobOutput': { 'S3TrainingJobOutput': 's3://here/output.tar.gz' } } sagemaker_session.sagemaker_client.describe_training_job = Mock( name='describe_training_job', return_value=returned_job_description) estimator = SKLearn.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator._current_job_name == 'neo' assert estimator.latest_training_job.job_name == 'neo' assert estimator.py_version == PYTHON_VERSION assert estimator.framework_version == sklearn_version assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == 'File' assert estimator.base_job_name == 'neo' assert estimator.output_path == 's3://place/output/neo' assert estimator.output_kms_key == '' assert estimator.hyperparameters()['training_steps'] == '100' assert estimator.source_dir == 's3://some/sourcedir.tar.gz' assert estimator.entry_point == 'iris-dnn-classifier.py'
def test_attach_wrong_framework(sagemaker_session): rjd = { 'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py3-cpu:1.0.4' }, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'checkpoint_path': '"s3://other/1508872349"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"' }, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': { 'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge' }, 'StoppingCondition': { 'MaxRuntimeInSeconds': 24 * 60 * 60 }, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'TrainingJobArn': 'arn:aws:sagemaker:us-west-2:336:training-job/neo', 'OutputDataConfig': { 'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo' }, 'TrainingJobOutput': { 'S3TrainingJobOutput': 's3://here/output.tar.gz' } } sagemaker_session.sagemaker_client.describe_training_job = Mock( name='describe_training_job', return_value=rjd) with pytest.raises(ValueError) as error: SKLearn.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def build_and_train_estimator(data_desc: str, classifier: str, count: int = 1, wait: bool = False, **hyperparams: object) -> Tuple[SKLearn, str]: """ Creates or returns an existing sagemaker training job :param data_desc: name of data to use (unique) :param classifier: name of sklearn classifier :param count: cache buster :param wait: waits on job, useful for debugging :param hyperparams: hyperparameters for the model :return: estimator | None """ model_name = build_model_name(data_desc, classifier, hyperparams, count) print('model_name', model_name) # check if model has already been built on this data # if it has check if it's finished and attach try: import boto3 client = boto3.client('sagemaker') response = client.describe_training_job(TrainingJobName=model_name) if wait or response['TrainingJobStatus'] in ['Completed', 'Failed']: return SKLearn.attach(model_name), model_name else: raise Warning(f'{model_name} isn\'t finished training yet') except ClientError: pass output_location = f's3://{bucket}/{S3_MODEL_DIR / data_desc}' estimator = SKLearn('train_and_deploy.py', source_dir='sagemaker_container', code_location=output_location, output_path=output_location, train_instance_type=TRAIN_INSTANCE, framework_version='0.23-1', role=role, hyperparameters={ 'classifier': classifier, **hyperparams }) estimator.fit(f's3://{bucket}/{S3_FEATURE_DIR / data_desc}', wait=wait, job_name=model_name) return estimator, model_name
def test_async_fit(sagemaker_session): endpoint_name = 'test-sklearn-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout(minutes=5): training_job_name = _run_mnist_training_job(sagemaker_session, "ml.c4.xlarge", sklearn_full_version=SKLEARN_VERSION, wait=False) print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = SKLearn.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, "ml.c4.xlarge", endpoint_name=endpoint_name) _predict_and_assert(predictor)
def test_attach_custom_image(sagemaker_session): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/my_custom_sklearn_image:latest' returned_job_description = { 'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': training_image }, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"' }, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': { 'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge' }, 'StoppingCondition': { 'MaxRuntimeInSeconds': 24 * 60 * 60 }, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'TrainingJobArn': 'arn:aws:sagemaker:us-west-2:336:training-job/neo', 'OutputDataConfig': { 'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo' }, 'TrainingJobOutput': { 'S3TrainingJobOutput': 's3://here/output.tar.gz' } } sagemaker_session.sagemaker_client.describe_training_job = Mock( name='describe_training_job', return_value=returned_job_description) estimator = SKLearn.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.image_name == training_image assert estimator.train_image() == training_image