def test_attach_wrong_framework(sagemaker_session): rjd = { 'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-py2-cpu:1.0.4'}, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'checkpoint_path': '"s3://other/1508872349"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': { 'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd) with pytest.raises(ValueError) as error: MXNet.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def test_custom_image_estimator_deploy(sagemaker_session): custom_image = "mycustomimage:latest" mx = MXNet( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, ) mx.fit(inputs="s3://mybucket/train", job_name="new_name") model = mx.create_model(image=custom_image) assert model.image == custom_image
def test_estimator_script_mode_launch_parameter_server(warning, sagemaker_session): mx = MXNet( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, distributions=LAUNCH_PS_DISTRIBUTIONS_DICT, framework_version="1.3.0", ) assert mx.hyperparameters().get(MXNet.LAUNCH_PS_ENV_NAME) == "true" warning.assert_called_with("distributions", "distribution")
def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version, cpu_instance_type): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "mnist.py") ec2_client = sagemaker_session.boto_session.client("ec2") subnet_ids, security_group_id = get_or_create_vpc_resources(ec2_client) mx = MXNet( entry_point=script_path, role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, subnets=subnet_ids, security_group_ids=[security_group_id], ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") job_name = unique_name_from_base("test-mxnet-vpc") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({"train": train_input, "test": test_input}, job_name=job_name) job_desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=mx.latest_training_job.name) assert set(subnet_ids) == set(job_desc["VpcConfig"]["Subnets"]) assert [security_group_id] == job_desc["VpcConfig"]["SecurityGroupIds"] transform_input_path = os.path.join(data_path, "transform", "data.csv") transform_input_key_prefix = "integ-test-data/mxnet_mnist/transform" transform_input = mx.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = _create_transformer_and_transform_job( mx, transform_input, cpu_instance_type) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.wait() model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=transformer.model_name) assert set(subnet_ids) == set(model_desc["VpcConfig"]["Subnets"]) assert [security_group_id ] == model_desc["VpcConfig"]["SecurityGroupIds"]
def test_local_transform_mxnet( sagemaker_local_session, tmpdir, mxnet_inference_latest_version, mxnet_inference_latest_py_version, cpu_instance_type, ): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "mnist.py") mx = MXNet( entry_point=script_path, role="SageMakerRole", instance_count=1, instance_type="local", framework_version=mxnet_inference_latest_version, py_version=mxnet_inference_latest_py_version, sagemaker_session=sagemaker_local_session, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") with stopit.ThreadingTimeout(5 * 60, swallow_exc=False): mx.fit({"train": train_input, "test": test_input}) transform_input_path = os.path.join(data_path, "transform") transform_input_key_prefix = "integ-test-data/mxnet_mnist/transform" transform_input = mx.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) output_path = "file://%s" % (str(tmpdir)) transformer = mx.transformer( 1, "local", assemble_with="Line", max_payload=1, strategy="SingleRecord", output_path=output_path, ) with lock.lock(LOCK_PATH): transformer.transform(transform_input, content_type="text/csv", split_type="Line") transformer.wait() assert os.path.exists(os.path.join(str(tmpdir), "data.csv.out"))
def test_estimator_script_mode_dont_launch_parameter_server(sagemaker_session): mx = MXNet( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, distributions={"parameter_server": { "enabled": False }}, framework_version="1.3.0", ) assert mx.hyperparameters().get(MXNet.LAUNCH_PS_ENV_NAME) == "false"
def test_transform_mxnet(sagemaker_session, mxnet_full_version): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "mnist.py") mx = MXNet( entry_point=script_path, role="SageMakerRole", train_instance_count=1, train_instance_type="ml.c4.xlarge", sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") job_name = unique_name_from_base("test-mxnet-transform") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({"train": train_input, "test": test_input}, job_name=job_name) transform_input_path = os.path.join(data_path, "transform", "data.csv") transform_input_key_prefix = "integ-test-data/mxnet_mnist/transform" transform_input = mx.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) kms_key_arn = get_or_create_kms_key(sagemaker_session) output_filter = "$" transformer = _create_transformer_and_transform_job( mx, transform_input, kms_key_arn, input_filter=None, output_filter=output_filter, join_source=None, ) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.wait() job_desc = transformer.sagemaker_session.sagemaker_client.describe_transform_job( TransformJobName=transformer.latest_transform_job.name) assert kms_key_arn == job_desc["TransformResources"]["VolumeKmsKeyId"] assert output_filter == job_desc["DataProcessing"]["OutputFilter"]
def test_mxnet_neo(strftime, sagemaker_session, neo_mxnet_version): mx = MXNet( entry_point=SCRIPT_PATH, framework_version="1.6", py_version="py3", role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, base_job_name="sagemaker-mxnet", ) mx.fit() input_shape = {"data": [100, 1, 28, 28]} output_location = "s3://neo-sdk-test" compiled_model = mx.compile_model( target_instance_family="ml_c4", input_shape=input_shape, output_path=output_location, framework="mxnet", framework_version=neo_mxnet_version, ) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == [ "train", "logs_for_job", "sagemaker_client.describe_training_job", "compile_model", "wait_for_compilation_job", ] expected_compile_model_args = _create_compilation_job( json.dumps(input_shape), output_location) actual_compile_model_args = sagemaker_session.method_calls[3][2] assert expected_compile_model_args == actual_compile_model_args assert compiled_model.image_uri == _neo_inference_image(neo_mxnet_version) predictor = mx.deploy(1, CPU, use_compiled_model=True) assert isinstance(predictor, MXNetPredictor) with pytest.raises(Exception) as wrong_target: mx.deploy(1, CPU_C5, use_compiled_model=True) assert str(wrong_target.value).startswith("No compiled model for") # deploy without sagemaker Neo should continue to work mx.deploy(1, CPU)
def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version): data_path = os.path.join(DATA_DIR, 'mxnet_mnist') script_path = os.path.join(data_path, 'mnist.py') ec2_client = sagemaker_session.boto_session.client('ec2') subnet_ids, security_group_id = get_or_create_vpc_resources( ec2_client, sagemaker_session.boto_session.region_name) mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, subnets=subnet_ids, security_group_ids=[security_group_id]) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') job_name = unique_name_from_base('test-mxnet-vpc') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({'train': train_input, 'test': test_input}, job_name=job_name) job_desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=mx.latest_training_job.name) assert set(subnet_ids) == set(job_desc['VpcConfig']['Subnets']) assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds'] transform_input_path = os.path.join(data_path, 'transform', 'data.csv') transform_input_key_prefix = 'integ-test-data/mxnet_mnist/transform' transform_input = mx.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = _create_transformer_and_transform_job(mx, transform_input) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.wait() model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=transformer.model_name) assert set(subnet_ids) == set(model_desc['VpcConfig']['Subnets']) assert [security_group_id ] == model_desc['VpcConfig']['SecurityGroupIds']
def test_single_transformer_multiple_jobs(sagemaker_session, mxnet_full_version, cpu_instance_type): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "mnist.py") mx = MXNet( entry_point=script_path, role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") job_name = unique_name_from_base("test-mxnet-transform") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({"train": train_input, "test": test_input}, job_name=job_name) transform_input_path = os.path.join(data_path, "transform", "data.csv") transform_input_key_prefix = "integ-test-data/mxnet_mnist/transform" transform_input = mx.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = mx.transformer(1, cpu_instance_type) job_name = unique_name_from_base("test-mxnet-transform") transformer.transform(transform_input, content_type="text/csv", job_name=job_name) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): assert transformer.output_path == "s3://{}/{}".format( sagemaker_session.default_bucket(), job_name) job_name = unique_name_from_base("test-mxnet-transform") transformer.transform(transform_input, content_type="text/csv", job_name=job_name) assert transformer.output_path == "s3://{}/{}".format( sagemaker_session.default_bucket(), job_name)
def test_keras_training(docker_image, sagemaker_local_session, local_instance_type, framework_version, tmpdir): keras_path = os.path.join(RESOURCE_PATH, 'keras') script_path = os.path.join(keras_path, 'keras_mnist.py') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type=local_instance_type, sagemaker_session=sagemaker_local_session, image_name=docker_image, framework_version=framework_version, output_path='file://{}'.format(tmpdir)) train = 'file://{}'.format(os.path.join(keras_path, 'data')) mx.fit({'train': train}) for directory, files in MODEL_SUCCESS_FILES.items(): local_mode_utils.assert_output_files_exist(str(tmpdir), directory, files)
def test_onnx_export(docker_image, sagemaker_local_session, local_instance_type, framework_version, tmpdir): mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', instance_count=1, instance_type=local_instance_type, sagemaker_session=sagemaker_local_session, image_uri=docker_image, framework_version=framework_version, output_path='file://{}'.format(tmpdir)) mx.fit() local_mode_utils.assert_output_files_exist(str(tmpdir), 'output', ['success'])
def test_local_transform_mxnet(sagemaker_local_session, tmpdir, mxnet_full_version): local_mode_lock_fd = open(LOCK_PATH, 'w') local_mode_lock = local_mode_lock_fd.fileno() data_path = os.path.join(DATA_DIR, 'mxnet_mnist') script_path = os.path.join(data_path, 'mnist.py') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', framework_version=mxnet_full_version, sagemaker_session=sagemaker_local_session) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') with timeout(minutes=15): mx.fit({'train': train_input, 'test': test_input}) transform_input_path = os.path.join(data_path, 'transform') transform_input_key_prefix = 'integ-test-data/mxnet_mnist/transform' transform_input = mx.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) output_path = 'file://%s' % (str(tmpdir)) transformer = mx.transformer(1, 'local', assemble_with='Line', max_payload=1, strategy='SingleRecord', output_path=output_path) # Since Local Mode uses the same port for serving, we need a lock in order # to allow concurrent test execution. fcntl.lockf(local_mode_lock, fcntl.LOCK_EX) transformer.transform(transform_input, content_type='text/csv', split_type='Line') transformer.wait() time.sleep(5) fcntl.lockf(local_mode_lock, fcntl.LOCK_UN) assert os.path.exists(os.path.join(str(tmpdir), 'data.csv.out'))
def test_attach_custom_image(sagemaker_session): training_image = 'ubuntu:latest' returned_job_description = {'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': { 'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'TrainingJobArn': 'arn:aws:sagemaker:us-west-2:336:training-job/neo', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=returned_job_description) estimator = MXNet.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.image_name == training_image assert estimator.train_image() == training_image
def test_s3_input_mode(sagemaker_session, tuner): expected_input_mode = 'Pipe' script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'failure_script.py') mxnet = MXNet(entry_point=script_path, role=ROLE, framework_version=FRAMEWORK_VERSION, train_instance_count=TRAIN_INSTANCE_COUNT, train_instance_type=TRAIN_INSTANCE_TYPE, sagemaker_session=sagemaker_session) tuner.estimator = mxnet tags = [{'Name': 'some-tag-without-a-value'}] tuner.tags = tags hyperparameter_ranges = { 'num_components': IntegerParameter(2, 4), 'algorithm_mode': CategoricalParameter(['regular', 'randomized']) } tuner._hyperparameter_ranges = hyperparameter_ranges tuner.fit(inputs=s3_input('s3://mybucket/train_manifest', input_mode=expected_input_mode)) actual_input_mode = sagemaker_session.method_calls[1][2]['input_mode'] assert actual_input_mode == expected_input_mode
def test_attach_custom_image(sagemaker_session): training_image = 'ubuntu:latest' returned_job_description = {'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': { 'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=returned_job_description) estimator = MXNet.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.image_name == training_image assert estimator.train_image() == training_image
def test_onnx_export(docker_image, sagemaker_local_session, local_instance_type, framework_version, tmpdir): mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', train_instance_count=1, train_instance_type=local_instance_type, sagemaker_session=sagemaker_local_session, image_name=docker_image, framework_version=framework_version, output_path='file://{}'.format(tmpdir)) input_path = 'file://{}'.format(os.path.join(ONNX_PATH, 'mxnet_module')) mx.fit({'train': input_path}) local_mode_utils.assert_output_files_exist(str(tmpdir), 'model', ['model.onnx'])
def test_attach_custom_image(sagemaker_session): training_image = "ubuntu:latest" returned_job_description = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"', "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description ) estimator = MXNet.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator.image_uri == training_image assert estimator.training_image_uri() == training_image
def mxnet_model(sagemaker_local_session): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='local', sagemaker_session=sagemaker_local_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}) model = mx.create_model(1) return model
def test_empty_framework_version(warning, sagemaker_session): mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=None) assert mx.framework_version == defaults.MXNET_VERSION warning.assert_called_with(defaults.MXNET_VERSION, mx.LATEST_VERSION)
def test_s3_input_mode(sagemaker_session, tuner): expected_input_mode = "Pipe" script_path = os.path.join(DATA_DIR, "mxnet_mnist", "failure_script.py") mxnet = MXNet( entry_point=script_path, role=ROLE, framework_version=FRAMEWORK_VERSION, train_instance_count=TRAIN_INSTANCE_COUNT, train_instance_type=TRAIN_INSTANCE_TYPE, sagemaker_session=sagemaker_session, ) tuner.estimator = mxnet tags = [{"Name": "some-tag-without-a-value"}] tuner.tags = tags hyperparameter_ranges = { "num_components": IntegerParameter(2, 4), "algorithm_mode": CategoricalParameter(["regular", "randomized"]), } tuner._hyperparameter_ranges = hyperparameter_ranges tuner.fit(inputs=s3_input("s3://mybucket/train_manifest", input_mode=expected_input_mode)) actual_input_mode = sagemaker_session.method_calls[1][2]["input_mode"] assert actual_input_mode == expected_input_mode
def test_mxnet_airflow_config_uploads_data_source_to_s3( sagemaker_session, cpu_instance_type, mxnet_full_version ): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "chainer_mnist") mx = MXNet( entry_point=script_path, role=ROLE, framework_version=mxnet_full_version, py_version=PYTHON_VERSION, train_instance_count=SINGLE_INSTANCE_COUNT, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) train_input = "file://" + os.path.join(data_path, "train") test_input = "file://" + os.path.join(data_path, "test") training_config = _build_airflow_workflow( estimator=mx, instance_type=cpu_instance_type, inputs={"train": train_input, "test": test_input}, ) _assert_that_s3_url_contains_data( sagemaker_session, training_config["HyperParameters"]["sagemaker_submit_directory"].strip('"'), )
def test_transform_mxnet(sagemaker_session, mxnet_full_version): data_path = os.path.join(DATA_DIR, 'mxnet_mnist') script_path = os.path.join(data_path, 'mnist.py') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, framework_version=mxnet_full_version) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') job_name = unique_name_from_base('test-mxnet-transform') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({'train': train_input, 'test': test_input}, job_name=job_name) transform_input_path = os.path.join(data_path, 'transform', 'data.csv') transform_input_key_prefix = 'integ-test-data/mxnet_mnist/transform' transform_input = mx.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) kms_key_arn = get_or_create_kms_key(sagemaker_session) output_filter = "$" transformer = _create_transformer_and_transform_job( mx, transform_input, kms_key_arn, input_filter=None, output_filter=output_filter, join_source=None) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.wait() job_desc = transformer.sagemaker_session.sagemaker_client.describe_transform_job( TransformJobName=transformer.latest_transform_job.name) assert kms_key_arn == job_desc['TransformResources']['VolumeKmsKeyId'] assert output_filter == job_desc['DataProcessing']['OutputFilter']
def test_requirements_file(image_uri, sagemaker_local_session, local_instance_type, framework_version, tmpdir): mx = MXNet( entry_point='entry.py', source_dir=SOURCE_PATH, role='SageMakerRole', train_instance_count=1, train_instance_type=local_instance_type, image_name=image_uri, framework_version=framework_version, output_path='file://{}'.format(tmpdir), sagemaker_session=sagemaker_local_session, ) mx.fit() local_mode_utils.assert_output_files_exist(str(tmpdir), 'output', MODEL_SUCCESS_FILES['output'])
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) mx.fit(inputs='s3://mybucket/train', job_name='new_name') new_role = 'role' model_server_workers = 2 model = mx.create_model(role=new_role, model_server_workers=model_server_workers) assert model.role == new_role assert model.model_server_workers == model_server_workers
def test_stop_transform_job(sagemaker_session, mxnet_full_version, cpu_instance_type): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "mnist.py") tags = [{"Key": "some-tag", "Value": "value-for-tag"}] mx = MXNet( entry_point=script_path, role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=mxnet_full_version, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train" ) test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test" ) job_name = unique_name_from_base("test-mxnet-transform") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({"train": train_input, "test": test_input}, job_name=job_name) transform_input_path = os.path.join(data_path, "transform", "data.csv") transform_input_key_prefix = "integ-test-data/mxnet_mnist/transform" transform_input = mx.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix ) transformer = mx.transformer(1, cpu_instance_type, tags=tags) transformer.transform(transform_input, content_type="text/csv") time.sleep(15) latest_transform_job_name = transformer.latest_transform_job.name print("Attempting to stop {}".format(latest_transform_job_name)) transformer.stop_transform_job() desc = transformer.latest_transform_job.sagemaker_session.sagemaker_client.describe_transform_job( TransformJobName=latest_transform_job_name ) assert desc["TransformJobStatus"] == "Stopped"
def test_single_machine(docker_image, sagemaker_local_session, local_instance_type, framework_version, tmpdir): mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', train_instance_count=1, train_instance_type=local_instance_type, sagemaker_session=sagemaker_local_session, image_name=docker_image, framework_version=framework_version, output_path='file://{}'.format(tmpdir)) _train_and_assert_success(mx, str(tmpdir))
def create_mxnet_estimator(session: Session, descriptor: BenchmarkDescriptor, source_dir: str, config: SageMakerExecutorConfig) -> Framework: kwargs = _create_common_estimator_args(session, descriptor, source_dir, config) logger.info(f"Creating MXNet Estimator with parameters {kwargs}") hps = get_hyper_params(descriptor) return MXNet(**kwargs, hyperparameters=hps)
def test_attach(sagemaker_session, mxnet_version): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:{}-cpu-py2'.format( mxnet_version) returned_job_description = { 'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': training_image }, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"' }, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': { 'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge' }, 'StoppingCondition': { 'MaxRuntimeInSeconds': 24 * 60 * 60 }, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'TrainingJobArn': 'arn:aws:sagemaker:us-west-2:336:training-job/neo', 'OutputDataConfig': { 'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo' }, 'TrainingJobOutput': { 'S3TrainingJobOutput': 's3://here/output.tar.gz' } } sagemaker_session.sagemaker_client.describe_training_job = Mock( name='describe_training_job', return_value=returned_job_description) estimator = MXNet.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == 'neo' assert estimator.py_version == 'py2' assert estimator.framework_version == mxnet_version assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == 'File' assert estimator.base_job_name == 'neo' assert estimator.output_path == 's3://place/output/neo' assert estimator.output_kms_key == '' assert estimator.hyperparameters()['training_steps'] == '100' assert estimator.source_dir == 's3://some/sourcedir.tar.gz' assert estimator.entry_point == 'iris-dnn-classifier.py' assert estimator.tags == LIST_TAGS_RESULT['Tags']
def test_attach(sagemaker_session, mxnet_version): training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:{}-cpu-py2".format( mxnet_version) returned_job_description = { "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": training_image }, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"', "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": { "MaxRuntimeInSeconds": 24 * 60 * 60 }, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": { "KmsKeyId": "", "S3OutputPath": "s3://place/output/neo" }, "TrainingJobOutput": { "S3TrainingJobOutput": "s3://here/output.tar.gz" }, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description) estimator = MXNet.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == "neo" assert estimator.py_version == "py2" assert estimator.framework_version == mxnet_version assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == "File" assert estimator.base_job_name == "neo" assert estimator.output_path == "s3://place/output/neo" assert estimator.output_kms_key == "" assert estimator.hyperparameters()["training_steps"] == "100" assert estimator.source_dir == "s3://some/sourcedir.tar.gz" assert estimator.entry_point == "iris-dnn-classifier.py" assert estimator.tags == LIST_TAGS_RESULT["Tags"]
def test_mxnet_local_mode(sagemaker_local_session, mxnet_full_version): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='local', sagemaker_session=sagemaker_local_session, framework_version=mxnet_full_version) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}) endpoint_name = mx.latest_training_job.name with local_mode_utils.lock(): try: predictor = mx.deploy(1, 'local', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data) finally: mx.delete_endpoint()
def test_mxnet_local_data_local_script(mxnet_full_version): data_path = os.path.join(DATA_DIR, "mxnet_mnist") script_path = os.path.join(data_path, "mnist.py") mx = MXNet( entry_point=script_path, role="SageMakerRole", train_instance_count=1, train_instance_type="local", framework_version=mxnet_full_version, sagemaker_session=LocalNoS3Session(), ) train_input = "file://" + os.path.join(data_path, "train") test_input = "file://" + os.path.join(data_path, "test") mx.fit({"train": train_input, "test": test_input}) endpoint_name = mx.latest_training_job.name with lock.lock(LOCK_PATH): try: predictor = mx.deploy(1, "local", endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data) finally: mx.delete_endpoint()
def test_mxnet_local_mode(sagemaker_local_session, mxnet_full_version): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type="local", sagemaker_session=sagemaker_local_session, framework_version=mxnet_full_version, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") mx.fit({"train": train_input, "test": test_input}) endpoint_name = mx.latest_training_job.name with lock.lock(LOCK_PATH): try: predictor = mx.deploy(1, "local", endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data) finally: mx.delete_endpoint()
def test_mxnet_local_data_local_script(): local_mode_lock_fd = open(LOCK_PATH, 'w') local_mode_lock = local_mode_lock_fd.fileno() script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='local', sagemaker_session=LocalNoS3Session()) train_input = 'file://' + os.path.join(data_path, 'train') test_input = 'file://' + os.path.join(data_path, 'test') mx.fit({'train': train_input, 'test': test_input}) endpoint_name = mx.latest_training_job.name try: # Since Local Mode uses the same port for serving, we need a lock in order # to allow concurrent test execution. The serving test is really fast so it still # makes sense to allow this behavior. fcntl.lockf(local_mode_lock, fcntl.LOCK_EX) predictor = mx.deploy(1, 'local', endpoint_name=endpoint_name) data = numpy.zeros(shape=(1, 1, 28, 28)) predictor.predict(data) finally: mx.delete_endpoint() time.sleep(5) fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' custom_image = 'mxnet:2.0' mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, image_name=custom_image, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) job_name = 'new_name' mx.fit(inputs='s3://mybucket/train', job_name='new_name') model = mx.create_model() assert model.sagemaker_session == sagemaker_session assert model.image == custom_image assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics
def test_transform_mxnet(sagemaker_session): data_path = os.path.join(DATA_DIR, 'mxnet_mnist') script_path = os.path.join(data_path, 'mnist.py') mx = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): mx.fit({'train': train_input, 'test': test_input}) transform_input_path = os.path.join(data_path, 'transform', 'data.csv') transform_input_key_prefix = 'integ-test-data/mxnet_mnist/transform' transform_input = mx.sagemaker_session.upload_data(path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = _create_transformer_and_transform_job(mx, transform_input) transformer.wait()
def test_attach(sagemaker_session, mxnet_version): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:{}-cpu-py2'.format(mxnet_version) returned_job_description = { 'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': training_image }, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"' }, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': { 'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge' }, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': { 'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo' }, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'} } sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=returned_job_description) estimator = MXNet.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == 'neo' assert estimator.py_version == 'py2' assert estimator.framework_version == mxnet_version assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == 'File' assert estimator.base_job_name == 'neo' assert estimator.output_path == 's3://place/output/neo' assert estimator.output_kms_key == '' assert estimator.hyperparameters()['training_steps'] == '100' assert estimator.source_dir == 's3://some/sourcedir.tar.gz' assert estimator.entry_point == 'iris-dnn-classifier.py'
def test_mxnet(strftime, sagemaker_session, mxnet_version): mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=mxnet_version) inputs = 's3://mybucket/train' mx.fit(inputs=inputs) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ['train', 'logs_for_job'] boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls] assert boto_call_names == ['resource'] expected_train_args = _create_train_job(mxnet_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = mx.create_model() expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:{}-gpu-py2' environment = { 'Environment': { 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://mybucket/sagemaker-mxnet-{}/source/sourcedir.tar.gz'.format(TIMESTAMP), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' }, 'Image': expected_image_base.format(mxnet_version), 'ModelDataUrl': 's3://m/m.tar.gz' } assert environment == model.prepare_container_def(GPU) assert 'cpu' in model.prepare_container_def(CPU)['Image'] predictor = mx.deploy(1, GPU) assert isinstance(predictor, MXNetPredictor)
def test_train_image_default(sagemaker_session): mx = MXNet(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE) assert _get_full_image_uri(defaults.MXNET_VERSION) in mx.train_image()