def test_tf(sagemaker_session, tf_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-tf') inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = json_predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result
def test_cifar(sagemaker_session, tf_full_version): with timeout(minutes=45): script_path = os.path.join(DATA_DIR, 'cifar_10', 'source') dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data') estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=500, evaluation_steps=5, train_instance_count=2, train_instance_type='ml.p2.xlarge', sagemaker_session=sagemaker_session, train_max_run=45 * 60, base_job_name='test-cifar') inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10') estimator.fit(inputs, logs=False) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge') predictor.serializer = PickleSerializer() predictor.content_type = PICKLE_CONTENT_TYPE data = np.random.randn(32, 32, 3) predict_response = predictor.predict(data) assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
def test_tf_async(sagemaker_session): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-tf') inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs, wait=False) training_job_name = estimator.latest_training_job.name time.sleep(20) endpoint_name = training_job_name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) result = json_predictor.predict([6.4, 3.2, 4.5, 1.5]) print('predict result: {}'.format(result))
def test_failed_tf_training(sagemaker_session, tf_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'failure_script.py') ec2_client = sagemaker_session.boto_session.client('ec2') subnet, security_group_id = get_or_create_subnet_and_security_group(ec2_client, VPC_NAME) estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, subnets=[subnet], security_group_ids=[security_group_id]) inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf-failure') with pytest.raises(ValueError) as e: estimator.fit(inputs) assert 'This failure is expected' in str(e.value) job_desc = estimator.sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=estimator.latest_training_job.name) assert [subnet] == job_desc['VpcConfig']['Subnets'] assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds']
def test_run_tensorboard_locally_without_awscli_binary(time, strftime, popen, call, access, sagemaker_session): tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE) with pytest.raises(EnvironmentError) as error: tf.fit(inputs='s3://mybucket/train', run_tensorboard_locally=True) assert str(error.value) == 'The AWS CLI is not installed in the system. Please install the AWS CLI using the ' \ 'following command: \n pip install awscli'
def test_train_image_default(sagemaker_session): tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE) assert _get_full_cpu_image_uri(defaults.TF_VERSION) in tf.train_image()
def test_run_tensorboard_locally(sleep, time, strftime, popen, call, access, rmtree, mkdtemp, sync, sagemaker_session): tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE) popen().poll.return_value = None tf.fit(inputs='s3://mybucket/train', run_tensorboard_locally=True) popen.assert_called_with(['tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost', '--port', '6006'], stderr=-1, stdout=-1)
def test_run_tensorboard_locally_port_in_use(time, strftime, popen, call, access, socket, rmtree, mkdtemp, sync, sagemaker_session): tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE) popen().poll.side_effect = [-1, None] tf.fit(inputs='s3://mybucket/train', run_tensorboard_locally=True) popen.assert_any_call(['tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost', '--port', '6006'], stderr=-1, stdout=-1) popen.assert_any_call(['tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost', '--port', '6007'], stderr=-1, stdout=-1)
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' custom_image = 'tensorflow:1.0' tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, image_name=custom_image, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir) job_name = 'doing something' tf.fit(inputs='s3://mybucket/train', job_name=job_name) model = tf.create_model() assert model.image == custom_image
def test_attach_custom_image(sagemaker_session): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/tensorflow_with_custom_binary:1.0' rjd = { 'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'checkpoint_path': '"s3://other/1508872349"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'evaluation_steps': '10'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': { 'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd) estimator = TensorFlow.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.image_name == training_image assert estimator.train_image() == training_image
def test_failed_tf_training(sagemaker_session, tf_full_version): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'iris', 'failure_script.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf-failure') with pytest.raises(ValueError) as e: estimator.fit(inputs) assert 'This failure is expected' in str(e.value)
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) job_name = 'doing something' tf.fit(inputs='s3://mybucket/train', job_name=job_name) new_role = 'role' model_server_workers = 2 model = tf.create_model(role=new_role, model_server_workers=2) assert model.role == new_role assert model.model_server_workers == model_server_workers
def test_attach_wrong_framework(sagemaker_session): returned_job_description = { 'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0' }, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'training_steps': '100' }, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': { 'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge' }, 'StoppingCondition': { 'MaxRuntimeInSeconds': 24 * 60 * 60 }, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': { 'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo' }, 'TrainingJobOutput': { 'S3TrainingJobOutput': 's3://here/output.tar.gz' } } sagemaker_session.sagemaker_client.describe_training_job = Mock( name='describe_training_job', return_value=returned_job_description) with pytest.raises(ValueError) as error: TensorFlow.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def test_mnist_with_checkpoint_config(sagemaker_session, instance_type, tf_full_version): checkpoint_s3_uri = "s3://{}/checkpoints/tf-{}".format( sagemaker_session.default_bucket(), sagemaker_timestamp() ) checkpoint_local_path = "/test/checkpoint/path" estimator = TensorFlow( entry_point=SCRIPT, role="SageMakerRole", train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, script_mode=True, framework_version=tf_full_version, py_version=tests.integ.PYTHON_VERSION, metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}], checkpoint_s3_uri=checkpoint_s3_uri, checkpoint_local_path=checkpoint_local_path, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist" ) training_job_name = unique_name_from_base("test-tf-sm-mnist") with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=inputs, job_name=training_job_name) assert_s3_files_exist( sagemaker_session, estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"], ) df = estimator.training_job_analytics.dataframe() assert df.size > 0 expected_training_checkpoint_config = { "S3Uri": checkpoint_s3_uri, "LocalPath": checkpoint_local_path, } actual_training_checkpoint_config = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=training_job_name )["CheckpointConfig"] assert actual_training_checkpoint_config == expected_training_checkpoint_config
def main(args): hyperparameters = format_hyperparameters(args) now = datetime.now() time_str = now.strftime("%d-%m-%Y-%H-%M") distributions = { "mpi": { "enabled": True, "processes_per_host": args.num_workers_per_host, "custom_mpi_options": "-x OMPI_MCA_btl_vader_single_copy_mechanism=none -x TF_CUDNN_USE_AUTOTUNE=0" } } channels = {'coco': args.data_channel, 'weights': args.weights_channel} s3_path = os.path.join(args.s3_path, time_str) job_name = '{}-{}-{}'.format(args.user_id, args.instance_name, time_str) output_path = os.path.join(s3_path, "output", job_name) configuration = { 'configuration': 'ci/frcnn/sagemaker_default_model_config.py', 's3_path': s3_path, 'instance_name': job_name } configuration.update(hyperparameters) estimator = TensorFlow(entry_point=args.main_script, source_dir=args.source_dir, image_name=args.image, role=args.sagemaker_role, framework_version="2.1.0", py_version="py3", train_instance_count=args.instance_count, train_instance_type=args.instance_type, distributions=distributions, output_path=output_path, train_volume_size=200, hyperparameters=configuration) estimator.fit(channels, wait=True, logs='All', job_name=job_name) print("Started Sagemaker job: {}".format(job_name)) pprint.pprint(configuration)
def test_transformer_creation_with_endpoint_type(create_model, sagemaker_session): model = Mock() create_model.return_value = model tf = TensorFlow( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, ) tf.transformer( INSTANCE_COUNT, INSTANCE_TYPE, endpoint_type="tensorflow-serving", entry_point=SERVING_SCRIPT_FILE, ) create_model.assert_called_with( endpoint_type="tensorflow-serving", model_server_workers=None, role=ROLE, vpc_config_override="VPC_CONFIG_DEFAULT", entry_point=SERVING_SCRIPT_FILE, ) model.transformer.assert_called_with( INSTANCE_COUNT, INSTANCE_TYPE, accept=None, assemble_with=None, env=None, max_concurrent_transforms=None, max_payload=None, output_kms_key=None, output_path=None, strategy=None, tags=None, volume_kms_key=None, )
def test_mnist(sagemaker_session, instance_type): estimator = TensorFlow(entry_point=SCRIPT, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, py_version='py3', framework_version=TensorFlow.LATEST_VERSION, metric_definitions=[{'Name': 'train:global_steps', 'Regex': r'global_step\/sec:\s(.*)'}], base_job_name=unique_name_from_base('test-tf-sm-mnist')) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(RESOURCE_PATH, 'data'), key_prefix='scriptmode/mnist') with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs) _assert_s3_files_exist(estimator.model_dir, ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta']) df = estimator.training_job_analytics.dataframe() print(df) assert df.size > 0
def test_tf_script_mode_attach(sagemaker_session, tf_version): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-py3-cpu:{}-cpu-py3'.format( tf_version) rjd = { 'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': training_image }, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"' }, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': { 'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge' }, 'StoppingCondition': { 'MaxRuntimeInSeconds': 24 * 60 * 60 }, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': { 'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo' }, 'TrainingJobOutput': { 'S3TrainingJobOutput': 's3://here/output.tar.gz' } } sagemaker_session.sagemaker_client.describe_training_job = Mock( name='describe_training_job', return_value=rjd) estimator = TensorFlow.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == 'neo' assert estimator.py_version == 'py3' assert estimator.framework_version == tf_version assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == 'File' assert estimator.input_mode == 'File' assert estimator.base_job_name == 'neo' assert estimator.output_path == 's3://place/output/neo' assert estimator.output_kms_key == '' assert estimator.hyperparameters() is not None assert estimator.source_dir == 's3://some/sourcedir.tar.gz' assert estimator.entry_point == 'iris-dnn-classifier.py'
def test_distributed_mnist_ps(sagemaker_session, image_uri, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") script = os.path.join(resource_path, "mnist", "mnist_estimator.py") estimator = TensorFlow( entry_point=script, role="SageMakerRole", hyperparameters={"sagemaker_parameter_server_enabled": True}, train_instance_count=2, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=image_uri, framework_version=framework_version, script_mode=True, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, "mnist", "data-distributed"), key_prefix="scriptmode/mnist-distributed", ) estimator.fit(inputs, job_name=unique_name_from_base("test-tf-sm-distributed-mnist")) _assert_checkpoint_exists(sagemaker_session.boto_region_name, estimator.model_dir, 0) _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)
def test_distributed_training_smdataparallel_script_mode( sagemaker_session, instance_type, ecr_image, tmpdir, framework_version): """ Tests SMDataParallel single-node command via script mode TODO: Enable debugger_hook_config post smdebug support for smdataparallel """ instance_type = "ml.p3.16xlarge" estimator = TensorFlow(entry_point='smdataparallel_mnist_script_mode.sh', source_dir=MNIST_PATH, role='SageMakerRole', instance_type=instance_type, instance_count=1, image_uri=ecr_image, framework_version=framework_version, py_version='py3', debugger_hook_config=False, sagemaker_session=sagemaker_session) estimator.fit(job_name=unique_name_from_base('test-tf-smdataparallel'))
def test_distributed_training_smdataparallel_script_mode( n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version): """ Tests SMDataParallel single-node command via script mode """ validate_or_skip_smdataparallel(n_virginia_ecr_image) instance_type = "ml.p3.16xlarge" distribution = {"smdistributed": {"dataparallel": {"enabled": True}}} estimator = TensorFlow(entry_point='smdataparallel_mnist_script_mode.sh', source_dir=MNIST_PATH, role='SageMakerRole', instance_type=instance_type, instance_count=1, image_uri=n_virginia_ecr_image, framework_version=framework_version, py_version='py3', sagemaker_session=n_virginia_sagemaker_session, distribution=distribution) estimator.fit(job_name=unique_name_from_base('test-tf-smdataparallel'))
def test_mnist_distributed(sagemaker_session, instance_type): estimator = TensorFlow(entry_point=SCRIPT, role=ROLE, train_instance_count=2, train_instance_type=instance_type, sagemaker_session=sagemaker_session, py_version=tests.integ.PYTHON_VERSION, script_mode=True, framework_version=TensorFlow.LATEST_VERSION, distributions=PARAMETER_SERVER_DISTRIBUTION) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(RESOURCE_PATH, 'data'), key_prefix='scriptmode/distributed_mnist') with tests.integ.timeout.timeout( minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=inputs, job_name=unique_name_from_base('test-tf-sm-distributed')) _assert_s3_files_exist( estimator.model_dir, ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta'])
def test_attach(sagemaker_session, tensorflow_training_version, tensorflow_training_py_version): if Version(tensorflow_training_version) > Version("1.12"): pytest.skip("framework_name_from_image doesn't infer info from DLC image URIs.") training_image = image_uris.retrieve( "tensorflow", region=REGION, version=tensorflow_training_version, py_version=tensorflow_training_py_version, instance_type="ml.c4.xlarge", image_scope="training", ) rjd = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=rjd ) estimator = TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == "neo" assert estimator.py_version == tensorflow_training_py_version assert estimator.framework_version == tensorflow_training_version assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" assert estimator.instance_count == 1 assert estimator.max_run == 24 * 60 * 60 assert estimator.input_mode == "File" assert estimator.input_mode == "File" assert estimator.base_job_name == "neo" assert estimator.output_path == "s3://place/output/neo" assert estimator.output_kms_key == "" assert estimator.hyperparameters() is not None assert estimator.source_dir == "s3://some/sourcedir.tar.gz" assert estimator.entry_point == "iris-dnn-classifier.py" assert estimator.training_image_uri() == training_image
def test_native(self, sagemaker_session, ecr_image, framework_version, instance_type, instance_count, tmpdir, mnist_dataset, capsys): script = os.path.join(resource_path, 'mnist', 'mnist.py') estimator = TensorFlow( entry_point=script, role='SageMakerRole', instance_type=instance_type, instance_count=instance_count, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, hyperparameters={ TrainingCompilerConfig.HP_ENABLE_COMPILER: True, }, ) estimator.fit(mnist_dataset, job_name=unique_name_from_base('test-TF-trcomp-DT')) _assert_model_exported_to_s3(estimator) captured = capsys.readouterr() _assert_training_compiler_invoked(captured)
def test_empty_framework_version(warning, sagemaker_session): estimator = TensorFlow( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=None, ) assert estimator.framework_version == defaults.TF_VERSION warning.assert_called_with(defaults.TF_VERSION, estimator.LATEST_VERSION)
def test_tuning_tf_vpc_multi(sagemaker_session): """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" instance_type = "ml.c4.xlarge" instance_count = 2 script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") ec2_client = sagemaker_session.boto_session.client("ec2") subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources( ec2_client, sagemaker_session.boto_region_name) vpc_test_utils.setup_security_group_for_encryption(ec2_client, security_group_id) estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, base_job_name="test-vpc-tf", subnets=subnet_ids, security_group_ids=[security_group_id], encrypt_inter_container_traffic=True, ) inputs = sagemaker_session.upload_data( path=DATA_PATH, key_prefix="integ-test-data/tf_iris") hyperparameter_ranges = {"learning_rate": ContinuousParameter(0.05, 0.2)} objective_metric_name = "loss" metric_definitions = [{"Name": "loss", "Regex": "loss = ([0-9\\.]+)"}] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, objective_type="Minimize", max_jobs=2, max_parallel_jobs=2, ) tuning_job_name = unique_name_from_base("tune-tf", max_length=32) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): tuner.fit(inputs, job_name=tuning_job_name) print("Started hyperparameter tuning job with name:" + tuning_job_name) time.sleep(15) tuner.wait()
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" custom_image = "tensorflow:1.0" tf = TensorFlow( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, image_uri=custom_image, container_log_level=container_log_level, base_job_name="job", source_dir=source_dir, ) job_name = "doing something" tf.fit(inputs="s3://mybucket/train", job_name=job_name) model = tf.create_model() assert model.image_uri == custom_image
def main(): download_training_and_eval_data() print('Starting model training.') print( 'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.' ) cifar10_estimator = TensorFlow(entry_point='cifar10_tf2.py', source_dir='source_dir', role=DUMMY_IAM_ROLE, instance_count=1, instance_type='local_gpu', framework_version='2.4.1', py_version='py37') inputs = { 'training': 'file://./data/training', 'validation': 'file://./data/validation' } cifar10_estimator.fit(inputs) print('Completed model training')
def run_test(sagemaker_session, ecr_image, instance_type, framework_version, test_data, record_wrapper_type=None): source_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources', 'pipemode') script = os.path.join(source_path, 'pipemode.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', train_instance_type=instance_type, train_instance_count=1, sagemaker_session=sagemaker_session, image_name=ecr_image, framework_version=framework_version, script_mode=True, input_mode='Pipe', hyperparameters={'dimension': DIMENSION}) input = s3_input(s3_data=test_data, distribution='FullyReplicated', record_wrapping=record_wrapper_type, input_mode='Pipe') with timeout(minutes=20): estimator.fit({'elizabeth': input}, job_name=unique_name_from_base('test-sagemaker-pipemode'))
def test_create_model(sagemaker_session, tf_version): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir) job_name = 'doing something' tf.fit(inputs='s3://mybucket/train', job_name=job_name) model = tf.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == tf_version assert model.py_version == tf.py_version assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir
def _test_distributed_mnist_ps_function(ecr_image, sagemaker_session, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') script = os.path.join(resource_path, 'mnist', 'mnist_estimator.py') estimator = TensorFlow( entry_point=script, role='SageMakerRole', hyperparameters={'sagemaker_parameter_server_enabled': True}, instance_count=2, instance_type=instance_type, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, 'mnist', 'data-distributed'), key_prefix='scriptmode/mnist-distributed') estimator.fit( inputs, job_name=unique_name_from_base('test-tf-sm-distributed-mnist')) _assert_checkpoint_exists(sagemaker_session.boto_region_name, estimator.model_dir, 0)
def tf_training_job(sagemaker_session, tf_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-tf') inputs = sagemaker_session.upload_data( path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs) print('job succeeded: {}'.format(estimator.latest_training_job.name)) return estimator.latest_training_job.name
def test_smdebug(sagemaker_session, ecr_image, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') script = os.path.join(resource_path, 'mnist', 'mnist_smdebug.py') hyperparameters = {'smdebug_path': '/tmp/ml/output/tensors'} estimator = TensorFlow(entry_point=script, role='SageMakerRole', instance_type=instance_type, instance_count=1, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, hyperparameters=hyperparameters) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, 'mnist', 'data'), key_prefix='scriptmode/mnist_smdebug') estimator.fit( inputs, job_name=unique_name_from_base('test-sagemaker-mnist-smdebug')) _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) job_name = 'doing something' tf.fit(inputs='s3://mybucket/train', job_name=job_name) new_role = 'role' model_server_workers = 2 vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']} model = tf.create_model(role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config) assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config
def test_inference_compiler_neo(self, sagemaker_session, ecr_image, framework_version, instance_type, instance_count, tmpdir, capsys, mnist_dataset): script = os.path.join(resource_path, 'mnist', 'mnist.py') estimator = TensorFlow( entry_point=script, role='SageMakerRole', instance_type=instance_type, instance_count=instance_count, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, hyperparameters={ TrainingCompilerConfig.HP_ENABLE_COMPILER: True, }, ) estimator.fit(mnist_dataset, job_name=unique_name_from_base('test-TF-trcomp-serving')) _assert_model_exported_to_s3(estimator) captured = capsys.readouterr() _assert_training_compiler_invoked(captured) s3_prefix = estimator.model_data.replace('output/model.tar.gz', '') estimator.compile_model( target_instance_family='ml_p3', input_shape={'data': [1, 28, 28]}, output_path=s3_prefix, framework='keras', framework_version='2.6.0', )
def test_tf_async(sagemaker_session): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-tf') inputs = estimator.sagemaker_session.upload_data( path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs, wait=False) training_job_name = estimator.latest_training_job.name time.sleep(20) endpoint_name = training_job_name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) result = json_predictor.predict([6.4, 3.2, 4.5, 1.5]) print('predict result: {}'.format(result))
def test_mnist_efs( efs_fsx_setup, sagemaker_session, cpu_instance_type, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] estimator = TensorFlow( entry_point=SCRIPT, role=role, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, subnets=subnets, security_group_ids=security_group_ids, ) file_system_efs_id = efs_fsx_setup["file_system_efs_id"] content_type = "application/json" file_system_input = FileSystemInput( file_system_id=file_system_efs_id, file_system_type="EFS", directory_path=EFS_DIR_PATH, content_type=content_type, ) with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=file_system_input, job_name=unique_name_from_base("test-mnist-efs")) assert_s3_files_exist( sagemaker_session, estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"], )
def test_attach_wrong_framework(sagemaker_session): returned_job_description = { "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0", }, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_container_log_level": '"logging.INFO"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": { "MaxRuntimeInSeconds": 24 * 60 * 60 }, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": { "KmsKeyId": "", "S3OutputPath": "s3://place/output/neo" }, "TrainingJobOutput": { "S3TrainingJobOutput": "s3://here/output.tar.gz" }, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description) with pytest.raises(ValueError) as error: TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def test_deploy(sagemaker_session, tf_version): estimator = TensorFlow( entry_point=SCRIPT, source_dir=SOURCE_DIR, role=ROLE, framework_version=tf_version, train_instance_count=2, train_instance_type=INSTANCE_TYPE_CPU, sagemaker_session=sagemaker_session, base_job_name="test-cifar", ) estimator.fit("s3://mybucket/train") print("job succeeded: {}".format(estimator.latest_training_job.name)) estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU) image = IMAGE_URI_FORMAT_STRING.format(REGION, CPU_IMAGE_NAME, tf_version, "cpu", "py2") sagemaker_session.create_model.assert_called_with( estimator._current_job_name, ROLE, { "Environment": { "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", "SAGEMAKER_SUBMIT_DIRECTORY": SOURCE_DIR, "SAGEMAKER_REQUIREMENTS": "", "SAGEMAKER_REGION": REGION, "SAGEMAKER_PROGRAM": SCRIPT, }, "Image": image, "ModelDataUrl": "s3://m/m.tar.gz", }, )
def test_deploy(sagemaker_session): estimator = TensorFlow( entry_point=SCRIPT, source_dir=SOURCE_DIR, role=ROLE, framework_version="2.3.0", py_version="py37", instance_count=2, instance_type=INSTANCE_TYPE_CPU, sagemaker_session=sagemaker_session, base_job_name="test-cifar", ) estimator.fit("s3://mybucket/train") estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU) image = IMAGE_URI_FORMAT_STRING.format(REGION, REPOSITORY, "2.3.0", PROCESSOR) sagemaker_session.create_model.assert_called_with( ANY, ROLE, { "Image": image, "Environment": { "SAGEMAKER_TFS_NGINX_LOGLEVEL": "info" }, "ModelDataUrl": "s3://m/m.tar.gz", }, vpc_config=None, enable_network_isolation=False, tags=None, )
def test_tf(m_tar, e_tar, time, strftime, sagemaker_session, tf_version): tf = TensorFlow(entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version, requirements_file=REQUIREMENTS_FILE, source_dir=DATA_DIR) inputs = 's3://mybucket/train' s3_prefix = 's3://{}/{}/source/sourcedir.tar.gz'.format( BUCKET_NAME, JOB_NAME) e_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE) s3_prefix = 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME) m_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE) tf.fit(inputs=inputs) call_names = [c[0] for c in sagemaker_session.method_calls] assert call_names == ['train', 'logs_for_job'] expected_train_args = _create_train_job(tf_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource'][ 'S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = tf.create_model() environment = { 'Environment': { 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_REQUIREMENTS': 'dummy_requirements.txt', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' }, 'Image': create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version, "py2"), 'ModelDataUrl': 's3://m/m.tar.gz' } assert environment == model.prepare_container_def(INSTANCE_TYPE) assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image'] predictor = tf.deploy(1, INSTANCE_TYPE) assert isinstance(predictor, TensorFlowPredictor)
def test_mnist_async(sagemaker_session): estimator = TensorFlow(entry_point=SCRIPT, role=ROLE, train_instance_count=1, train_instance_type='ml.c5.4xlarge', sagemaker_session=sagemaker_session, py_version='py3', framework_version=TensorFlow.LATEST_VERSION, base_job_name=unique_name_from_base('test-tf-sm-mnist'), tags=TAGS) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(RESOURCE_PATH, 'data'), key_prefix='scriptmode/mnist') estimator.fit(inputs, wait=False) training_job_name = estimator.latest_training_job.name time.sleep(20) endpoint_name = training_job_name _assert_training_job_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS) with timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) result = predictor.predict(np.zeros(784)) print('predict result: {}'.format(result)) _assert_endpoint_tags_match(sagemaker_session.sagemaker_client, predictor.endpoint, TAGS) _assert_model_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS)
def test_attach_wrong_framework(sagemaker_session): returned_job_description = { 'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0' }, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'training_steps': '100' }, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge' }, 'StoppingCondition': { 'MaxRuntimeInSeconds': 24 * 60 * 60 }, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': { 'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo' }, 'TrainingJobOutput': { 'S3TrainingJobOutput': 's3://here/output.tar.gz' } } sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=returned_job_description) with pytest.raises(ValueError) as error: TensorFlow.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def test_tf(m_tar, e_tar, time, strftime, sagemaker_session, tf_version): tf = TensorFlow(entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version, requirements_file=REQUIREMENTS_FILE, source_dir=DATA_DIR) inputs = 's3://mybucket/train' s3_prefix = 's3://{}/{}/source/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME) e_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE) s3_prefix = 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME) m_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE) tf.fit(inputs=inputs) call_names = [c[0] for c in sagemaker_session.method_calls] assert call_names == ['train', 'logs_for_job'] expected_train_args = _create_train_job(tf_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = tf.create_model() environment = { 'Environment': { 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_REQUIREMENTS': 'dummy_requirements.txt', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' }, 'Image': create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version, "py2"), 'ModelDataUrl': 's3://m/m.tar.gz' } assert environment == model.prepare_container_def(INSTANCE_TYPE) assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image'] predictor = tf.deploy(1, INSTANCE_TYPE) assert isinstance(predictor, TensorFlowPredictor)
def test_deploy(sagemaker_session, tf_version): estimator = TensorFlow(entry_point=SCRIPT, source_dir=SOURCE_DIR, role=ROLE, framework_version=tf_version, train_instance_count=2, train_instance_type=INSTANCE_TYPE_CPU, sagemaker_session=sagemaker_session, base_job_name='test-cifar') estimator.fit('s3://mybucket/train') print('job succeeded: {}'.format(estimator.latest_training_job.name)) estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU) image = IMAGE_URI_FORMAT_STRING.format(REGION, CPU_IMAGE_NAME, tf_version, 'cpu', 'py2') sagemaker_session.create_model.assert_called_with( estimator._current_job_name, ROLE, {'Environment': {'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20', 'SAGEMAKER_SUBMIT_DIRECTORY': SOURCE_DIR, 'SAGEMAKER_REQUIREMENTS': '', 'SAGEMAKER_REGION': REGION, 'SAGEMAKER_PROGRAM': SCRIPT}, 'Image': image, 'ModelDataUrl': 's3://m/m.tar.gz'})
def test_keras(sagemaker_session, tf_full_version): script_path = os.path.join(DATA_DIR, 'cifar_10', 'source') dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data') with timeout(minutes=45): estimator = TensorFlow(entry_point='keras_cnn_cifar_10.py', source_dir=script_path, role='SageMakerRole', sagemaker_session=sagemaker_session, hyperparameters={'learning_rate': 1e-4, 'decay': 1e-6}, training_steps=500, evaluation_steps=5, train_instance_count=1, train_instance_type='ml.c4.xlarge', train_max_run=45 * 60) inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10') estimator.fit(inputs) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge') data = np.random.randn(32, 32, 3) predict_response = predictor.predict(data) assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
def test_attach(sagemaker_session, tf_version): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-py2-cpu:{}-cpu-py2'.format(tf_version) rjd = { 'AlgorithmSpecification': { 'TrainingInputMode': 'File', 'TrainingImage': training_image }, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'checkpoint_path': '"s3://other/1508872349"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'evaluation_steps': '10' }, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': { 'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge' }, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd) estimator = TensorFlow.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == 'neo' assert estimator.py_version == 'py2' assert estimator.framework_version == tf_version assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == 'File' assert estimator.training_steps == 100 assert estimator.evaluation_steps == 10 assert estimator.input_mode == 'File' assert estimator.base_job_name == 'neo' assert estimator.output_path == 's3://place/output/neo' assert estimator.output_kms_key == '' assert estimator.hyperparameters()['training_steps'] == '100' assert estimator.source_dir == 's3://some/sourcedir.tar.gz' assert estimator.entry_point == 'iris-dnn-classifier.py' assert estimator.checkpoint_path == 's3://other/1508872349'
def test_tf_local_mode(tf_full_version, sagemaker_local_session): local_mode_lock_fd = open(LOCK_PATH, 'w') local_mode_lock = local_mode_lock_fd.fileno() with timeout(minutes=5): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='local', base_job_name='test-tf', sagemaker_session=sagemaker_local_session) inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name try: # Since Local Mode uses the same port for serving, we need a lock in order # to allow concurrent test execution. The serving test is really fast so it still # makes sense to allow this behavior. fcntl.lockf(local_mode_lock, fcntl.LOCK_EX) json_predictor = estimator.deploy(initial_instance_count=1, instance_type='local', endpoint_name=endpoint_name) features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = json_predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result finally: estimator.delete_endpoint() time.sleep(5) fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)