def test_tf_vpc_multi(sagemaker_session, tf_full_version): """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" instance_type = 'ml.c4.xlarge' instance_count = 2 train_input = sagemaker_session.upload_data( path=os.path.join(DATA_DIR, 'iris', 'data'), key_prefix='integ-test-data/tf_iris') script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') ec2_client = sagemaker_session.boto_session.client('ec2') subnet_ids, security_group_id = get_or_create_vpc_resources( ec2_client, sagemaker_session.boto_session.region_name) setup_security_group_for_encryption(ec2_client, security_group_id) estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, base_job_name='test-vpc-tf', subnets=subnet_ids, security_group_ids=[security_group_id], encrypt_inter_container_traffic=True) with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(train_input) print('training job succeeded: {}'.format( estimator.latest_training_job.name)) job_desc = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=estimator.latest_training_job.name) assert set(subnet_ids) == set(job_desc['VpcConfig']['Subnets']) assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds'] assert job_desc['EnableInterContainerTrafficEncryption'] is True endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): model = estimator.create_model() json_predictor = model.deploy(initial_instance_count=instance_count, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = json_predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=model.name) assert set(subnet_ids) == set(model_desc['VpcConfig']['Subnets']) assert [security_group_id] == model_desc['VpcConfig']['SecurityGroupIds']
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" enable_cloudwatch_metrics = "true" tf = TensorFlow( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name="job", source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics, ) job_name = "doing something" tf.fit(inputs="s3://mybucket/train", job_name=job_name) new_role = "role" model_server_workers = 2 vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]} model = tf.create_model(role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config) assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config
def test_tf(m_tar, e_tar, time, strftime, sagemaker_session, tf_version): tf = TensorFlow(entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version, requirements_file=REQUIREMENTS_FILE, source_dir=DATA_DIR) inputs = 's3://mybucket/train' s3_prefix = 's3://{}/{}/source/sourcedir.tar.gz'.format( BUCKET_NAME, JOB_NAME) e_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE) s3_prefix = 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME) m_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE) tf.fit(inputs=inputs) call_names = [c[0] for c in sagemaker_session.method_calls] assert call_names == ['train', 'logs_for_job'] expected_train_args = _create_train_job(tf_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource'][ 'S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = tf.create_model() environment = { 'Environment': { 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_REQUIREMENTS': 'dummy_requirements.txt', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' }, 'Image': create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version, "py2"), 'ModelDataUrl': 's3://m/m.tar.gz' } assert environment == model.prepare_container_def(INSTANCE_TYPE) assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image'] predictor = tf.deploy(1, INSTANCE_TYPE) assert isinstance(predictor, TensorFlowPredictor)
def test_create_model(sagemaker_session, tf_version): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir) job_name = 'doing something' tf.fit(inputs='s3://mybucket/train', job_name=job_name) model = tf.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == tf_version assert model.py_version == tf.py_version assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.vpc_config is None
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) job_name = 'doing something' tf.fit(inputs='s3://mybucket/train', job_name=job_name) new_role = 'role' model_server_workers = 2 vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']} model = tf.create_model(role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config) assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config
def test_tf(time, strftime, sagemaker_session, tf_version): tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version) inputs = 's3://mybucket/train' tf.fit(inputs=inputs) call_names = [c[0] for c in sagemaker_session.method_calls] assert call_names == ['train', 'logs_for_job'] boto_call_names = [ c[0] for c in sagemaker_session.boto_session.method_calls ] assert boto_call_names == ['resource'] expected_train_args = _create_train_job(tf_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource'][ 'S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = tf.create_model() assert { 'Environment': { 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' }, 'Image': create_image_uri('us-west-2', "tensorflow", GPU_IMAGE_NAME, tf_version, "py2"), 'ModelDataUrl': 's3://m/m.tar.gz' } == model.prepare_container_def(GPU_IMAGE_NAME) assert 'cpu' in model.prepare_container_def(CPU_IMAGE_NAME)['Image'] predictor = tf.deploy(1, GPU_IMAGE_NAME) assert isinstance(predictor, TensorFlowPredictor)
def test_tf(sagemaker_session, tf_version): tf = TensorFlow( entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version, requirements_file=REQUIREMENTS_FILE, source_dir=DATA_DIR, ) inputs = "s3://mybucket/train" tf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG) call_names = [c[0] for c in sagemaker_session.method_calls] assert call_names == ["train", "logs_for_job"] expected_train_args = _create_train_job(tf_version) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][ "S3Uri"] = inputs expected_train_args["experiment_config"] = EXPERIMENT_CONFIG actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = tf.create_model() environment = { "Environment": { "SAGEMAKER_SUBMIT_DIRECTORY": "s3://mybucket/sagemaker-tensorflow-2017-11-06-14:14:15.673/source/sourcedir.tar.gz", # noqa: E501 "SAGEMAKER_PROGRAM": "dummy_script.py", "SAGEMAKER_REQUIREMENTS": "dummy_requirements.txt", "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS": "false", "SAGEMAKER_REGION": "us-west-2", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", }, "Image": create_image_uri("us-west-2", "tensorflow", INSTANCE_TYPE, tf_version, "py2"), "ModelDataUrl": "s3://m/m.tar.gz", } assert environment == model.prepare_container_def(INSTANCE_TYPE) assert "cpu" in model.prepare_container_def(INSTANCE_TYPE)["Image"] predictor = tf.deploy(1, INSTANCE_TYPE) assert isinstance(predictor, TensorFlowPredictor)
def train_and_deploy(deploy_name, embedding_dim=256, units=1024, batch_size=32, epochs=20): """ Start a Sagemaker Training job passing the parameters to the TensorFlow model. Deploy the model to a SageMaker endpoint. :return: SageMaker TensorFlow model object. """ # Setup the training job. model_artifacts_location = os.getenv("S3_SAGEMAKER_ARTIFACTS") tf_estimator = TensorFlow(entry_point='rnn.py', role=os.getenv("SAGEMAKER_ROLE"), train_instance_count=1, train_instance_type='ml.p2.xlarge', framework_version='2.3.0', model_dir='/opt/ml/model', output_path=model_artifacts_location, py_version='py37', script_mode=True, hyperparameters={ 'embed-dim': embedding_dim, 'rnn-units': units, 'batch-size': batch_size, 'epochs': epochs }) # Start the training job. tf_estimator.fit() # Create the SageMaker Model. tf_estimator.create_model() # Deploy to endpoint. deploy(tf_estimator.model_data, deploy_name) return tf_estimator
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' custom_image = 'tensorflow:1.0' tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, image_name=custom_image, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir) job_name = 'doing something' tf.fit(inputs='s3://mybucket/train', job_name=job_name) model = tf.create_model() assert model.image == custom_image
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) job_name = 'doing something' tf.fit(inputs='s3://mybucket/train', job_name=job_name) new_role = 'role' model_server_workers = 2 model = tf.create_model(role=new_role, model_server_workers=2) assert model.role == new_role assert model.model_server_workers == model_server_workers
def test_create_model_with_optional_params(sagemaker_session, tensorflow_inference_version, tensorflow_inference_py_version): if version.Version(tensorflow_inference_version) < version.Version("1.11"): pytest.skip( "Legacy TF version requires explicit image URI, and " "this logic is tested in test_create_model_with_custom_image.") container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" tf = TensorFlow( entry_point=SCRIPT_PATH, framework_version=tensorflow_inference_version, py_version=tensorflow_inference_py_version, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name="job", source_dir=source_dir, output_path="s3://mybucket/output", ) tf._current_job_name = "doing something" new_role = "role" vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]} model_name = "model-name" model = tf.create_model( role=new_role, vpc_config_override=vpc_config, entry_point=SERVING_SCRIPT_FILE, name=model_name, enable_network_isolation=True, ) assert model.role == new_role assert model.vpc_config == vpc_config assert model.entry_point == SERVING_SCRIPT_FILE assert model.name == model_name assert model.enable_network_isolation()
def test_create_model(name_from_base, sagemaker_session, tensorflow_inference_version, tensorflow_inference_py_version): if version.Version(tensorflow_inference_version) < version.Version("1.11"): pytest.skip( "Legacy TF version requires explicit image URI, and " "this logic is tested in test_create_model_with_custom_image.") container_log_level = '"logging.INFO"' base_job_name = "job" tf = TensorFlow( entry_point=SCRIPT_PATH, source_dir="s3://mybucket/source", framework_version=tensorflow_inference_version, py_version=tensorflow_inference_py_version, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name=base_job_name, enable_network_isolation=True, output_path="s3://mybucket/output", ) tf._current_job_name = "doing something" model_name = "doing something else" name_from_base.return_value = model_name model = tf.create_model() name_from_base.assert_called_with("job") assert model.sagemaker_session == sagemaker_session assert model.framework_version == tensorflow_inference_version assert model.entry_point is None assert model.role == ROLE assert model.name == model_name assert model._container_log_level == container_log_level assert model.source_dir is None assert model.vpc_config is None assert model.enable_network_isolation()
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" custom_image = "tensorflow:1.0" tf = TensorFlow( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, image_uri=custom_image, container_log_level=container_log_level, base_job_name="job", source_dir=source_dir, ) job_name = "doing something" tf.fit(inputs="s3://mybucket/train", job_name=job_name) model = tf.create_model() assert model.image_uri == custom_image
def test_create_model(sagemaker_session, tf_version): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir) job_name = 'doing something' tf.fit(inputs='s3://mybucket/train', job_name=job_name) model = tf.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == tf_version assert model.py_version == tf.py_version assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir
def test_tf(m_tar, e_tar, time, strftime, sagemaker_session, tf_version): tf = TensorFlow(entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version, requirements_file=REQUIREMENTS_FILE, source_dir=DATA_DIR) inputs = 's3://mybucket/train' s3_prefix = 's3://{}/{}/source/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME) e_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE) s3_prefix = 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME) m_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE) tf.fit(inputs=inputs) call_names = [c[0] for c in sagemaker_session.method_calls] assert call_names == ['train', 'logs_for_job'] expected_train_args = _create_train_job(tf_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = tf.create_model() environment = { 'Environment': { 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_REQUIREMENTS': 'dummy_requirements.txt', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' }, 'Image': create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version, "py2"), 'ModelDataUrl': 's3://m/m.tar.gz' } assert environment == model.prepare_container_def(INSTANCE_TYPE) assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image'] predictor = tf.deploy(1, INSTANCE_TYPE) assert isinstance(predictor, TensorFlowPredictor)
train_instance_count=1, train_instance_type='ml.c4.xlarge', training_steps=900, evaluation_steps=100) pprint.pprint(vars(iris_estimator)) container = '520713654638.dkr.ecr.{}.amazonaws.com/sagemaker-tensorflow:{}-cpu-{}'.format( boto3.Session().region_name, iris_estimator.framework_version, iris_estimator.py_version) iris_estimator.fit(train_data_location) pprint.pprint(vars(iris_estimator)) pprint.pprint('Creating model') iris_model = iris_estimator.create_model(role=role) pprint.pprint(vars(iris_model)) #pprint.pprint('Deploy model') #iris_predictor = iris_model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge') #pprint.pprint(vars(iris_predictor)) containerEnv = { "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS": "false", "SAGEMAKER_PROGRAM": "iris_dnn_classifier.py", "SAGEMAKER_REGION": boto3.Session().region_name, "SAGEMAKER_SUBMIT_DIRECTORY": iris_model.source_dir, } best_model = iris_model.model_data