예제 #1
0
def test_tf_vpc_multi(sagemaker_session, tf_full_version):
    """Test Tensorflow multi-instance using the same VpcConfig for training and inference"""
    instance_type = 'ml.c4.xlarge'
    instance_count = 2

    train_input = sagemaker_session.upload_data(
        path=os.path.join(DATA_DIR, 'iris', 'data'),
        key_prefix='integ-test-data/tf_iris')
    script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

    ec2_client = sagemaker_session.boto_session.client('ec2')
    subnet_ids, security_group_id = get_or_create_vpc_resources(
        ec2_client, sagemaker_session.boto_session.region_name)

    setup_security_group_for_encryption(ec2_client, security_group_id)

    estimator = TensorFlow(entry_point=script_path,
                           role='SageMakerRole',
                           framework_version=tf_full_version,
                           training_steps=1,
                           evaluation_steps=1,
                           hyperparameters={'input_tensor_name': 'inputs'},
                           train_instance_count=instance_count,
                           train_instance_type=instance_type,
                           sagemaker_session=sagemaker_session,
                           base_job_name='test-vpc-tf',
                           subnets=subnet_ids,
                           security_group_ids=[security_group_id],
                           encrypt_inter_container_traffic=True)

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(train_input)
        print('training job succeeded: {}'.format(
            estimator.latest_training_job.name))

    job_desc = sagemaker_session.sagemaker_client.describe_training_job(
        TrainingJobName=estimator.latest_training_job.name)
    assert set(subnet_ids) == set(job_desc['VpcConfig']['Subnets'])
    assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds']
    assert job_desc['EnableInterContainerTrafficEncryption'] is True

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = estimator.create_model()
        json_predictor = model.deploy(initial_instance_count=instance_count,
                                      instance_type='ml.c4.xlarge',
                                      endpoint_name=endpoint_name)

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = json_predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = json_predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result

    model_desc = sagemaker_session.sagemaker_client.describe_model(
        ModelName=model.name)
    assert set(subnet_ids) == set(model_desc['VpcConfig']['Subnets'])
    assert [security_group_id] == model_desc['VpcConfig']['SecurityGroupIds']
def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    enable_cloudwatch_metrics = "true"
    tf = TensorFlow(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        training_steps=1000,
        evaluation_steps=10,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        container_log_level=container_log_level,
        base_job_name="job",
        source_dir=source_dir,
        enable_cloudwatch_metrics=enable_cloudwatch_metrics,
    )

    job_name = "doing something"
    tf.fit(inputs="s3://mybucket/train", job_name=job_name)

    new_role = "role"
    model_server_workers = 2
    vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]}
    model = tf.create_model(role=new_role,
                            model_server_workers=model_server_workers,
                            vpc_config_override=vpc_config)

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
    assert model.vpc_config == vpc_config
예제 #3
0
def test_tf(m_tar, e_tar, time, strftime, sagemaker_session, tf_version):
    tf = TensorFlow(entry_point=SCRIPT_FILE,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    training_steps=1000,
                    evaluation_steps=10,
                    train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE,
                    framework_version=tf_version,
                    requirements_file=REQUIREMENTS_FILE,
                    source_dir=DATA_DIR)

    inputs = 's3://mybucket/train'
    s3_prefix = 's3://{}/{}/source/sourcedir.tar.gz'.format(
        BUCKET_NAME, JOB_NAME)
    e_tar.return_value = UploadedCode(s3_prefix=s3_prefix,
                                      script_name=SCRIPT_FILE)
    s3_prefix = 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME)
    m_tar.return_value = UploadedCode(s3_prefix=s3_prefix,
                                      script_name=SCRIPT_FILE)
    tf.fit(inputs=inputs)

    call_names = [c[0] for c in sagemaker_session.method_calls]
    assert call_names == ['train', 'logs_for_job']

    expected_train_args = _create_train_job(tf_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource'][
        'S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = tf.create_model()

    environment = {
        'Environment': {
            'SAGEMAKER_SUBMIT_DIRECTORY':
            's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME),
            'SAGEMAKER_PROGRAM':
            'dummy_script.py',
            'SAGEMAKER_REQUIREMENTS':
            'dummy_requirements.txt',
            'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS':
            'false',
            'SAGEMAKER_REGION':
            'us-west-2',
            'SAGEMAKER_CONTAINER_LOG_LEVEL':
            '20'
        },
        'Image':
        create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version,
                         "py2"),
        'ModelDataUrl':
        's3://m/m.tar.gz'
    }
    assert environment == model.prepare_container_def(INSTANCE_TYPE)

    assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image']
    predictor = tf.deploy(1, INSTANCE_TYPE)
    assert isinstance(predictor, TensorFlowPredictor)
예제 #4
0
def test_create_model(sagemaker_session, tf_version):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    tf = TensorFlow(entry_point=SCRIPT_PATH,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    training_steps=1000,
                    evaluation_steps=10,
                    train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE,
                    framework_version=tf_version,
                    container_log_level=container_log_level,
                    base_job_name='job',
                    source_dir=source_dir)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)
    model = tf.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == tf_version
    assert model.py_version == tf.py_version
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
    assert model.vpc_config is None
예제 #5
0
def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    tf = TensorFlow(entry_point=SCRIPT_PATH,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    training_steps=1000,
                    evaluation_steps=10,
                    train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE,
                    container_log_level=container_log_level,
                    base_job_name='job',
                    source_dir=source_dir,
                    enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)

    new_role = 'role'
    model_server_workers = 2
    vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']}
    model = tf.create_model(role=new_role,
                            model_server_workers=model_server_workers,
                            vpc_config_override=vpc_config)

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
    assert model.vpc_config == vpc_config
예제 #6
0
def test_tf(time, strftime, sagemaker_session, tf_version):
    tf = TensorFlow(entry_point=SCRIPT_PATH,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    training_steps=1000,
                    evaluation_steps=10,
                    train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE,
                    framework_version=tf_version)

    inputs = 's3://mybucket/train'

    tf.fit(inputs=inputs)

    call_names = [c[0] for c in sagemaker_session.method_calls]
    assert call_names == ['train', 'logs_for_job']
    boto_call_names = [
        c[0] for c in sagemaker_session.boto_session.method_calls
    ]
    assert boto_call_names == ['resource']

    expected_train_args = _create_train_job(tf_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource'][
        'S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = tf.create_model()

    assert {
        'Environment': {
            'SAGEMAKER_SUBMIT_DIRECTORY':
            's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME),
            'SAGEMAKER_PROGRAM':
            'dummy_script.py',
            'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS':
            'false',
            'SAGEMAKER_REGION':
            'us-west-2',
            'SAGEMAKER_CONTAINER_LOG_LEVEL':
            '20'
        },
        'Image':
        create_image_uri('us-west-2', "tensorflow", GPU_IMAGE_NAME, tf_version,
                         "py2"),
        'ModelDataUrl':
        's3://m/m.tar.gz'
    } == model.prepare_container_def(GPU_IMAGE_NAME)

    assert 'cpu' in model.prepare_container_def(CPU_IMAGE_NAME)['Image']
    predictor = tf.deploy(1, GPU_IMAGE_NAME)
    assert isinstance(predictor, TensorFlowPredictor)
def test_tf(sagemaker_session, tf_version):
    tf = TensorFlow(
        entry_point=SCRIPT_FILE,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        training_steps=1000,
        evaluation_steps=10,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        framework_version=tf_version,
        requirements_file=REQUIREMENTS_FILE,
        source_dir=DATA_DIR,
    )

    inputs = "s3://mybucket/train"

    tf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG)

    call_names = [c[0] for c in sagemaker_session.method_calls]
    assert call_names == ["train", "logs_for_job"]

    expected_train_args = _create_train_job(tf_version)
    expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][
        "S3Uri"] = inputs
    expected_train_args["experiment_config"] = EXPERIMENT_CONFIG

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = tf.create_model()

    environment = {
        "Environment": {
            "SAGEMAKER_SUBMIT_DIRECTORY":
            "s3://mybucket/sagemaker-tensorflow-2017-11-06-14:14:15.673/source/sourcedir.tar.gz",  # noqa: E501
            "SAGEMAKER_PROGRAM": "dummy_script.py",
            "SAGEMAKER_REQUIREMENTS": "dummy_requirements.txt",
            "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS": "false",
            "SAGEMAKER_REGION": "us-west-2",
            "SAGEMAKER_CONTAINER_LOG_LEVEL": "20",
        },
        "Image":
        create_image_uri("us-west-2", "tensorflow", INSTANCE_TYPE, tf_version,
                         "py2"),
        "ModelDataUrl":
        "s3://m/m.tar.gz",
    }
    assert environment == model.prepare_container_def(INSTANCE_TYPE)

    assert "cpu" in model.prepare_container_def(INSTANCE_TYPE)["Image"]
    predictor = tf.deploy(1, INSTANCE_TYPE)
    assert isinstance(predictor, TensorFlowPredictor)
예제 #8
0
def train_and_deploy(deploy_name,
                     embedding_dim=256,
                     units=1024,
                     batch_size=32,
                     epochs=20):
    """
    Start a Sagemaker Training job passing the parameters to the TensorFlow model. Deploy the model to a SageMaker
     endpoint.
    :return: SageMaker TensorFlow model object.
    """
    # Setup the training job.
    model_artifacts_location = os.getenv("S3_SAGEMAKER_ARTIFACTS")
    tf_estimator = TensorFlow(entry_point='rnn.py',
                              role=os.getenv("SAGEMAKER_ROLE"),
                              train_instance_count=1,
                              train_instance_type='ml.p2.xlarge',
                              framework_version='2.3.0',
                              model_dir='/opt/ml/model',
                              output_path=model_artifacts_location,
                              py_version='py37',
                              script_mode=True,
                              hyperparameters={
                                  'embed-dim': embedding_dim,
                                  'rnn-units': units,
                                  'batch-size': batch_size,
                                  'epochs': epochs
                              })

    # Start the training job.
    tf_estimator.fit()

    # Create the SageMaker Model.
    tf_estimator.create_model()

    # Deploy to endpoint.
    deploy(tf_estimator.model_data, deploy_name)

    return tf_estimator
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    custom_image = 'tensorflow:1.0'
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE, image_name=custom_image,
                    container_log_level=container_log_level, base_job_name='job',
                    source_dir=source_dir)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)
    model = tf.create_model()

    assert model.image == custom_image
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    custom_image = 'tensorflow:1.0'
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE, image_name=custom_image,
                    container_log_level=container_log_level, base_job_name='job',
                    source_dir=source_dir)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)
    model = tf.create_model()

    assert model.image == custom_image
def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job',
                    source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)

    new_role = 'role'
    model_server_workers = 2
    model = tf.create_model(role=new_role, model_server_workers=2)

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
def test_create_model_with_optional_params(sagemaker_session,
                                           tensorflow_inference_version,
                                           tensorflow_inference_py_version):
    if version.Version(tensorflow_inference_version) < version.Version("1.11"):
        pytest.skip(
            "Legacy TF version requires explicit image URI, and "
            "this logic is tested in test_create_model_with_custom_image.")

    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    tf = TensorFlow(
        entry_point=SCRIPT_PATH,
        framework_version=tensorflow_inference_version,
        py_version=tensorflow_inference_py_version,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        container_log_level=container_log_level,
        base_job_name="job",
        source_dir=source_dir,
        output_path="s3://mybucket/output",
    )

    tf._current_job_name = "doing something"

    new_role = "role"
    vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]}
    model_name = "model-name"
    model = tf.create_model(
        role=new_role,
        vpc_config_override=vpc_config,
        entry_point=SERVING_SCRIPT_FILE,
        name=model_name,
        enable_network_isolation=True,
    )

    assert model.role == new_role
    assert model.vpc_config == vpc_config
    assert model.entry_point == SERVING_SCRIPT_FILE
    assert model.name == model_name
    assert model.enable_network_isolation()
def test_create_model(name_from_base, sagemaker_session,
                      tensorflow_inference_version,
                      tensorflow_inference_py_version):
    if version.Version(tensorflow_inference_version) < version.Version("1.11"):
        pytest.skip(
            "Legacy TF version requires explicit image URI, and "
            "this logic is tested in test_create_model_with_custom_image.")

    container_log_level = '"logging.INFO"'
    base_job_name = "job"
    tf = TensorFlow(
        entry_point=SCRIPT_PATH,
        source_dir="s3://mybucket/source",
        framework_version=tensorflow_inference_version,
        py_version=tensorflow_inference_py_version,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        container_log_level=container_log_level,
        base_job_name=base_job_name,
        enable_network_isolation=True,
        output_path="s3://mybucket/output",
    )

    tf._current_job_name = "doing something"

    model_name = "doing something else"
    name_from_base.return_value = model_name
    model = tf.create_model()

    name_from_base.assert_called_with("job")

    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == tensorflow_inference_version
    assert model.entry_point is None
    assert model.role == ROLE
    assert model.name == model_name
    assert model._container_log_level == container_log_level
    assert model.source_dir is None
    assert model.vpc_config is None
    assert model.enable_network_isolation()
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    custom_image = "tensorflow:1.0"
    tf = TensorFlow(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        image_uri=custom_image,
        container_log_level=container_log_level,
        base_job_name="job",
        source_dir=source_dir,
    )

    job_name = "doing something"
    tf.fit(inputs="s3://mybucket/train", job_name=job_name)
    model = tf.create_model()

    assert model.image_uri == custom_image
def test_create_model(sagemaker_session, tf_version):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE, framework_version=tf_version,
                    container_log_level=container_log_level, base_job_name='job',
                    source_dir=source_dir)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)
    model = tf.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == tf_version
    assert model.py_version == tf.py_version
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
def test_tf(m_tar, e_tar, time, strftime, sagemaker_session, tf_version):
    tf = TensorFlow(entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000,
                    evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                    framework_version=tf_version, requirements_file=REQUIREMENTS_FILE, source_dir=DATA_DIR)

    inputs = 's3://mybucket/train'
    s3_prefix = 's3://{}/{}/source/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME)
    e_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE)
    s3_prefix = 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME)
    m_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE)
    tf.fit(inputs=inputs)

    call_names = [c[0] for c in sagemaker_session.method_calls]
    assert call_names == ['train', 'logs_for_job']

    expected_train_args = _create_train_job(tf_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = tf.create_model()

    environment = {
        'Environment': {
            'SAGEMAKER_SUBMIT_DIRECTORY': 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME),
            'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_REQUIREMENTS': 'dummy_requirements.txt',
            'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2',
            'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'
        },
        'Image': create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version, "py2"),
        'ModelDataUrl': 's3://m/m.tar.gz'
    }
    assert environment == model.prepare_container_def(INSTANCE_TYPE)

    assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image']
    predictor = tf.deploy(1, INSTANCE_TYPE)
    assert isinstance(predictor, TensorFlowPredictor)
예제 #17
0
    train_instance_count=1,
    train_instance_type='ml.c4.xlarge',
    training_steps=900,
    evaluation_steps=100)

pprint.pprint(vars(iris_estimator))

container = '520713654638.dkr.ecr.{}.amazonaws.com/sagemaker-tensorflow:{}-cpu-{}'.format(
    boto3.Session().region_name, iris_estimator.framework_version,
    iris_estimator.py_version)

iris_estimator.fit(train_data_location)
pprint.pprint(vars(iris_estimator))

pprint.pprint('Creating model')
iris_model = iris_estimator.create_model(role=role)
pprint.pprint(vars(iris_model))

#pprint.pprint('Deploy model')
#iris_predictor = iris_model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')
#pprint.pprint(vars(iris_predictor))

containerEnv = {
    "SAGEMAKER_CONTAINER_LOG_LEVEL": "20",
    "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS": "false",
    "SAGEMAKER_PROGRAM": "iris_dnn_classifier.py",
    "SAGEMAKER_REGION": boto3.Session().region_name,
    "SAGEMAKER_SUBMIT_DIRECTORY": iris_model.source_dir,
}
best_model = iris_model.model_data