Exemplo n.º 1
0
def test_tf_vpc_multi(sagemaker_session, tf_full_version):
    """Test Tensorflow multi-instance using the same VpcConfig for training and inference"""
    instance_type = 'ml.c4.xlarge'
    instance_count = 2

    train_input = sagemaker_session.upload_data(
        path=os.path.join(DATA_DIR, 'iris', 'data'),
        key_prefix='integ-test-data/tf_iris')
    script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

    ec2_client = sagemaker_session.boto_session.client('ec2')
    subnet_ids, security_group_id = get_or_create_vpc_resources(
        ec2_client, sagemaker_session.boto_session.region_name)

    setup_security_group_for_encryption(ec2_client, security_group_id)

    estimator = TensorFlow(entry_point=script_path,
                           role='SageMakerRole',
                           framework_version=tf_full_version,
                           training_steps=1,
                           evaluation_steps=1,
                           hyperparameters={'input_tensor_name': 'inputs'},
                           train_instance_count=instance_count,
                           train_instance_type=instance_type,
                           sagemaker_session=sagemaker_session,
                           base_job_name='test-vpc-tf',
                           subnets=subnet_ids,
                           security_group_ids=[security_group_id],
                           encrypt_inter_container_traffic=True)

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(train_input)
        print('training job succeeded: {}'.format(
            estimator.latest_training_job.name))

    job_desc = sagemaker_session.sagemaker_client.describe_training_job(
        TrainingJobName=estimator.latest_training_job.name)
    assert set(subnet_ids) == set(job_desc['VpcConfig']['Subnets'])
    assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds']
    assert job_desc['EnableInterContainerTrafficEncryption'] is True

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = estimator.create_model()
        json_predictor = model.deploy(initial_instance_count=instance_count,
                                      instance_type='ml.c4.xlarge',
                                      endpoint_name=endpoint_name)

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = json_predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = json_predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result

    model_desc = sagemaker_session.sagemaker_client.describe_model(
        ModelName=model.name)
    assert set(subnet_ids) == set(model_desc['VpcConfig']['Subnets'])
    assert [security_group_id] == model_desc['VpcConfig']['SecurityGroupIds']
def test_tuning_tf_vpc_multi(
    sagemaker_session,
    cpu_instance_type,
    tensorflow_training_latest_version,
    tensorflow_training_latest_py_version,
):
    """Test Tensorflow multi-instance using the same VpcConfig for training and inference"""
    instance_type = cpu_instance_type
    instance_count = 2

    resource_path = os.path.join(DATA_DIR, "tensorflow_mnist")
    script_path = "mnist.py"

    ec2_client = sagemaker_session.boto_session.client("ec2")
    subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources(
        ec2_client)
    vpc_test_utils.setup_security_group_for_encryption(ec2_client,
                                                       security_group_id)

    estimator = TensorFlow(
        entry_point=script_path,
        source_dir=resource_path,
        role="SageMakerRole",
        framework_version=tensorflow_training_latest_version,
        py_version=tensorflow_training_latest_py_version,
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        base_job_name="test-vpc-tf",
        subnets=subnet_ids,
        security_group_ids=[security_group_id],
        encrypt_inter_container_traffic=True,
    )

    hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)}
    objective_metric_name = "accuracy"
    metric_definitions = [{
        "Name": objective_metric_name,
        "Regex": "accuracy = ([0-9\\.]+)"
    }]

    tuner = HyperparameterTuner(
        estimator,
        objective_metric_name,
        hyperparameter_ranges,
        metric_definitions,
        max_jobs=2,
        max_parallel_jobs=2,
    )

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(resource_path, "data"),
            key_prefix="scriptmode/mnist")

        tuning_job_name = unique_name_from_base("tune-tf", max_length=32)
        print(
            f"Started hyperparameter tuning job with name: {tuning_job_name}")
        tuner.fit(inputs, job_name=tuning_job_name)
def test_transform_pytorch_vpc_custom_model_bucket(
    sagemaker_session,
    pytorch_inference_latest_version,
    pytorch_inference_latest_py_version,
    cpu_instance_type,
    custom_bucket_name,
):
    data_dir = os.path.join(DATA_DIR, "pytorch_mnist")

    ec2_client = sagemaker_session.boto_session.client("ec2")
    subnet_ids, security_group_id = get_or_create_vpc_resources(ec2_client)

    model_data = sagemaker_session.upload_data(
        path=os.path.join(data_dir, "model.tar.gz"),
        bucket=custom_bucket_name,
        key_prefix="integ-test-data/pytorch_mnist/model",
    )

    model = PyTorchModel(
        model_data=model_data,
        entry_point=os.path.join(data_dir, "mnist.py"),
        role="SageMakerRole",
        framework_version=pytorch_inference_latest_version,
        py_version=pytorch_inference_latest_py_version,
        sagemaker_session=sagemaker_session,
        vpc_config={
            "Subnets": subnet_ids,
            "SecurityGroupIds": [security_group_id]
        },
        code_location="s3://{}".format(custom_bucket_name),
    )

    transform_input = sagemaker_session.upload_data(
        path=os.path.join(data_dir, "transform", "data.npy"),
        key_prefix="integ-test-data/pytorch_mnist/transform",
    )

    transformer = model.transformer(1, cpu_instance_type)
    transformer.transform(
        transform_input,
        content_type="application/x-npy",
        job_name=unique_name_from_base("test-transform-vpc"),
    )

    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        transformer.wait()
        model_desc = sagemaker_session.sagemaker_client.describe_model(
            ModelName=transformer.model_name)
        assert set(subnet_ids) == set(model_desc["VpcConfig"]["Subnets"])
        assert [security_group_id
                ] == model_desc["VpcConfig"]["SecurityGroupIds"]

        model_bucket, _ = s3.parse_s3_url(
            model_desc["PrimaryContainer"]["ModelDataUrl"])
        assert custom_bucket_name == model_bucket
Exemplo n.º 4
0
def test_tuning_tf_vpc_multi(sagemaker_session):
    """Test Tensorflow multi-instance using the same VpcConfig for training and inference"""
    instance_type = "ml.c4.xlarge"
    instance_count = 2

    script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py")

    ec2_client = sagemaker_session.boto_session.client("ec2")
    subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources(
        ec2_client, sagemaker_session.boto_region_name)
    vpc_test_utils.setup_security_group_for_encryption(ec2_client,
                                                       security_group_id)

    estimator = TensorFlow(
        entry_point=script_path,
        role="SageMakerRole",
        training_steps=1,
        evaluation_steps=1,
        hyperparameters={"input_tensor_name": "inputs"},
        train_instance_count=instance_count,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        base_job_name="test-vpc-tf",
        subnets=subnet_ids,
        security_group_ids=[security_group_id],
        encrypt_inter_container_traffic=True,
    )

    inputs = sagemaker_session.upload_data(
        path=DATA_PATH, key_prefix="integ-test-data/tf_iris")
    hyperparameter_ranges = {"learning_rate": ContinuousParameter(0.05, 0.2)}

    objective_metric_name = "loss"
    metric_definitions = [{"Name": "loss", "Regex": "loss = ([0-9\\.]+)"}]

    tuner = HyperparameterTuner(
        estimator,
        objective_metric_name,
        hyperparameter_ranges,
        metric_definitions,
        objective_type="Minimize",
        max_jobs=2,
        max_parallel_jobs=2,
    )

    tuning_job_name = unique_name_from_base("tune-tf", max_length=32)
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        tuner.fit(inputs, job_name=tuning_job_name)

        print("Started hyperparameter tuning job with name:" + tuning_job_name)

        time.sleep(15)
        tuner.wait()
Exemplo n.º 5
0
def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version):
    data_path = os.path.join(DATA_DIR, "mxnet_mnist")
    script_path = os.path.join(data_path, "mnist.py")

    ec2_client = sagemaker_session.boto_session.client("ec2")
    subnet_ids, security_group_id = get_or_create_vpc_resources(
        ec2_client, sagemaker_session.boto_session.region_name
    )

    mx = MXNet(
        entry_point=script_path,
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type="ml.c4.xlarge",
        sagemaker_session=sagemaker_session,
        framework_version=mxnet_full_version,
        subnets=subnet_ids,
        security_group_ids=[security_group_id],
    )

    train_input = mx.sagemaker_session.upload_data(
        path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train"
    )
    test_input = mx.sagemaker_session.upload_data(
        path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test"
    )
    job_name = unique_name_from_base("test-mxnet-vpc")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        mx.fit({"train": train_input, "test": test_input}, job_name=job_name)

    job_desc = sagemaker_session.sagemaker_client.describe_training_job(
        TrainingJobName=mx.latest_training_job.name
    )
    assert set(subnet_ids) == set(job_desc["VpcConfig"]["Subnets"])
    assert [security_group_id] == job_desc["VpcConfig"]["SecurityGroupIds"]

    transform_input_path = os.path.join(data_path, "transform", "data.csv")
    transform_input_key_prefix = "integ-test-data/mxnet_mnist/transform"
    transform_input = mx.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix
    )

    transformer = _create_transformer_and_transform_job(mx, transform_input)
    with timeout_and_delete_model_with_transformer(
        transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES
    ):
        transformer.wait()
        model_desc = sagemaker_session.sagemaker_client.describe_model(
            ModelName=transformer.model_name
        )
        assert set(subnet_ids) == set(model_desc["VpcConfig"]["Subnets"])
        assert [security_group_id] == model_desc["VpcConfig"]["SecurityGroupIds"]
Exemplo n.º 6
0
def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version):
    data_path = os.path.join(DATA_DIR, 'mxnet_mnist')
    script_path = os.path.join(data_path, 'mnist.py')

    ec2_client = sagemaker_session.boto_session.client('ec2')
    subnet_ids, security_group_id = get_or_create_vpc_resources(
        ec2_client, sagemaker_session.boto_session.region_name)

    mx = MXNet(entry_point=script_path,
               role='SageMakerRole',
               train_instance_count=1,
               train_instance_type='ml.c4.xlarge',
               sagemaker_session=sagemaker_session,
               framework_version=mxnet_full_version,
               subnets=subnet_ids,
               security_group_ids=[security_group_id])

    train_input = mx.sagemaker_session.upload_data(
        path=os.path.join(data_path, 'train'),
        key_prefix='integ-test-data/mxnet_mnist/train')
    test_input = mx.sagemaker_session.upload_data(
        path=os.path.join(data_path, 'test'),
        key_prefix='integ-test-data/mxnet_mnist/test')

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        mx.fit({'train': train_input, 'test': test_input})

    job_desc = sagemaker_session.sagemaker_client.describe_training_job(
        TrainingJobName=mx.latest_training_job.name)
    assert set(subnet_ids) == set(job_desc['VpcConfig']['Subnets'])
    assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds']

    transform_input_path = os.path.join(data_path, 'transform', 'data.csv')
    transform_input_key_prefix = 'integ-test-data/mxnet_mnist/transform'
    transform_input = mx.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(mx, transform_input)
    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        transformer.wait()
        model_desc = sagemaker_session.sagemaker_client.describe_model(
            ModelName=transformer.model_name)
        assert set(subnet_ids) == set(model_desc['VpcConfig']['Subnets'])
        assert [security_group_id
                ] == model_desc['VpcConfig']['SecurityGroupIds']