Пример #1
0
def test_tf(sagemaker_session, tf_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
                                          endpoint_name=endpoint_name)

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = json_predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = json_predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result
def test_cifar(sagemaker_session, tf_full_version):
    with timeout(minutes=45):
        script_path = os.path.join(DATA_DIR, 'cifar_10', 'source')

        dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data')

        estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole',
                               framework_version=tf_full_version, training_steps=500, evaluation_steps=5,
                               train_instance_count=2, train_instance_type='ml.p2.xlarge',
                               sagemaker_session=sagemaker_session, train_max_run=45 * 60,
                               base_job_name='test-cifar')

        inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10')
        estimator.fit(inputs, logs=False)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge')
        predictor.serializer = PickleSerializer()
        predictor.content_type = PICKLE_CONTENT_TYPE

        data = np.random.randn(32, 32, 3)
        predict_response = predictor.predict(data)
        assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
Пример #3
0
def test_tf_async(sagemaker_session):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs, wait=False)
        training_job_name = estimator.latest_training_job.name
        time.sleep(20)

    endpoint_name = training_job_name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
                                          endpoint_name=endpoint_name)

        result = json_predictor.predict([6.4, 3.2, 4.5, 1.5])
        print('predict result: {}'.format(result))
Пример #4
0
def test_failed_tf_training(sagemaker_session, tf_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'failure_script.py')
        ec2_client = sagemaker_session.boto_session.client('ec2')
        subnet, security_group_id = get_or_create_subnet_and_security_group(ec2_client, VPC_NAME)
        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               subnets=[subnet],
                               security_group_ids=[security_group_id])

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf-failure')

        with pytest.raises(ValueError) as e:
            estimator.fit(inputs)
        assert 'This failure is expected' in str(e.value)

        job_desc = estimator.sagemaker_session.sagemaker_client.describe_training_job(
            TrainingJobName=estimator.latest_training_job.name)
        assert [subnet] == job_desc['VpcConfig']['Subnets']
        assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds']
def test_run_tensorboard_locally_without_awscli_binary(time, strftime, popen, call, access, sagemaker_session):
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE)

    with pytest.raises(EnvironmentError) as error:
        tf.fit(inputs='s3://mybucket/train', run_tensorboard_locally=True)
    assert str(error.value) == 'The AWS CLI is not installed in the system. Please install the AWS CLI using the ' \
                               'following command: \n pip install awscli'
def test_train_image_default(sagemaker_session):
    tf = TensorFlow(entry_point=SCRIPT_PATH,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE)

    assert _get_full_cpu_image_uri(defaults.TF_VERSION) in tf.train_image()
def test_run_tensorboard_locally(sleep, time, strftime, popen, call, access, rmtree, mkdtemp, sync, sagemaker_session):
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE)

    popen().poll.return_value = None

    tf.fit(inputs='s3://mybucket/train', run_tensorboard_locally=True)

    popen.assert_called_with(['tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost', '--port', '6006'],
                             stderr=-1,
                             stdout=-1)
def test_run_tensorboard_locally_port_in_use(time, strftime, popen, call, access, socket, rmtree, mkdtemp, sync,
                                             sagemaker_session):
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE)

    popen().poll.side_effect = [-1, None]

    tf.fit(inputs='s3://mybucket/train', run_tensorboard_locally=True)

    popen.assert_any_call(['tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost', '--port', '6006'],
                          stderr=-1, stdout=-1)

    popen.assert_any_call(['tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost', '--port', '6007'],
                          stderr=-1, stdout=-1)
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    custom_image = 'tensorflow:1.0'
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE, image_name=custom_image,
                    container_log_level=container_log_level, base_job_name='job',
                    source_dir=source_dir)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)
    model = tf.create_model()

    assert model.image == custom_image
def test_attach_custom_image(sagemaker_session):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/tensorflow_with_custom_binary:1.0'
    rjd = {
        'AlgorithmSpecification': {
            'TrainingInputMode': 'File',
            'TrainingImage': training_image},
        'HyperParameters': {
            'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
            'checkpoint_path': '"s3://other/1508872349"',
            'sagemaker_program': '"iris-dnn-classifier.py"',
            'sagemaker_enable_cloudwatch_metrics': 'false',
            'sagemaker_container_log_level': '"logging.INFO"',
            'sagemaker_job_name': '"neo"',
            'training_steps': '100',
            'evaluation_steps': '10'},
        'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
        'ResourceConfig': {
            'VolumeSizeInGB': 30,
            'InstanceCount': 1,
            'InstanceType': 'ml.c4.xlarge'},
        'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
        'TrainingJobName': 'neo',
        'TrainingJobStatus': 'Completed',
        'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'},
        'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd)

    estimator = TensorFlow.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert estimator.image_name == training_image
    assert estimator.train_image() == training_image
Пример #11
0
def test_failed_tf_training(sagemaker_session, tf_full_version):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'iris', 'failure_script.py')
        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session)

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf-failure')

        with pytest.raises(ValueError) as e:
            estimator.fit(inputs)
        assert 'This failure is expected' in str(e.value)
def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job',
                    source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)

    new_role = 'role'
    model_server_workers = 2
    model = tf.create_model(role=new_role, model_server_workers=2)

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
def test_attach_wrong_framework(sagemaker_session):
    returned_job_description = {
        'AlgorithmSpecification': {
            'TrainingInputMode':
            'File',
            'TrainingImage':
            '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0'
        },
        'HyperParameters': {
            'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
            'sagemaker_program': '"iris-dnn-classifier.py"',
            'sagemaker_enable_cloudwatch_metrics': 'false',
            'sagemaker_container_log_level': '"logging.INFO"',
            'training_steps': '100'
        },
        'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
        'ResourceConfig': {
            'VolumeSizeInGB': 30,
            'InstanceCount': 1,
            'InstanceType': 'ml.c4.xlarge'
        },
        'StoppingCondition': {
            'MaxRuntimeInSeconds': 24 * 60 * 60
        },
        'TrainingJobName': 'neo',
        'TrainingJobStatus': 'Completed',
        'OutputDataConfig': {
            'KmsKeyId': '',
            'S3OutputPath': 's3://place/output/neo'
        },
        'TrainingJobOutput': {
            'S3TrainingJobOutput': 's3://here/output.tar.gz'
        }
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name='describe_training_job', return_value=returned_job_description)

    with pytest.raises(ValueError) as error:
        TensorFlow.attach(training_job_name='neo',
                          sagemaker_session=sagemaker_session)
    assert "didn't use image for requested framework" in str(error)
def test_mnist_with_checkpoint_config(sagemaker_session, instance_type, tf_full_version):
    checkpoint_s3_uri = "s3://{}/checkpoints/tf-{}".format(
        sagemaker_session.default_bucket(), sagemaker_timestamp()
    )
    checkpoint_local_path = "/test/checkpoint/path"
    estimator = TensorFlow(
        entry_point=SCRIPT,
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        script_mode=True,
        framework_version=tf_full_version,
        py_version=tests.integ.PYTHON_VERSION,
        metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}],
        checkpoint_s3_uri=checkpoint_s3_uri,
        checkpoint_local_path=checkpoint_local_path,
    )
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist"
    )

    training_job_name = unique_name_from_base("test-tf-sm-mnist")
    with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=inputs, job_name=training_job_name)
    assert_s3_files_exist(
        sagemaker_session,
        estimator.model_dir,
        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
    )
    df = estimator.training_job_analytics.dataframe()
    assert df.size > 0

    expected_training_checkpoint_config = {
        "S3Uri": checkpoint_s3_uri,
        "LocalPath": checkpoint_local_path,
    }
    actual_training_checkpoint_config = sagemaker_session.sagemaker_client.describe_training_job(
        TrainingJobName=training_job_name
    )["CheckpointConfig"]
    assert actual_training_checkpoint_config == expected_training_checkpoint_config
Пример #15
0
def main(args):
    hyperparameters = format_hyperparameters(args)
    now = datetime.now()
    time_str = now.strftime("%d-%m-%Y-%H-%M")
    distributions = {
        "mpi": {
            "enabled":
            True,
            "processes_per_host":
            args.num_workers_per_host,
            "custom_mpi_options":
            "-x OMPI_MCA_btl_vader_single_copy_mechanism=none -x TF_CUDNN_USE_AUTOTUNE=0"
        }
    }
    channels = {'coco': args.data_channel, 'weights': args.weights_channel}
    s3_path = os.path.join(args.s3_path, time_str)
    job_name = '{}-{}-{}'.format(args.user_id, args.instance_name, time_str)
    output_path = os.path.join(s3_path, "output", job_name)
    configuration = {
        'configuration': 'ci/frcnn/sagemaker_default_model_config.py',
        's3_path': s3_path,
        'instance_name': job_name
    }
    configuration.update(hyperparameters)
    estimator = TensorFlow(entry_point=args.main_script,
                           source_dir=args.source_dir,
                           image_name=args.image,
                           role=args.sagemaker_role,
                           framework_version="2.1.0",
                           py_version="py3",
                           train_instance_count=args.instance_count,
                           train_instance_type=args.instance_type,
                           distributions=distributions,
                           output_path=output_path,
                           train_volume_size=200,
                           hyperparameters=configuration)

    estimator.fit(channels, wait=True, logs='All', job_name=job_name)

    print("Started Sagemaker job: {}".format(job_name))
    pprint.pprint(configuration)
def test_transformer_creation_with_endpoint_type(create_model,
                                                 sagemaker_session):
    model = Mock()
    create_model.return_value = model

    tf = TensorFlow(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
    )

    tf.transformer(
        INSTANCE_COUNT,
        INSTANCE_TYPE,
        endpoint_type="tensorflow-serving",
        entry_point=SERVING_SCRIPT_FILE,
    )

    create_model.assert_called_with(
        endpoint_type="tensorflow-serving",
        model_server_workers=None,
        role=ROLE,
        vpc_config_override="VPC_CONFIG_DEFAULT",
        entry_point=SERVING_SCRIPT_FILE,
    )
    model.transformer.assert_called_with(
        INSTANCE_COUNT,
        INSTANCE_TYPE,
        accept=None,
        assemble_with=None,
        env=None,
        max_concurrent_transforms=None,
        max_payload=None,
        output_kms_key=None,
        output_path=None,
        strategy=None,
        tags=None,
        volume_kms_key=None,
    )
def test_mnist(sagemaker_session, instance_type):
    estimator = TensorFlow(entry_point=SCRIPT,
                           role='SageMakerRole',
                           train_instance_count=1,
                           train_instance_type=instance_type,
                           sagemaker_session=sagemaker_session,
                           py_version='py3',
                           framework_version=TensorFlow.LATEST_VERSION,
                           metric_definitions=[{'Name': 'train:global_steps', 'Regex': r'global_step\/sec:\s(.*)'}],
                           base_job_name=unique_name_from_base('test-tf-sm-mnist'))
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(RESOURCE_PATH, 'data'),
        key_prefix='scriptmode/mnist')

    with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs)
    _assert_s3_files_exist(estimator.model_dir,
                           ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta'])
    df = estimator.training_job_analytics.dataframe()
    print(df)
    assert df.size > 0
def test_tf_script_mode_attach(sagemaker_session, tf_version):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-py3-cpu:{}-cpu-py3'.format(
        tf_version)
    rjd = {
        'AlgorithmSpecification': {
            'TrainingInputMode': 'File',
            'TrainingImage': training_image
        },
        'HyperParameters': {
            'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
            'sagemaker_program': '"iris-dnn-classifier.py"',
            'sagemaker_enable_cloudwatch_metrics': 'false',
            'sagemaker_container_log_level': '"logging.INFO"',
            'sagemaker_job_name': '"neo"'
        },
        'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
        'ResourceConfig': {
            'VolumeSizeInGB': 30,
            'InstanceCount': 1,
            'InstanceType': 'ml.c4.xlarge'
        },
        'StoppingCondition': {
            'MaxRuntimeInSeconds': 24 * 60 * 60
        },
        'TrainingJobName': 'neo',
        'TrainingJobStatus': 'Completed',
        'OutputDataConfig': {
            'KmsKeyId': '',
            'S3OutputPath': 's3://place/output/neo'
        },
        'TrainingJobOutput': {
            'S3TrainingJobOutput': 's3://here/output.tar.gz'
        }
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name='describe_training_job', return_value=rjd)

    estimator = TensorFlow.attach(training_job_name='neo',
                                  sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == 'neo'
    assert estimator.py_version == 'py3'
    assert estimator.framework_version == tf_version
    assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole'
    assert estimator.train_instance_count == 1
    assert estimator.train_max_run == 24 * 60 * 60
    assert estimator.input_mode == 'File'
    assert estimator.input_mode == 'File'
    assert estimator.base_job_name == 'neo'
    assert estimator.output_path == 's3://place/output/neo'
    assert estimator.output_kms_key == ''
    assert estimator.hyperparameters() is not None
    assert estimator.source_dir == 's3://some/sourcedir.tar.gz'
    assert estimator.entry_point == 'iris-dnn-classifier.py'
def test_distributed_mnist_ps(sagemaker_session, image_uri, instance_type, framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
    script = os.path.join(resource_path, "mnist", "mnist_estimator.py")
    estimator = TensorFlow(
        entry_point=script,
        role="SageMakerRole",
        hyperparameters={"sagemaker_parameter_server_enabled": True},
        train_instance_count=2,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        image_name=image_uri,
        framework_version=framework_version,
        script_mode=True,
    )
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(resource_path, "mnist", "data-distributed"),
        key_prefix="scriptmode/mnist-distributed",
    )
    estimator.fit(inputs, job_name=unique_name_from_base("test-tf-sm-distributed-mnist"))
    _assert_checkpoint_exists(sagemaker_session.boto_region_name, estimator.model_dir, 0)
    _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)
def test_distributed_training_smdataparallel_script_mode(
        sagemaker_session, instance_type, ecr_image, tmpdir,
        framework_version):
    """
    Tests SMDataParallel single-node command via script mode

    TODO: Enable debugger_hook_config post smdebug support for smdataparallel
    """
    instance_type = "ml.p3.16xlarge"
    estimator = TensorFlow(entry_point='smdataparallel_mnist_script_mode.sh',
                           source_dir=MNIST_PATH,
                           role='SageMakerRole',
                           instance_type=instance_type,
                           instance_count=1,
                           image_uri=ecr_image,
                           framework_version=framework_version,
                           py_version='py3',
                           debugger_hook_config=False,
                           sagemaker_session=sagemaker_session)

    estimator.fit(job_name=unique_name_from_base('test-tf-smdataparallel'))
def test_distributed_training_smdataparallel_script_mode(
        n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image,
        tmpdir, framework_version):
    """
    Tests SMDataParallel single-node command via script mode
    """
    validate_or_skip_smdataparallel(n_virginia_ecr_image)
    instance_type = "ml.p3.16xlarge"
    distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}
    estimator = TensorFlow(entry_point='smdataparallel_mnist_script_mode.sh',
                           source_dir=MNIST_PATH,
                           role='SageMakerRole',
                           instance_type=instance_type,
                           instance_count=1,
                           image_uri=n_virginia_ecr_image,
                           framework_version=framework_version,
                           py_version='py3',
                           sagemaker_session=n_virginia_sagemaker_session,
                           distribution=distribution)

    estimator.fit(job_name=unique_name_from_base('test-tf-smdataparallel'))
def test_mnist_distributed(sagemaker_session, instance_type):
    estimator = TensorFlow(entry_point=SCRIPT,
                           role=ROLE,
                           train_instance_count=2,
                           train_instance_type=instance_type,
                           sagemaker_session=sagemaker_session,
                           py_version=tests.integ.PYTHON_VERSION,
                           script_mode=True,
                           framework_version=TensorFlow.LATEST_VERSION,
                           distributions=PARAMETER_SERVER_DISTRIBUTION)
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(RESOURCE_PATH, 'data'),
        key_prefix='scriptmode/distributed_mnist')

    with tests.integ.timeout.timeout(
            minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=inputs,
                      job_name=unique_name_from_base('test-tf-sm-distributed'))
    _assert_s3_files_exist(
        estimator.model_dir,
        ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta'])
Пример #23
0
def test_attach(sagemaker_session, tensorflow_training_version, tensorflow_training_py_version):
    if Version(tensorflow_training_version) > Version("1.12"):
        pytest.skip("framework_name_from_image doesn't infer info from DLC image URIs.")

    training_image = image_uris.retrieve(
        "tensorflow",
        region=REGION,
        version=tensorflow_training_version,
        py_version=tensorflow_training_py_version,
        instance_type="ml.c4.xlarge",
        image_scope="training",
    )
    rjd = {
        "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image},
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "sagemaker_program": '"iris-dnn-classifier.py"',
            "sagemaker_container_log_level": '"logging.INFO"',
            "sagemaker_job_name": '"neo"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"},
        "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=rjd
    )

    estimator = TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == "neo"
    assert estimator.py_version == tensorflow_training_py_version
    assert estimator.framework_version == tensorflow_training_version
    assert estimator.role == "arn:aws:iam::366:role/SageMakerRole"
    assert estimator.instance_count == 1
    assert estimator.max_run == 24 * 60 * 60
    assert estimator.input_mode == "File"
    assert estimator.input_mode == "File"
    assert estimator.base_job_name == "neo"
    assert estimator.output_path == "s3://place/output/neo"
    assert estimator.output_kms_key == ""
    assert estimator.hyperparameters() is not None
    assert estimator.source_dir == "s3://some/sourcedir.tar.gz"
    assert estimator.entry_point == "iris-dnn-classifier.py"
    assert estimator.training_image_uri() == training_image
Пример #24
0
 def test_native(self, sagemaker_session, ecr_image, framework_version,
                 instance_type, instance_count, tmpdir, mnist_dataset,
                 capsys):
     script = os.path.join(resource_path, 'mnist', 'mnist.py')
     estimator = TensorFlow(
         entry_point=script,
         role='SageMakerRole',
         instance_type=instance_type,
         instance_count=instance_count,
         sagemaker_session=sagemaker_session,
         image_uri=ecr_image,
         framework_version=framework_version,
         hyperparameters={
             TrainingCompilerConfig.HP_ENABLE_COMPILER: True,
         },
     )
     estimator.fit(mnist_dataset,
                   job_name=unique_name_from_base('test-TF-trcomp-DT'))
     _assert_model_exported_to_s3(estimator)
     captured = capsys.readouterr()
     _assert_training_compiler_invoked(captured)
def test_empty_framework_version(warning, sagemaker_session):
    estimator = TensorFlow(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        framework_version=None,
    )

    assert estimator.framework_version == defaults.TF_VERSION
    warning.assert_called_with(defaults.TF_VERSION, estimator.LATEST_VERSION)
Пример #26
0
def test_tuning_tf_vpc_multi(sagemaker_session):
    """Test Tensorflow multi-instance using the same VpcConfig for training and inference"""
    instance_type = "ml.c4.xlarge"
    instance_count = 2

    script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py")

    ec2_client = sagemaker_session.boto_session.client("ec2")
    subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources(
        ec2_client, sagemaker_session.boto_region_name)
    vpc_test_utils.setup_security_group_for_encryption(ec2_client,
                                                       security_group_id)

    estimator = TensorFlow(
        entry_point=script_path,
        role="SageMakerRole",
        training_steps=1,
        evaluation_steps=1,
        hyperparameters={"input_tensor_name": "inputs"},
        train_instance_count=instance_count,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        base_job_name="test-vpc-tf",
        subnets=subnet_ids,
        security_group_ids=[security_group_id],
        encrypt_inter_container_traffic=True,
    )

    inputs = sagemaker_session.upload_data(
        path=DATA_PATH, key_prefix="integ-test-data/tf_iris")
    hyperparameter_ranges = {"learning_rate": ContinuousParameter(0.05, 0.2)}

    objective_metric_name = "loss"
    metric_definitions = [{"Name": "loss", "Regex": "loss = ([0-9\\.]+)"}]

    tuner = HyperparameterTuner(
        estimator,
        objective_metric_name,
        hyperparameter_ranges,
        metric_definitions,
        objective_type="Minimize",
        max_jobs=2,
        max_parallel_jobs=2,
    )

    tuning_job_name = unique_name_from_base("tune-tf", max_length=32)
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        tuner.fit(inputs, job_name=tuning_job_name)

        print("Started hyperparameter tuning job with name:" + tuning_job_name)

        time.sleep(15)
        tuner.wait()
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    custom_image = "tensorflow:1.0"
    tf = TensorFlow(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        image_uri=custom_image,
        container_log_level=container_log_level,
        base_job_name="job",
        source_dir=source_dir,
    )

    job_name = "doing something"
    tf.fit(inputs="s3://mybucket/train", job_name=job_name)
    model = tf.create_model()

    assert model.image_uri == custom_image
Пример #28
0
def main():
    download_training_and_eval_data()

    print('Starting model training.')
    print(
        'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.'
    )
    cifar10_estimator = TensorFlow(entry_point='cifar10_tf2.py',
                                   source_dir='source_dir',
                                   role=DUMMY_IAM_ROLE,
                                   instance_count=1,
                                   instance_type='local_gpu',
                                   framework_version='2.4.1',
                                   py_version='py37')

    inputs = {
        'training': 'file://./data/training',
        'validation': 'file://./data/validation'
    }
    cifar10_estimator.fit(inputs)
    print('Completed model training')
Пример #29
0
def run_test(sagemaker_session, ecr_image, instance_type, framework_version, test_data,
             record_wrapper_type=None):
    source_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources', 'pipemode')
    script = os.path.join(source_path, 'pipemode.py')
    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           train_instance_type=instance_type,
                           train_instance_count=1,
                           sagemaker_session=sagemaker_session,
                           image_name=ecr_image,
                           framework_version=framework_version,
                           script_mode=True,
                           input_mode='Pipe',
                           hyperparameters={'dimension': DIMENSION})
    input = s3_input(s3_data=test_data,
                     distribution='FullyReplicated',
                     record_wrapping=record_wrapper_type,
                     input_mode='Pipe')
    with timeout(minutes=20):
        estimator.fit({'elizabeth': input},
                      job_name=unique_name_from_base('test-sagemaker-pipemode'))
def test_create_model(sagemaker_session, tf_version):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE, framework_version=tf_version,
                    container_log_level=container_log_level, base_job_name='job',
                    source_dir=source_dir)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)
    model = tf.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == tf_version
    assert model.py_version == tf.py_version
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
Пример #31
0
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    custom_image = 'tensorflow:1.0'
    tf = TensorFlow(entry_point=SCRIPT_PATH,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    training_steps=1000,
                    evaluation_steps=10,
                    train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE,
                    image_name=custom_image,
                    container_log_level=container_log_level,
                    base_job_name='job',
                    source_dir=source_dir)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)
    model = tf.create_model()

    assert model.image == custom_image
Пример #32
0
def _test_distributed_mnist_ps_function(ecr_image, sagemaker_session,
                                        instance_type, framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), '..', '..',
                                 'resources')
    script = os.path.join(resource_path, 'mnist', 'mnist_estimator.py')
    estimator = TensorFlow(
        entry_point=script,
        role='SageMakerRole',
        hyperparameters={'sagemaker_parameter_server_enabled': True},
        instance_count=2,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        image_uri=ecr_image,
        framework_version=framework_version)
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(resource_path, 'mnist', 'data-distributed'),
        key_prefix='scriptmode/mnist-distributed')
    estimator.fit(
        inputs, job_name=unique_name_from_base('test-tf-sm-distributed-mnist'))
    _assert_checkpoint_exists(sagemaker_session.boto_region_name,
                              estimator.model_dir, 0)
Пример #33
0
def tf_training_job(sagemaker_session, tf_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = sagemaker_session.upload_data(
            path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

        return estimator.latest_training_job.name
Пример #34
0
def test_smdebug(sagemaker_session, ecr_image, instance_type,
                 framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), '..', '..',
                                 'resources')
    script = os.path.join(resource_path, 'mnist', 'mnist_smdebug.py')
    hyperparameters = {'smdebug_path': '/tmp/ml/output/tensors'}
    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           instance_type=instance_type,
                           instance_count=1,
                           sagemaker_session=sagemaker_session,
                           image_uri=ecr_image,
                           framework_version=framework_version,
                           hyperparameters=hyperparameters)
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(resource_path, 'mnist', 'data'),
        key_prefix='scriptmode/mnist_smdebug')
    estimator.fit(
        inputs, job_name=unique_name_from_base('test-sagemaker-mnist-smdebug'))
    _assert_s3_file_exists(sagemaker_session.boto_region_name,
                           estimator.model_data)
def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job',
                    source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)

    new_role = 'role'
    model_server_workers = 2
    vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']}
    model = tf.create_model(role=new_role, model_server_workers=model_server_workers,
                            vpc_config_override=vpc_config)

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
    assert model.vpc_config == vpc_config
Пример #36
0
 def test_inference_compiler_neo(self, sagemaker_session, ecr_image,
                                 framework_version, instance_type,
                                 instance_count, tmpdir, capsys,
                                 mnist_dataset):
     script = os.path.join(resource_path, 'mnist', 'mnist.py')
     estimator = TensorFlow(
         entry_point=script,
         role='SageMakerRole',
         instance_type=instance_type,
         instance_count=instance_count,
         sagemaker_session=sagemaker_session,
         image_uri=ecr_image,
         framework_version=framework_version,
         hyperparameters={
             TrainingCompilerConfig.HP_ENABLE_COMPILER: True,
         },
     )
     estimator.fit(mnist_dataset,
                   job_name=unique_name_from_base('test-TF-trcomp-serving'))
     _assert_model_exported_to_s3(estimator)
     captured = capsys.readouterr()
     _assert_training_compiler_invoked(captured)
     s3_prefix = estimator.model_data.replace('output/model.tar.gz', '')
     estimator.compile_model(
         target_instance_family='ml_p3',
         input_shape={'data': [1, 28, 28]},
         output_path=s3_prefix,
         framework='keras',
         framework_version='2.6.0',
     )
Пример #37
0
def test_tf_async(sagemaker_session):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = estimator.sagemaker_session.upload_data(
            path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs, wait=False)
        training_job_name = estimator.latest_training_job.name
        time.sleep(20)

    endpoint_name = training_job_name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = TensorFlow.attach(training_job_name=training_job_name,
                                      sagemaker_session=sagemaker_session)
        json_predictor = estimator.deploy(initial_instance_count=1,
                                          instance_type='ml.c4.xlarge',
                                          endpoint_name=endpoint_name)

        result = json_predictor.predict([6.4, 3.2, 4.5, 1.5])
        print('predict result: {}'.format(result))
def test_mnist_efs(
    efs_fsx_setup,
    sagemaker_session,
    cpu_instance_type,
    tensorflow_training_latest_version,
    tensorflow_training_latest_py_version,
):
    role = efs_fsx_setup["role_name"]
    subnets = [efs_fsx_setup["subnet_id"]]
    security_group_ids = efs_fsx_setup["security_group_ids"]

    estimator = TensorFlow(
        entry_point=SCRIPT,
        role=role,
        instance_count=1,
        instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
        framework_version=tensorflow_training_latest_version,
        py_version=tensorflow_training_latest_py_version,
        subnets=subnets,
        security_group_ids=security_group_ids,
    )

    file_system_efs_id = efs_fsx_setup["file_system_efs_id"]
    content_type = "application/json"
    file_system_input = FileSystemInput(
        file_system_id=file_system_efs_id,
        file_system_type="EFS",
        directory_path=EFS_DIR_PATH,
        content_type=content_type,
    )
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=file_system_input,
                      job_name=unique_name_from_base("test-mnist-efs"))

    assert_s3_files_exist(
        sagemaker_session,
        estimator.model_dir,
        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
    )
def test_attach_wrong_framework(sagemaker_session):
    returned_job_description = {
        "AlgorithmSpecification": {
            "TrainingInputMode":
            "File",
            "TrainingImage":
            "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0",
        },
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "sagemaker_program": '"iris-dnn-classifier.py"',
            "sagemaker_container_log_level": '"logging.INFO"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {
            "MaxRuntimeInSeconds": 24 * 60 * 60
        },
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {
            "KmsKeyId": "",
            "S3OutputPath": "s3://place/output/neo"
        },
        "TrainingJobOutput": {
            "S3TrainingJobOutput": "s3://here/output.tar.gz"
        },
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=returned_job_description)

    with pytest.raises(ValueError) as error:
        TensorFlow.attach(training_job_name="neo",
                          sagemaker_session=sagemaker_session)
    assert "didn't use image for requested framework" in str(error)
def test_deploy(sagemaker_session, tf_version):
    estimator = TensorFlow(
        entry_point=SCRIPT,
        source_dir=SOURCE_DIR,
        role=ROLE,
        framework_version=tf_version,
        train_instance_count=2,
        train_instance_type=INSTANCE_TYPE_CPU,
        sagemaker_session=sagemaker_session,
        base_job_name="test-cifar",
    )

    estimator.fit("s3://mybucket/train")
    print("job succeeded: {}".format(estimator.latest_training_job.name))

    estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU)
    image = IMAGE_URI_FORMAT_STRING.format(REGION, CPU_IMAGE_NAME, tf_version,
                                           "cpu", "py2")
    sagemaker_session.create_model.assert_called_with(
        estimator._current_job_name,
        ROLE,
        {
            "Environment": {
                "SAGEMAKER_CONTAINER_LOG_LEVEL": "20",
                "SAGEMAKER_SUBMIT_DIRECTORY": SOURCE_DIR,
                "SAGEMAKER_REQUIREMENTS": "",
                "SAGEMAKER_REGION": REGION,
                "SAGEMAKER_PROGRAM": SCRIPT,
            },
            "Image": image,
            "ModelDataUrl": "s3://m/m.tar.gz",
        },
    )
def test_deploy(sagemaker_session):
    estimator = TensorFlow(
        entry_point=SCRIPT,
        source_dir=SOURCE_DIR,
        role=ROLE,
        framework_version="2.3.0",
        py_version="py37",
        instance_count=2,
        instance_type=INSTANCE_TYPE_CPU,
        sagemaker_session=sagemaker_session,
        base_job_name="test-cifar",
    )

    estimator.fit("s3://mybucket/train")

    estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU)
    image = IMAGE_URI_FORMAT_STRING.format(REGION, REPOSITORY, "2.3.0",
                                           PROCESSOR)
    sagemaker_session.create_model.assert_called_with(
        ANY,
        ROLE,
        {
            "Image": image,
            "Environment": {
                "SAGEMAKER_TFS_NGINX_LOGLEVEL": "info"
            },
            "ModelDataUrl": "s3://m/m.tar.gz",
        },
        vpc_config=None,
        enable_network_isolation=False,
        tags=None,
    )
Пример #42
0
def test_tf(m_tar, e_tar, time, strftime, sagemaker_session, tf_version):
    tf = TensorFlow(entry_point=SCRIPT_FILE,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    training_steps=1000,
                    evaluation_steps=10,
                    train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE,
                    framework_version=tf_version,
                    requirements_file=REQUIREMENTS_FILE,
                    source_dir=DATA_DIR)

    inputs = 's3://mybucket/train'
    s3_prefix = 's3://{}/{}/source/sourcedir.tar.gz'.format(
        BUCKET_NAME, JOB_NAME)
    e_tar.return_value = UploadedCode(s3_prefix=s3_prefix,
                                      script_name=SCRIPT_FILE)
    s3_prefix = 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME)
    m_tar.return_value = UploadedCode(s3_prefix=s3_prefix,
                                      script_name=SCRIPT_FILE)
    tf.fit(inputs=inputs)

    call_names = [c[0] for c in sagemaker_session.method_calls]
    assert call_names == ['train', 'logs_for_job']

    expected_train_args = _create_train_job(tf_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource'][
        'S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = tf.create_model()

    environment = {
        'Environment': {
            'SAGEMAKER_SUBMIT_DIRECTORY':
            's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME),
            'SAGEMAKER_PROGRAM':
            'dummy_script.py',
            'SAGEMAKER_REQUIREMENTS':
            'dummy_requirements.txt',
            'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS':
            'false',
            'SAGEMAKER_REGION':
            'us-west-2',
            'SAGEMAKER_CONTAINER_LOG_LEVEL':
            '20'
        },
        'Image':
        create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version,
                         "py2"),
        'ModelDataUrl':
        's3://m/m.tar.gz'
    }
    assert environment == model.prepare_container_def(INSTANCE_TYPE)

    assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image']
    predictor = tf.deploy(1, INSTANCE_TYPE)
    assert isinstance(predictor, TensorFlowPredictor)
def test_mnist_async(sagemaker_session):
    estimator = TensorFlow(entry_point=SCRIPT,
                           role=ROLE,
                           train_instance_count=1,
                           train_instance_type='ml.c5.4xlarge',
                           sagemaker_session=sagemaker_session,
                           py_version='py3',
                           framework_version=TensorFlow.LATEST_VERSION,
                           base_job_name=unique_name_from_base('test-tf-sm-mnist'),
                           tags=TAGS)
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(RESOURCE_PATH, 'data'),
        key_prefix='scriptmode/mnist')
    estimator.fit(inputs, wait=False)
    training_job_name = estimator.latest_training_job.name
    time.sleep(20)
    endpoint_name = training_job_name
    _assert_training_job_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS)
    with timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
                                     endpoint_name=endpoint_name)

        result = predictor.predict(np.zeros(784))
        print('predict result: {}'.format(result))
        _assert_endpoint_tags_match(sagemaker_session.sagemaker_client, predictor.endpoint, TAGS)
        _assert_model_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS)
def test_attach_wrong_framework(sagemaker_session):
    returned_job_description = {
        'AlgorithmSpecification': {
            'TrainingInputMode': 'File',
            'TrainingImage': '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0'
        },
        'HyperParameters': {
            'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
            'sagemaker_program': '"iris-dnn-classifier.py"',
            'sagemaker_enable_cloudwatch_metrics': 'false',
            'sagemaker_container_log_level': '"logging.INFO"',
            'training_steps': '100'

        },
        'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
        'ResourceConfig':
            {'VolumeSizeInGB': 30,
             'InstanceCount': 1,
             'InstanceType': 'ml.c4.xlarge'
             },
        'StoppingCondition': {
            'MaxRuntimeInSeconds': 24 * 60 * 60
        },
        'TrainingJobName': 'neo',
        'TrainingJobStatus': 'Completed',
        'OutputDataConfig': {
            'KmsKeyId': '',
            'S3OutputPath': 's3://place/output/neo'
        },
        'TrainingJobOutput': {
            'S3TrainingJobOutput': 's3://here/output.tar.gz'
        }
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job',
                                                                    return_value=returned_job_description)

    with pytest.raises(ValueError) as error:
        TensorFlow.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert "didn't use image for requested framework" in str(error)
def test_tf(m_tar, e_tar, time, strftime, sagemaker_session, tf_version):
    tf = TensorFlow(entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000,
                    evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                    framework_version=tf_version, requirements_file=REQUIREMENTS_FILE, source_dir=DATA_DIR)

    inputs = 's3://mybucket/train'
    s3_prefix = 's3://{}/{}/source/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME)
    e_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE)
    s3_prefix = 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME)
    m_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE)
    tf.fit(inputs=inputs)

    call_names = [c[0] for c in sagemaker_session.method_calls]
    assert call_names == ['train', 'logs_for_job']

    expected_train_args = _create_train_job(tf_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = tf.create_model()

    environment = {
        'Environment': {
            'SAGEMAKER_SUBMIT_DIRECTORY': 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME),
            'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_REQUIREMENTS': 'dummy_requirements.txt',
            'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2',
            'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'
        },
        'Image': create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version, "py2"),
        'ModelDataUrl': 's3://m/m.tar.gz'
    }
    assert environment == model.prepare_container_def(INSTANCE_TYPE)

    assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image']
    predictor = tf.deploy(1, INSTANCE_TYPE)
    assert isinstance(predictor, TensorFlowPredictor)
def test_deploy(sagemaker_session, tf_version):
    estimator = TensorFlow(entry_point=SCRIPT, source_dir=SOURCE_DIR, role=ROLE,
                           framework_version=tf_version,
                           train_instance_count=2, train_instance_type=INSTANCE_TYPE_CPU,
                           sagemaker_session=sagemaker_session,
                           base_job_name='test-cifar')

    estimator.fit('s3://mybucket/train')
    print('job succeeded: {}'.format(estimator.latest_training_job.name))

    estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU)
    image = IMAGE_URI_FORMAT_STRING.format(REGION, CPU_IMAGE_NAME, tf_version, 'cpu', 'py2')
    sagemaker_session.create_model.assert_called_with(
        estimator._current_job_name,
        ROLE,
        {'Environment':
         {'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false',
          'SAGEMAKER_CONTAINER_LOG_LEVEL': '20',
          'SAGEMAKER_SUBMIT_DIRECTORY': SOURCE_DIR,
          'SAGEMAKER_REQUIREMENTS': '',
          'SAGEMAKER_REGION': REGION,
          'SAGEMAKER_PROGRAM': SCRIPT},
         'Image': image,
         'ModelDataUrl': 's3://m/m.tar.gz'})
Пример #47
0
def test_keras(sagemaker_session, tf_full_version):
    script_path = os.path.join(DATA_DIR, 'cifar_10', 'source')
    dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data')

    with timeout(minutes=45):
        estimator = TensorFlow(entry_point='keras_cnn_cifar_10.py',
                               source_dir=script_path,
                               role='SageMakerRole', sagemaker_session=sagemaker_session,
                               hyperparameters={'learning_rate': 1e-4, 'decay': 1e-6},
                               training_steps=500, evaluation_steps=5,
                               train_instance_count=1, train_instance_type='ml.c4.xlarge',
                               train_max_run=45 * 60)

        inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10')

        estimator.fit(inputs)

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge')

        data = np.random.randn(32, 32, 3)
        predict_response = predictor.predict(data)
        assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
def test_attach(sagemaker_session, tf_version):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-py2-cpu:{}-cpu-py2'.format(tf_version)
    rjd = {
        'AlgorithmSpecification': {
            'TrainingInputMode': 'File',
            'TrainingImage': training_image
        },
        'HyperParameters': {
            'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
            'checkpoint_path': '"s3://other/1508872349"',
            'sagemaker_program': '"iris-dnn-classifier.py"',
            'sagemaker_enable_cloudwatch_metrics': 'false',
            'sagemaker_container_log_level': '"logging.INFO"',
            'sagemaker_job_name': '"neo"',
            'training_steps': '100',
            'evaluation_steps': '10'
        },
        'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
        'ResourceConfig': {
            'VolumeSizeInGB': 30,
            'InstanceCount': 1,
            'InstanceType': 'ml.c4.xlarge'
        },
        'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
        'TrainingJobName': 'neo',
        'TrainingJobStatus': 'Completed',
        'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'},
        'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd)

    estimator = TensorFlow.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == 'neo'
    assert estimator.py_version == 'py2'
    assert estimator.framework_version == tf_version
    assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole'
    assert estimator.train_instance_count == 1
    assert estimator.train_max_run == 24 * 60 * 60
    assert estimator.input_mode == 'File'
    assert estimator.training_steps == 100
    assert estimator.evaluation_steps == 10
    assert estimator.input_mode == 'File'
    assert estimator.base_job_name == 'neo'
    assert estimator.output_path == 's3://place/output/neo'
    assert estimator.output_kms_key == ''
    assert estimator.hyperparameters()['training_steps'] == '100'
    assert estimator.source_dir == 's3://some/sourcedir.tar.gz'
    assert estimator.entry_point == 'iris-dnn-classifier.py'
    assert estimator.checkpoint_path == 's3://other/1508872349'
def test_tf_local_mode(tf_full_version, sagemaker_local_session):
    local_mode_lock_fd = open(LOCK_PATH, 'w')
    local_mode_lock = local_mode_lock_fd.fileno()
    with timeout(minutes=5):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='local',
                               base_job_name='test-tf',
                               sagemaker_session=sagemaker_local_session)

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH,
                                                         key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    try:
        # Since Local Mode uses the same port for serving, we need a lock in order
        # to allow concurrent test execution. The serving test is really fast so it still
        # makes sense to allow this behavior.
        fcntl.lockf(local_mode_lock, fcntl.LOCK_EX)
        json_predictor = estimator.deploy(initial_instance_count=1,
                                          instance_type='local',
                                          endpoint_name=endpoint_name)

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = json_predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = json_predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result
    finally:
        estimator.delete_endpoint()
        time.sleep(5)
        fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)