예제 #1
0
def test_dict_of_mixed_input_types():
    input_list = _Job._format_inputs_to_input_config({
        'a': 's3://foo/bar',
        'b': s3_input('s3://whizz/bang')})

    expected = [
        {'ChannelName': 'a',
         'DataSource': {
             'S3DataSource': {
                 'S3DataDistributionType': 'FullyReplicated',
                 'S3DataType': 'S3Prefix',
                 'S3Uri': 's3://foo/bar'
             }
         }
         },
        {
            'ChannelName': 'b',
            'DataSource': {
                'S3DataSource': {
                    'S3DataDistributionType': 'FullyReplicated',
                    'S3DataType': 'S3Prefix',
                    'S3Uri': 's3://whizz/bang'
                }
            }
        }]

    # convert back into map for comparison so list order (which is arbitrary) is ignored
    assert {c['ChannelName']: c for c in input_list} == {c['ChannelName']: c for c in expected}
예제 #2
0
def test_format_string_uri_input():
    inputs = s3_input(BUCKET_NAME)

    s3_uri_input = _Job._format_string_uri_input(inputs)

    assert s3_uri_input.config['DataSource']['S3DataSource']['S3Uri'] == inputs.config[
        'DataSource']['S3DataSource']['S3Uri']
def test_tf_deploy_model_server_workers_unset(sagemaker_session):
    tf = _build_tf(sagemaker_session)
    tf.fit(inputs=s3_input('s3://mybucket/train'))

    tf.deploy(initial_instance_count=1, instance_type='ml.c2.2xlarge')

    assert MODEL_SERVER_WORKERS_PARAM_NAME.upper() not in sagemaker_session.method_calls[3][1][2]['Environment']
예제 #4
0
def test_format_inputs_to_input_config_s3_input():
    inputs = s3_input(BUCKET_NAME)

    channels = _Job._format_inputs_to_input_config(inputs)

    assert channels[0]['DataSource']['S3DataSource']['S3Uri'] == inputs.config['DataSource'][
        'S3DataSource']['S3Uri']
def test_tf_training_and_evaluation_steps(sagemaker_session):
    job_name = "sagemaker-tensorflow-py2-gpu-2017-10-24-14-12-09"
    output_path = "s3://{}/output/{}/".format(sagemaker_session.default_bucket(), job_name)

    tf = _build_tf(sagemaker_session, training_steps=123, evaluation_steps=456, output_path=output_path)
    tf.fit(inputs=s3_input('s3://mybucket/train'))
    assert tf.hyperparameters()['training_steps'] == '123'
    assert tf.hyperparameters()['evaluation_steps'] == '456'
def test_tf_checkpoint_not_set(sagemaker_session):
    job_name = "sagemaker-tensorflow-py2-gpu-2017-10-24-14-12-09"
    tf = _build_tf(sagemaker_session, checkpoint_path=None, base_job_name=job_name,
                   output_path="s3://{}/".format(sagemaker_session.default_bucket()))
    tf.fit(inputs=s3_input('s3://mybucket/train'), job_name=job_name)

    expected_result = '"s3://{}/{}/checkpoints"'.format(sagemaker_session.default_bucket(), job_name)
    assert tf.hyperparameters()['checkpoint_path'] == expected_result
def test_container_log_level(sagemaker_session):
    fw = DummyFramework(entry_point=SCRIPT_PATH, role='DummyRole', sagemaker_session=sagemaker_session,
                        train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                        container_log_level=logging.DEBUG)
    fw.fit(inputs=s3_input('s3://mybucket/train'))

    _, _, train_kwargs = sagemaker_session.train.mock_calls[0]
    assert train_kwargs['hyperparameters']['sagemaker_container_log_level'] == '10'
def test_enable_cloudwatch_metrics(sagemaker_session):
    fw = DummyFramework(entry_point=SCRIPT_PATH, role='DummyRole', sagemaker_session=sagemaker_session,
                        train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                        enable_cloudwatch_metrics=True)
    fw.fit(inputs=s3_input('s3://mybucket/train'))

    _, _, train_kwargs = sagemaker_session.train.mock_calls[0]
    assert train_kwargs['hyperparameters']['sagemaker_enable_cloudwatch_metrics']
def test_fit_force_generation(strftime, sagemaker_session):
    fw = DummyFramework(entry_point=SCRIPT_PATH,
                        role=ROLE,
                        sagemaker_session=sagemaker_session,
                        train_instance_count=INSTANCE_COUNT,
                        train_instance_type=INSTANCE_TYPE,
                        base_job_name='some',
                        enable_cloudwatch_metrics=True)
    fw.base_job_name = None
    fw.fit(inputs=s3_input('s3://mybucket/train'))
    assert JOB_NAME == fw.latest_training_job.name
예제 #10
0
def test_tf_deploy_model_server_workers(sagemaker_session):
    tf = _build_tf(sagemaker_session)
    tf.fit(inputs=s3_input("s3://mybucket/train"))

    tf.deploy(initial_instance_count=1, instance_type="ml.c2.2xlarge", model_server_workers=2)

    assert (
        "2"
        == sagemaker_session.method_calls[3][1][2]["Environment"][
            MODEL_SERVER_WORKERS_PARAM_NAME.upper()
        ]
    )
예제 #11
0
def test_tf_checkpoint_not_set(sagemaker_session):
    job_name = "sagemaker-tensorflow-py2-gpu-2017-10-24-14-12-09"
    tf = _build_tf(sagemaker_session,
                   checkpoint_path=None,
                   base_job_name=job_name,
                   output_path="s3://{}/".format(
                       sagemaker_session.default_bucket()))
    tf.fit(inputs=s3_input('s3://mybucket/train'), job_name=job_name)

    expected_result = '"s3://{}/{}/checkpoints"'.format(
        sagemaker_session.default_bucket(), job_name)
    assert tf.hyperparameters()['checkpoint_path'] == expected_result
예제 #12
0
def test_tf_training_and_evaluation_steps(sagemaker_session):
    job_name = "sagemaker-tensorflow-py2-gpu-2017-10-24-14-12-09"
    output_path = "s3://{}/output/{}/".format(
        sagemaker_session.default_bucket(), job_name)

    tf = _build_tf(sagemaker_session,
                   training_steps=123,
                   evaluation_steps=456,
                   output_path=output_path)
    tf.fit(inputs=s3_input('s3://mybucket/train'))
    assert tf.hyperparameters()['training_steps'] == '123'
    assert tf.hyperparameters()['evaluation_steps'] == '456'
def test_tf_training_and_evaluation_steps_not_set(sagemaker_session):
    job_name = "sagemaker-tensorflow-py2-gpu-2017-10-24-14-12-09"
    output_path = "s3://{}/output/{}/".format(
        sagemaker_session.default_bucket(), job_name)

    tf = _build_tf(sagemaker_session,
                   training_steps=None,
                   evaluation_steps=None,
                   output_path=output_path)
    tf.fit(inputs=s3_input("s3://mybucket/train"))
    assert tf.hyperparameters()["training_steps"] == "null"
    assert tf.hyperparameters()["evaluation_steps"] == "null"
예제 #14
0
def test_enable_cloudwatch_metrics(sagemaker_session):
    fw = DummyFramework(entry_point=SCRIPT_PATH,
                        role='DummyRole',
                        sagemaker_session=sagemaker_session,
                        train_instance_count=INSTANCE_COUNT,
                        train_instance_type=INSTANCE_TYPE,
                        enable_cloudwatch_metrics=True)
    fw.fit(inputs=s3_input('s3://mybucket/train'))

    _, _, train_kwargs = sagemaker_session.train.mock_calls[0]
    assert train_kwargs['hyperparameters'][
        'sagemaker_enable_cloudwatch_metrics']
예제 #15
0
def test_container_log_level(sagemaker_session):
    fw = DummyFramework(entry_point=SCRIPT_PATH,
                        role='DummyRole',
                        sagemaker_session=sagemaker_session,
                        train_instance_count=INSTANCE_COUNT,
                        train_instance_type=INSTANCE_TYPE,
                        container_log_level=logging.DEBUG)
    fw.fit(inputs=s3_input('s3://mybucket/train'))

    _, _, train_kwargs = sagemaker_session.train.mock_calls[0]
    assert train_kwargs['hyperparameters'][
        'sagemaker_container_log_level'] == '10'
예제 #16
0
def test_load_config(estimator):
    inputs = s3_input(BUCKET_NAME)

    config = _Job._load_config(inputs, estimator)

    assert config['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] == BUCKET_NAME
    assert config['role'] == ROLE
    assert config['output_config']['S3OutputPath'] == S3_OUTPUT_PATH
    assert 'KmsKeyId' not in config['output_config']
    assert config['resource_config']['InstanceCount'] == INSTANCE_COUNT
    assert config['resource_config']['InstanceType'] == INSTANCE_TYPE
    assert config['resource_config']['VolumeSizeInGB'] == VOLUME_SIZE
    assert config['stop_condition']['MaxRuntimeInSeconds'] == MAX_RUNTIME
예제 #17
0
 def _format_string_uri_input(uri_input, validate_uri=True):
     if isinstance(uri_input,
                   str) and validate_uri and uri_input.startswith('s3://'):
         return s3_input(uri_input)
     elif isinstance(
             uri_input,
             str) and validate_uri and uri_input.startswith('file://'):
         return file_input(uri_input)
     elif isinstance(uri_input, str) and validate_uri:
         raise ValueError(
             'Training input data must be a valid S3 or FILE URI: must start with "s3://" or '
             '"file://"')
     elif isinstance(uri_input, str):
         return s3_input(uri_input)
     elif isinstance(uri_input, s3_input):
         return uri_input
     elif isinstance(uri_input, file_input):
         return uri_input
     else:
         raise ValueError(
             'Cannot format input {}. Expecting one of str, s3_input, or file_input'
             .format(uri_input))
예제 #18
0
 def _format_model_uri_input(model_uri, validate_uri=True):
     if isinstance(model_uri, string_types
                   ) and validate_uri and model_uri.startswith('s3://'):
         return s3_input(model_uri,
                         input_mode='File',
                         distribution='FullyReplicated',
                         content_type='application/x-sagemaker-model')
     elif isinstance(model_uri, string_types
                     ) and validate_uri and model_uri.startswith('file://'):
         return file_input(model_uri)
     elif isinstance(model_uri, string_types) and validate_uri:
         raise ValueError(
             'Model URI must be a valid S3 or FILE URI: must start with "s3://" or '
             '"file://')
     elif isinstance(model_uri, string_types):
         return s3_input(model_uri,
                         input_mode='File',
                         distribution='FullyReplicated',
                         content_type='application/x-sagemaker-model')
     else:
         raise ValueError(
             'Cannot format model URI {}. Expecting str'.format(model_uri))
예제 #19
0
def test_load_config(estimator):
    inputs = s3_input(BUCKET_NAME)

    config = _Job._load_config(inputs, estimator)

    assert config["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] == BUCKET_NAME
    assert config["role"] == ROLE
    assert config["output_config"]["S3OutputPath"] == S3_OUTPUT_PATH
    assert "KmsKeyId" not in config["output_config"]
    assert config["resource_config"]["InstanceCount"] == INSTANCE_COUNT
    assert config["resource_config"]["InstanceType"] == INSTANCE_TYPE
    assert config["resource_config"]["VolumeSizeInGB"] == VOLUME_SIZE
    assert config["stop_condition"]["MaxRuntimeInSeconds"] == MAX_RUNTIME
예제 #20
0
 def _format_s3_uri_input(input):
     if isinstance(input, str):
         if not input.startswith('s3://'):
             raise ValueError(
                 'Training input data must be a valid S3 URI and must start with "s3://"'
             )
         return s3_input(input)
     if isinstance(input, s3_input):
         return input
     else:
         raise ValueError(
             'Cannot format input {}. Expecting one of str or s3_input'.
             format(input))
예제 #21
0
def test_fit_verify_job_name(strftime, sagemaker_session):
    fw = DummyFramework(entry_point=SCRIPT_PATH, role='DummyRole', sagemaker_session=sagemaker_session,
                        train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                        enable_cloudwatch_metrics=True, tags=TAGS)
    fw.fit(inputs=s3_input('s3://mybucket/train'))

    _, _, train_kwargs = sagemaker_session.train.mock_calls[0]

    assert train_kwargs['hyperparameters']['sagemaker_enable_cloudwatch_metrics']
    assert train_kwargs['image'] == IMAGE_NAME
    assert train_kwargs['input_mode'] == 'File'
    assert train_kwargs['tags'] == TAGS
    assert train_kwargs['job_name'] == JOB_NAME
    assert fw.latest_training_job.name == JOB_NAME
예제 #22
0
def test_format_input_s3_input():
    input_dict = _Job._format_inputs_to_input_config(s3_input('s3://foo/bar', distribution='ShardedByS3Key',
                                                              compression='gzip', content_type='whizz',
                                                              record_wrapping='bang'))
    assert input_dict == [{
        'CompressionType': 'gzip',
        'ChannelName': 'training',
        'ContentType': 'whizz',
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3DataDistributionType': 'ShardedByS3Key',
                'S3Uri': 's3://foo/bar'}},
        'RecordWrapperType': 'bang'}]
예제 #23
0
def test_load_config_with_code_channel_no_code_uri(framework):
    inputs = s3_input(BUCKET_NAME)

    framework.model_uri = MODEL_URI
    framework.model_channel_name = MODEL_CHANNEL_NAME
    framework._enable_network_isolation = True
    config = _Job._load_config(inputs, framework)

    assert len(config["input_config"]) == 2
    assert config["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] == BUCKET_NAME
    assert config["role"] == ROLE
    assert config["output_config"]["S3OutputPath"] == S3_OUTPUT_PATH
    assert "KmsKeyId" not in config["output_config"]
    assert config["resource_config"]["InstanceCount"] == INSTANCE_COUNT
    assert config["resource_config"]["InstanceType"] == INSTANCE_TYPE
예제 #24
0
 def _format_string_uri_input(input):
     if isinstance(input, str):
         if input.startswith('s3://'):
             return s3_input(input)
         elif input.startswith('file://'):
             return file_input(input)
         else:
             raise ValueError('Training input data must be a valid S3 or FILE URI: must start with "s3://" or '
                              '"file://"')
     elif isinstance(input, s3_input):
         return input
     elif isinstance(input, file_input):
         return input
     else:
         raise ValueError('Cannot format input {}. Expecting one of str, s3_input, or file_input'.format(input))
예제 #25
0
def test_load_config_with_model_channel(estimator):
    inputs = s3_input(BUCKET_NAME)

    estimator.model_uri = MODEL_URI
    estimator.model_channel_name = MODEL_CHANNEL_NAME

    config = _Job._load_config(inputs, estimator)

    assert config["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] == BUCKET_NAME
    assert config["input_config"][1]["DataSource"]["S3DataSource"]["S3Uri"] == MODEL_URI
    assert config["input_config"][1]["ChannelName"] == MODEL_CHANNEL_NAME
    assert config["role"] == ROLE
    assert config["output_config"]["S3OutputPath"] == S3_OUTPUT_PATH
    assert "KmsKeyId" not in config["output_config"]
    assert config["resource_config"]["InstanceCount"] == INSTANCE_COUNT
    assert config["resource_config"]["InstanceType"] == INSTANCE_TYPE
    assert config["resource_config"]["VolumeSizeInGB"] == VOLUME_SIZE
    assert config["stop_condition"]["MaxRuntimeInSeconds"] == MAX_RUNTIME
예제 #26
0
def test_load_config_with_model_channel(estimator):
    inputs = s3_input(BUCKET_NAME)

    estimator.model_uri = MODEL_URI
    estimator.model_channel_name = CHANNEL_NAME

    config = _Job._load_config(inputs, estimator)

    assert config['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] == BUCKET_NAME
    assert config['input_config'][1]['DataSource']['S3DataSource']['S3Uri'] == MODEL_URI
    assert config['input_config'][1]['ChannelName'] == CHANNEL_NAME
    assert config['role'] == ROLE
    assert config['output_config']['S3OutputPath'] == S3_OUTPUT_PATH
    assert 'KmsKeyId' not in config['output_config']
    assert config['resource_config']['InstanceCount'] == INSTANCE_COUNT
    assert config['resource_config']['InstanceType'] == INSTANCE_TYPE
    assert config['resource_config']['VolumeSizeInGB'] == VOLUME_SIZE
    assert config['stop_condition']['MaxRuntimeInSeconds'] == MAX_RUNTIME
예제 #27
0
def test_format_input_s3_input():
    input_dict = _Job._format_inputs_to_input_config(
        s3_input(
            "s3://foo/bar",
            distribution="ShardedByS3Key",
            compression="gzip",
            content_type="whizz",
            record_wrapping="bang",
        ))
    assert input_dict == [{
        "CompressionType": "gzip",
        "ChannelName": "training",
        "ContentType": "whizz",
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3DataDistributionType": "ShardedByS3Key",
                "S3Uri": "s3://foo/bar",
            }
        },
        "RecordWrapperType": "bang",
    }]
def test_s3_input_mode(sagemaker_session, tuner):
    expected_input_mode = 'Pipe'

    script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'failure_script.py')
    mxnet = MXNet(entry_point=script_path,
                  role=ROLE,
                  framework_version=FRAMEWORK_VERSION,
                  train_instance_count=TRAIN_INSTANCE_COUNT,
                  train_instance_type=TRAIN_INSTANCE_TYPE,
                  sagemaker_session=sagemaker_session)
    tuner.estimator = mxnet

    tags = [{'Name': 'some-tag-without-a-value'}]
    tuner.tags = tags

    hyperparameter_ranges = {'num_components': IntegerParameter(2, 4),
                             'algorithm_mode': CategoricalParameter(['regular', 'randomized'])}
    tuner._hyperparameter_ranges = hyperparameter_ranges

    tuner.fit(inputs=s3_input('s3://mybucket/train_manifest', input_mode=expected_input_mode))

    actual_input_mode = sagemaker_session.method_calls[1][2]['input_mode']
    assert actual_input_mode == expected_input_mode
 def records_s3_input(self):
     """Return a s3_input to represent the training data"""
     return s3_input(self.s3_data, distribution='ShardedByS3Key', s3_data_type=self.s3_data_type)
예제 #30
0
}

# create estimator
xgb_estimator = Estimator(image_name=container,
                          hyperparameters=hyperparameters,
                          role=sagemaker_role,
                          sagemaker_session=sagemaker.session.Session(sess),
                          train_instance_count=1,
                          train_instance_type='ml.m5.4xlarge',
                          train_volume_size=5,
                          output_path=config.SAGEMAKER_MODEL_S3_DEST)

# create training inputs
sagemaker_taining_job_name = config.SAGEMAKER_TRAINING_JOB_NAME_PREFIX + '-{}'.format(
    guid)
sagemaker_training_data = s3_input(config.SAGEMAKER_TRAINING_DATA_S3_SOURCE,
                                   content_type=config.SAGEMAKER_CONTENT_TYPE)
sagemaker_validation_data = s3_input(
    config.SAGEMAKER_VALIDATION_DATA_S3_SOURCE,
    content_type=config.SAGEMAKER_CONTENT_TYPE)
sagemaker_training_inputs = {
    'train': sagemaker_training_data,
    'validation': sagemaker_validation_data
}

# train_config specifies SageMaker training configuration
training_config = training_config(estimator=xgb_estimator,
                                  inputs=sagemaker_training_inputs,
                                  job_name=sagemaker_taining_job_name)

sagemaker_model_name = config.SAGEMAKER_MODEL_NAME_PREFIX + '-{}'.format(guid)
sagemaker_endpoint_name = config.SAGEMAKER_ENDPOINT_NAME_PREFIX + '-{}'.format(
sess = sagemaker.Session()
role = config['role']
bucket = config['bucket']
key_prefix = config['key-prefix']
config['job-name'] = job_name

if __name__ == '__main__':
    input_train = sess.upload_data(path='train.csv',
                                   bucket=bucket,
                                   key_prefix=key_prefix)

    input_validation = sess.upload_data(path='validation.csv',
                                        bucket=bucket,
                                        key_prefix=key_prefix)

    s3_input_train = s3_input(s3_data=input_train, content_type='text/csv')

    s3_input_validation = s3_input(s3_data=input_validation,
                                   content_type='text/csv')

    container = get_image_uri(boto3.Session().region_name, 'xgboost')

    xgb = sagemaker.estimator.Estimator(
        container,
        role,
        train_instance_count=1,
        train_instance_type='ml.m4.xlarge',
        sagemaker_session=sess,
        output_path=config['s3_output_location'])

    xgb.set_hyperparameters(max_depth=5,
예제 #32
0
    'Extract, Transform, Load',
    parameters={"JobName": job_name,
                "Arguments":{
                    '--S3_SOURCE': data_source,
                    '--S3_DEST': 's3a://{}/{}/'.format(bucket, project_name),
                    '--TRAIN_KEY': train_prefix + '/',
                    '--VAL_KEY': val_prefix +'/'}
               }
)


training_step = steps.TrainingStep(
    'Model Training', 
    estimator=xgb,
    data={
        'train': s3_input(train_data, content_type='csv'),
        'validation': s3_input(validation_data, content_type='csv')
    },
    job_name=training_job_name,
    wait_for_completion=True
)

model_step = steps.ModelStep(
    'Save Model',
    model=training_step.get_expected_model(),
    model_name=execution_input['ModelName'],
    result_path='$.ModelStepResults'
)

lambda_step = steps.compute.LambdaStep(
    'Query Training Results',
 def records_s3_input(self):
     """Return a s3_input to represent the training data"""
     return s3_input(self.s3_data,
                     distribution='ShardedByS3Key',
                     s3_data_type=self.s3_data_type)
                                    train_instance_type='ml.c4.xlarge',
                                    output_path=output_path,
                                    sagemaker_session=sess)
num_topics = 6
ntm.set_hyperparameters(num_topics=num_topics,
                        feature_dim=vocab_size,
                        mini_batch_size=128,
                        epochs=100,
                        num_patience_epochs=5,
                        tolerance=0.001)

# In[83]:

from sagemaker.session import s3_input

s3_train = s3_input(s3_train_data, distribution='ShardedByS3Key')

# In[84]:

ntm.fit({'train': s3_train, 'test': s3_val_data})

# In[59]:

ntm_predictor = ntm.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

# In[65]:

from sagemaker.predictor import csv_serializer, json_deserializer

ntm_predictor.content_type = 'text/csv'