예제 #1
0
def test_stop_tuning_job(sagemaker_session):
    feature_num = 14
    train_input = np.random.rand(1000, feature_num)

    rcf = RandomCutForest(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge',
                          num_trees=50, num_samples_per_tree=20, sagemaker_session=sagemaker_session,
                          base_job_name='test-randomcutforest')

    records = rcf.record_set(train_input)
    records.distribution = 'FullyReplicated'

    test_records = rcf.record_set(train_input, channel='test')
    test_records.distribution = 'FullyReplicated'

    hyperparameter_ranges = {'num_trees': IntegerParameter(50, 100),
                             'num_samples_per_tree': IntegerParameter(1, 2)}

    objective_metric_name = 'test:f1'
    tuner = HyperparameterTuner(estimator=rcf, objective_metric_name=objective_metric_name,
                                hyperparameter_ranges=hyperparameter_ranges, objective_type='Maximize', max_jobs=2,
                                max_parallel_jobs=2)

    tuner.fit([records, test_records])

    time.sleep(15)

    latest_tuning_job_name = tuner.latest_tuning_job.name

    print('Attempting to stop {}'.format(latest_tuning_job_name))

    tuner.stop_tuning_job()

    desc = tuner.latest_tuning_job.sagemaker_session.sagemaker_client\
        .describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=latest_tuning_job_name)
    assert desc['HyperParameterTuningJobStatus'] == 'Stopping'
예제 #2
0
def test_tuning(sagemaker_session, ecr_image, instance_type, framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
    script = os.path.join(resource_path, 'mnist', 'mnist.py')

    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           train_instance_type=instance_type,
                           train_instance_count=1,
                           sagemaker_session=sagemaker_session,
                           image_name=ecr_image,
                           framework_version=framework_version,
                           script_mode=True)

    hyperparameter_ranges = {'epochs': IntegerParameter(1, 2)}
    objective_metric_name = 'accuracy'
    metric_definitions = [{'Name': objective_metric_name, 'Regex': 'accuracy = ([0-9\\.]+)'}]

    tuner = HyperparameterTuner(estimator,
                                objective_metric_name,
                                hyperparameter_ranges,
                                metric_definitions,
                                max_jobs=2,
                                max_parallel_jobs=2)

    with timeout(minutes=20):
        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(resource_path, 'mnist', 'data'),
            key_prefix='scriptmode/mnist')

        tuning_job_name = unique_name_from_base('test-tf-sm-tuning', max_length=32)
        tuner.fit(inputs, job_name=tuning_job_name)
        tuner.wait()
예제 #3
0
def test_model_dir_with_training_job_name(sagemaker_session, image_uri,
                                          instance_type, framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), "../..",
                                 "resources")
    script = os.path.join(resource_path, "tuning_model_dir", "entry.py")

    estimator = TensorFlow(
        entry_point=script,
        role="SageMakerRole",
        train_instance_type=instance_type,
        train_instance_count=1,
        image_name=image_uri,
        framework_version=framework_version,
        py_version="py3",
        sagemaker_session=sagemaker_session,
    )

    tuner = HyperparameterTuner(
        estimator=estimator,
        objective_metric_name="accuracy",
        hyperparameter_ranges={"arbitrary_value": IntegerParameter(0, 1)},
        metric_definitions=[{
            "Name": "accuracy",
            "Regex": "accuracy=([01])"
        }],
        max_jobs=1,
        max_parallel_jobs=1,
    )

    # User script has logic to check for the correct model_dir
    tuner.fit(
        job_name=unique_name_from_base("test-tf-model-dir", max_length=32))
    tuner.wait()
예제 #4
0
def test_marketplace_tuning_job(sagemaker_session):
    data_path = os.path.join(DATA_DIR, 'marketplace', 'training')
    region = sagemaker_session.boto_region_name
    account = REGION_ACCOUNT_MAP[region]
    algorithm_arn = ALGORITHM_ARN % (region, account)

    mktplace = AlgorithmEstimator(algorithm_arn=algorithm_arn,
                                  role='SageMakerRole',
                                  train_instance_count=1,
                                  train_instance_type='ml.c4.xlarge',
                                  sagemaker_session=sagemaker_session,
                                  base_job_name='test-marketplace')

    train_input = mktplace.sagemaker_session.upload_data(
        path=data_path, key_prefix='integ-test-data/marketplace/train')

    mktplace.set_hyperparameters(max_leaf_nodes=10)

    hyperparameter_ranges = {'max_leaf_nodes': IntegerParameter(1, 100000)}

    tuner = HyperparameterTuner(estimator=mktplace,
                                base_tuning_job_name='byo',
                                objective_metric_name='validation:accuracy',
                                hyperparameter_ranges=hyperparameter_ranges,
                                max_jobs=2,
                                max_parallel_jobs=2)

    tuner.fit({'training': train_input}, include_cls_metadata=False)
    time.sleep(15)
    tuner.wait()
예제 #5
0
def test_tuning(sagemaker_session, ecr_image, instance_type):
    mx = MXNet(entry_point=SCRIPT_PATH,
               role='SageMakerRole',
               train_instance_count=1,
               train_instance_type=instance_type,
               sagemaker_session=sagemaker_session,
               image_name=ecr_image,
               hyperparameters={'epochs': 1})

    hyperparameter_ranges = {'learning-rate': ContinuousParameter(0.01, 0.2)}
    objective_metric_name = 'Validation-accuracy'
    metric_definitions = [
        {'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)'}]

    tuner = HyperparameterTuner(mx,
                                objective_metric_name,
                                hyperparameter_ranges,
                                metric_definitions,
                                max_jobs=2,
                                max_parallel_jobs=2)

    with timeout(minutes=20):
        prefix = 'mxnet_mnist/{}'.format(utils.sagemaker_timestamp())
        train_input = mx.sagemaker_session.upload_data(path=os.path.join(DATA_PATH, 'train'),
                                                       key_prefix=prefix + '/train')
        test_input = mx.sagemaker_session.upload_data(path=os.path.join(DATA_PATH, 'test'),
                                                      key_prefix=prefix + '/test')

        job_name = utils.unique_name_from_base('test-mxnet-image', max_length=32)
        tuner.fit({'train': train_input, 'test': test_input}, job_name=job_name)
        tuner.wait()
예제 #6
0
def test_tuning_mxnet(sagemaker_session):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'tuning.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        estimator = MXNet(entry_point=script_path,
                          role='SageMakerRole',
                          train_instance_count=1,
                          train_instance_type='ml.m4.xlarge',
                          sagemaker_session=sagemaker_session,
                          base_job_name='tune-mxnet')

        hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.01, 0.2)}
        objective_metric_name = 'Validation-accuracy'
        metric_definitions = [{'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)'}]
        tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions,
                                    max_jobs=4, max_parallel_jobs=2)

        train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                              key_prefix='integ-test-data/mxnet_mnist/train')
        test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                             key_prefix='integ-test-data/mxnet_mnist/test')
        tuner.fit({'train': train_input, 'test': test_input})

        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')
        data = np.zeros(shape=(1, 1, 28, 28))
        predictor.predict(data)
예제 #7
0
def _tune(
    kmeans_estimator,
    kmeans_train_set,
    tuner=None,
    hyperparameter_ranges=None,
    job_name=None,
    warm_start_config=None,
    wait=True,
    max_jobs=2,
    max_parallel_jobs=2,
    early_stopping_type="Off",
):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):

        if not tuner:
            tuner = HyperparameterTuner(
                estimator=kmeans_estimator,
                objective_metric_name="test:msd",
                hyperparameter_ranges=hyperparameter_ranges,
                objective_type="Minimize",
                max_jobs=max_jobs,
                max_parallel_jobs=max_parallel_jobs,
                warm_start_config=warm_start_config,
                early_stopping_type=early_stopping_type,
            )

        records = kmeans_estimator.record_set(kmeans_train_set[0][:100])
        test_record_set = kmeans_estimator.record_set(
            kmeans_train_set[0][:100], channel="test")

        print(
            "Started hyperparameter tuning job with name: {}".format(job_name))
        tuner.fit([records, test_record_set], job_name=job_name, wait=wait)

    return tuner
예제 #8
0
def _tune(kmeans_estimator,
          kmeans_train_set,
          tuner=None,
          hyperparameter_ranges=None,
          job_name=None,
          warm_start_config=None,
          wait_till_terminal=True,
          max_jobs=2,
          max_parallel_jobs=2,
          early_stopping_type='Off'):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):

        if not tuner:
            tuner = HyperparameterTuner(
                estimator=kmeans_estimator,
                objective_metric_name='test:msd',
                hyperparameter_ranges=hyperparameter_ranges,
                objective_type='Minimize',
                max_jobs=max_jobs,
                max_parallel_jobs=max_parallel_jobs,
                warm_start_config=warm_start_config,
                early_stopping_type=early_stopping_type)

        records = kmeans_estimator.record_set(kmeans_train_set[0][:100])
        test_record_set = kmeans_estimator.record_set(
            kmeans_train_set[0][:100], channel='test')

        tuner.fit([records, test_record_set], job_name=job_name)
        print('Started hyperparameter tuning job with name:' +
              tuner.latest_tuning_job.name)

        if wait_till_terminal:
            tuner.wait()

    return tuner
def test_attach_tuning_pytorch(sagemaker_session):
    mnist_dir = os.path.join(DATA_DIR, "pytorch_mnist")
    mnist_script = os.path.join(mnist_dir, "mnist.py")

    estimator = PyTorch(
        entry_point=mnist_script,
        role="SageMakerRole",
        train_instance_count=1,
        py_version=PYTHON_VERSION,
        train_instance_type="ml.c4.xlarge",
        sagemaker_session=sagemaker_session,
    )

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        objective_metric_name = "evaluation-accuracy"
        metric_definitions = [{
            "Name": "evaluation-accuracy",
            "Regex": r"Overall test accuracy: (\d+)"
        }]
        hyperparameter_ranges = {"batch-size": IntegerParameter(50, 100)}

        tuner = HyperparameterTuner(
            estimator,
            objective_metric_name,
            hyperparameter_ranges,
            metric_definitions,
            max_jobs=2,
            max_parallel_jobs=2,
            early_stopping_type="Auto",
        )

        training_data = estimator.sagemaker_session.upload_data(
            path=os.path.join(mnist_dir, "training"),
            key_prefix="integ-test-data/pytorch_mnist/training",
        )

        tuning_job_name = unique_name_from_base("pytorch", max_length=32)
        tuner.fit({"training": training_data}, job_name=tuning_job_name)

        print("Started hyperparameter tuning job with name:" + tuning_job_name)

        time.sleep(15)
        tuner.wait()

    attached_tuner = HyperparameterTuner.attach(
        tuning_job_name, sagemaker_session=sagemaker_session)
    assert attached_tuner.early_stopping_type == "Auto"

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = attached_tuner.deploy(1, "ml.c4.xlarge")
        data = np.zeros(shape=(1, 1, 28, 28), dtype=np.float32)
        predictor.predict(data)

        batch_size = 100
        data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32)
        output = predictor.predict(data)

        assert output.shape == (batch_size, 10)
예제 #10
0
def get_xgb_tuner(output_path, model_name):
    xgb = _init_model(role, output_path, model_name)

    # Set core hyperparameters
    xgb.set_hyperparameters(
        eval_metric='rmse',
        objective=
        'reg:linear',  # plenty of options out there: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst#learning-task-parameters
        num_round=100,
        rate_drop=0.3,
        tweedie_variance_power=1.4)

    hyperparemeters_to_tune = {
        'eta': ContinuousParameter(0, 1),
        'min_child_weight': ContinuousParameter(1, 10),
        'alpha': ContinuousParameter(0, 2),
        'max_depth': IntegerParameter(1, 10)
    }

    tuner = HyperparameterTuner(
        xgb,
        'validation:rmse',  # objective metric
        hyperparemeters_to_tune,
        max_jobs=20,
        max_parallel_jobs=3,
        base_tuning_job_name=model_name + "-tuner",
        objective_type='Minimize')
    return tuner
예제 #11
0
    def model_fit(
        self,
        inputs: Dict[str, str],
        hparam: Dict[str, Any] = None,
    ) -> None:

        if hparam is not None:

            tuner = HyperparameterTuner(
                estimator=self.estimator,
                objective_metric_name=hparam.get('objective_metric_name'),
                metric_definitions=hparam.get('metric_definitions'),
                hyperparameter_ranges=hparam.get('hyperparameter_ranges'),
                objective_type=hparam.get('objective_type'),
                max_jobs=hparam.get('max_jobs'),
                max_parallel_jobs=hparam.get('max_parallel_jobs'),
                tags=self._project_tag,
                base_tuning_job_name=self._training_job_name,
            )
            tuner.fit(
                inputs=inputs,
                job_name=self._training_job_name,
                wait=False,
                logs='All',
            )

        else:

            self.estimator.fit(
                inputs=inputs,
                job_name=self._training_job_name,
                wait=False,
                logs='All',
            )
예제 #12
0
def _test_model_dir_with_training_job_name_function(ecr_image,
                                                    sagemaker_session,
                                                    instance_type,
                                                    framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), '../..',
                                 'resources')
    script = os.path.join(resource_path, 'tuning_model_dir', 'entry.py')

    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           instance_type=instance_type,
                           instance_count=1,
                           image_uri=ecr_image,
                           framework_version=framework_version,
                           py_version='py3',
                           sagemaker_session=sagemaker_session)

    tuner = HyperparameterTuner(
        estimator=estimator,
        objective_metric_name='accuracy',
        hyperparameter_ranges={'arbitrary_value': IntegerParameter(0, 1)},
        metric_definitions=[{
            'Name': 'accuracy',
            'Regex': 'accuracy=([01])'
        }],
        max_jobs=1,
        max_parallel_jobs=1)

    # User script has logic to check for the correct model_dir
    tuner.fit(
        job_name=unique_name_from_base('test-tf-model-dir', max_length=32))
    tuner.wait()
예제 #13
0
def test_marketplace_tuning_job(sagemaker_session, cpu_instance_type):
    data_path = os.path.join(DATA_DIR, "marketplace", "training")
    region = sagemaker_session.boto_region_name
    account = REGION_ACCOUNT_MAP[region]
    algorithm_arn = ALGORITHM_ARN % (region, account)

    mktplace = AlgorithmEstimator(
        algorithm_arn=algorithm_arn,
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
        base_job_name="test-marketplace",
    )

    train_input = mktplace.sagemaker_session.upload_data(
        path=data_path, key_prefix="integ-test-data/marketplace/train")

    mktplace.set_hyperparameters(max_leaf_nodes=10)

    hyperparameter_ranges = {"max_leaf_nodes": IntegerParameter(1, 100000)}

    tuner = HyperparameterTuner(
        estimator=mktplace,
        base_tuning_job_name="byo",
        objective_metric_name="validation:accuracy",
        hyperparameter_ranges=hyperparameter_ranges,
        max_jobs=2,
        max_parallel_jobs=2,
    )

    tuner.fit({"training": train_input}, include_cls_metadata=False)
    time.sleep(15)
    tuner.wait()
def test_tuning_tf_vpc_multi(
    sagemaker_session,
    cpu_instance_type,
    tensorflow_training_latest_version,
    tensorflow_training_latest_py_version,
):
    """Test Tensorflow multi-instance using the same VpcConfig for training and inference"""
    instance_type = cpu_instance_type
    instance_count = 2

    resource_path = os.path.join(DATA_DIR, "tensorflow_mnist")
    script_path = "mnist.py"

    ec2_client = sagemaker_session.boto_session.client("ec2")
    subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources(
        ec2_client)
    vpc_test_utils.setup_security_group_for_encryption(ec2_client,
                                                       security_group_id)

    estimator = TensorFlow(
        entry_point=script_path,
        source_dir=resource_path,
        role="SageMakerRole",
        framework_version=tensorflow_training_latest_version,
        py_version=tensorflow_training_latest_py_version,
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        base_job_name="test-vpc-tf",
        subnets=subnet_ids,
        security_group_ids=[security_group_id],
        encrypt_inter_container_traffic=True,
    )

    hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)}
    objective_metric_name = "accuracy"
    metric_definitions = [{
        "Name": objective_metric_name,
        "Regex": "accuracy = ([0-9\\.]+)"
    }]

    tuner = HyperparameterTuner(
        estimator,
        objective_metric_name,
        hyperparameter_ranges,
        metric_definitions,
        max_jobs=2,
        max_parallel_jobs=2,
    )

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(resource_path, "data"),
            key_prefix="scriptmode/mnist")

        tuning_job_name = unique_name_from_base("tune-tf", max_length=32)
        print(
            f"Started hyperparameter tuning job with name: {tuning_job_name}")
        tuner.fit(inputs, job_name=tuning_job_name)
예제 #15
0
def test_tuning_lda(sagemaker_session):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'lda')
        data_filename = 'nips-train_1.pbr'

        with open(os.path.join(data_path, data_filename), 'rb') as f:
            all_records = read_records(f)

        # all records must be same
        feature_num = int(
            all_records[0].features['values'].float32_tensor.shape[0])

        lda = LDA(role='SageMakerRole',
                  train_instance_type='ml.c4.xlarge',
                  num_topics=10,
                  sagemaker_session=sagemaker_session,
                  base_job_name='test-lda')

        record_set = prepare_record_set_from_local_files(
            data_path, lda.data_location, len(all_records), feature_num,
            sagemaker_session)
        test_record_set = prepare_record_set_from_local_files(
            data_path, lda.data_location, len(all_records), feature_num,
            sagemaker_session)
        test_record_set.channel = 'test'

        # specify which hp you want to optimize over
        hyperparameter_ranges = {
            'alpha0': ContinuousParameter(1, 10),
            'num_topics': IntegerParameter(1, 2)
        }
        objective_metric_name = 'test:pwll'

        tuner = HyperparameterTuner(
            estimator=lda,
            objective_metric_name=objective_metric_name,
            hyperparameter_ranges=hyperparameter_ranges,
            objective_type='Maximize',
            max_jobs=2,
            max_parallel_jobs=2)

        tuner.fit([record_set, test_record_set], mini_batch_size=1)

        print('Started hyperparameter tuning job with name:' +
              tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')
        predict_input = np.random.rand(1, feature_num)
        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label['topic_mixture'] is not None
예제 #16
0
def test_tuning_mxnet(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        estimator = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            framework_version=mxnet_training_latest_version,
            sagemaker_session=sagemaker_session,
        )

        hyperparameter_ranges = {
            "learning-rate": ContinuousParameter(0.01, 0.2)
        }
        objective_metric_name = "Validation-accuracy"
        metric_definitions = [{
            "Name": "Validation-accuracy",
            "Regex": "Validation-accuracy=([0-9\\.]+)"
        }]
        tuner = HyperparameterTuner(
            estimator,
            objective_metric_name,
            hyperparameter_ranges,
            metric_definitions,
            max_jobs=4,
            max_parallel_jobs=2,
        )

        train_input = estimator.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = estimator.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        tuning_job_name = unique_name_from_base("tune-mxnet", max_length=32)
        print("Started hyperparameter tuning job with name:" + tuning_job_name)
        tuner.fit({
            "train": train_input,
            "test": test_input
        },
                  job_name=tuning_job_name)

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = tuner.deploy(1, cpu_instance_type)
        data = np.zeros(shape=(1, 1, 28, 28))
        predictor.predict(data)
def test_tuning_step_with_single_algo_tuner(pipeline_session, entry_point):
    inputs = TrainingInput(
        s3_data=f"s3://{pipeline_session.default_bucket()}/training-data")

    pytorch_estimator = PyTorch(
        entry_point=entry_point,
        role=sagemaker.get_execution_role(),
        framework_version="1.5.0",
        py_version="py3",
        instance_count=1,
        instance_type="ml.m5.xlarge",
        sagemaker_session=pipeline_session,
        enable_sagemaker_metrics=True,
        max_retry_attempts=3,
    )

    hyperparameter_ranges = {
        "batch-size": IntegerParameter(64, 128),
    }

    tuner = HyperparameterTuner(
        estimator=pytorch_estimator,
        objective_metric_name="test:acc",
        objective_type="Maximize",
        hyperparameter_ranges=hyperparameter_ranges,
        metric_definitions=[{
            "Name": "test:acc",
            "Regex": "Overall test accuracy: (.*?);"
        }],
        max_jobs=2,
        max_parallel_jobs=2,
    )

    with warnings.catch_warnings(record=True) as w:
        step_args = tuner.fit(inputs=inputs)
        assert len(w) == 1
        assert issubclass(w[-1].category, UserWarning)
        assert "Running within a PipelineSession" in str(w[-1].message)

    with warnings.catch_warnings(record=True) as w:
        step = TuningStep(
            name="MyTuningStep",
            step_args=step_args,
        )
        assert len(w) == 0

    pipeline = Pipeline(
        name="MyPipeline",
        steps=[step],
        sagemaker_session=pipeline_session,
    )

    assert json.loads(pipeline.definition())["Steps"][0] == {
        "Name": "MyTuningStep",
        "Type": "Tuning",
        "Arguments": step_args,
    }
예제 #18
0
def test_attach_tuning_pytorch(sagemaker_session):
    mnist_dir = os.path.join(DATA_DIR, 'pytorch_mnist')
    mnist_script = os.path.join(mnist_dir, 'mnist.py')

    estimator = PyTorch(entry_point=mnist_script,
                        role='SageMakerRole',
                        train_instance_count=1,
                        py_version=PYTHON_VERSION,
                        train_instance_type='ml.c4.xlarge',
                        sagemaker_session=sagemaker_session)

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        objective_metric_name = 'evaluation-accuracy'
        metric_definitions = [{
            'Name': 'evaluation-accuracy',
            'Regex': r'Overall test accuracy: (\d+)'
        }]
        hyperparameter_ranges = {'batch-size': IntegerParameter(50, 100)}

        tuner = HyperparameterTuner(estimator,
                                    objective_metric_name,
                                    hyperparameter_ranges,
                                    metric_definitions,
                                    max_jobs=2,
                                    max_parallel_jobs=2,
                                    early_stopping_type='Auto')

        training_data = estimator.sagemaker_session.upload_data(
            path=os.path.join(mnist_dir, 'training'),
            key_prefix='integ-test-data/pytorch_mnist/training')

        tuning_job_name = unique_name_from_base('pytorch', max_length=32)
        tuner.fit({'training': training_data}, job_name=tuning_job_name)

        print('Started hyperparameter tuning job with name:' + tuning_job_name)

        time.sleep(15)
        tuner.wait()

    attached_tuner = HyperparameterTuner.attach(
        tuning_job_name, sagemaker_session=sagemaker_session)
    assert attached_tuner.early_stopping_type == 'Auto'

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = attached_tuner.deploy(1, 'ml.c4.xlarge')
        data = np.zeros(shape=(1, 1, 28, 28), dtype=np.float32)
        predictor.predict(data)

        batch_size = 100
        data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32)
        output = predictor.predict(data)

        assert output.shape == (batch_size, 10)
예제 #19
0
def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session,
                           cpu_instance_type):
    subnets = [efs_fsx_setup.subnet_id]
    security_group_ids = efs_fsx_setup.security_group_ids
    role = efs_fsx_setup.role_name
    kmeans = KMeans(
        role=role,
        train_instance_count=TRAIN_INSTANCE_COUNT,
        train_instance_type=cpu_instance_type,
        k=K,
        sagemaker_session=sagemaker_session,
        subnets=subnets,
        security_group_ids=security_group_ids,
    )

    hyperparameter_ranges = {
        "extra_center_factor": IntegerParameter(4, 10),
        "mini_batch_size": IntegerParameter(10, 100),
        "epochs": IntegerParameter(1, 2),
        "init_method": CategoricalParameter(["kmeans++", "random"]),
    }

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        tuner = HyperparameterTuner(
            estimator=kmeans,
            objective_metric_name=OBJECTIVE_METRIC_NAME,
            hyperparameter_ranges=hyperparameter_ranges,
            objective_type="Minimize",
            max_jobs=MAX_JOBS,
            max_parallel_jobs=MAX_PARALLEL_JOBS,
        )

        file_system_fsx_id = efs_fsx_setup.file_system_fsx_id
        train_records = FileSystemRecordSet(
            file_system_id=file_system_fsx_id,
            file_system_type="FSxLustre",
            directory_path=FSX_DIR_PATH,
            num_records=NUM_RECORDS,
            feature_dim=FEATURE_DIM,
        )

        test_records = FileSystemRecordSet(
            file_system_id=file_system_fsx_id,
            file_system_type="FSxLustre",
            directory_path=FSX_DIR_PATH,
            num_records=NUM_RECORDS,
            feature_dim=FEATURE_DIM,
            channel="test",
        )

        job_name = unique_name_from_base("tune-kmeans-fsx")
        tuner.fit([train_records, test_records], job_name=job_name)
        tuner.wait()
        best_training_job = tuner.best_training_job()
        assert best_training_job
예제 #20
0
def test_validate_parameter_ranges_string_value_validation_error(sagemaker_session):
    pca = PCA(ROLE, TRAIN_INSTANCE_COUNT, TRAIN_INSTANCE_TYPE, NUM_COMPONENTS,
              base_job_name='pca', sagemaker_session=sagemaker_session)

    invalid_hyperparameter_ranges = {'algorithm_mode': CategoricalParameter([0, 5])}

    with pytest.raises(ValueError) as e:
        HyperparameterTuner(estimator=pca, objective_metric_name=OBJECTIVE_METRIC_NAME,
                            hyperparameter_ranges=invalid_hyperparameter_ranges, metric_definitions=METRIC_DEFINTIONS)

    assert 'Value must be one of "regular" and "randomized"' in str(e)
예제 #21
0
def test_tuning_tf(sagemaker_session):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py")

        estimator = TensorFlow(
            entry_point=script_path,
            role="SageMakerRole",
            training_steps=1,
            evaluation_steps=1,
            hyperparameters={"input_tensor_name": "inputs"},
            train_instance_count=1,
            train_instance_type="ml.c4.xlarge",
            sagemaker_session=sagemaker_session,
        )

        inputs = sagemaker_session.upload_data(
            path=DATA_PATH, key_prefix="integ-test-data/tf_iris")
        hyperparameter_ranges = {
            "learning_rate": ContinuousParameter(0.05, 0.2)
        }

        objective_metric_name = "loss"
        metric_definitions = [{"Name": "loss", "Regex": "loss = ([0-9\\.]+)"}]

        tuner = HyperparameterTuner(
            estimator,
            objective_metric_name,
            hyperparameter_ranges,
            metric_definitions,
            objective_type="Minimize",
            max_jobs=2,
            max_parallel_jobs=2,
        )

        tuning_job_name = unique_name_from_base("tune-tf", max_length=32)
        tuner.fit(inputs, job_name=tuning_job_name)

        print("Started hyperparameter tuning job with name:" + tuning_job_name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = tuner.deploy(1, "ml.c4.xlarge")

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = predictor.predict({"inputs": features})
        print("predict result: {}".format(dict_result))
        list_result = predictor.predict(features)
        print("predict result: {}".format(list_result))

        assert dict_result == list_result
예제 #22
0
def test_validate_parameter_ranges_number_validation_error(sagemaker_session):
    pca = PCA(ROLE, TRAIN_INSTANCE_COUNT, TRAIN_INSTANCE_TYPE, NUM_COMPONENTS,
              base_job_name='pca', sagemaker_session=sagemaker_session)

    invalid_hyperparameter_ranges = {'num_components': IntegerParameter(-1, 2)}

    with pytest.raises(ValueError) as e:
        HyperparameterTuner(estimator=pca, objective_metric_name=OBJECTIVE_METRIC_NAME,
                            hyperparameter_ranges=invalid_hyperparameter_ranges, metric_definitions=METRIC_DEFINTIONS)

    assert 'Value must be an integer greater than zero' in str(e)
예제 #23
0
 def get_sagemaker_tuner(self, **kwargs):
     return HyperparameterTuner(
         base_tuning_job_name=self.get_tuning_job_name(),
         estimator=self.get_sagemaker_estimator(),
         objective_metric_name=self.get_tuner_objective_metric_name(),
         objective_type=kwargs.get("objective_type", "Minimize"),
         hyperparameter_ranges=kwargs["hyperparameter_ranges"],
         metric_definitions=self.get_tuner_metric_definitions(),
         max_jobs=kwargs.get("max_jobs", 1),
         max_parallel_jobs=kwargs.get("max_parallel_jobs", 1),
     )
예제 #24
0
def test_tuning_chainer(sagemaker_session):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        estimator = Chainer(entry_point=script_path,
                            role='SageMakerRole',
                            py_version=PYTHON_VERSION,
                            train_instance_count=1,
                            train_instance_type='ml.c4.xlarge',
                            sagemaker_session=sagemaker_session,
                            hyperparameters={'epochs': 1})

        train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                              key_prefix='integ-test-data/chainer_mnist/train')
        test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                             key_prefix='integ-test-data/chainer_mnist/test')

        hyperparameter_ranges = {'alpha': ContinuousParameter(0.001, 0.005)}

        objective_metric_name = 'Validation-accuracy'
        metric_definitions = [
            {'Name': 'Validation-accuracy',
             'Regex': r'\[J1\s+\d\.\d+\s+\d\.\d+\s+\d\.\d+\s+(\d\.\d+)'}]

        tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges,
                                    metric_definitions,
                                    max_jobs=2, max_parallel_jobs=2)

        tuning_job_name = unique_name_from_base('chainer', max_length=32)
        tuner.fit({'train': train_input, 'test': test_input}, job_name=tuning_job_name)

        print('Started hyperparameter tuning job with name:' + tuning_job_name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')

        batch_size = 100
        data = np.zeros((batch_size, 784), dtype='float32')
        output = predictor.predict(data)
        assert len(output) == batch_size

        data = np.zeros((batch_size, 1, 28, 28), dtype='float32')
        output = predictor.predict(data)
        assert len(output) == batch_size

        data = np.zeros((batch_size, 28, 28), dtype='float32')
        output = predictor.predict(data)
        assert len(output) == batch_size
예제 #25
0
def test_tuning_tf_vpc_multi(sagemaker_session):
    """Test Tensorflow multi-instance using the same VpcConfig for training and inference"""
    instance_type = "ml.c4.xlarge"
    instance_count = 2

    script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py")

    ec2_client = sagemaker_session.boto_session.client("ec2")
    subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources(
        ec2_client, sagemaker_session.boto_region_name)
    vpc_test_utils.setup_security_group_for_encryption(ec2_client,
                                                       security_group_id)

    estimator = TensorFlow(
        entry_point=script_path,
        role="SageMakerRole",
        training_steps=1,
        evaluation_steps=1,
        hyperparameters={"input_tensor_name": "inputs"},
        train_instance_count=instance_count,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        base_job_name="test-vpc-tf",
        subnets=subnet_ids,
        security_group_ids=[security_group_id],
        encrypt_inter_container_traffic=True,
    )

    inputs = sagemaker_session.upload_data(
        path=DATA_PATH, key_prefix="integ-test-data/tf_iris")
    hyperparameter_ranges = {"learning_rate": ContinuousParameter(0.05, 0.2)}

    objective_metric_name = "loss"
    metric_definitions = [{"Name": "loss", "Regex": "loss = ([0-9\\.]+)"}]

    tuner = HyperparameterTuner(
        estimator,
        objective_metric_name,
        hyperparameter_ranges,
        metric_definitions,
        objective_type="Minimize",
        max_jobs=2,
        max_parallel_jobs=2,
    )

    tuning_job_name = unique_name_from_base("tune-tf", max_length=32)
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        tuner.fit(inputs, job_name=tuning_job_name)

        print("Started hyperparameter tuning job with name:" + tuning_job_name)

        time.sleep(15)
        tuner.wait()
예제 #26
0
def test_tuning_kmeans(sagemaker_session):
    with timeout(minutes=20):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='tk',
                        output_path='s3://{}/'.format(sagemaker_session.default_bucket()))

        # set kmeans specific hp
        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1

        records = kmeans.record_set(train_set[0][:100])
        test_records = kmeans.record_set(train_set[0][:100], channel='test')

        # specify which hp you want to optimize over
        hyperparameter_ranges = {'extra_center_factor': IntegerParameter(1, 10),
                                 'mini_batch_size': IntegerParameter(10, 100),
                                 'epochs': IntegerParameter(1, 2),
                                 'init_method': CategoricalParameter(['kmeans++', 'random'])}
        objective_metric_name = 'test:msd'

        tuner = HyperparameterTuner(estimator=kmeans, objective_metric_name=objective_metric_name,
                                    hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=2,
                                    max_parallel_jobs=2)

        tuner.fit([records, test_records])

        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label['closest_cluster'] is not None
            assert record.label['distance_to_cluster'] is not None
def test_tuning_step(sfn_client, record_set_for_hyperparameter_tuning,
                     sagemaker_role_arn, sfn_role_arn):
    job_name = generate_job_name()

    kmeans = KMeans(role=sagemaker_role_arn,
                    instance_count=1,
                    instance_type=INSTANCE_TYPE,
                    k=10)

    hyperparameter_ranges = {
        "extra_center_factor": IntegerParameter(4, 10),
        "mini_batch_size": IntegerParameter(10, 100),
        "epochs": IntegerParameter(1, 2),
        "init_method": CategoricalParameter(["kmeans++", "random"]),
    }

    tuner = HyperparameterTuner(
        estimator=kmeans,
        objective_metric_name="test:msd",
        hyperparameter_ranges=hyperparameter_ranges,
        objective_type="Minimize",
        max_jobs=2,
        max_parallel_jobs=2,
    )

    # Build workflow definition
    tuning_step = TuningStep('Tuning',
                             tuner=tuner,
                             job_name=job_name,
                             data=record_set_for_hyperparameter_tuning)
    tuning_step.add_retry(SAGEMAKER_RETRY_STRATEGY)
    workflow_graph = Chain([tuning_step])

    with timeout(minutes=DEFAULT_TIMEOUT_MINUTES):
        # Create workflow and check definition
        workflow = create_workflow_and_check_definition(
            workflow_graph=workflow_graph,
            workflow_name=unique_name_from_base(
                "integ-test-tuning-step-workflow"),
            sfn_client=sfn_client,
            sfn_role_arn=sfn_role_arn)

        # Execute workflow
        execution = workflow.execute()
        execution_output = execution.get_output(wait=True)

        # Check workflow output
        assert execution_output.get(
            "HyperParameterTuningJobStatus") == "Completed"

        # Cleanup
        state_machine_delete_wait(sfn_client, workflow.state_machine_arn)
예제 #28
0
def test_tuning_tf(sagemaker_session):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='tune-tf')

        inputs = sagemaker_session.upload_data(
            path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        hyperparameter_ranges = {
            'learning_rate': ContinuousParameter(0.05, 0.2)
        }

        objective_metric_name = 'loss'
        metric_definitions = [{'Name': 'loss', 'Regex': 'loss = ([0-9\\.]+)'}]

        tuner = HyperparameterTuner(estimator,
                                    objective_metric_name,
                                    hyperparameter_ranges,
                                    metric_definitions,
                                    objective_type='Minimize',
                                    max_jobs=2,
                                    max_parallel_jobs=2)

        tuner.fit(inputs)

        print('Started hyperparameter tuning job with name:' +
              tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result
def test_tuning_tf_lustre(
    efs_fsx_setup,
    sagemaker_session,
    cpu_instance_type,
    tensorflow_training_latest_version,
    tensorflow_training_latest_py_version,
):
    role = efs_fsx_setup["role_name"]
    subnets = [efs_fsx_setup["subnet_id"]]
    security_group_ids = efs_fsx_setup["security_group_ids"]

    estimator = TensorFlow(
        entry_point=SCRIPT,
        role=role,
        instance_count=1,
        instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
        framework_version=tensorflow_training_latest_version,
        py_version=tensorflow_training_latest_py_version,
        subnets=subnets,
        security_group_ids=security_group_ids,
    )

    hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)}
    objective_metric_name = "accuracy"
    metric_definitions = [{
        "Name": objective_metric_name,
        "Regex": "accuracy = ([0-9\\.]+)"
    }]
    tuner = HyperparameterTuner(
        estimator,
        objective_metric_name,
        hyperparameter_ranges,
        metric_definitions,
        max_jobs=MAX_JOBS,
        max_parallel_jobs=MAX_PARALLEL_JOBS,
    )

    file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"]
    file_system_input = FileSystemInput(file_system_id=file_system_fsx_id,
                                        file_system_type="FSxLustre",
                                        directory_path=FSX_DIR_PATH)

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        tuning_job_name = unique_name_from_base(
            "test-tuning-tf-script-mode-lustre", max_length=32)
        tuner.fit(file_system_input, job_name=tuning_job_name)
        time.sleep(15)
        tuner.wait()
    best_training_job = tuner.best_training_job()
    assert best_training_job
예제 #30
0
def sagemaker_hyperparam_tuning(sm_estimator, train_s3, hyperparameter_ranges,
                                metric_definitions, tuning_job_name, max_jobs,
                                max_parallel_jobs):
    objective_metric_name = 'validation:error'
    objective_type = 'Minimize'
    tuner = HyperparameterTuner(estimator=sm_estimator,
                                objective_metric_name=objective_metric_name,
                                hyperparameter_ranges=hyperparameter_ranges,
                                metric_definitions=metric_definitions,
                                max_jobs=max_jobs,
                                max_parallel_jobs=max_parallel_jobs,
                                objective_type=objective_type)

    tuner.fit(train_s3, job_name=tuning_job_name, wait=False)