def test_stop_tuning_job(sagemaker_session): feature_num = 14 train_input = np.random.rand(1000, feature_num) rcf = RandomCutForest(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', num_trees=50, num_samples_per_tree=20, sagemaker_session=sagemaker_session, base_job_name='test-randomcutforest') records = rcf.record_set(train_input) records.distribution = 'FullyReplicated' test_records = rcf.record_set(train_input, channel='test') test_records.distribution = 'FullyReplicated' hyperparameter_ranges = {'num_trees': IntegerParameter(50, 100), 'num_samples_per_tree': IntegerParameter(1, 2)} objective_metric_name = 'test:f1' tuner = HyperparameterTuner(estimator=rcf, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Maximize', max_jobs=2, max_parallel_jobs=2) tuner.fit([records, test_records]) time.sleep(15) latest_tuning_job_name = tuner.latest_tuning_job.name print('Attempting to stop {}'.format(latest_tuning_job_name)) tuner.stop_tuning_job() desc = tuner.latest_tuning_job.sagemaker_session.sagemaker_client\ .describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=latest_tuning_job_name) assert desc['HyperParameterTuningJobStatus'] == 'Stopping'
def test_tuning(sagemaker_session, ecr_image, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') script = os.path.join(resource_path, 'mnist', 'mnist.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', train_instance_type=instance_type, train_instance_count=1, sagemaker_session=sagemaker_session, image_name=ecr_image, framework_version=framework_version, script_mode=True) hyperparameter_ranges = {'epochs': IntegerParameter(1, 2)} objective_metric_name = 'accuracy' metric_definitions = [{'Name': objective_metric_name, 'Regex': 'accuracy = ([0-9\\.]+)'}] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2) with timeout(minutes=20): inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, 'mnist', 'data'), key_prefix='scriptmode/mnist') tuning_job_name = unique_name_from_base('test-tf-sm-tuning', max_length=32) tuner.fit(inputs, job_name=tuning_job_name) tuner.wait()
def test_model_dir_with_training_job_name(sagemaker_session, image_uri, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), "../..", "resources") script = os.path.join(resource_path, "tuning_model_dir", "entry.py") estimator = TensorFlow( entry_point=script, role="SageMakerRole", train_instance_type=instance_type, train_instance_count=1, image_name=image_uri, framework_version=framework_version, py_version="py3", sagemaker_session=sagemaker_session, ) tuner = HyperparameterTuner( estimator=estimator, objective_metric_name="accuracy", hyperparameter_ranges={"arbitrary_value": IntegerParameter(0, 1)}, metric_definitions=[{ "Name": "accuracy", "Regex": "accuracy=([01])" }], max_jobs=1, max_parallel_jobs=1, ) # User script has logic to check for the correct model_dir tuner.fit( job_name=unique_name_from_base("test-tf-model-dir", max_length=32)) tuner.wait()
def test_marketplace_tuning_job(sagemaker_session): data_path = os.path.join(DATA_DIR, 'marketplace', 'training') region = sagemaker_session.boto_region_name account = REGION_ACCOUNT_MAP[region] algorithm_arn = ALGORITHM_ARN % (region, account) mktplace = AlgorithmEstimator(algorithm_arn=algorithm_arn, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-marketplace') train_input = mktplace.sagemaker_session.upload_data( path=data_path, key_prefix='integ-test-data/marketplace/train') mktplace.set_hyperparameters(max_leaf_nodes=10) hyperparameter_ranges = {'max_leaf_nodes': IntegerParameter(1, 100000)} tuner = HyperparameterTuner(estimator=mktplace, base_tuning_job_name='byo', objective_metric_name='validation:accuracy', hyperparameter_ranges=hyperparameter_ranges, max_jobs=2, max_parallel_jobs=2) tuner.fit({'training': train_input}, include_cls_metadata=False) time.sleep(15) tuner.wait()
def test_tuning(sagemaker_session, ecr_image, instance_type): mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters={'epochs': 1}) hyperparameter_ranges = {'learning-rate': ContinuousParameter(0.01, 0.2)} objective_metric_name = 'Validation-accuracy' metric_definitions = [ {'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)'}] tuner = HyperparameterTuner(mx, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2) with timeout(minutes=20): prefix = 'mxnet_mnist/{}'.format(utils.sagemaker_timestamp()) train_input = mx.sagemaker_session.upload_data(path=os.path.join(DATA_PATH, 'train'), key_prefix=prefix + '/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(DATA_PATH, 'test'), key_prefix=prefix + '/test') job_name = utils.unique_name_from_base('test-mxnet-image', max_length=32) tuner.fit({'train': train_input, 'test': test_input}, job_name=job_name) tuner.wait()
def test_tuning_mxnet(sagemaker_session): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'tuning.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') estimator = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.m4.xlarge', sagemaker_session=sagemaker_session, base_job_name='tune-mxnet') hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.01, 0.2)} objective_metric_name = 'Validation-accuracy' metric_definitions = [{'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)'}] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=4, max_parallel_jobs=2) train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') tuner.fit({'train': train_input, 'test': test_input}) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') data = np.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def _tune( kmeans_estimator, kmeans_train_set, tuner=None, hyperparameter_ranges=None, job_name=None, warm_start_config=None, wait=True, max_jobs=2, max_parallel_jobs=2, early_stopping_type="Off", ): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): if not tuner: tuner = HyperparameterTuner( estimator=kmeans_estimator, objective_metric_name="test:msd", hyperparameter_ranges=hyperparameter_ranges, objective_type="Minimize", max_jobs=max_jobs, max_parallel_jobs=max_parallel_jobs, warm_start_config=warm_start_config, early_stopping_type=early_stopping_type, ) records = kmeans_estimator.record_set(kmeans_train_set[0][:100]) test_record_set = kmeans_estimator.record_set( kmeans_train_set[0][:100], channel="test") print( "Started hyperparameter tuning job with name: {}".format(job_name)) tuner.fit([records, test_record_set], job_name=job_name, wait=wait) return tuner
def _tune(kmeans_estimator, kmeans_train_set, tuner=None, hyperparameter_ranges=None, job_name=None, warm_start_config=None, wait_till_terminal=True, max_jobs=2, max_parallel_jobs=2, early_stopping_type='Off'): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): if not tuner: tuner = HyperparameterTuner( estimator=kmeans_estimator, objective_metric_name='test:msd', hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=max_jobs, max_parallel_jobs=max_parallel_jobs, warm_start_config=warm_start_config, early_stopping_type=early_stopping_type) records = kmeans_estimator.record_set(kmeans_train_set[0][:100]) test_record_set = kmeans_estimator.record_set( kmeans_train_set[0][:100], channel='test') tuner.fit([records, test_record_set], job_name=job_name) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) if wait_till_terminal: tuner.wait() return tuner
def test_attach_tuning_pytorch(sagemaker_session): mnist_dir = os.path.join(DATA_DIR, "pytorch_mnist") mnist_script = os.path.join(mnist_dir, "mnist.py") estimator = PyTorch( entry_point=mnist_script, role="SageMakerRole", train_instance_count=1, py_version=PYTHON_VERSION, train_instance_type="ml.c4.xlarge", sagemaker_session=sagemaker_session, ) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): objective_metric_name = "evaluation-accuracy" metric_definitions = [{ "Name": "evaluation-accuracy", "Regex": r"Overall test accuracy: (\d+)" }] hyperparameter_ranges = {"batch-size": IntegerParameter(50, 100)} tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2, early_stopping_type="Auto", ) training_data = estimator.sagemaker_session.upload_data( path=os.path.join(mnist_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) tuning_job_name = unique_name_from_base("pytorch", max_length=32) tuner.fit({"training": training_data}, job_name=tuning_job_name) print("Started hyperparameter tuning job with name:" + tuning_job_name) time.sleep(15) tuner.wait() attached_tuner = HyperparameterTuner.attach( tuning_job_name, sagemaker_session=sagemaker_session) assert attached_tuner.early_stopping_type == "Auto" best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = attached_tuner.deploy(1, "ml.c4.xlarge") data = np.zeros(shape=(1, 1, 28, 28), dtype=np.float32) predictor.predict(data) batch_size = 100 data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def get_xgb_tuner(output_path, model_name): xgb = _init_model(role, output_path, model_name) # Set core hyperparameters xgb.set_hyperparameters( eval_metric='rmse', objective= 'reg:linear', # plenty of options out there: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst#learning-task-parameters num_round=100, rate_drop=0.3, tweedie_variance_power=1.4) hyperparemeters_to_tune = { 'eta': ContinuousParameter(0, 1), 'min_child_weight': ContinuousParameter(1, 10), 'alpha': ContinuousParameter(0, 2), 'max_depth': IntegerParameter(1, 10) } tuner = HyperparameterTuner( xgb, 'validation:rmse', # objective metric hyperparemeters_to_tune, max_jobs=20, max_parallel_jobs=3, base_tuning_job_name=model_name + "-tuner", objective_type='Minimize') return tuner
def model_fit( self, inputs: Dict[str, str], hparam: Dict[str, Any] = None, ) -> None: if hparam is not None: tuner = HyperparameterTuner( estimator=self.estimator, objective_metric_name=hparam.get('objective_metric_name'), metric_definitions=hparam.get('metric_definitions'), hyperparameter_ranges=hparam.get('hyperparameter_ranges'), objective_type=hparam.get('objective_type'), max_jobs=hparam.get('max_jobs'), max_parallel_jobs=hparam.get('max_parallel_jobs'), tags=self._project_tag, base_tuning_job_name=self._training_job_name, ) tuner.fit( inputs=inputs, job_name=self._training_job_name, wait=False, logs='All', ) else: self.estimator.fit( inputs=inputs, job_name=self._training_job_name, wait=False, logs='All', )
def _test_model_dir_with_training_job_name_function(ecr_image, sagemaker_session, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '../..', 'resources') script = os.path.join(resource_path, 'tuning_model_dir', 'entry.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', instance_type=instance_type, instance_count=1, image_uri=ecr_image, framework_version=framework_version, py_version='py3', sagemaker_session=sagemaker_session) tuner = HyperparameterTuner( estimator=estimator, objective_metric_name='accuracy', hyperparameter_ranges={'arbitrary_value': IntegerParameter(0, 1)}, metric_definitions=[{ 'Name': 'accuracy', 'Regex': 'accuracy=([01])' }], max_jobs=1, max_parallel_jobs=1) # User script has logic to check for the correct model_dir tuner.fit( job_name=unique_name_from_base('test-tf-model-dir', max_length=32)) tuner.wait()
def test_marketplace_tuning_job(sagemaker_session, cpu_instance_type): data_path = os.path.join(DATA_DIR, "marketplace", "training") region = sagemaker_session.boto_region_name account = REGION_ACCOUNT_MAP[region] algorithm_arn = ALGORITHM_ARN % (region, account) mktplace = AlgorithmEstimator( algorithm_arn=algorithm_arn, role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, base_job_name="test-marketplace", ) train_input = mktplace.sagemaker_session.upload_data( path=data_path, key_prefix="integ-test-data/marketplace/train") mktplace.set_hyperparameters(max_leaf_nodes=10) hyperparameter_ranges = {"max_leaf_nodes": IntegerParameter(1, 100000)} tuner = HyperparameterTuner( estimator=mktplace, base_tuning_job_name="byo", objective_metric_name="validation:accuracy", hyperparameter_ranges=hyperparameter_ranges, max_jobs=2, max_parallel_jobs=2, ) tuner.fit({"training": train_input}, include_cls_metadata=False) time.sleep(15) tuner.wait()
def test_tuning_tf_vpc_multi( sagemaker_session, cpu_instance_type, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" instance_type = cpu_instance_type instance_count = 2 resource_path = os.path.join(DATA_DIR, "tensorflow_mnist") script_path = "mnist.py" ec2_client = sagemaker_session.boto_session.client("ec2") subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources( ec2_client) vpc_test_utils.setup_security_group_for_encryption(ec2_client, security_group_id) estimator = TensorFlow( entry_point=script_path, source_dir=resource_path, role="SageMakerRole", framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, base_job_name="test-vpc-tf", subnets=subnet_ids, security_group_ids=[security_group_id], encrypt_inter_container_traffic=True, ) hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)} objective_metric_name = "accuracy" metric_definitions = [{ "Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)" }] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2, ) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, "data"), key_prefix="scriptmode/mnist") tuning_job_name = unique_name_from_base("tune-tf", max_length=32) print( f"Started hyperparameter tuning job with name: {tuning_job_name}") tuner.fit(inputs, job_name=tuning_job_name)
def test_tuning_lda(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'lda') data_filename = 'nips-train_1.pbr' with open(os.path.join(data_path, data_filename), 'rb') as f: all_records = read_records(f) # all records must be same feature_num = int( all_records[0].features['values'].float32_tensor.shape[0]) lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10, sagemaker_session=sagemaker_session, base_job_name='test-lda') record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set.channel = 'test' # specify which hp you want to optimize over hyperparameter_ranges = { 'alpha0': ContinuousParameter(1, 10), 'num_topics': IntegerParameter(1, 2) } objective_metric_name = 'test:pwll' tuner = HyperparameterTuner( estimator=lda, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Maximize', max_jobs=2, max_parallel_jobs=2) tuner.fit([record_set, test_record_set], mini_batch_size=1) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label['topic_mixture'] is not None
def test_tuning_mxnet( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") estimator = MXNet( entry_point=script_path, role="SageMakerRole", py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, framework_version=mxnet_training_latest_version, sagemaker_session=sagemaker_session, ) hyperparameter_ranges = { "learning-rate": ContinuousParameter(0.01, 0.2) } objective_metric_name = "Validation-accuracy" metric_definitions = [{ "Name": "Validation-accuracy", "Regex": "Validation-accuracy=([0-9\\.]+)" }] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=4, max_parallel_jobs=2, ) train_input = estimator.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = estimator.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") tuning_job_name = unique_name_from_base("tune-mxnet", max_length=32) print("Started hyperparameter tuning job with name:" + tuning_job_name) tuner.fit({ "train": train_input, "test": test_input }, job_name=tuning_job_name) best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, cpu_instance_type) data = np.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_tuning_step_with_single_algo_tuner(pipeline_session, entry_point): inputs = TrainingInput( s3_data=f"s3://{pipeline_session.default_bucket()}/training-data") pytorch_estimator = PyTorch( entry_point=entry_point, role=sagemaker.get_execution_role(), framework_version="1.5.0", py_version="py3", instance_count=1, instance_type="ml.m5.xlarge", sagemaker_session=pipeline_session, enable_sagemaker_metrics=True, max_retry_attempts=3, ) hyperparameter_ranges = { "batch-size": IntegerParameter(64, 128), } tuner = HyperparameterTuner( estimator=pytorch_estimator, objective_metric_name="test:acc", objective_type="Maximize", hyperparameter_ranges=hyperparameter_ranges, metric_definitions=[{ "Name": "test:acc", "Regex": "Overall test accuracy: (.*?);" }], max_jobs=2, max_parallel_jobs=2, ) with warnings.catch_warnings(record=True) as w: step_args = tuner.fit(inputs=inputs) assert len(w) == 1 assert issubclass(w[-1].category, UserWarning) assert "Running within a PipelineSession" in str(w[-1].message) with warnings.catch_warnings(record=True) as w: step = TuningStep( name="MyTuningStep", step_args=step_args, ) assert len(w) == 0 pipeline = Pipeline( name="MyPipeline", steps=[step], sagemaker_session=pipeline_session, ) assert json.loads(pipeline.definition())["Steps"][0] == { "Name": "MyTuningStep", "Type": "Tuning", "Arguments": step_args, }
def test_attach_tuning_pytorch(sagemaker_session): mnist_dir = os.path.join(DATA_DIR, 'pytorch_mnist') mnist_script = os.path.join(mnist_dir, 'mnist.py') estimator = PyTorch(entry_point=mnist_script, role='SageMakerRole', train_instance_count=1, py_version=PYTHON_VERSION, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): objective_metric_name = 'evaluation-accuracy' metric_definitions = [{ 'Name': 'evaluation-accuracy', 'Regex': r'Overall test accuracy: (\d+)' }] hyperparameter_ranges = {'batch-size': IntegerParameter(50, 100)} tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2, early_stopping_type='Auto') training_data = estimator.sagemaker_session.upload_data( path=os.path.join(mnist_dir, 'training'), key_prefix='integ-test-data/pytorch_mnist/training') tuning_job_name = unique_name_from_base('pytorch', max_length=32) tuner.fit({'training': training_data}, job_name=tuning_job_name) print('Started hyperparameter tuning job with name:' + tuning_job_name) time.sleep(15) tuner.wait() attached_tuner = HyperparameterTuner.attach( tuning_job_name, sagemaker_session=sagemaker_session) assert attached_tuner.early_stopping_type == 'Auto' best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = attached_tuner.deploy(1, 'ml.c4.xlarge') data = np.zeros(shape=(1, 1, 28, 28), dtype=np.float32) predictor.predict(data) batch_size = 100 data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type): subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids role = efs_fsx_setup.role_name kmeans = KMeans( role=role, train_instance_count=TRAIN_INSTANCE_COUNT, train_instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, security_group_ids=security_group_ids, ) hyperparameter_ranges = { "extra_center_factor": IntegerParameter(4, 10), "mini_batch_size": IntegerParameter(10, 100), "epochs": IntegerParameter(1, 2), "init_method": CategoricalParameter(["kmeans++", "random"]), } with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): tuner = HyperparameterTuner( estimator=kmeans, objective_metric_name=OBJECTIVE_METRIC_NAME, hyperparameter_ranges=hyperparameter_ranges, objective_type="Minimize", max_jobs=MAX_JOBS, max_parallel_jobs=MAX_PARALLEL_JOBS, ) file_system_fsx_id = efs_fsx_setup.file_system_fsx_id train_records = FileSystemRecordSet( file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH, num_records=NUM_RECORDS, feature_dim=FEATURE_DIM, ) test_records = FileSystemRecordSet( file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH, num_records=NUM_RECORDS, feature_dim=FEATURE_DIM, channel="test", ) job_name = unique_name_from_base("tune-kmeans-fsx") tuner.fit([train_records, test_records], job_name=job_name) tuner.wait() best_training_job = tuner.best_training_job() assert best_training_job
def test_validate_parameter_ranges_string_value_validation_error(sagemaker_session): pca = PCA(ROLE, TRAIN_INSTANCE_COUNT, TRAIN_INSTANCE_TYPE, NUM_COMPONENTS, base_job_name='pca', sagemaker_session=sagemaker_session) invalid_hyperparameter_ranges = {'algorithm_mode': CategoricalParameter([0, 5])} with pytest.raises(ValueError) as e: HyperparameterTuner(estimator=pca, objective_metric_name=OBJECTIVE_METRIC_NAME, hyperparameter_ranges=invalid_hyperparameter_ranges, metric_definitions=METRIC_DEFINTIONS) assert 'Value must be one of "regular" and "randomized"' in str(e)
def test_tuning_tf(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, train_instance_count=1, train_instance_type="ml.c4.xlarge", sagemaker_session=sagemaker_session, ) inputs = sagemaker_session.upload_data( path=DATA_PATH, key_prefix="integ-test-data/tf_iris") hyperparameter_ranges = { "learning_rate": ContinuousParameter(0.05, 0.2) } objective_metric_name = "loss" metric_definitions = [{"Name": "loss", "Regex": "loss = ([0-9\\.]+)"}] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, objective_type="Minimize", max_jobs=2, max_parallel_jobs=2, ) tuning_job_name = unique_name_from_base("tune-tf", max_length=32) tuner.fit(inputs, job_name=tuning_job_name) print("Started hyperparameter tuning job with name:" + tuning_job_name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, "ml.c4.xlarge") features = [6.4, 3.2, 4.5, 1.5] dict_result = predictor.predict({"inputs": features}) print("predict result: {}".format(dict_result)) list_result = predictor.predict(features) print("predict result: {}".format(list_result)) assert dict_result == list_result
def test_validate_parameter_ranges_number_validation_error(sagemaker_session): pca = PCA(ROLE, TRAIN_INSTANCE_COUNT, TRAIN_INSTANCE_TYPE, NUM_COMPONENTS, base_job_name='pca', sagemaker_session=sagemaker_session) invalid_hyperparameter_ranges = {'num_components': IntegerParameter(-1, 2)} with pytest.raises(ValueError) as e: HyperparameterTuner(estimator=pca, objective_metric_name=OBJECTIVE_METRIC_NAME, hyperparameter_ranges=invalid_hyperparameter_ranges, metric_definitions=METRIC_DEFINTIONS) assert 'Value must be an integer greater than zero' in str(e)
def get_sagemaker_tuner(self, **kwargs): return HyperparameterTuner( base_tuning_job_name=self.get_tuning_job_name(), estimator=self.get_sagemaker_estimator(), objective_metric_name=self.get_tuner_objective_metric_name(), objective_type=kwargs.get("objective_type", "Minimize"), hyperparameter_ranges=kwargs["hyperparameter_ranges"], metric_definitions=self.get_tuner_metric_definitions(), max_jobs=kwargs.get("max_jobs", 1), max_parallel_jobs=kwargs.get("max_parallel_jobs", 1), )
def test_tuning_chainer(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') estimator = Chainer(entry_point=script_path, role='SageMakerRole', py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}) train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') hyperparameter_ranges = {'alpha': ContinuousParameter(0.001, 0.005)} objective_metric_name = 'Validation-accuracy' metric_definitions = [ {'Name': 'Validation-accuracy', 'Regex': r'\[J1\s+\d\.\d+\s+\d\.\d+\s+\d\.\d+\s+(\d\.\d+)'}] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2) tuning_job_name = unique_name_from_base('chainer', max_length=32) tuner.fit({'train': train_input, 'test': test_input}, job_name=tuning_job_name) print('Started hyperparameter tuning job with name:' + tuning_job_name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') batch_size = 100 data = np.zeros((batch_size, 784), dtype='float32') output = predictor.predict(data) assert len(output) == batch_size data = np.zeros((batch_size, 1, 28, 28), dtype='float32') output = predictor.predict(data) assert len(output) == batch_size data = np.zeros((batch_size, 28, 28), dtype='float32') output = predictor.predict(data) assert len(output) == batch_size
def test_tuning_tf_vpc_multi(sagemaker_session): """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" instance_type = "ml.c4.xlarge" instance_count = 2 script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") ec2_client = sagemaker_session.boto_session.client("ec2") subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources( ec2_client, sagemaker_session.boto_region_name) vpc_test_utils.setup_security_group_for_encryption(ec2_client, security_group_id) estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, base_job_name="test-vpc-tf", subnets=subnet_ids, security_group_ids=[security_group_id], encrypt_inter_container_traffic=True, ) inputs = sagemaker_session.upload_data( path=DATA_PATH, key_prefix="integ-test-data/tf_iris") hyperparameter_ranges = {"learning_rate": ContinuousParameter(0.05, 0.2)} objective_metric_name = "loss" metric_definitions = [{"Name": "loss", "Regex": "loss = ([0-9\\.]+)"}] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, objective_type="Minimize", max_jobs=2, max_parallel_jobs=2, ) tuning_job_name = unique_name_from_base("tune-tf", max_length=32) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): tuner.fit(inputs, job_name=tuning_job_name) print("Started hyperparameter tuning job with name:" + tuning_job_name) time.sleep(15) tuner.wait()
def test_tuning_kmeans(sagemaker_session): with timeout(minutes=20): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} # Load the data into memory as numpy arrays with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, base_job_name='tk', output_path='s3://{}/'.format(sagemaker_session.default_bucket())) # set kmeans specific hp kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) test_records = kmeans.record_set(train_set[0][:100], channel='test') # specify which hp you want to optimize over hyperparameter_ranges = {'extra_center_factor': IntegerParameter(1, 10), 'mini_batch_size': IntegerParameter(10, 100), 'epochs': IntegerParameter(1, 2), 'init_method': CategoricalParameter(['kmeans++', 'random'])} objective_metric_name = 'test:msd' tuner = HyperparameterTuner(estimator=kmeans, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=2, max_parallel_jobs=2) tuner.fit([records, test_records]) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label['closest_cluster'] is not None assert record.label['distance_to_cluster'] is not None
def test_tuning_step(sfn_client, record_set_for_hyperparameter_tuning, sagemaker_role_arn, sfn_role_arn): job_name = generate_job_name() kmeans = KMeans(role=sagemaker_role_arn, instance_count=1, instance_type=INSTANCE_TYPE, k=10) hyperparameter_ranges = { "extra_center_factor": IntegerParameter(4, 10), "mini_batch_size": IntegerParameter(10, 100), "epochs": IntegerParameter(1, 2), "init_method": CategoricalParameter(["kmeans++", "random"]), } tuner = HyperparameterTuner( estimator=kmeans, objective_metric_name="test:msd", hyperparameter_ranges=hyperparameter_ranges, objective_type="Minimize", max_jobs=2, max_parallel_jobs=2, ) # Build workflow definition tuning_step = TuningStep('Tuning', tuner=tuner, job_name=job_name, data=record_set_for_hyperparameter_tuning) tuning_step.add_retry(SAGEMAKER_RETRY_STRATEGY) workflow_graph = Chain([tuning_step]) with timeout(minutes=DEFAULT_TIMEOUT_MINUTES): # Create workflow and check definition workflow = create_workflow_and_check_definition( workflow_graph=workflow_graph, workflow_name=unique_name_from_base( "integ-test-tuning-step-workflow"), sfn_client=sfn_client, sfn_role_arn=sfn_role_arn) # Execute workflow execution = workflow.execute() execution_output = execution.get_output(wait=True) # Check workflow output assert execution_output.get( "HyperParameterTuningJobStatus") == "Completed" # Cleanup state_machine_delete_wait(sfn_client, workflow.state_machine_arn)
def test_tuning_tf(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='tune-tf') inputs = sagemaker_session.upload_data( path=DATA_PATH, key_prefix='integ-test-data/tf_iris') hyperparameter_ranges = { 'learning_rate': ContinuousParameter(0.05, 0.2) } objective_metric_name = 'loss' metric_definitions = [{'Name': 'loss', 'Regex': 'loss = ([0-9\\.]+)'}] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, objective_type='Minimize', max_jobs=2, max_parallel_jobs=2) tuner.fit(inputs) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') features = [6.4, 3.2, 4.5, 1.5] dict_result = predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result
def test_tuning_tf_lustre( efs_fsx_setup, sagemaker_session, cpu_instance_type, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] estimator = TensorFlow( entry_point=SCRIPT, role=role, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, subnets=subnets, security_group_ids=security_group_ids, ) hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)} objective_metric_name = "accuracy" metric_definitions = [{ "Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)" }] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=MAX_JOBS, max_parallel_jobs=MAX_PARALLEL_JOBS, ) file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"] file_system_input = FileSystemInput(file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): tuning_job_name = unique_name_from_base( "test-tuning-tf-script-mode-lustre", max_length=32) tuner.fit(file_system_input, job_name=tuning_job_name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() assert best_training_job
def sagemaker_hyperparam_tuning(sm_estimator, train_s3, hyperparameter_ranges, metric_definitions, tuning_job_name, max_jobs, max_parallel_jobs): objective_metric_name = 'validation:error' objective_type = 'Minimize' tuner = HyperparameterTuner(estimator=sm_estimator, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, metric_definitions=metric_definitions, max_jobs=max_jobs, max_parallel_jobs=max_parallel_jobs, objective_type=objective_type) tuner.fit(train_s3, job_name=tuning_job_name, wait=False)