def test_stop_tuning_job(sagemaker_session): feature_num = 14 train_input = np.random.rand(1000, feature_num) rcf = RandomCutForest(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', num_trees=50, num_samples_per_tree=20, sagemaker_session=sagemaker_session, base_job_name='test-randomcutforest') records = rcf.record_set(train_input) records.distribution = 'FullyReplicated' test_records = rcf.record_set(train_input, channel='test') test_records.distribution = 'FullyReplicated' hyperparameter_ranges = {'num_trees': IntegerParameter(50, 100), 'num_samples_per_tree': IntegerParameter(1, 2)} objective_metric_name = 'test:f1' tuner = HyperparameterTuner(estimator=rcf, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Maximize', max_jobs=2, max_parallel_jobs=2) tuner.fit([records, test_records]) time.sleep(15) latest_tuning_job_name = tuner.latest_tuning_job.name print('Attempting to stop {}'.format(latest_tuning_job_name)) tuner.stop_tuning_job() desc = tuner.latest_tuning_job.sagemaker_session.sagemaker_client\ .describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=latest_tuning_job_name) assert desc['HyperParameterTuningJobStatus'] == 'Stopping'
def test_stop_tuning_job(sagemaker_session): feature_num = 14 train_input = np.random.rand(1000, feature_num) rcf = RandomCutForest(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', num_trees=50, num_samples_per_tree=20, sagemaker_session=sagemaker_session, base_job_name='test-randomcutforest') records = rcf.record_set(train_input) records.distribution = 'FullyReplicated' test_records = rcf.record_set(train_input, channel='test') test_records.distribution = 'FullyReplicated' hyperparameter_ranges = {'num_trees': IntegerParameter(50, 100), 'num_samples_per_tree': IntegerParameter(1, 2)} objective_metric_name = 'test:f1' tuner = HyperparameterTuner(estimator=rcf, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Maximize', max_jobs=2, max_parallel_jobs=2) tuner.fit([records, test_records]) time.sleep(15) latest_tuning_job_name = tuner.latest_tuning_job.name print('Attempting to stop {}'.format(latest_tuning_job_name)) tuner.stop_tuning_job() desc = tuner.latest_tuning_job.sagemaker_session.sagemaker_client\ .describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=latest_tuning_job_name) assert desc['HyperParameterTuningJobStatus'] == 'Stopping'
def _test_model_dir_with_training_job_name_function(ecr_image, sagemaker_session, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '../..', 'resources') script = os.path.join(resource_path, 'tuning_model_dir', 'entry.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', instance_type=instance_type, instance_count=1, image_uri=ecr_image, framework_version=framework_version, py_version='py3', sagemaker_session=sagemaker_session) tuner = HyperparameterTuner( estimator=estimator, objective_metric_name='accuracy', hyperparameter_ranges={'arbitrary_value': IntegerParameter(0, 1)}, metric_definitions=[{ 'Name': 'accuracy', 'Regex': 'accuracy=([01])' }], max_jobs=1, max_parallel_jobs=1) # User script has logic to check for the correct model_dir tuner.fit( job_name=unique_name_from_base('test-tf-model-dir', max_length=32)) tuner.wait()
def test_tuning(sagemaker_session, ecr_image, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') script = os.path.join(resource_path, 'mnist', 'mnist.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', train_instance_type=instance_type, train_instance_count=1, sagemaker_session=sagemaker_session, image_name=ecr_image, framework_version=framework_version, script_mode=True) hyperparameter_ranges = {'epochs': IntegerParameter(1, 2)} objective_metric_name = 'accuracy' metric_definitions = [{'Name': objective_metric_name, 'Regex': 'accuracy = ([0-9\\.]+)'}] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2) with timeout(minutes=20): inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, 'mnist', 'data'), key_prefix='scriptmode/mnist') tuning_job_name = unique_name_from_base('test-tf-sm-tuning', max_length=32) tuner.fit(inputs, job_name=tuning_job_name) tuner.wait()
def test_model_dir_with_training_job_name(sagemaker_session, image_uri, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), "../..", "resources") script = os.path.join(resource_path, "tuning_model_dir", "entry.py") estimator = TensorFlow( entry_point=script, role="SageMakerRole", train_instance_type=instance_type, train_instance_count=1, image_name=image_uri, framework_version=framework_version, py_version="py3", sagemaker_session=sagemaker_session, ) tuner = HyperparameterTuner( estimator=estimator, objective_metric_name="accuracy", hyperparameter_ranges={"arbitrary_value": IntegerParameter(0, 1)}, metric_definitions=[{ "Name": "accuracy", "Regex": "accuracy=([01])" }], max_jobs=1, max_parallel_jobs=1, ) # User script has logic to check for the correct model_dir tuner.fit( job_name=unique_name_from_base("test-tf-model-dir", max_length=32)) tuner.wait()
def model_fit( self, inputs: Dict[str, str], hparam: Dict[str, Any] = None, ) -> None: if hparam is not None: tuner = HyperparameterTuner( estimator=self.estimator, objective_metric_name=hparam.get('objective_metric_name'), metric_definitions=hparam.get('metric_definitions'), hyperparameter_ranges=hparam.get('hyperparameter_ranges'), objective_type=hparam.get('objective_type'), max_jobs=hparam.get('max_jobs'), max_parallel_jobs=hparam.get('max_parallel_jobs'), tags=self._project_tag, base_tuning_job_name=self._training_job_name, ) tuner.fit( inputs=inputs, job_name=self._training_job_name, wait=False, logs='All', ) else: self.estimator.fit( inputs=inputs, job_name=self._training_job_name, wait=False, logs='All', )
def test_tuning(sagemaker_session, ecr_image, instance_type): mx = MXNet(entry_point=SCRIPT_PATH, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters={'epochs': 1}) hyperparameter_ranges = {'learning-rate': ContinuousParameter(0.01, 0.2)} objective_metric_name = 'Validation-accuracy' metric_definitions = [ {'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)'}] tuner = HyperparameterTuner(mx, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2) with timeout(minutes=20): prefix = 'mxnet_mnist/{}'.format(utils.sagemaker_timestamp()) train_input = mx.sagemaker_session.upload_data(path=os.path.join(DATA_PATH, 'train'), key_prefix=prefix + '/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(DATA_PATH, 'test'), key_prefix=prefix + '/test') job_name = utils.unique_name_from_base('test-mxnet-image', max_length=32) tuner.fit({'train': train_input, 'test': test_input}, job_name=job_name) tuner.wait()
def _tune(kmeans_estimator, kmeans_train_set, tuner=None, hyperparameter_ranges=None, job_name=None, warm_start_config=None, wait_till_terminal=True, max_jobs=2, max_parallel_jobs=2, early_stopping_type='Off'): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): if not tuner: tuner = HyperparameterTuner( estimator=kmeans_estimator, objective_metric_name='test:msd', hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=max_jobs, max_parallel_jobs=max_parallel_jobs, warm_start_config=warm_start_config, early_stopping_type=early_stopping_type) records = kmeans_estimator.record_set(kmeans_train_set[0][:100]) test_record_set = kmeans_estimator.record_set( kmeans_train_set[0][:100], channel='test') tuner.fit([records, test_record_set], job_name=job_name) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) if wait_till_terminal: tuner.wait() return tuner
def test_tuning_mxnet(sagemaker_session): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'tuning.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') estimator = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.m4.xlarge', sagemaker_session=sagemaker_session, base_job_name='tune-mxnet') hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.01, 0.2)} objective_metric_name = 'Validation-accuracy' metric_definitions = [{'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)'}] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=4, max_parallel_jobs=2) train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') tuner.fit({'train': train_input, 'test': test_input}) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') data = np.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_marketplace_tuning_job(sagemaker_session): data_path = os.path.join(DATA_DIR, 'marketplace', 'training') region = sagemaker_session.boto_region_name account = REGION_ACCOUNT_MAP[region] algorithm_arn = ALGORITHM_ARN % (region, account) mktplace = AlgorithmEstimator(algorithm_arn=algorithm_arn, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-marketplace') train_input = mktplace.sagemaker_session.upload_data( path=data_path, key_prefix='integ-test-data/marketplace/train') mktplace.set_hyperparameters(max_leaf_nodes=10) hyperparameter_ranges = {'max_leaf_nodes': IntegerParameter(1, 100000)} tuner = HyperparameterTuner(estimator=mktplace, base_tuning_job_name='byo', objective_metric_name='validation:accuracy', hyperparameter_ranges=hyperparameter_ranges, max_jobs=2, max_parallel_jobs=2) tuner.fit({'training': train_input}, include_cls_metadata=False) time.sleep(15) tuner.wait()
def test_marketplace_tuning_job(sagemaker_session, cpu_instance_type): data_path = os.path.join(DATA_DIR, "marketplace", "training") region = sagemaker_session.boto_region_name account = REGION_ACCOUNT_MAP[region] algorithm_arn = ALGORITHM_ARN % (region, account) mktplace = AlgorithmEstimator( algorithm_arn=algorithm_arn, role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, base_job_name="test-marketplace", ) train_input = mktplace.sagemaker_session.upload_data( path=data_path, key_prefix="integ-test-data/marketplace/train") mktplace.set_hyperparameters(max_leaf_nodes=10) hyperparameter_ranges = {"max_leaf_nodes": IntegerParameter(1, 100000)} tuner = HyperparameterTuner( estimator=mktplace, base_tuning_job_name="byo", objective_metric_name="validation:accuracy", hyperparameter_ranges=hyperparameter_ranges, max_jobs=2, max_parallel_jobs=2, ) tuner.fit({"training": train_input}, include_cls_metadata=False) time.sleep(15) tuner.wait()
def _tune( kmeans_estimator, kmeans_train_set, tuner=None, hyperparameter_ranges=None, job_name=None, warm_start_config=None, wait=True, max_jobs=2, max_parallel_jobs=2, early_stopping_type="Off", ): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): if not tuner: tuner = HyperparameterTuner( estimator=kmeans_estimator, objective_metric_name="test:msd", hyperparameter_ranges=hyperparameter_ranges, objective_type="Minimize", max_jobs=max_jobs, max_parallel_jobs=max_parallel_jobs, warm_start_config=warm_start_config, early_stopping_type=early_stopping_type, ) records = kmeans_estimator.record_set(kmeans_train_set[0][:100]) test_record_set = kmeans_estimator.record_set( kmeans_train_set[0][:100], channel="test") print( "Started hyperparameter tuning job with name: {}".format(job_name)) tuner.fit([records, test_record_set], job_name=job_name, wait=wait) return tuner
def test_attach_tuning_pytorch(sagemaker_session): mnist_dir = os.path.join(DATA_DIR, "pytorch_mnist") mnist_script = os.path.join(mnist_dir, "mnist.py") estimator = PyTorch( entry_point=mnist_script, role="SageMakerRole", train_instance_count=1, py_version=PYTHON_VERSION, train_instance_type="ml.c4.xlarge", sagemaker_session=sagemaker_session, ) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): objective_metric_name = "evaluation-accuracy" metric_definitions = [{ "Name": "evaluation-accuracy", "Regex": r"Overall test accuracy: (\d+)" }] hyperparameter_ranges = {"batch-size": IntegerParameter(50, 100)} tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2, early_stopping_type="Auto", ) training_data = estimator.sagemaker_session.upload_data( path=os.path.join(mnist_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) tuning_job_name = unique_name_from_base("pytorch", max_length=32) tuner.fit({"training": training_data}, job_name=tuning_job_name) print("Started hyperparameter tuning job with name:" + tuning_job_name) time.sleep(15) tuner.wait() attached_tuner = HyperparameterTuner.attach( tuning_job_name, sagemaker_session=sagemaker_session) assert attached_tuner.early_stopping_type == "Auto" best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = attached_tuner.deploy(1, "ml.c4.xlarge") data = np.zeros(shape=(1, 1, 28, 28), dtype=np.float32) predictor.predict(data) batch_size = 100 data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def test_tuning_mxnet(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'tuning.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') estimator = MXNet(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.m4.xlarge', sagemaker_session=sagemaker_session, base_job_name='tune-mxnet') hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.01, 0.2)} objective_metric_name = 'Validation-accuracy' metric_definitions = [{'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)'}] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=4, max_parallel_jobs=2) train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') tuner.fit({'train': train_input, 'test': test_input}) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') data = np.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_tuning_tf_vpc_multi( sagemaker_session, cpu_instance_type, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" instance_type = cpu_instance_type instance_count = 2 resource_path = os.path.join(DATA_DIR, "tensorflow_mnist") script_path = "mnist.py" ec2_client = sagemaker_session.boto_session.client("ec2") subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources( ec2_client) vpc_test_utils.setup_security_group_for_encryption(ec2_client, security_group_id) estimator = TensorFlow( entry_point=script_path, source_dir=resource_path, role="SageMakerRole", framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, base_job_name="test-vpc-tf", subnets=subnet_ids, security_group_ids=[security_group_id], encrypt_inter_container_traffic=True, ) hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)} objective_metric_name = "accuracy" metric_definitions = [{ "Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)" }] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2, ) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, "data"), key_prefix="scriptmode/mnist") tuning_job_name = unique_name_from_base("tune-tf", max_length=32) print( f"Started hyperparameter tuning job with name: {tuning_job_name}") tuner.fit(inputs, job_name=tuning_job_name)
def test_tuning_lda(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'lda') data_filename = 'nips-train_1.pbr' with open(os.path.join(data_path, data_filename), 'rb') as f: all_records = read_records(f) # all records must be same feature_num = int( all_records[0].features['values'].float32_tensor.shape[0]) lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10, sagemaker_session=sagemaker_session, base_job_name='test-lda') record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set.channel = 'test' # specify which hp you want to optimize over hyperparameter_ranges = { 'alpha0': ContinuousParameter(1, 10), 'num_topics': IntegerParameter(1, 2) } objective_metric_name = 'test:pwll' tuner = HyperparameterTuner( estimator=lda, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Maximize', max_jobs=2, max_parallel_jobs=2) tuner.fit([record_set, test_record_set], mini_batch_size=1) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label['topic_mixture'] is not None
def test_tuning_mxnet( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") estimator = MXNet( entry_point=script_path, role="SageMakerRole", py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, framework_version=mxnet_training_latest_version, sagemaker_session=sagemaker_session, ) hyperparameter_ranges = { "learning-rate": ContinuousParameter(0.01, 0.2) } objective_metric_name = "Validation-accuracy" metric_definitions = [{ "Name": "Validation-accuracy", "Regex": "Validation-accuracy=([0-9\\.]+)" }] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=4, max_parallel_jobs=2, ) train_input = estimator.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = estimator.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") tuning_job_name = unique_name_from_base("tune-mxnet", max_length=32) print("Started hyperparameter tuning job with name:" + tuning_job_name) tuner.fit({ "train": train_input, "test": test_input }, job_name=tuning_job_name) best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, cpu_instance_type) data = np.zeros(shape=(1, 1, 28, 28)) predictor.predict(data)
def test_attach_tuning_pytorch(sagemaker_session): mnist_dir = os.path.join(DATA_DIR, 'pytorch_mnist') mnist_script = os.path.join(mnist_dir, 'mnist.py') estimator = PyTorch(entry_point=mnist_script, role='SageMakerRole', train_instance_count=1, py_version=PYTHON_VERSION, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): objective_metric_name = 'evaluation-accuracy' metric_definitions = [{ 'Name': 'evaluation-accuracy', 'Regex': r'Overall test accuracy: (\d+)' }] hyperparameter_ranges = {'batch-size': IntegerParameter(50, 100)} tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2, early_stopping_type='Auto') training_data = estimator.sagemaker_session.upload_data( path=os.path.join(mnist_dir, 'training'), key_prefix='integ-test-data/pytorch_mnist/training') tuning_job_name = unique_name_from_base('pytorch', max_length=32) tuner.fit({'training': training_data}, job_name=tuning_job_name) print('Started hyperparameter tuning job with name:' + tuning_job_name) time.sleep(15) tuner.wait() attached_tuner = HyperparameterTuner.attach( tuning_job_name, sagemaker_session=sagemaker_session) assert attached_tuner.early_stopping_type == 'Auto' best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = attached_tuner.deploy(1, 'ml.c4.xlarge') data = np.zeros(shape=(1, 1, 28, 28), dtype=np.float32) predictor.predict(data) batch_size = 100 data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type): subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids role = efs_fsx_setup.role_name kmeans = KMeans( role=role, train_instance_count=TRAIN_INSTANCE_COUNT, train_instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, security_group_ids=security_group_ids, ) hyperparameter_ranges = { "extra_center_factor": IntegerParameter(4, 10), "mini_batch_size": IntegerParameter(10, 100), "epochs": IntegerParameter(1, 2), "init_method": CategoricalParameter(["kmeans++", "random"]), } with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): tuner = HyperparameterTuner( estimator=kmeans, objective_metric_name=OBJECTIVE_METRIC_NAME, hyperparameter_ranges=hyperparameter_ranges, objective_type="Minimize", max_jobs=MAX_JOBS, max_parallel_jobs=MAX_PARALLEL_JOBS, ) file_system_fsx_id = efs_fsx_setup.file_system_fsx_id train_records = FileSystemRecordSet( file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH, num_records=NUM_RECORDS, feature_dim=FEATURE_DIM, ) test_records = FileSystemRecordSet( file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH, num_records=NUM_RECORDS, feature_dim=FEATURE_DIM, channel="test", ) job_name = unique_name_from_base("tune-kmeans-fsx") tuner.fit([train_records, test_records], job_name=job_name) tuner.wait() best_training_job = tuner.best_training_job() assert best_training_job
def test_tuning_tf(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, train_instance_count=1, train_instance_type="ml.c4.xlarge", sagemaker_session=sagemaker_session, ) inputs = sagemaker_session.upload_data( path=DATA_PATH, key_prefix="integ-test-data/tf_iris") hyperparameter_ranges = { "learning_rate": ContinuousParameter(0.05, 0.2) } objective_metric_name = "loss" metric_definitions = [{"Name": "loss", "Regex": "loss = ([0-9\\.]+)"}] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, objective_type="Minimize", max_jobs=2, max_parallel_jobs=2, ) tuning_job_name = unique_name_from_base("tune-tf", max_length=32) tuner.fit(inputs, job_name=tuning_job_name) print("Started hyperparameter tuning job with name:" + tuning_job_name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, "ml.c4.xlarge") features = [6.4, 3.2, 4.5, 1.5] dict_result = predictor.predict({"inputs": features}) print("predict result: {}".format(dict_result)) list_result = predictor.predict(features) print("predict result: {}".format(list_result)) assert dict_result == list_result
def test_tuning_kmeans(sagemaker_session): with timeout(minutes=20): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} # Load the data into memory as numpy arrays with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, base_job_name='tk', output_path='s3://{}/'.format(sagemaker_session.default_bucket())) # set kmeans specific hp kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) test_records = kmeans.record_set(train_set[0][:100], channel='test') # specify which hp you want to optimize over hyperparameter_ranges = {'extra_center_factor': IntegerParameter(1, 10), 'mini_batch_size': IntegerParameter(10, 100), 'epochs': IntegerParameter(1, 2), 'init_method': CategoricalParameter(['kmeans++', 'random'])} objective_metric_name = 'test:msd' tuner = HyperparameterTuner(estimator=kmeans, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=2, max_parallel_jobs=2) tuner.fit([records, test_records]) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label['closest_cluster'] is not None assert record.label['distance_to_cluster'] is not None
def test_tuning_tf_vpc_multi(sagemaker_session): """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" instance_type = "ml.c4.xlarge" instance_count = 2 script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") ec2_client = sagemaker_session.boto_session.client("ec2") subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources( ec2_client, sagemaker_session.boto_region_name) vpc_test_utils.setup_security_group_for_encryption(ec2_client, security_group_id) estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, base_job_name="test-vpc-tf", subnets=subnet_ids, security_group_ids=[security_group_id], encrypt_inter_container_traffic=True, ) inputs = sagemaker_session.upload_data( path=DATA_PATH, key_prefix="integ-test-data/tf_iris") hyperparameter_ranges = {"learning_rate": ContinuousParameter(0.05, 0.2)} objective_metric_name = "loss" metric_definitions = [{"Name": "loss", "Regex": "loss = ([0-9\\.]+)"}] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, objective_type="Minimize", max_jobs=2, max_parallel_jobs=2, ) tuning_job_name = unique_name_from_base("tune-tf", max_length=32) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): tuner.fit(inputs, job_name=tuning_job_name) print("Started hyperparameter tuning job with name:" + tuning_job_name) time.sleep(15) tuner.wait()
def test_tuning_kmeans(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} # Load the data into memory as numpy arrays with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, base_job_name='tk', output_path='s3://{}/'.format(sagemaker_session.default_bucket())) # set kmeans specific hp kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) test_records = kmeans.record_set(train_set[0][:100], channel='test') # specify which hp you want to optimize over hyperparameter_ranges = {'extra_center_factor': IntegerParameter(1, 10), 'mini_batch_size': IntegerParameter(10, 100), 'epochs': IntegerParameter(1, 2), 'init_method': CategoricalParameter(['kmeans++', 'random'])} objective_metric_name = 'test:msd' tuner = HyperparameterTuner(estimator=kmeans, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=2, max_parallel_jobs=2) tuner.fit([records, test_records]) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label['closest_cluster'] is not None assert record.label['distance_to_cluster'] is not None
def test_tuning_chainer(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') estimator = Chainer(entry_point=script_path, role='SageMakerRole', py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}) train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') hyperparameter_ranges = {'alpha': ContinuousParameter(0.001, 0.005)} objective_metric_name = 'Validation-accuracy' metric_definitions = [ {'Name': 'Validation-accuracy', 'Regex': r'\[J1\s+\d\.\d+\s+\d\.\d+\s+\d\.\d+\s+(\d\.\d+)'}] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2) tuning_job_name = unique_name_from_base('chainer', max_length=32) tuner.fit({'train': train_input, 'test': test_input}, job_name=tuning_job_name) print('Started hyperparameter tuning job with name:' + tuning_job_name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') batch_size = 100 data = np.zeros((batch_size, 784), dtype='float32') output = predictor.predict(data) assert len(output) == batch_size data = np.zeros((batch_size, 1, 28, 28), dtype='float32') output = predictor.predict(data) assert len(output) == batch_size data = np.zeros((batch_size, 28, 28), dtype='float32') output = predictor.predict(data) assert len(output) == batch_size
def test_tuning_tf_lustre( efs_fsx_setup, sagemaker_session, cpu_instance_type, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] estimator = TensorFlow( entry_point=SCRIPT, role=role, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, subnets=subnets, security_group_ids=security_group_ids, ) hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)} objective_metric_name = "accuracy" metric_definitions = [{ "Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)" }] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=MAX_JOBS, max_parallel_jobs=MAX_PARALLEL_JOBS, ) file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"] file_system_input = FileSystemInput(file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): tuning_job_name = unique_name_from_base( "test-tuning-tf-script-mode-lustre", max_length=32) tuner.fit(file_system_input, job_name=tuning_job_name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() assert best_training_job
def test_tuning_tf(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='tune-tf') inputs = sagemaker_session.upload_data( path=DATA_PATH, key_prefix='integ-test-data/tf_iris') hyperparameter_ranges = { 'learning_rate': ContinuousParameter(0.05, 0.2) } objective_metric_name = 'loss' metric_definitions = [{'Name': 'loss', 'Regex': 'loss = ([0-9\\.]+)'}] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, objective_type='Minimize', max_jobs=2, max_parallel_jobs=2) tuner.fit(inputs) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') features = [6.4, 3.2, 4.5, 1.5] dict_result = predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result
def sagemaker_hyperparam_tuning(sm_estimator, train_s3, hyperparameter_ranges, metric_definitions, tuning_job_name, max_jobs, max_parallel_jobs): objective_metric_name = 'validation:error' objective_type = 'Minimize' tuner = HyperparameterTuner(estimator=sm_estimator, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, metric_definitions=metric_definitions, max_jobs=max_jobs, max_parallel_jobs=max_parallel_jobs, objective_type=objective_type) tuner.fit(train_s3, job_name=tuning_job_name, wait=False)
def test_stop_tuning_job(sagemaker_session, cpu_instance_type): feature_num = 14 train_input = np.random.rand(1000, feature_num) rcf = RandomCutForest( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, num_trees=50, num_samples_per_tree=20, sagemaker_session=sagemaker_session, ) records = rcf.record_set(train_input) records.distribution = "FullyReplicated" test_records = rcf.record_set(train_input, channel="test") test_records.distribution = "FullyReplicated" hyperparameter_ranges = { "num_trees": IntegerParameter(50, 100), "num_samples_per_tree": IntegerParameter(1, 2), } objective_metric_name = "test:f1" tuner = HyperparameterTuner( estimator=rcf, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type="Maximize", max_jobs=2, max_parallel_jobs=2, ) tuning_job_name = unique_name_from_base("test-randomcutforest", max_length=32) tuner.fit([records, test_records], tuning_job_name, wait=False) time.sleep(15) latest_tuning_job_name = tuner.latest_tuning_job.name print("Attempting to stop {}".format(latest_tuning_job_name)) tuner.stop_tuning_job() desc = tuner.latest_tuning_job.sagemaker_session.sagemaker_client.describe_hyper_parameter_tuning_job( HyperParameterTuningJobName=latest_tuning_job_name) assert desc["HyperParameterTuningJobStatus"] == "Stopping"
def test_tuning_chainer(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') estimator = Chainer(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}) train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') hyperparameter_ranges = {'alpha': ContinuousParameter(0.001, 0.005)} objective_metric_name = 'Validation-accuracy' metric_definitions = [ {'Name': 'Validation-accuracy', 'Regex': '\[J1\s+\d\.\d+\s+\d\.\d+\s+\d\.\d+\s+(\d\.\d+)'}] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2) tuner.fit({'train': train_input, 'test': test_input}) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') batch_size = 100 data = np.zeros((batch_size, 784), dtype='float32') output = predictor.predict(data) assert len(output) == batch_size data = np.zeros((batch_size, 1, 28, 28), dtype='float32') output = predictor.predict(data) assert len(output) == batch_size data = np.zeros((batch_size, 28, 28), dtype='float32') output = predictor.predict(data) assert len(output) == batch_size
def test_tuning_tf_script_mode(sagemaker_session, cpu_instance_type, tf_full_version): resource_path = os.path.join(DATA_DIR, "tensorflow_mnist") script_path = os.path.join(resource_path, "mnist.py") estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, script_mode=True, sagemaker_session=sagemaker_session, py_version=PYTHON_VERSION, framework_version=tf_full_version, ) hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)} objective_metric_name = "accuracy" metric_definitions = [{ "Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)" }] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2, ) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, "data"), key_prefix="scriptmode/mnist") tuning_job_name = unique_name_from_base("tune-tf-script-mode", max_length=32) tuner.fit(inputs, job_name=tuning_job_name) print("Started hyperparameter tuning job with name: " + tuning_job_name) time.sleep(15) tuner.wait()
def test_tuning_lda(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'lda') data_filename = 'nips-train_1.pbr' with open(os.path.join(data_path, data_filename), 'rb') as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features['values'].float32_tensor.shape[0]) lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10, sagemaker_session=sagemaker_session, base_job_name='test-lda') record_set = prepare_record_set_from_local_files(data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set = prepare_record_set_from_local_files(data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set.channel = 'test' # specify which hp you want to optimize over hyperparameter_ranges = {'alpha0': ContinuousParameter(1, 10), 'num_topics': IntegerParameter(1, 2)} objective_metric_name = 'test:pwll' tuner = HyperparameterTuner(estimator=lda, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Maximize', max_jobs=2, max_parallel_jobs=2) tuner.fit([record_set, test_record_set], mini_batch_size=1) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label['topic_mixture'] is not None
def test_tuning_step_with_single_algo_tuner(pipeline_session, entry_point): inputs = TrainingInput( s3_data=f"s3://{pipeline_session.default_bucket()}/training-data") pytorch_estimator = PyTorch( entry_point=entry_point, role=sagemaker.get_execution_role(), framework_version="1.5.0", py_version="py3", instance_count=1, instance_type="ml.m5.xlarge", sagemaker_session=pipeline_session, enable_sagemaker_metrics=True, max_retry_attempts=3, ) hyperparameter_ranges = { "batch-size": IntegerParameter(64, 128), } tuner = HyperparameterTuner( estimator=pytorch_estimator, objective_metric_name="test:acc", objective_type="Maximize", hyperparameter_ranges=hyperparameter_ranges, metric_definitions=[{ "Name": "test:acc", "Regex": "Overall test accuracy: (.*?);" }], max_jobs=2, max_parallel_jobs=2, ) with warnings.catch_warnings(record=True) as w: step_args = tuner.fit(inputs=inputs) assert len(w) == 1 assert issubclass(w[-1].category, UserWarning) assert "Running within a PipelineSession" in str(w[-1].message) with warnings.catch_warnings(record=True) as w: step = TuningStep( name="MyTuningStep", step_args=step_args, ) assert len(w) == 0 pipeline = Pipeline( name="MyPipeline", steps=[step], sagemaker_session=pipeline_session, ) assert json.loads(pipeline.definition())["Steps"][0] == { "Name": "MyTuningStep", "Type": "Tuning", "Arguments": step_args, }
def test_tuning_tf(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='tune-tf') inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris') hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.05, 0.2)} objective_metric_name = 'loss' metric_definitions = [{'Name': 'loss', 'Regex': 'loss = ([0-9\\.]+)'}] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, objective_type='Minimize', max_jobs=2, max_parallel_jobs=2) tuner.fit(inputs) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') features = [6.4, 3.2, 4.5, 1.5] dict_result = predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result
def test_tuning_tf_script_mode(sagemaker_session): resource_path = os.path.join(DATA_DIR, 'tensorflow_mnist') script_path = os.path.join(resource_path, 'mnist.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.m4.xlarge', script_mode=True, sagemaker_session=sagemaker_session, py_version=PYTHON_VERSION, framework_version=TensorFlow.LATEST_VERSION) hyperparameter_ranges = {'epochs': IntegerParameter(1, 2)} objective_metric_name = 'accuracy' metric_definitions = [{ 'Name': objective_metric_name, 'Regex': 'accuracy = ([0-9\\.]+)' }] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, 'data'), key_prefix='scriptmode/mnist') tuning_job_name = unique_name_from_base('tune-tf-script-mode', max_length=32) tuner.fit(inputs, job_name=tuning_job_name) print('Started hyperparameter tuning job with name: ' + tuning_job_name) time.sleep(15) tuner.wait()
def test_mxnet_tuning(sagemaker_session, mxnet_full_version): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'tuning.py') data_path = os.path.join(DATA_DIR, 'mxnet_mnist') estimator = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version, train_instance_count=1, train_instance_type='ml.m4.xlarge', sagemaker_session=sagemaker_session, base_job_name='hpo') hyperparameter_ranges = { 'learning_rate': ContinuousParameter(0.01, 0.2) } objective_metric_name = 'Validation-accuracy' metric_definitions = [{ 'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)' }] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=4, max_parallel_jobs=2) train_input = estimator.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = estimator.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/mxnet_mnist/test') tuner.fit({'train': train_input, 'test': test_input}) print('tuning job successfully created: {}'.format( tuner.latest_tuning_job.name))
def test_attach_tuning_pytorch(sagemaker_session): mnist_dir = os.path.join(DATA_DIR, 'pytorch_mnist') mnist_script = os.path.join(mnist_dir, 'mnist.py') estimator = PyTorch(entry_point=mnist_script, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): objective_metric_name = 'evaluation-accuracy' metric_definitions = [{'Name': 'evaluation-accuracy', 'Regex': 'Overall test accuracy: (\d+)'}] hyperparameter_ranges = {'batch-size': IntegerParameter(50, 100)} tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2) training_data = estimator.sagemaker_session.upload_data(path=os.path.join(mnist_dir, 'training'), key_prefix='integ-test-data/pytorch_mnist/training') tuner.fit({'training': training_data}) tuning_job_name = tuner.latest_tuning_job.name print('Started hyperparameter tuning job with name:' + tuning_job_name) time.sleep(15) tuner.wait() attached_tuner = HyperparameterTuner.attach(tuning_job_name, sagemaker_session=sagemaker_session) best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = attached_tuner.deploy(1, 'ml.c4.xlarge') data = np.zeros(shape=(1, 1, 28, 28), dtype=np.float32) predictor.predict(data) batch_size = 100 data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def test_tuning(sagemaker_session, image_uri, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") script = os.path.join(resource_path, "mnist", "mnist.py") estimator = TensorFlow( entry_point=script, role="SageMakerRole", train_instance_type=instance_type, train_instance_count=1, sagemaker_session=sagemaker_session, image_name=image_uri, framework_version=framework_version, script_mode=True, ) hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)} objective_metric_name = "accuracy" metric_definitions = [{"Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)"}] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2, ) with timeout(minutes=20): inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist" ) tuning_job_name = unique_name_from_base("test-tf-sm-tuning", max_length=32) tuner.fit(inputs, job_name=tuning_job_name) tuner.wait()
def test_tuning_byo_estimator(sagemaker_session): """Use Factorization Machines algorithm as an example here. First we need to prepare data for training. We take standard data set, convert it to the format that the algorithm can process and upload it to S3. Then we create the Estimator and set hyperparamets as required by the algorithm. Next, we can call fit() with path to the S3. Later the trained model is deployed and prediction is called against the endpoint. Default predictor is updated with json serializer and deserializer. """ image_name = registry(sagemaker_session.boto_session.region_name) + '/factorization-machines:1' training_data_path = os.path.join(DATA_DIR, 'dummy_tensor') with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) prefix = 'test_byo_estimator' key = 'recordio-pb-data' s3_train_data = sagemaker_session.upload_data(path=training_data_path, key_prefix=os.path.join(prefix, 'train', key)) estimator = Estimator(image_name=image_name, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-byo') estimator.set_hyperparameters(num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type='binary_classifier') hyperparameter_ranges = {'mini_batch_size': IntegerParameter(100, 200)} tuner = HyperparameterTuner(estimator=estimator, base_tuning_job_name='byo', objective_metric_name='test:binary_classification_accuracy', hyperparameter_ranges=hyperparameter_ranges, max_jobs=2, max_parallel_jobs=2) tuner.fit({'train': s3_train_data, 'test': s3_train_data}, include_cls_metadata=False) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.m4.xlarge', endpoint_name=best_training_job) predictor.serializer = _fm_serializer predictor.content_type = 'application/json' predictor.deserializer = json_deserializer result = predictor.predict(train_set[0][:10]) assert len(result['predictions']) == 10 for prediction in result['predictions']: assert prediction['score'] is not None