def test_run_hyperparameter_tuning_job_with_fail_raises( self, create_hyperparameter_tuning_job_mock, get_hyperparameter_tuning_job_mock_with_fail, sync, ): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) custom_job = aiplatform.CustomJob( display_name=test_custom_job._TEST_DISPLAY_NAME, worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR, ) job = aiplatform.HyperparameterTuningJob( display_name=_TEST_DISPLAY_NAME, custom_job=custom_job, metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, parameter_spec={ "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), "activation": hpt.CategoricalParameterSpec( values=["relu", "sigmoid", "elu", "selu", "tanh"] ), "batch_size": hpt.DiscreteParameterSpec( values=[16, 32], scale="linear" ), }, parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, max_trial_count=_TEST_MAX_TRIAL_COUNT, max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, search_algorithm=_TEST_SEARCH_ALGORITHM, measurement_selection=_TEST_MEASUREMENT_SELECTION, labels=_TEST_LABELS, ) with pytest.raises(RuntimeError): job.run( service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, sync=sync, ) job.wait() expected_hyperparameter_tuning_job = _get_hyperparameter_tuning_job_proto() create_hyperparameter_tuning_job_mock.assert_called_once_with( parent=_TEST_PARENT, hyperparameter_tuning_job=expected_hyperparameter_tuning_job, ) assert job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_FAILED
def test_serialize_parameters(self): parameters = { 'lr': hpt.DoubleParameterSpec(min=0.001, max=0.1, scale='log'), 'units': hpt.IntegerParameterSpec(min=4, max=128, scale='linear'), 'activation': hpt.CategoricalParameterSpec(values=['relu', 'selu']), 'batch_size': hpt.DiscreteParameterSpec(values=[128, 256], scale='linear') } expected_outputs = [ '{\n "parameterId": "lr",\n "doubleValueSpec": {\n' ' "minValue": 0.001,\n "maxValue": 0.1\n },\n' ' "scaleType": 2,\n "conditionalParameterSpecs": []\n}', '{\n "parameterId": "units",\n "integerValueSpec": {\n' ' "minValue": "4",\n "maxValue": "128"\n },\n' ' "scaleType": 1,\n "conditionalParameterSpecs": []\n}', '{\n "parameterId": "activation",\n "categoricalValueSpec": {\n' ' "values": [\n "relu",\n "selu"\n ]\n },\n' ' "scaleType": 0,\n "conditionalParameterSpecs": []\n}', '{\n "parameterId": "batch_size",\n "discreteValueSpec": {\n' ' "values": [\n 128.0,\n 256.0\n ]\n },\n' ' "scaleType": 1,\n "conditionalParameterSpecs": []\n}', ] outputs = serialize_parameters(parameters) self.assertEqual(outputs, expected_outputs)
def test_serialize_parameters(self): parameters = { 'lr': hpt.DoubleParameterSpec(min=0.001, max=0.1, scale='log'), 'units': hpt.IntegerParameterSpec(min=4, max=128, scale='linear'), 'activation': hpt.CategoricalParameterSpec(values=['relu', 'selu']), 'batch_size': hpt.DiscreteParameterSpec(values=[128, 256], scale='linear') } expected_outputs = [ { 'parameter_id': 'lr', 'double_value_spec': { 'min_value': 0.001, 'max_value': 0.1 }, 'scale_type': 2, 'conditional_parameter_specs': [] }, { 'parameter_id': 'units', 'integer_value_spec': { 'min_value': '4', 'max_value': '128' }, 'scale_type': 1, 'conditional_parameter_specs': [] }, { 'parameter_id': 'activation', 'categorical_value_spec': { 'values': ['relu', 'selu'] }, 'scale_type': 0, 'conditional_parameter_specs': [] }, { 'parameter_id': 'batch_size', 'discrete_value_spec': { 'values': [128.0, 256.0] }, 'scale_type': 1, 'conditional_parameter_specs': [] }, ] outputs = serialize_parameters(parameters) self.assertEqual(outputs, expected_outputs)
def test_hyperparameter_tuning_job_get_state_raises_without_run(self): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) custom_job = aiplatform.CustomJob( display_name=test_custom_job._TEST_DISPLAY_NAME, worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR, ) job = aiplatform.HyperparameterTuningJob( display_name=_TEST_DISPLAY_NAME, custom_job=custom_job, metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, parameter_spec={ "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), "activation": hpt.CategoricalParameterSpec( values=["relu", "sigmoid", "elu", "selu", "tanh"]), "batch_size": hpt.DiscreteParameterSpec(values=[16, 32, 64], scale="linear"), }, parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, max_trial_count=_TEST_MAX_TRIAL_COUNT, max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, search_algorithm=_TEST_SEARCH_ALGORITHM, measurement_selection=_TEST_MEASUREMENT_SELECTION, ) with pytest.raises(RuntimeError): print(job.state)
def test_run_hyperparameter_tuning_job_with_fail_at_creation(self): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) custom_job = aiplatform.CustomJob( display_name=test_custom_job._TEST_DISPLAY_NAME, worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR, ) job = aiplatform.HyperparameterTuningJob( display_name=_TEST_DISPLAY_NAME, custom_job=custom_job, metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, parameter_spec={ "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), "activation": hpt.CategoricalParameterSpec( values=["relu", "sigmoid", "elu", "selu", "tanh"]), "batch_size": hpt.DiscreteParameterSpec(values=[16, 32], scale="linear"), }, parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, max_trial_count=_TEST_MAX_TRIAL_COUNT, max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, search_algorithm=_TEST_SEARCH_ALGORITHM, measurement_selection=_TEST_MEASUREMENT_SELECTION, ) job.run( service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, sync=False, ) with pytest.raises(RuntimeError) as e: job.wait_for_resource_creation() assert e.match("Mock fail") with pytest.raises(RuntimeError) as e: job.resource_name assert e.match( "HyperparameterTuningJob resource has not been created. Resource failed with: Mock fail" ) with pytest.raises(RuntimeError) as e: job.network assert e.match( "HyperparameterTuningJob resource has not been created. Resource failed with: Mock fail" ) with pytest.raises(RuntimeError) as e: job.trials assert e.match( "HyperparameterTuningJob resource has not been created. Resource failed with: Mock fail" )
def do_hyperparameter_tuning(data_set, timestamp, develop_mode, cpu_only_mode, tf_version): # Vertex AI services require regional API endpoints. if cpu_only_mode: train_image = 'us-docker.pkg.dev/vertex-ai/training/tf-cpu.{}:latest'.format( tf_version) else: train_image = "us-docker.pkg.dev/vertex-ai/training/tf-gpu.{}:latest".format( tf_version) # a single trial job model_display_name = '{}-{}'.format(ENDPOINT_NAME, timestamp) if cpu_only_mode: trial_job = aiplatform.CustomJob.from_local_script( display_name='train-{}'.format(model_display_name), script_path="model.py", container_uri=train_image, args=[ '--bucket', BUCKET, '--skip_full_eval', # no need to evaluate on test data set '--num_epochs', '10', '--num_examples', '500000' # 1/10 actual size to finish faster ], requirements=['cloudml-hypertune'], # any extra Python packages replica_count=1, machine_type='n1-standard-4') else: trial_job = aiplatform.CustomJob.from_local_script( display_name='train-{}'.format(model_display_name), script_path="model.py", container_uri=train_image, args=[ '--bucket', BUCKET, '--skip_full_eval', # no need to evaluate on test data set '--num_epochs', '10', '--num_examples', '500000' # 1/10 actual size to finish faster ], requirements=['cloudml-hypertune'], # any extra Python packages replica_count=1, machine_type='n1-standard-4', # See https://cloud.google.com/vertex-ai/docs/general/locations#accelerators accelerator_type=aip.AcceleratorType.NVIDIA_TESLA_T4.name, accelerator_count=1, ) # the tuning job hparam_job = aiplatform.HyperparameterTuningJob( # See https://googleapis.dev/python/aiplatform/latest/aiplatform.html# display_name='hparam-{}'.format(model_display_name), custom_job=trial_job, metric_spec={'val_rmse': 'minimize'}, parameter_spec={ "train_batch_size": hpt.IntegerParameterSpec(min=16, max=256, scale='log'), "nbuckets": hpt.IntegerParameterSpec(min=5, max=10, scale='linear'), "dnn_hidden_units": hpt.CategoricalParameterSpec( values=["64,16", "64,16,4", "64,64,64,8", "256,64,16"]) }, max_trial_count=2 if develop_mode else NUM_HPARAM_TRIALS, parallel_trial_count=2, search_algorithm=None, # Bayesian ) hparam_job.run(sync=True) # has to finish before we can get trials. # get the parameters corresponding to the best trial best = sorted(hparam_job.trials, key=lambda x: x.final_measurement.metrics[0].value)[0] logging.info('Best trial: {}'.format(best)) best_params = [] for param in best.parameters: best_params.append('--{}'.format(param.parameter_id)) if param.parameter_id in ["train_batch_size", "nbuckets"]: # hparam returns 10.0 even though it's an integer param. so round it. # but CustomTrainingJob makes integer args into floats. so make it a string best_params.append(str(int(round(param.value)))) else: # string or float parameters best_params.append(param.value) # run the best trial to completion logging.info('Launching full training job with {}'.format(best_params)) return train_custom_model(data_set, timestamp, develop_mode, cpu_only_mode, tf_version, extra_args=best_params)
def test_create_hyperparameter_tuning_job_with_enable_web_access( self, create_hyperparameter_tuning_job_mock_with_enable_web_access, get_hyperparameter_tuning_job_mock_with_enable_web_access, sync, caplog, ): caplog.set_level(logging.INFO) aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) custom_job = aiplatform.CustomJob( display_name=test_custom_job._TEST_DISPLAY_NAME, worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR, ) job = aiplatform.HyperparameterTuningJob( display_name=_TEST_DISPLAY_NAME, custom_job=custom_job, metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, parameter_spec={ "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), "activation": hpt.CategoricalParameterSpec( values=["relu", "sigmoid", "elu", "selu", "tanh"]), "batch_size": hpt.DiscreteParameterSpec(values=[16, 32], scale="linear"), }, parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, max_trial_count=_TEST_MAX_TRIAL_COUNT, max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, search_algorithm=_TEST_SEARCH_ALGORITHM, measurement_selection=_TEST_MEASUREMENT_SELECTION, labels=_TEST_LABELS, ) job.run( service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, enable_web_access=test_custom_job._TEST_ENABLE_WEB_ACCESS, sync=sync, create_request_timeout=None, ) job.wait() assert "workerpool0-0" in caplog.text expected_hyperparameter_tuning_job = ( _get_hyperparameter_tuning_job_proto_with_enable_web_access()) create_hyperparameter_tuning_job_mock_with_enable_web_access.assert_called_once_with( parent=_TEST_PARENT, hyperparameter_tuning_job=expected_hyperparameter_tuning_job, timeout=None, ) assert job.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED assert job.network == _TEST_NETWORK assert job.trials == [] caplog.clear()