def test_run_hyperparameter_tuning_job_with_fail_raises( self, create_hyperparameter_tuning_job_mock, get_hyperparameter_tuning_job_mock_with_fail, sync, ): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) custom_job = aiplatform.CustomJob( display_name=test_custom_job._TEST_DISPLAY_NAME, worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR, ) job = aiplatform.HyperparameterTuningJob( display_name=_TEST_DISPLAY_NAME, custom_job=custom_job, metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, parameter_spec={ "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), "activation": hpt.CategoricalParameterSpec( values=["relu", "sigmoid", "elu", "selu", "tanh"] ), "batch_size": hpt.DiscreteParameterSpec( values=[16, 32], scale="linear" ), }, parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, max_trial_count=_TEST_MAX_TRIAL_COUNT, max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, search_algorithm=_TEST_SEARCH_ALGORITHM, measurement_selection=_TEST_MEASUREMENT_SELECTION, labels=_TEST_LABELS, ) with pytest.raises(RuntimeError): job.run( service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, sync=sync, ) job.wait() expected_hyperparameter_tuning_job = _get_hyperparameter_tuning_job_proto() create_hyperparameter_tuning_job_mock.assert_called_once_with( parent=_TEST_PARENT, hyperparameter_tuning_job=expected_hyperparameter_tuning_job, ) assert job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_FAILED
def tune_hyperparameters( project: str, location: str, container_uri: str, training_file_path: str, validation_file_path: str, staging_bucket: str, max_trial_count: int, parallel_trial_count: int ) -> NamedTuple('Outputs', [("best_accuracy", float), ("best_alpha", float), ("best_max_iter", int)]): from google.cloud import aiplatform from google.cloud.aiplatform import hyperparameter_tuning as hpt aiplatform.init(project=project, location=location, staging_bucket=staging_bucket) worker_pool_specs = [{ "machine_spec": { "machine_type": "n1-standard-4", "accelerator_type": "NVIDIA_TESLA_K80", "accelerator_count": 1, }, "replica_count": 1, "container_spec": { "image_uri": container_uri, "args": [ f"--training_dataset_path={training_file_path}", f"--validation_dataset_path={validation_file_path}", "--hptune" ], }, }] custom_job = aiplatform.CustomJob(display_name='covertype_kfp_trial_job', worker_pool_specs=worker_pool_specs) hp_job = aiplatform.HyperparameterTuningJob( display_name='covertype_kfp_tuning_job', custom_job=custom_job, metric_spec={ 'accuracy': 'maximize', }, parameter_spec={ 'alpha': hpt.DoubleParameterSpec(min=1.0e-4, max=1.0e-1, scale='linear'), 'max_iter': hpt.DiscreteParameterSpec(values=[1, 2], scale='linear') }, max_trial_count=max_trial_count, parallel_trial_count=parallel_trial_count, ) hp_job.run() metrics = [ trial.final_measurement.metrics[0].value for trial in hp_job.trials ] best_trial = hp_job.trials[metrics.index(max(metrics))] best_accuracy = float(best_trial.final_measurement.metrics[0].value) best_alpha = float(best_trial.parameters[0].value) best_max_iter = int(best_trial.parameters[1].value) return best_accuracy, best_alpha, best_max_iter
def test_create_custom_job_with_tensorboard(self, create_custom_job_v1beta1_mock, get_custom_job_mock, sync): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) job = aiplatform.CustomJob(display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC) job.run( service_account=_TEST_SERVICE_ACCOUNT, tensorboard=_TEST_TENSORBOARD_NAME, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, sync=sync, ) job.wait() expected_custom_job = _get_custom_job_proto(version="v1beta1") create_custom_job_v1beta1_mock.assert_called_once_with( parent=_TEST_PARENT, custom_job=expected_custom_job) expected_custom_job = _get_custom_job_proto() assert job.job_spec == expected_custom_job.job_spec assert (job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED)
def test_run_custom_job_with_fail_raises(self, create_custom_job_mock, get_custom_job_mock_with_fail, sync): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) job = aiplatform.CustomJob(display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC) with pytest.raises(RuntimeError): job.run( service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart= _TEST_RESTART_JOB_ON_WORKER_RESTART, sync=sync, ) job.wait() expected_custom_job = _get_custom_job_proto() create_custom_job_mock.assert_called_once_with( parent=_TEST_PARENT, custom_job=expected_custom_job) assert job.job_spec == expected_custom_job.job_spec assert job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_FAILED
def test_check_custom_job_availability(self): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) job = aiplatform.CustomJob( display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC, base_output_dir=_TEST_BASE_OUTPUT_DIR, labels=_TEST_LABELS, ) assert not job._resource_is_available assert job.__repr__().startswith( "<google.cloud.aiplatform.jobs.CustomJob object") job.run( service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, ) job.wait_for_resource_creation() assert job._resource_is_available assert "resource name" in job.__repr__() job.wait()
def test_no_staging_bucket_raises(self): aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION) with pytest.raises(RuntimeError): job = aiplatform.CustomJob( # noqa: F841 display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC, )
def test_create_custom_job_with_enable_web_access( self, create_custom_job_mock_with_enable_web_access, get_custom_job_mock_with_enable_web_access, sync, caplog, ): caplog.set_level(logging.INFO) aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) job = aiplatform.CustomJob( display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC, base_output_dir=_TEST_BASE_OUTPUT_DIR, labels=_TEST_LABELS, ) job.run( enable_web_access=_TEST_ENABLE_WEB_ACCESS, service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, sync=sync, create_request_timeout=None, ) job.wait_for_resource_creation() job.wait() assert "workerpool0-0" in caplog.text assert job.resource_name == _TEST_CUSTOM_JOB_NAME expected_custom_job = _get_custom_job_proto_with_enable_web_access() create_custom_job_mock_with_enable_web_access.assert_called_once_with( parent=_TEST_PARENT, custom_job=expected_custom_job, timeout=None, ) assert job.job_spec == expected_custom_job.job_spec assert (job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED) caplog.clear()
def test_custom_job_get_state_raises_without_run(self): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) job = aiplatform.CustomJob(display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC) with pytest.raises(RuntimeError): print(job.state)
def test_run_custom_job_with_fail_raises(self, create_custom_job_mock, get_custom_job_mock_with_fail, sync): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) job = aiplatform.CustomJob( display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC, base_output_dir=_TEST_BASE_OUTPUT_DIR, labels=_TEST_LABELS, ) with pytest.raises(RuntimeError) as e: job.wait_for_resource_creation() assert e.match(r"CustomJob resource is not scheduled to be created.") with pytest.raises(RuntimeError): job.run( service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart= _TEST_RESTART_JOB_ON_WORKER_RESTART, sync=sync, create_request_timeout=None, ) job.wait() # shouldn't fail job.wait_for_resource_creation() assert job.resource_name == _TEST_CUSTOM_JOB_NAME expected_custom_job = _get_custom_job_proto() create_custom_job_mock.assert_called_once_with( parent=_TEST_PARENT, custom_job=expected_custom_job, timeout=None, ) assert job.job_spec == expected_custom_job.job_spec assert job.state == gca_job_state_compat.JobState.JOB_STATE_FAILED
def test_create_custom_job_without_base_output_dir(self, ): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) job = aiplatform.CustomJob( display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC, ) assert job.job_spec.base_output_directory.output_uri_prefix.startswith( f"{_TEST_STAGING_BUCKET}/aiplatform-custom-job")
def test_project_id_inference(self, shared_state): # Collection of resources generated by this test, to be deleted during teardown shared_state["resources"] = [] aiplatform.init( project=e2e_base._PROJECT, location=e2e_base._LOCATION, staging_bucket=shared_state["staging_bucket_name"], ) worker_pool_specs = [{ "machine_spec": { "machine_type": "n1-standard-4" }, "replica_count": 1, "container_spec": { "image_uri": "python:3.9", "command": [ "sh", "-exc", """python3 -m pip install git+https://github.com/googleapis/python-aiplatform@main "$0" "$@" """, "python3", "-c", _SCRIPT, ], "args": [], }, }] custom_job = aiplatform.CustomJob( display_name=self._make_display_name("custom"), worker_pool_specs=worker_pool_specs, ) custom_job.run() shared_state["resources"].append(custom_job) assert custom_job.state == gca_job_state.JobState.JOB_STATE_SUCCEEDED
def test_create_custom_job(self, create_custom_job_mock, get_custom_job_mock, sync): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) job = aiplatform.CustomJob( display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC, base_output_dir=_TEST_BASE_OUTPUT_DIR, labels=_TEST_LABELS, ) job.run( service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, sync=sync, ) job.wait_for_resource_creation() assert job.resource_name == _TEST_CUSTOM_JOB_NAME job.wait() expected_custom_job = _get_custom_job_proto() create_custom_job_mock.assert_called_once_with( parent=_TEST_PARENT, custom_job=expected_custom_job) assert job.job_spec == expected_custom_job.job_spec assert (job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED) assert job.network == _TEST_NETWORK
def test_hyperparameter_tuning_job_get_state_raises_without_run(self): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) custom_job = aiplatform.CustomJob( display_name=test_custom_job._TEST_DISPLAY_NAME, worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR, ) job = aiplatform.HyperparameterTuningJob( display_name=_TEST_DISPLAY_NAME, custom_job=custom_job, metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, parameter_spec={ "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), "activation": hpt.CategoricalParameterSpec( values=["relu", "sigmoid", "elu", "selu", "tanh"]), "batch_size": hpt.DiscreteParameterSpec(values=[16, 32, 64], scale="linear"), }, parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, max_trial_count=_TEST_MAX_TRIAL_COUNT, max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, search_algorithm=_TEST_SEARCH_ALGORITHM, measurement_selection=_TEST_MEASUREMENT_SELECTION, ) with pytest.raises(RuntimeError): print(job.state)
def test_run_custom_job_with_fail_at_creation(self): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) job = aiplatform.CustomJob( display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC, base_output_dir=_TEST_BASE_OUTPUT_DIR, ) job.run( service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, sync=False, ) with pytest.raises(RuntimeError) as e: job.wait_for_resource_creation() assert e.match("Mock fail") with pytest.raises(RuntimeError) as e: job.resource_name assert e.match( "CustomJob resource has not been created. Resource failed with: Mock fail" ) with pytest.raises(RuntimeError) as e: job.network assert e.match( "CustomJob resource has not been created. Resource failed with: Mock fail" )
def test_create_hyperparameter_tuning_job_with_enable_web_access( self, create_hyperparameter_tuning_job_mock_with_enable_web_access, get_hyperparameter_tuning_job_mock_with_enable_web_access, sync, caplog, ): caplog.set_level(logging.INFO) aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) custom_job = aiplatform.CustomJob( display_name=test_custom_job._TEST_DISPLAY_NAME, worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR, ) job = aiplatform.HyperparameterTuningJob( display_name=_TEST_DISPLAY_NAME, custom_job=custom_job, metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, parameter_spec={ "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), "activation": hpt.CategoricalParameterSpec( values=["relu", "sigmoid", "elu", "selu", "tanh"]), "batch_size": hpt.DiscreteParameterSpec(values=[16, 32], scale="linear"), }, parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, max_trial_count=_TEST_MAX_TRIAL_COUNT, max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, search_algorithm=_TEST_SEARCH_ALGORITHM, measurement_selection=_TEST_MEASUREMENT_SELECTION, labels=_TEST_LABELS, ) job.run( service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, enable_web_access=test_custom_job._TEST_ENABLE_WEB_ACCESS, sync=sync, create_request_timeout=None, ) job.wait() assert "workerpool0-0" in caplog.text expected_hyperparameter_tuning_job = ( _get_hyperparameter_tuning_job_proto_with_enable_web_access()) create_hyperparameter_tuning_job_mock_with_enable_web_access.assert_called_once_with( parent=_TEST_PARENT, hyperparameter_tuning_job=expected_hyperparameter_tuning_job, timeout=None, ) assert job.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED assert job.network == _TEST_NETWORK assert job.trials == [] caplog.clear()
def hyperparameter_tuning_job_run_op( display_name: str, project: str, base_output_directory: str, worker_pool_specs: list, study_spec_metrics: dict, study_spec_parameters: list, max_trial_count: int, parallel_trial_count: int, max_failed_trial_count: int = 0, location: str = "us-central1", study_spec_algorithm: str = "ALGORITHM_UNSPECIFIED", study_spec_measurement_selection_type: str = "BEST_MEASUREMENT", encryption_spec_key_name: str = None, service_account: str = None, network: str = None, ) -> NamedTuple('Outputs', [ ("trials", list), ]): """ Creates a Google Cloud AI Platform HyperparameterTuning Job and waits for it to complete. For example usage, see https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/experimental/hyperparameter_tuning_job/hp_tuning_job_sample.ipynb. For more information on using hyperparameter tuning, please visit: https://cloud.google.com/vertex-ai/docs/training/using-hyperparameter-tuning Args: Creates a Google Cloud AI Platform HyperparameterTuning Job and waits for it to complete. For example usage, see https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/experimental/hyperparameter_tuning_job/hp_tuning_job_sample.ipynb. For more information on using hyperparameter tuning, please visit: https://cloud.google.com/vertex-ai/docs/training/using-hyperparameter-tuning Args: display_name (str): Required. The user-defined name of the HyperparameterTuningJob. The name can be up to 128 characters long and can be consist of any UTF-8 characters. project (str): Required. Project to run the HyperparameterTuningJob in. base_output_directory (str): Required. The Cloud Storage location to store the output of this HyperparameterTuningJob. The base_output_directory of each child CustomJob backing a Trial is set to a subdirectory with name as the trial id under its parent HyperparameterTuningJob's base_output_directory. The following Vertex AI environment variables will be passed to containers or python modules when this field is set: For CustomJob backing a Trial of HyperparameterTuningJob: * AIP_MODEL_DIR = `\/\/model\/` * AIP_CHECKPOINT_DIR = `\/\/checkpoints\/` * AIP_TENSORBOARD_LOG_DIR = `\/\/logs\/` worker_pool_specs (List[Dict]): Required. The spec of the worker pools including machine type and Docker image. All worker pools except the first one are optional and can be skipped by providing an empty value. study_spec_metrics: (Dict[str, str]): Required. Dictionary representing metrics to optimize. The dictionary key is the metric_id, which is reported by your training job, and the dictionary value is the optimization goal of the metric ('minimize' or 'maximize'). example: metrics = {'loss': 'minimize', 'accuracy': 'maximize'} study_spec_parameters (list[str]): Required. List serialized from the parameter dictionary. The dictionary represents parameters to optimize. The dictionary key is the parameter_id, which is passed into your training job as a command line key word argument, and the dictionary value is the parameter specification of the metric. from google.cloud.aiplatform import hyperparameter_tuning as hpt from google_cloud_pipeline_components.experimental import hyperparameter_tuning_job parameters = hyperparameter_tuning_job.serialize_parameters({ 'lr': hpt.DoubleParameterSpec(min=0.001, max=0.1, scale='log'), 'units': hpt.IntegerParameterSpec(min=4, max=128, scale='linear'), 'activation': hpt.CategoricalParameterSpec(values=['relu', 'selu']), 'batch_size': hpt.DiscreteParameterSpec(values=[128, 256], scale='linear') }) Supported parameter specifications can be found in aiplatform.hyperparameter_tuning. These parameter specification are currently supported: DoubleParameterSpec, IntegerParameterSpec, CategoricalParameterSpace, DiscreteParameterSpec max_trial_count (int): Required. The desired total number of Trials. parallel_trial_count (int): Required. The desired number of Trials to run in parallel. max_failed_trial_count (Optional[int]): The number of failed Trials that need to be seen before failing the HyperparameterTuningJob. If set to 0, Vertex AI decides how many Trials must fail before the whole job fails. location (Optional[str]): Location to run the HyperparameterTuningJob in, defaults to "us-central1" study_spec_algorithm (Optional[str]): The search algorithm specified for the Study. Accepts one of the following: * `ALGORITHM_UNSPECIFIED` - If you do not specify an algorithm, your job uses the default Vertex AI algorithm. The default algorithm applies Bayesian optimization to arrive at the optimal solution with a more effective search over the parameter space. * 'GRID_SEARCH' - A simple grid search within the feasible space. This option is particularly useful if you want to specify a quantity of trials that is greater than the number of points in the feasible space. In such cases, if you do not specify a grid search, the Vertex AI default algorithm may generate duplicate suggestions. To use grid search, all parameter specs must be of type `IntegerParameterSpec`, `CategoricalParameterSpace`, or `DiscreteParameterSpec`. * 'RANDOM_SEARCH' - A simple random search within the feasible space. study_spec_measurement_selection_type (Optional[str]): This indicates which measurement to use if/when the service automatically selects the final measurement from previously reported intermediate measurements. Accepts: 'BEST_MEASUREMENT', 'LAST_MEASUREMENT' Choose this based on two considerations: A) Do you expect your measurements to monotonically improve? If so, choose 'LAST_MEASUREMENT'. On the other hand, if you're in a situation where your system can "over-train" and you expect the performance to get better for a while but then start declining, choose 'BEST_MEASUREMENT'. B) Are your measurements significantly noisy and/or irreproducible? If so, 'BEST_MEASUREMENT' will tend to be over-optimistic, and it may be better to choose 'LAST_MEASUREMENT'. If both or neither of (A) and (B) apply, it doesn't matter which selection type is chosen. encryption_spec_key_name (Optional[str]): Customer-managed encryption key options for a HyperparameterTuningJob. If this is set, then all resources created by the HyperparameterTuningJob will be encrypted with the provided encryption key. Has the form: ``projects/my-project/locations/my-location/keyRings/my-kr/cryptoKeys/my-key``. The key needs to be in the same region as where the compute resource is created. service_account (Optional[str]): Specifies the service account for workload run-as account. Users submitting jobs must have act-as permission on this run-as account. network (Optional[str]): The full name of the Compute Engine network to which the job should be peered. For example, projects/12345/global/networks/myVPC. Private services access must already be configured for the network. If left unspecified, the job is not peered with any network. Returns: List of HyperparameterTuningJob trials """ from google.cloud import aiplatform from google.cloud.aiplatform import hyperparameter_tuning as hpt from google.cloud.aiplatform_v1.types import study from google.cloud.aiplatform.hyperparameter_tuning import _SCALE_TYPE_MAP # Reverse the _SCALE_TYPE_MAP dict for deserialization SCALE_MAP = dict((reversed(item) for item in _SCALE_TYPE_MAP.items())) PARAMETER_SPEC_MAP = { hpt.DoubleParameterSpec._parameter_spec_value_key: hpt.DoubleParameterSpec, hpt.IntegerParameterSpec._parameter_spec_value_key: hpt.IntegerParameterSpec, hpt.CategoricalParameterSpec._parameter_spec_value_key: hpt.CategoricalParameterSpec, hpt.DiscreteParameterSpec._parameter_spec_value_key: hpt.DiscreteParameterSpec, } ALGORITHM_MAP = { 'ALGORITHM_UNSPECIFIED': None, 'GRID_SEARCH': 'grid', 'RANDOM_SEARCH': 'random', } MEASUREMENT_SELECTION_TYPE_MAP = { 'BEST_MEASUREMENT': 'best', 'LAST_MEASUREMENT': 'last', } aiplatform.init(project=project, location=location, staging_bucket=base_output_directory) # Deserialize the parameters parameters_kwargs = {} for parameter in study_spec_parameters: param = study.StudySpec.ParameterSpec.from_json(parameter) parameter_id = param.parameter_id param_attrs = {} for parameter_spec_value_key, parameter_spec in PARAMETER_SPEC_MAP.items(): if getattr(param, parameter_spec_value_key): attrs = getattr(param, parameter_spec_value_key) for parameter, value in parameter_spec._parameter_value_map: if hasattr(attrs, value): param_attrs[parameter] = getattr(attrs, value) # Detect 'scale' in list of arguments to parameter_spec.__init__ param_spec_code = parameter_spec.__init__.__code__ if 'scale' in param_spec_code.co_varnames[:param_spec_code.co_argcount]: param_attrs['scale'] = SCALE_MAP[param.scale_type] parameters_kwargs[parameter_id] = parameter_spec( **param_attrs) # pytype: disable=wrong-keyword-args break custom_job_display_name = display_name + '_custom_job' job = aiplatform.CustomJob( display_name=custom_job_display_name, staging_bucket=base_output_directory, worker_pool_specs=worker_pool_specs, ) hp_job = aiplatform.HyperparameterTuningJob( display_name=display_name, custom_job=job, metric_spec=study_spec_metrics, parameter_spec={ **parameters_kwargs }, max_trial_count=max_trial_count, parallel_trial_count=parallel_trial_count, max_failed_trial_count=max_failed_trial_count, search_algorithm=ALGORITHM_MAP[study_spec_algorithm], measurement_selection=MEASUREMENT_SELECTION_TYPE_MAP[ study_spec_measurement_selection_type ], encryption_spec_key_name=encryption_spec_key_name ) hp_job.run( service_account=service_account, network=network) trials = [study.Trial.to_json(trial) for trial in hp_job.trials] return trials # pytype: disable=bad-return-type
def test_run_hyperparameter_tuning_job_with_fail_at_creation(self): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) custom_job = aiplatform.CustomJob( display_name=test_custom_job._TEST_DISPLAY_NAME, worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR, ) job = aiplatform.HyperparameterTuningJob( display_name=_TEST_DISPLAY_NAME, custom_job=custom_job, metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, parameter_spec={ "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), "activation": hpt.CategoricalParameterSpec( values=["relu", "sigmoid", "elu", "selu", "tanh"]), "batch_size": hpt.DiscreteParameterSpec(values=[16, 32], scale="linear"), }, parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, max_trial_count=_TEST_MAX_TRIAL_COUNT, max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, search_algorithm=_TEST_SEARCH_ALGORITHM, measurement_selection=_TEST_MEASUREMENT_SELECTION, ) job.run( service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, sync=False, ) with pytest.raises(RuntimeError) as e: job.wait_for_resource_creation() assert e.match("Mock fail") with pytest.raises(RuntimeError) as e: job.resource_name assert e.match( "HyperparameterTuningJob resource has not been created. Resource failed with: Mock fail" ) with pytest.raises(RuntimeError) as e: job.network assert e.match( "HyperparameterTuningJob resource has not been created. Resource failed with: Mock fail" ) with pytest.raises(RuntimeError) as e: job.trials assert e.match( "HyperparameterTuningJob resource has not been created. Resource failed with: Mock fail" )