示例#1
0
    def test_run_hyperparameter_tuning_job_with_fail_raises(
        self,
        create_hyperparameter_tuning_job_mock,
        get_hyperparameter_tuning_job_mock_with_fail,
        sync,
    ):
        aiplatform.init(
            project=_TEST_PROJECT,
            location=_TEST_LOCATION,
            staging_bucket=_TEST_STAGING_BUCKET,
            encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME,
        )

        custom_job = aiplatform.CustomJob(
            display_name=test_custom_job._TEST_DISPLAY_NAME,
            worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC,
            base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR,
        )

        job = aiplatform.HyperparameterTuningJob(
            display_name=_TEST_DISPLAY_NAME,
            custom_job=custom_job,
            metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE},
            parameter_spec={
                "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"),
                "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"),
                "activation": hpt.CategoricalParameterSpec(
                    values=["relu", "sigmoid", "elu", "selu", "tanh"]
                ),
                "batch_size": hpt.DiscreteParameterSpec(
                    values=[16, 32], scale="linear"
                ),
            },
            parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT,
            max_trial_count=_TEST_MAX_TRIAL_COUNT,
            max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT,
            search_algorithm=_TEST_SEARCH_ALGORITHM,
            measurement_selection=_TEST_MEASUREMENT_SELECTION,
            labels=_TEST_LABELS,
        )

        with pytest.raises(RuntimeError):
            job.run(
                service_account=_TEST_SERVICE_ACCOUNT,
                network=_TEST_NETWORK,
                timeout=_TEST_TIMEOUT,
                restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART,
                sync=sync,
            )

            job.wait()

        expected_hyperparameter_tuning_job = _get_hyperparameter_tuning_job_proto()

        create_hyperparameter_tuning_job_mock.assert_called_once_with(
            parent=_TEST_PARENT,
            hyperparameter_tuning_job=expected_hyperparameter_tuning_job,
        )

        assert job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_FAILED
示例#2
0
    def test_serialize_parameters(self):
        parameters = {
            'lr':
                hpt.DoubleParameterSpec(min=0.001, max=0.1, scale='log'),
            'units':
                hpt.IntegerParameterSpec(min=4, max=128, scale='linear'),
            'activation':
                hpt.CategoricalParameterSpec(values=['relu', 'selu']),
            'batch_size':
                hpt.DiscreteParameterSpec(values=[128, 256], scale='linear')
        }
        expected_outputs = [
          '{\n  "parameterId": "lr",\n  "doubleValueSpec": {\n'
          '    "minValue": 0.001,\n    "maxValue": 0.1\n  },\n'
          '  "scaleType": 2,\n  "conditionalParameterSpecs": []\n}',
          '{\n  "parameterId": "units",\n  "integerValueSpec": {\n'
          '    "minValue": "4",\n    "maxValue": "128"\n  },\n'
          '  "scaleType": 1,\n  "conditionalParameterSpecs": []\n}',
          '{\n  "parameterId": "activation",\n  "categoricalValueSpec": {\n'
          '    "values": [\n      "relu",\n      "selu"\n    ]\n  },\n'
          '  "scaleType": 0,\n  "conditionalParameterSpecs": []\n}',
          '{\n  "parameterId": "batch_size",\n  "discreteValueSpec": {\n'
          '    "values": [\n      128.0,\n      256.0\n    ]\n  },\n'
          '  "scaleType": 1,\n  "conditionalParameterSpecs": []\n}',
        ]

        outputs = serialize_parameters(parameters)
        self.assertEqual(outputs, expected_outputs)
示例#3
0
  def test_serialize_parameters(self):
    parameters = {
        'lr':
            hpt.DoubleParameterSpec(min=0.001, max=0.1, scale='log'),
        'units':
            hpt.IntegerParameterSpec(min=4, max=128, scale='linear'),
        'activation':
            hpt.CategoricalParameterSpec(values=['relu', 'selu']),
        'batch_size':
            hpt.DiscreteParameterSpec(values=[128, 256], scale='linear')
    }
    expected_outputs = [
        {
            'parameter_id': 'lr',
            'double_value_spec': {
                'min_value': 0.001,
                'max_value': 0.1
            },
            'scale_type': 2,
            'conditional_parameter_specs': []
        },
        {
            'parameter_id': 'units',
            'integer_value_spec': {
                'min_value': '4',
                'max_value': '128'
            },
            'scale_type': 1,
            'conditional_parameter_specs': []
        },
        {
            'parameter_id': 'activation',
            'categorical_value_spec': {
                'values': ['relu', 'selu']
            },
            'scale_type': 0,
            'conditional_parameter_specs': []
        },
        {
            'parameter_id': 'batch_size',
            'discrete_value_spec': {
                'values': [128.0, 256.0]
            },
            'scale_type': 1,
            'conditional_parameter_specs': []
        },
    ]

    outputs = serialize_parameters(parameters)
    self.assertEqual(outputs, expected_outputs)
    def test_hyperparameter_tuning_job_get_state_raises_without_run(self):
        aiplatform.init(
            project=_TEST_PROJECT,
            location=_TEST_LOCATION,
            staging_bucket=_TEST_STAGING_BUCKET,
            encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME,
        )

        custom_job = aiplatform.CustomJob(
            display_name=test_custom_job._TEST_DISPLAY_NAME,
            worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC,
            base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR,
        )

        job = aiplatform.HyperparameterTuningJob(
            display_name=_TEST_DISPLAY_NAME,
            custom_job=custom_job,
            metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE},
            parameter_spec={
                "lr":
                hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"),
                "units":
                hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"),
                "activation":
                hpt.CategoricalParameterSpec(
                    values=["relu", "sigmoid", "elu", "selu", "tanh"]),
                "batch_size":
                hpt.DiscreteParameterSpec(values=[16, 32, 64], scale="linear"),
            },
            parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT,
            max_trial_count=_TEST_MAX_TRIAL_COUNT,
            max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT,
            search_algorithm=_TEST_SEARCH_ALGORITHM,
            measurement_selection=_TEST_MEASUREMENT_SELECTION,
        )

        with pytest.raises(RuntimeError):
            print(job.state)
    def test_run_hyperparameter_tuning_job_with_fail_at_creation(self):
        aiplatform.init(
            project=_TEST_PROJECT,
            location=_TEST_LOCATION,
            staging_bucket=_TEST_STAGING_BUCKET,
            encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME,
        )

        custom_job = aiplatform.CustomJob(
            display_name=test_custom_job._TEST_DISPLAY_NAME,
            worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC,
            base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR,
        )

        job = aiplatform.HyperparameterTuningJob(
            display_name=_TEST_DISPLAY_NAME,
            custom_job=custom_job,
            metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE},
            parameter_spec={
                "lr":
                hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"),
                "units":
                hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"),
                "activation":
                hpt.CategoricalParameterSpec(
                    values=["relu", "sigmoid", "elu", "selu", "tanh"]),
                "batch_size":
                hpt.DiscreteParameterSpec(values=[16, 32], scale="linear"),
            },
            parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT,
            max_trial_count=_TEST_MAX_TRIAL_COUNT,
            max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT,
            search_algorithm=_TEST_SEARCH_ALGORITHM,
            measurement_selection=_TEST_MEASUREMENT_SELECTION,
        )

        job.run(
            service_account=_TEST_SERVICE_ACCOUNT,
            network=_TEST_NETWORK,
            timeout=_TEST_TIMEOUT,
            restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART,
            sync=False,
        )

        with pytest.raises(RuntimeError) as e:
            job.wait_for_resource_creation()
        assert e.match("Mock fail")

        with pytest.raises(RuntimeError) as e:
            job.resource_name
        assert e.match(
            "HyperparameterTuningJob resource has not been created. Resource failed with: Mock fail"
        )

        with pytest.raises(RuntimeError) as e:
            job.network
        assert e.match(
            "HyperparameterTuningJob resource has not been created. Resource failed with: Mock fail"
        )

        with pytest.raises(RuntimeError) as e:
            job.trials
        assert e.match(
            "HyperparameterTuningJob resource has not been created. Resource failed with: Mock fail"
        )
示例#6
0
def do_hyperparameter_tuning(data_set, timestamp, develop_mode, cpu_only_mode,
                             tf_version):
    # Vertex AI services require regional API endpoints.
    if cpu_only_mode:
        train_image = 'us-docker.pkg.dev/vertex-ai/training/tf-cpu.{}:latest'.format(
            tf_version)
    else:
        train_image = "us-docker.pkg.dev/vertex-ai/training/tf-gpu.{}:latest".format(
            tf_version)

    # a single trial job
    model_display_name = '{}-{}'.format(ENDPOINT_NAME, timestamp)
    if cpu_only_mode:
        trial_job = aiplatform.CustomJob.from_local_script(
            display_name='train-{}'.format(model_display_name),
            script_path="model.py",
            container_uri=train_image,
            args=[
                '--bucket',
                BUCKET,
                '--skip_full_eval',  # no need to evaluate on test data set
                '--num_epochs',
                '10',
                '--num_examples',
                '500000'  # 1/10 actual size to finish faster
            ],
            requirements=['cloudml-hypertune'],  # any extra Python packages
            replica_count=1,
            machine_type='n1-standard-4')
    else:
        trial_job = aiplatform.CustomJob.from_local_script(
            display_name='train-{}'.format(model_display_name),
            script_path="model.py",
            container_uri=train_image,
            args=[
                '--bucket',
                BUCKET,
                '--skip_full_eval',  # no need to evaluate on test data set
                '--num_epochs',
                '10',
                '--num_examples',
                '500000'  # 1/10 actual size to finish faster
            ],
            requirements=['cloudml-hypertune'],  # any extra Python packages
            replica_count=1,
            machine_type='n1-standard-4',
            # See https://cloud.google.com/vertex-ai/docs/general/locations#accelerators
            accelerator_type=aip.AcceleratorType.NVIDIA_TESLA_T4.name,
            accelerator_count=1,
        )

    # the tuning job
    hparam_job = aiplatform.HyperparameterTuningJob(
        # See https://googleapis.dev/python/aiplatform/latest/aiplatform.html#
        display_name='hparam-{}'.format(model_display_name),
        custom_job=trial_job,
        metric_spec={'val_rmse': 'minimize'},
        parameter_spec={
            "train_batch_size":
            hpt.IntegerParameterSpec(min=16, max=256, scale='log'),
            "nbuckets":
            hpt.IntegerParameterSpec(min=5, max=10, scale='linear'),
            "dnn_hidden_units":
            hpt.CategoricalParameterSpec(
                values=["64,16", "64,16,4", "64,64,64,8", "256,64,16"])
        },
        max_trial_count=2 if develop_mode else NUM_HPARAM_TRIALS,
        parallel_trial_count=2,
        search_algorithm=None,  # Bayesian
    )

    hparam_job.run(sync=True)  # has to finish before we can get trials.

    # get the parameters corresponding to the best trial
    best = sorted(hparam_job.trials,
                  key=lambda x: x.final_measurement.metrics[0].value)[0]
    logging.info('Best trial: {}'.format(best))
    best_params = []
    for param in best.parameters:
        best_params.append('--{}'.format(param.parameter_id))

        if param.parameter_id in ["train_batch_size", "nbuckets"]:
            # hparam returns 10.0 even though it's an integer param. so round it.
            # but CustomTrainingJob makes integer args into floats. so make it a string
            best_params.append(str(int(round(param.value))))
        else:
            # string or float parameters
            best_params.append(param.value)

    # run the best trial to completion
    logging.info('Launching full training job with {}'.format(best_params))
    return train_custom_model(data_set,
                              timestamp,
                              develop_mode,
                              cpu_only_mode,
                              tf_version,
                              extra_args=best_params)
示例#7
0
    def test_create_hyperparameter_tuning_job_with_enable_web_access(
        self,
        create_hyperparameter_tuning_job_mock_with_enable_web_access,
        get_hyperparameter_tuning_job_mock_with_enable_web_access,
        sync,
        caplog,
    ):
        caplog.set_level(logging.INFO)

        aiplatform.init(
            project=_TEST_PROJECT,
            location=_TEST_LOCATION,
            staging_bucket=_TEST_STAGING_BUCKET,
            encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME,
        )

        custom_job = aiplatform.CustomJob(
            display_name=test_custom_job._TEST_DISPLAY_NAME,
            worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC,
            base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR,
        )

        job = aiplatform.HyperparameterTuningJob(
            display_name=_TEST_DISPLAY_NAME,
            custom_job=custom_job,
            metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE},
            parameter_spec={
                "lr":
                hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"),
                "units":
                hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"),
                "activation":
                hpt.CategoricalParameterSpec(
                    values=["relu", "sigmoid", "elu", "selu", "tanh"]),
                "batch_size":
                hpt.DiscreteParameterSpec(values=[16, 32], scale="linear"),
            },
            parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT,
            max_trial_count=_TEST_MAX_TRIAL_COUNT,
            max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT,
            search_algorithm=_TEST_SEARCH_ALGORITHM,
            measurement_selection=_TEST_MEASUREMENT_SELECTION,
            labels=_TEST_LABELS,
        )

        job.run(
            service_account=_TEST_SERVICE_ACCOUNT,
            network=_TEST_NETWORK,
            timeout=_TEST_TIMEOUT,
            restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART,
            enable_web_access=test_custom_job._TEST_ENABLE_WEB_ACCESS,
            sync=sync,
            create_request_timeout=None,
        )

        job.wait()

        assert "workerpool0-0" in caplog.text

        expected_hyperparameter_tuning_job = (
            _get_hyperparameter_tuning_job_proto_with_enable_web_access())

        create_hyperparameter_tuning_job_mock_with_enable_web_access.assert_called_once_with(
            parent=_TEST_PARENT,
            hyperparameter_tuning_job=expected_hyperparameter_tuning_job,
            timeout=None,
        )

        assert job.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED
        assert job.network == _TEST_NETWORK
        assert job.trials == []

        caplog.clear()