def custom_training_job_sample( project: str, location: str, bucket: str, display_name: str, script_path: str, script_args: str, container_uri: str, model_serving_container_image_uri: str, requirements: str, replica_count: int, ): aiplatform.init(project=project, location=location, staging_bucket=bucket) job = aiplatform.CustomTrainingJob( display_name=display_name, script_path=script_path, container_uri=container_uri, requirements=requirements, model_serving_container_image_uri=model_serving_container_image_uri, ) model = job.run(args=script_args, replica_count=replica_count, model_display_name=display_name) return model
def train_model(bucket_name: str) -> str: aiplatform.init(project=PROJECT, staging_bucket=bucket_name) job = aiplatform.CustomTrainingJob( display_name="climate_script_colab", script_path="task.py", container_uri="us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-7:latest", ) job.run( accelerator_type="NVIDIA_TESLA_K80", accelerator_count=1, args=[f"--bucket={bucket_name}"], ) logging.info(f"train_model resource_name: {job.resource_name}") # Wait until the model training job finishes. status = None logging.info("Waiting for model to train.") for _ in range(0, TIMEOUT_SEC, POLL_INTERVAL_SEC): # https://googleapis.dev/python/aiplatform/latest/aiplatform_v1/job_service.html status = job.state.name if status in VERTEX_AI_FINISHED_STATE: break time.sleep(POLL_INTERVAL_SEC) logging.info(f"Model job finished with status {status}") assert status == VERTEX_AI_SUCCESS_STATE yield job.resource_name
def train_custom_model(data_set, timestamp, develop_mode, cpu_only_mode, tf_version, extra_args=None): # Set up training and deployment infra if cpu_only_mode: train_image = 'us-docker.pkg.dev/vertex-ai/training/tf-cpu.{}:latest'.format( tf_version) deploy_image = 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.{}:latest'.format( tf_version) else: train_image = "us-docker.pkg.dev/vertex-ai/training/tf-gpu.{}:latest".format( tf_version) deploy_image = "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.{}:latest".format( tf_version) # train model_display_name = '{}-{}'.format(ENDPOINT_NAME, timestamp) job = aiplatform.CustomTrainingJob( display_name='train-{}'.format(model_display_name), script_path="model.py", container_uri=train_image, requirements=['cloudml-hypertune'], # any extra Python packages model_serving_container_image_uri=deploy_image) model_args = [ '--bucket', BUCKET, ] if develop_mode: model_args += ['--develop'] if extra_args: model_args += extra_args if cpu_only_mode: model = job.run( dataset=data_set, # See https://googleapis.dev/python/aiplatform/latest/aiplatform.html# predefined_split_column_name='data_split', model_display_name=model_display_name, args=model_args, replica_count=1, machine_type='n1-standard-4', sync=develop_mode) else: model = job.run( dataset=data_set, # See https://googleapis.dev/python/aiplatform/latest/aiplatform.html# predefined_split_column_name='data_split', model_display_name=model_display_name, args=model_args, replica_count=1, machine_type='n1-standard-4', # See https://cloud.google.com/vertex-ai/docs/general/locations#accelerators accelerator_type=aip.AcceleratorType.NVIDIA_TESLA_T4.name, accelerator_count=1, sync=develop_mode) return model
def create_training_pipeline_custom_job_sample( project: str, location: str, staging_bucket: str, display_name: str, script_path: str, container_uri: str, model_serving_container_image_uri: str, dataset_id: Optional[str] = None, model_display_name: Optional[str] = None, args: Optional[List[Union[str, float, int]]] = None, replica_count: int = 0, machine_type: str = "n1-standard-4", accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", accelerator_count: int = 0, training_fraction_split: float = 0.8, validation_fraction_split: float = 0.1, test_fraction_split: float = 0.1, sync: bool = True, ): aiplatform.init(project=project, location=location, staging_bucket=staging_bucket) job = aiplatform.CustomTrainingJob( display_name=display_name, script_path=script_path, container_uri=container_uri, model_serving_container_image_uri=model_serving_container_image_uri, ) # This example uses an ImageDataset, but you can use another type dataset = aiplatform.ImageDataset(dataset_id) if dataset_id else None model = job.run( dataset=dataset, model_display_name=model_display_name, args=args, replica_count=replica_count, machine_type=machine_type, accelerator_type=accelerator_type, accelerator_count=accelerator_count, training_fraction_split=training_fraction_split, validation_fraction_split=validation_fraction_split, test_fraction_split=test_fraction_split, sync=sync, ) model.wait() print(model.display_name) print(model.resource_name) print(model.uri) return model
def test_dataset_create_to_model_predict( self, create_dataset_mock, # noqa: F811 import_data_mock, # noqa: F811 predict_client_predict_mock, # noqa: F811 mock_python_package_to_gcs, # noqa: F811 mock_pipeline_service_create, # noqa: F811 mock_model_service_get, # noqa: F811 mock_pipeline_service_get, # noqa: F811 sync, ): aiplatform.init( project=test_datasets._TEST_PROJECT, staging_bucket=test_training_jobs._TEST_BUCKET_NAME, credentials=test_training_jobs._TEST_CREDENTIALS, ) my_dataset = aiplatform.ImageDataset.create( display_name=test_datasets._TEST_DISPLAY_NAME, encryption_spec_key_name=_TEST_ENCRYPTION_KEY_NAME, sync=sync, ) my_dataset.import_data( gcs_source=test_datasets._TEST_SOURCE_URI_GCS, import_schema_uri=test_datasets._TEST_IMPORT_SCHEMA_URI, data_item_labels=test_datasets._TEST_DATA_LABEL_ITEMS, sync=sync, ) job = aiplatform.CustomTrainingJob( display_name=test_training_jobs._TEST_DISPLAY_NAME, script_path=test_training_jobs._TEST_LOCAL_SCRIPT_FILE_NAME, container_uri=test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE, model_serving_container_image_uri=test_training_jobs. _TEST_SERVING_CONTAINER_IMAGE, model_serving_container_predict_route=test_training_jobs. _TEST_SERVING_CONTAINER_PREDICTION_ROUTE, model_serving_container_health_route=test_training_jobs. _TEST_SERVING_CONTAINER_HEALTH_ROUTE, ) model_from_job = job.run( dataset=my_dataset, base_output_dir=test_training_jobs._TEST_BASE_OUTPUT_DIR, args=test_training_jobs._TEST_RUN_ARGS, replica_count=1, machine_type=test_training_jobs._TEST_MACHINE_TYPE, accelerator_type=test_training_jobs._TEST_ACCELERATOR_TYPE, accelerator_count=test_training_jobs._TEST_ACCELERATOR_COUNT, model_display_name=test_training_jobs._TEST_MODEL_DISPLAY_NAME, training_fraction_split=test_training_jobs. _TEST_TRAINING_FRACTION_SPLIT, validation_fraction_split=test_training_jobs. _TEST_VALIDATION_FRACTION_SPLIT, test_fraction_split=test_training_jobs._TEST_TEST_FRACTION_SPLIT, sync=sync, ) created_endpoint = models.Endpoint.create( display_name=test_endpoints._TEST_DISPLAY_NAME, encryption_spec_key_name=_TEST_ENCRYPTION_KEY_NAME, sync=sync, ) my_endpoint = model_from_job.deploy( encryption_spec_key_name=_TEST_ENCRYPTION_KEY_NAME, sync=sync) endpoint_deploy_return = created_endpoint.deploy(model_from_job, sync=sync) assert endpoint_deploy_return is None if not sync: my_endpoint.wait() created_endpoint.wait() test_prediction = created_endpoint.predict(instances=[[1.0, 2.0, 3.0], [1.0, 3.0, 4.0]], parameters={"param": 3.0}) true_prediction = models.Prediction( predictions=test_endpoints._TEST_PREDICTION, deployed_model_id=test_endpoints._TEST_ID, ) assert true_prediction == test_prediction predict_client_predict_mock.assert_called_once_with( endpoint=test_endpoints._TEST_ENDPOINT_NAME, instances=[[1.0, 2.0, 3.0], [1.0, 3.0, 4.0]], parameters={"param": 3.0}, ) expected_dataset = gca_dataset.Dataset( display_name=test_datasets._TEST_DISPLAY_NAME, metadata_schema_uri=test_datasets. _TEST_METADATA_SCHEMA_URI_NONTABULAR, metadata=test_datasets._TEST_NONTABULAR_DATASET_METADATA, encryption_spec=_TEST_ENCRYPTION_SPEC, ) expected_import_config = gca_dataset.ImportDataConfig( gcs_source=gca_io.GcsSource( uris=[test_datasets._TEST_SOURCE_URI_GCS]), import_schema_uri=test_datasets._TEST_IMPORT_SCHEMA_URI, data_item_labels=test_datasets._TEST_DATA_LABEL_ITEMS, ) create_dataset_mock.assert_called_once_with( parent=test_datasets._TEST_PARENT, dataset=expected_dataset, metadata=test_datasets._TEST_REQUEST_METADATA, ) import_data_mock.assert_called_once_with( name=test_datasets._TEST_NAME, import_configs=[expected_import_config]) expected_dataset.name = test_datasets._TEST_NAME assert my_dataset._gca_resource == expected_dataset mock_python_package_to_gcs.assert_called_once_with( gcs_staging_dir=test_training_jobs._TEST_BUCKET_NAME, project=test_training_jobs._TEST_PROJECT, credentials=initializer.global_config.credentials, ) true_args = test_training_jobs._TEST_RUN_ARGS true_worker_pool_spec = { "replica_count": test_training_jobs._TEST_REPLICA_COUNT, "machine_spec": { "machine_type": test_training_jobs._TEST_MACHINE_TYPE, "accelerator_type": test_training_jobs._TEST_ACCELERATOR_TYPE, "accelerator_count": test_training_jobs._TEST_ACCELERATOR_COUNT, }, "python_package_spec": { "executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE, "python_module": source_utils._TrainingScriptPythonPackager.module_name, "package_uris": [test_training_jobs._TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } true_fraction_split = gca_training_pipeline.FractionSplit( training_fraction=test_training_jobs._TEST_TRAINING_FRACTION_SPLIT, validation_fraction=test_training_jobs. _TEST_VALIDATION_FRACTION_SPLIT, test_fraction=test_training_jobs._TEST_TEST_FRACTION_SPLIT, ) true_container_spec = gca_model.ModelContainerSpec( image_uri=test_training_jobs._TEST_SERVING_CONTAINER_IMAGE, predict_route=test_training_jobs. _TEST_SERVING_CONTAINER_PREDICTION_ROUTE, health_route=test_training_jobs. _TEST_SERVING_CONTAINER_HEALTH_ROUTE, ) true_managed_model = gca_model.Model( display_name=test_training_jobs._TEST_MODEL_DISPLAY_NAME, container_spec=true_container_spec, ) true_input_data_config = gca_training_pipeline.InputDataConfig( fraction_split=true_fraction_split, dataset_id=my_dataset.name, gcs_destination=gca_io.GcsDestination( output_uri_prefix=test_training_jobs._TEST_BASE_OUTPUT_DIR), ) true_training_pipeline = gca_training_pipeline.TrainingPipeline( display_name=test_training_jobs._TEST_DISPLAY_NAME, training_task_definition=schema.training_job.definition. custom_task, training_task_inputs=json_format.ParseDict( { "worker_pool_specs": [true_worker_pool_spec], "base_output_directory": { "output_uri_prefix": test_training_jobs._TEST_BASE_OUTPUT_DIR }, }, struct_pb2.Value(), ), model_to_upload=true_managed_model, input_data_config=true_input_data_config, ) mock_pipeline_service_create.assert_called_once_with( parent=initializer.global_config.common_location_path(), training_pipeline=true_training_pipeline, ) assert job._gca_resource is mock_pipeline_service_get.return_value mock_model_service_get.assert_called_once_with( name=test_training_jobs._TEST_MODEL_NAME) assert model_from_job._gca_resource is mock_model_service_get.return_value assert job.get_model( )._gca_resource is mock_model_service_get.return_value assert not job.has_failed assert job.state == gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
def test_end_to_end_tabular(self, shared_state): """Build dataset, train a custom and AutoML model, deploy, and get predictions""" assert shared_state["bucket"] bucket = shared_state["bucket"] blob = bucket.blob(_BLOB_PATH) # Download the CSV file into memory and save it directory to staging bucket with request.urlopen(_DATASET_SRC) as response: data = response.read() blob.upload_from_string(data) # Collection of resources generated by this test, to be deleted during teardown shared_state["resources"] = [] aiplatform.init( project=e2e_base._PROJECT, location=e2e_base._LOCATION, staging_bucket=shared_state["staging_bucket_name"], ) # Create and import to single managed dataset for both training jobs ds = aiplatform.TabularDataset.create( display_name=f"{self._temp_prefix}-dataset-{uuid.uuid4()}", gcs_source=[ f'gs://{shared_state["staging_bucket_name"]}/{_BLOB_PATH}' ], sync=False, ) shared_state["resources"].extend([ds]) # Define both training jobs custom_job = aiplatform.CustomTrainingJob( display_name= f"{self._temp_prefix}-train-housing-custom-{uuid.uuid4()}", script_path=_LOCAL_TRAINING_SCRIPT_PATH, container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-2:latest", requirements=["gcsfs==0.7.1"], model_serving_container_image_uri= "gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-2:latest", ) automl_job = aiplatform.AutoMLTabularTrainingJob( display_name= f"{self._temp_prefix}-train-housing-automl-{uuid.uuid4()}", optimization_prediction_type="regression", optimization_objective="minimize-rmse", ) # Kick off both training jobs, AutoML job will take approx one hour to run custom_model = custom_job.run( ds, replica_count=1, model_display_name= f"{self._temp_prefix}-custom-housing-model-{uuid.uuid4()}", sync=False, ) automl_model = automl_job.run( dataset=ds, target_column="median_house_value", model_display_name= f"{self._temp_prefix}-automl-housing-model-{uuid.uuid4()}", sync=False, ) shared_state["resources"].extend( [automl_job, automl_model, custom_job, custom_model]) # Deploy both models after training completes custom_endpoint = custom_model.deploy(machine_type="n1-standard-4", sync=False) automl_endpoint = automl_model.deploy(machine_type="n1-standard-4", sync=False) shared_state["resources"].extend([automl_endpoint, custom_endpoint]) # Send online prediction with same instance to both deployed models # This sample is taken from an observation where median_house_value = 94600 custom_endpoint.wait() custom_prediction = custom_endpoint.predict([ { "longitude": -124.35, "latitude": 40.54, "housing_median_age": 52.0, "total_rooms": 1820.0, "total_bedrooms": 300.0, "population": 806, "households": 270.0, "median_income": 3.014700, }, ]) automl_endpoint.wait() automl_prediction = automl_endpoint.predict([ { "longitude": "-124.35", "latitude": "40.54", "housing_median_age": "52.0", "total_rooms": "1820.0", "total_bedrooms": "300.0", "population": "806", "households": "270.0", "median_income": "3.014700", }, ]) # Ensure a single prediction was returned assert len(custom_prediction.predictions) == 1 assert len(automl_prediction.predictions) == 1 # Ensure the models are remotely accurate try: automl_result = automl_prediction.predictions[0]["value"] custom_result = custom_prediction.predictions[0][0] assert 200000 > automl_result > 50000 assert 200000 > custom_result > 50000 except KeyError as e: raise RuntimeError("Unexpected prediction response structure:", e)
def test_end_to_end_tabular(self, shared_state): """Build dataset, train a custom and AutoML model, deploy, and get predictions""" assert shared_state["bucket"] bucket = shared_state["bucket"] blob = bucket.blob(_BLOB_PATH) # Download the CSV file into memory and save it directory to staging bucket with request.urlopen(_DATASET_SRC) as response: data = response.read() blob.upload_from_string(data) # Collection of resources generated by this test, to be deleted during teardown shared_state["resources"] = [] aiplatform.init( project=e2e_base._PROJECT, location=e2e_base._LOCATION, staging_bucket=shared_state["staging_bucket_name"], ) # Create and import to single managed dataset for both training jobs dataset_gcs_source = f'gs://{shared_state["staging_bucket_name"]}/{_BLOB_PATH}' ds = aiplatform.TabularDataset.create( display_name=self._make_display_name("dataset"), gcs_source=[dataset_gcs_source], sync=False, create_request_timeout=180.0, ) shared_state["resources"].extend([ds]) # Define both training jobs custom_job = aiplatform.CustomTrainingJob( display_name=self._make_display_name("train-housing-custom"), script_path=_LOCAL_TRAINING_SCRIPT_PATH, container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-2:latest", requirements=["gcsfs==0.7.1"], model_serving_container_image_uri= "gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-2:latest", ) automl_job = aiplatform.AutoMLTabularTrainingJob( display_name=self._make_display_name("train-housing-automl"), optimization_prediction_type="regression", optimization_objective="minimize-rmse", ) # Kick off both training jobs, AutoML job will take approx one hour to run custom_model = custom_job.run( ds, replica_count=1, model_display_name=self._make_display_name("custom-housing-model"), timeout=1234, restart_job_on_worker_restart=True, enable_web_access=True, sync=False, create_request_timeout=None, ) automl_model = automl_job.run( dataset=ds, target_column="median_house_value", model_display_name=self._make_display_name("automl-housing-model"), sync=False, ) shared_state["resources"].extend( [automl_job, automl_model, custom_job, custom_model]) # Deploy both models after training completes custom_endpoint = custom_model.deploy(machine_type="n1-standard-4", sync=False) automl_endpoint = automl_model.deploy(machine_type="n1-standard-4", sync=False) shared_state["resources"].extend([automl_endpoint, custom_endpoint]) custom_batch_prediction_job = custom_model.batch_predict( job_display_name=self._make_display_name("automl-housing-model"), instances_format="csv", machine_type="n1-standard-4", gcs_source=dataset_gcs_source, gcs_destination_prefix= f'gs://{shared_state["staging_bucket_name"]}/bp_results/', sync=False, ) shared_state["resources"].append(custom_batch_prediction_job) in_progress_done_check = custom_job.done() custom_job.wait_for_resource_creation() automl_job.wait_for_resource_creation() custom_batch_prediction_job.wait_for_resource_creation() # Send online prediction with same instance to both deployed models # This sample is taken from an observation where median_house_value = 94600 custom_endpoint.wait() # Check scheduling is correctly set assert (custom_job._gca_resource.training_task_inputs["scheduling"] ["timeout"] == "1234s") assert (custom_job._gca_resource.training_task_inputs["scheduling"] ["restartJobOnWorkerRestart"] is True) custom_prediction = custom_endpoint.predict([_INSTANCE], timeout=180.0) custom_batch_prediction_job.wait() automl_endpoint.wait() automl_prediction = automl_endpoint.predict( [{k: str(v) for k, v in _INSTANCE.items()}], # Cast int values to strings timeout=180.0, ) # Test lazy loading of Endpoint, check getter was never called after predict() custom_endpoint = aiplatform.Endpoint(custom_endpoint.resource_name) custom_endpoint.predict([_INSTANCE]) completion_done_check = custom_job.done() assert custom_endpoint._skipped_getter_call() assert (custom_job.state == gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED) assert (automl_job.state == gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED) assert (custom_batch_prediction_job.state == gca_job_state.JobState.JOB_STATE_SUCCEEDED) # Ensure a single prediction was returned assert len(custom_prediction.predictions) == 1 assert len(automl_prediction.predictions) == 1 # Ensure the models are remotely accurate try: automl_result = automl_prediction.predictions[0]["value"] custom_result = custom_prediction.predictions[0][0] assert 200000 > automl_result > 50000 assert 200000 > custom_result > 50000 except KeyError as e: raise RuntimeError("Unexpected prediction response structure:", e) # Check done() method works correctly assert in_progress_done_check is False assert completion_done_check is True