def delete_model_from_aip_if_exists( api: discovery.Resource, ai_platform_serving_args: Dict[Text, Any], ) -> None: """Deletes a model from Google Cloud AI Platform if exists. Args: api: Google API client resource. ai_platform_serving_args: Dictionary containing arguments for pushing to AI Platform. For the full set of parameters supported, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.models Raises: RuntimeError: if an error is encountered when trying to delete. """ logging.info('Deleting model with from AI Platform: %s', ai_platform_serving_args) model_name = ai_platform_serving_args['model_name'] project_id = ai_platform_serving_args['project_id'] name = 'projects/{}/models/{}'.format(project_id, model_name) try: operation = api.projects().models().delete(name=name).execute() _wait_for_operation(api, operation, 'projects.models.delete') except errors.HttpError as e: # If the error is to delete an non-exist model, it's ok to ignore. if e.resp.status == 404: logging.warn('Model %s does not exist', model_name) else: raise RuntimeError( 'Deleting model from AI Platform failed: {}'.format(e))
def _wait_for_operation(api: discovery.Resource, operation: Dict[Text, Any], method_name: Text) -> Dict[Text, Any]: """Wait for a long running operation. Args: api: Google API client resource. operation: The operation to wait for. method_name: Operation method name for logging. Returns: Operation completion status. Raises: RuntimeError: If the operation completed with an error. """ status_resc = api.projects().operations().get(name=operation['name']) while not status_resc.execute().get('done'): time.sleep(_POLLING_INTERVAL_IN_SECONDS) logging.info('Method %s still being executed...', method_name) result = status_resc.execute() if result.get('error'): # The operation completed with an error. raise RuntimeError('Failed to execute {}: {}'.format( method_name, result['error'])) return result
def get_zone_tpu_types(tpu_api: discovery.Resource, project_id: str, zone: str) -> Optional[List[TPUSpec]]: """gets list of tpus available in given zone Args: tpu_api: tpu api instance project_id: project id zone: zone string Returns: list of supported tpu specs on success, None otherwise """ location = 'projects/{}/locations/{}'.format(project_id, zone) rsp = tpu_api.projects().locations().acceleratorTypes().list( parent=location).execute() tpus = [] for t in rsp['acceleratorTypes']: spec = gke_tpu_to_tpuspec(t['type']) if spec is None: continue tpus.append(spec) return tpus
def get_gke_clusters(container: Resource, project_id: str) -> Dict: """ Returns a GCP response object containing a list of GKE clusters within the given project. :type container: The GCP Container resource object :param container: The Container resource object created by googleapiclient.discovery.build() :type project_id: str :param project_id: The Google Project Id that you are retrieving clusters from :rtype: Cluster Object :return: Cluster response object """ try: req = container.projects().zones().clusters().list(projectId=project_id, zone='-') res = req.execute() return res except HttpError as e: err = json.loads(e.content.decode('utf-8'))['error'] if err['status'] == 'PERMISSION_DENIED': logger.warning( ( "Could not retrieve GKE clusters on project %s due to permissions issue. Code: %s, Message: %s" ), project_id, err['code'], err['message'], ) return {} else: raise
def _get_study( service_client: discovery.Resource, study_parent: Text, study_id: Text, study_should_exist: bool = False, ): """Method for loading a study. Given the study_parent and the study_id, this method will load the specified study, up to constants.MAX_NUM_TRIES_FOR_STUDIES tries. Args: service_client: An API client of Vizier service. study_parent: Prefix of the study name. The full study name will be {study_parent}/studies/{study_id}. study_id: An identifier of the study. study_should_exist: Indicates whether it should be assumed that the study with the given study_id exists. """ study_name = "{}/studies/{}".format(study_parent, study_id) tf.get_logger().info( "Study already exists: {}.\nLoad existing study...".format(study_name)) num_tries = 0 while True: try: service_client.projects().locations().studies().get( name=study_name ).execute() except errors.HttpError as err: num_tries += 1 if num_tries >= constants.MAX_NUM_TRIES_FOR_STUDIES: if ( study_should_exist and err.resp.status == http.HTTPStatus.NOT_FOUND.value ): raise ValueError( "GetStudy failed. Study not found: {}.".format(study_id) ) else: raise RuntimeError( "GetStudy failed. Max retries reached: {0!s}".format( err ) ) time.sleep(1) # wait 1 second before trying to get the study again else: break
def __read_papi_v2beta_operation_metadata( operation_id: str, genomics_v2beta_client: Resource) -> Mapping[str, Any]: """Reads the operations metadata for a pipelines API v2beta job ID. Returns a python dict""" logger.info( f'Reading PAPI v2beta operation metadata for {operation_id}...') result = genomics_v2beta_client.projects().locations().operations( ).get(name=operation_id).execute() return result
def get_service_account(project_id: str, service_account_email: str, iam_service: discovery.Resource) -> Dict: """Summary Args: project_id (str): Description service_account_email (str): Description iam_service (discovery.Resource): Description Returns: Dict: Description """ return iam_service.projects().serviceAccounts().get( name=f"projects/{project_id}/serviceAccounts/{service_account_email}" ).execute()
def create_model_for_aip_prediction_if_not_exist( api: discovery.Resource, job_labels: Dict[Text, Text], ai_platform_serving_args: Dict[Text, Any], ) -> bool: """Creates a new model for serving with AI Platform if not exists. Args: api: Google API client resource. job_labels: The dict of labels that will be attached to this job. ai_platform_serving_args: Dictionary containing arguments for pushing to AI Platform. Returns: Whether a new model is created. Raises: RuntimeError if model creation failed. """ model_name = ai_platform_serving_args['model_name'] project_id = ai_platform_serving_args['project_id'] regions = ai_platform_serving_args.get('regions', []) body = {'name': model_name, 'regions': regions, 'labels': job_labels} parent = 'projects/{}'.format(project_id) result = True try: api.projects().models().create(body=body, parent=parent).execute() except errors.HttpError as e: # If the error is to create an already existing model, it's ok to ignore. if e.resp.status == 409: logging.warn('Model %s already exists', model_name) result = False else: raise RuntimeError( 'Creating model to AI Platform failed: {}'.format(e)) return result
def create_service_account_key(service_account: Dict, iam_service: discovery.Resource) -> Dict: """Summary Args: service_account (Dict): Description iam_service (discovery.Resource): Description Returns: Dict: Description """ return iam_service.projects().serviceAccounts().keys().create( name=service_account["name"], body={ "privateKeyType": "TYPE_GOOGLE_CREDENTIALS_FILE", "keyAlgorithm": "KEY_ALG_RSA_2048" }).execute()
def get_project_policies(project_id: str, resource_manager_service: discovery.Resource) -> Dict: """Summary Args: project_id (str): Description resource_manager_service (discovery.Resource): Description Returns: Dict: Description """ return resource_manager_service.projects().getIamPolicy( resource=project_id, body={ "options": { "requestedPolicyVersion": 3 } }).execute()
def create_service_account(project_id: str, name: str, display_name: str, iam_service: discovery.Resource) -> Dict: """Summary Args: project_id (str): Description name (str): Description display_name (str): Description iam_service (discovery.Resource): Description Returns: Dict: Description """ return iam_service.projects().serviceAccounts().create( name='projects/' + project_id, body={ 'accountId': name, 'serviceAccount': { 'displayName': display_name } }).execute()
def delete_model_version_from_aip_if_exists( api: discovery.Resource, model_version: Text, ai_platform_serving_args: Dict[Text, Any], ) -> None: """Deletes a model version from Google Cloud AI Platform if version exists. Args: api: Google API client resource. model_version: Version of the model being deleted. ai_platform_serving_args: Dictionary containing arguments for pushing to AI Platform. For the full set of parameters supported, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.models Raises: RuntimeError: if an error is encountered when trying to delete. """ logging.info('Deleting model version %s from AI Platform: %s', model_version, ai_platform_serving_args) model_name = ai_platform_serving_args['model_name'] project_id = ai_platform_serving_args['project_id'] version_name = 'projects/{}/models/{}/versions/{}'.format( project_id, model_name, model_version) try: operation = api.projects().models().versions().delete( name=version_name).execute() _wait_for_operation(api, operation, 'projects.models.versions.delete') except errors.HttpError as e: # If the error is to delete an non-exist model version, it's ok to ignore. if e.resp.status == 404: logging.warn('Model version %s does not exist', version_name) if e.resp.status == 400: logging.warn( 'Model version %s won\'t be deleted because it is the ' 'default version and not the only version in the model', version_name) else: raise RuntimeError( 'Deleting model version {} from AI Platform failed: {}'.format( version_name, e))
def add_service_account_policy( project_id: str, service_account: Dict, resource_manager_service: discovery.Resource) -> Dict: """Summary Args: project_id (str): Description service_account (Dict): Description resource_manager_service (discovery.Resource): Description Returns: Dict: Description """ existing_policies = get_project_policies(project_id, resource_manager_service) new_policies = existing_policies new_policies["bindings"] += [{ "role": "roles/storage.objectAdmin", "members": [f'serviceAccount:{service_account["email"]}'] }] return resource_manager_service.projects().setIamPolicy( resource=f'{project_id}', body={ "policy": new_policies }).execute()
def get_tpu_drivers(tpu_api: discovery.Resource, project_id: str, zone: str) -> Optional[List[str]]: """gets supported tpu drivers for given project, zone Args: tpu_api: discovery tpu api resource project_id: project id zone: zone identifier Returns: list of supported drivers on success, None otherwise """ location = 'projects/{}/locations/{}'.format(project_id, zone) rsp = tpu_api.projects().locations().tensorflowVersions().list( parent=location).execute() if rsp is None: logging.error('error getting tpu drivers') return None return [d['version'] for d in rsp['tensorflowVersions']]
def deploy_model_for_aip_prediction(api: discovery.Resource, serving_path: Text, model_version: Text, ai_platform_serving_args: Dict[Text, Any], job_labels: Dict[Text, Text], skip_model_creation: bool = False, set_default_version: bool = True) -> None: """Deploys a model for serving with AI Platform. Args: api: Google API client resource. serving_path: The path to the model. Must be a GCS URI. model_version: Version of the model being deployed. Must be different from what is currently being served. ai_platform_serving_args: Dictionary containing arguments for pushing to AI Platform. The full set of parameters supported can be found at https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions#Version. Most keys are forwarded as-is, but following keys are handled specially: - name: this must be empty (and will be filled by pusher). - deployment_uri: this must be empty (and will be filled by pusher). - python_version: when left empty, this will be filled by python version of the environment being used. - runtime_version: when left empty, this will be filled by TensorFlow version from the environment. - labels: a list of job labels will be merged with user's input. job_labels: The dict of labels that will be attached to this job. They are merged with optional labels from `ai_platform_serving_args`. skip_model_creation: If true, the method assuem model already exist in AI platform, therefore skipping model creation. set_default_version: Whether set the newly deployed model version as the default version. Raises: RuntimeError: if an error is encountered when trying to push. """ logging.info( 'Deploying to model with version %s to AI Platform for serving: %s', model_version, ai_platform_serving_args) model_name = ai_platform_serving_args['model_name'] project_id = ai_platform_serving_args['project_id'] default_runtime_version = _get_tf_runtime_version(tf.__version__) runtime_version = ai_platform_serving_args.get('runtime_version', default_runtime_version) python_version = _get_caip_python_version(runtime_version) if not skip_model_creation: create_model_for_aip_prediction_if_not_exist(api, job_labels, ai_platform_serving_args) version_body = dict(ai_platform_serving_args) for model_only_key in ['model_name', 'project_id', 'regions']: version_body.pop(model_only_key, None) version_body['name'] = model_version version_body['deployment_uri'] = serving_path version_body['runtime_version'] = version_body.get('runtime_version', runtime_version) version_body['python_version'] = version_body.get('python_version', python_version) version_body['labels'] = {**version_body.get('labels', {}), **job_labels} logging.info( 'Creating new version of model_name %s in project %s, request body: %s', model_name, project_id, version_body) # Push to AIP, and record the operation name so we can poll for its state. model_name = 'projects/{}/models/{}'.format(project_id, model_name) try: operation = api.projects().models().versions().create( body=version_body, parent=model_name).execute() _wait_for_operation(api, operation, 'projects.models.versions.create') except errors.HttpError as e: # If the error is to create an already existing model version, it's ok to # ignore. if e.resp.status == 409: logging.warn('Model version %s already exists', model_version) else: raise RuntimeError( 'Creating model verseion to AI Platform failed: {}'.format(e)) if set_default_version: # Set the new version as default. # By API specification, if Long-Running-Operation is done and there is # no error, 'response' is guaranteed to exist. api.projects().models().versions().setDefault( name='{}/versions/{}'.format(model_name, model_version)).execute() logging.info( 'Successfully deployed model %s with version %s, serving from %s', model_name, model_version, serving_path)
def create_request(cluster_api: discovery.Resource, creds: Credentials, cluster_name: str, project_id: str, zone: str, release_channel: ReleaseChannel, single_zone: bool) -> Optional[HttpRequest]: '''generates cluster create request Args: cluster_api: cluster api client creds: credentials cluster_name: name of cluster to create project_id: project id zone: zone in which to create cluster For a single-zone cluster (see below), this zone will contain the cluster control plane and all worker nodes. For a multi-zone cluster this zone will contain the control plane, but worker nodes can be created in any zone in the same region as the control plane. release_channel: release channel for cluster single_zone: create a single-zone cluster if true, multi-zone otherwise. A single-zone cluster only creates worker nodes in the same zone as the cluster control-plane (specified in the 'zone' argument above), whereas a multi-zone cluster can create worker nodes in every zone in the same region as the cluster control plane. A multi-zone cluster can help job response time when a given zone becomes overburdened. Returns: HttpRequest on success, None otherwise ''' rz = _parse_zone(zone) if rz is None: logging.error('invalid zone specified: {}'.format(zone)) return region, _ = rz compute_api = discovery.build('compute', 'v1', credentials=creds, cache_discovery=False) resource_limits = utils.generate_resource_limits( compute_api, project_id, region) if resource_limits is None: logging.error('error generating resource limits') return if single_zone: node_zones = [zone] else: node_zones = utils.get_zones_in_region(compute_api, project_id, region) if node_zones is None: logging.error('error getting zones for region {}'.format(region)) return request_body = _cluster_create_request_body( project_id, zone, _create_cluster_spec(cluster_name, zone, node_zones, resource_limits, release_channel)) # see https://cloud.google.com/kubernetes-engine/docs/reference/rest/v1/projects.zones.clusters/create return cluster_api.projects().zones().clusters().create( projectId=project_id, zone=zone, body=request_body)
def deploy_model_for_aip_prediction(api: discovery.Resource, serving_path: Text, model_version: Text, ai_platform_serving_args: Dict[Text, Any], job_labels: Dict[Text, Text], skip_model_creation: bool = False, set_default_version: bool = True) -> None: """Deploys a model for serving with AI Platform. Args: api: Google API client resource. serving_path: The path to the model. Must be a GCS URI. model_version: Version of the model being deployed. Must be different from what is currently being served. ai_platform_serving_args: Dictionary containing arguments for pushing to AI Platform. For the full set of parameters supported, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions#Version job_labels: The dict of labels that will be attached to this job. skip_model_creation: If true, the method assuem model already exist in AI platform, therefore skipping model creation. set_default_version: Whether set the newly deployed model version as the default version. Raises: RuntimeError: if an error is encountered when trying to push. """ logging.info( 'Deploying to model with version %s to AI Platform for serving: %s', model_version, ai_platform_serving_args) model_name = ai_platform_serving_args['model_name'] project_id = ai_platform_serving_args['project_id'] default_runtime_version = _get_tf_runtime_version(tf.__version__) runtime_version = ai_platform_serving_args.get('runtime_version', default_runtime_version) python_version = _get_caip_python_version(runtime_version) if not skip_model_creation: create_model_for_aip_prediction_if_not_exist(api, job_labels, ai_platform_serving_args) body = { 'name': model_version, 'deployment_uri': serving_path, 'runtime_version': runtime_version, 'python_version': python_version, 'labels': job_labels, } # Push to AIP, and record the operation name so we can poll for its state. model_name = 'projects/{}/models/{}'.format(project_id, model_name) try: operation = api.projects().models().versions().create( body=body, parent=model_name).execute() _wait_for_operation(api, operation, 'projects.models.versions.create') except errors.HttpError as e: # If the error is to create an already existing model version, it's ok to # ignore. if e.resp.status == 409: logging.warn('Model version %s already exists', model_version) else: raise RuntimeError( 'Creating model verseion to AI Platform failed: {}'.format(e)) if set_default_version: # Set the new version as default. # By API specification, if Long-Running-Operation is done and there is # no error, 'response' is guaranteed to exist. api.projects().models().versions().setDefault( name='{}/versions/{}'.format(model_name, model_version)).execute() logging.info( 'Successfully deployed model %s with version %s, serving from %s', model_name, model_version, serving_path)