def _GetStagedFile(self, file_str): """Validate file URI and register it for uploading if it is local.""" drive, _ = os.path.splitdrive(file_str) uri = six.moves.urllib.parse.urlsplit(file_str, allow_fragments=False) # Determine the file is local to this machine if no scheme besides a drive # is passed. file:// URIs are interpreted as living on VMs. is_local = drive or not uri.scheme if not is_local: # Non-local files are already staged. # TODO(b/36057257): Validate scheme. return file_str if not os.path.exists(file_str): raise files.Error('File Not Found: [{0}].'.format(file_str)) if self._staging_dir is None: # we raise this exception only if there are files to stage but the staging # location couldn't be determined. In case where files are already staged # this exception is not raised raise exceptions.ArgumentError( 'Could not determine where to stage local file {0}. When submitting ' 'a job to a cluster selected via --cluster-labels, either\n' '- a staging bucket must be provided via the --bucket argument, or\n' '- all provided files must be non-local.'.format(file_str)) basename = os.path.basename(file_str) self.files_to_stage.append(file_str) staged_file = six.moves.urllib.parse.urljoin(self._staging_dir, basename) return staged_file
def Run(self, args): client = self.context['dataproc_client'] messages = self.context['dataproc_messages'] project = properties.VALUES.core.project.Get(required=True) region = self.context['dataproc_region'] request = self.GetRequest(messages, project, region, args) if args.cluster: request.clusterName = args.cluster if args.state_filter: if args.state_filter == 'active': request.jobStateMatcher = ( messages.DataprocProjectsRegionsJobsListRequest. JobStateMatcherValueValuesEnum.ACTIVE) elif args.state_filter == 'inactive': request.jobStateMatcher = ( messages.DataprocProjectsRegionsJobsListRequest. JobStateMatcherValueValuesEnum.NON_ACTIVE) else: raise exceptions.ArgumentError( 'Invalid state-filter; [{0}].'.format(args.state_filter)) jobs = list_pager.YieldFromList(client.projects_regions_jobs, request, limit=args.limit, field='jobs', batch_size=args.page_size, batch_size_attribute='pageSize') return (TypedJob(job) for job in jobs)
def Run(self, args): dataproc = dp.Dataproc() project = properties.VALUES.core.project.GetOrFail() region = properties.VALUES.dataproc.region.GetOrFail() request = self.GetRequest(dataproc.messages, project, region, args) if args.cluster: request.clusterName = args.cluster if args.state_filter: if args.state_filter == 'active': request.jobStateMatcher = ( dataproc.messages.DataprocProjectsRegionsJobsListRequest. JobStateMatcherValueValuesEnum.ACTIVE) # TODO(b/32669485) Get full flag test coverage. elif args.state_filter == 'inactive': request.jobStateMatcher = ( dataproc.messages.DataprocProjectsRegionsJobsListRequest. JobStateMatcherValueValuesEnum.NON_ACTIVE) else: raise exceptions.ArgumentError( 'Invalid state-filter; [{0}].'.format(args.state_filter)) jobs = list_pager.YieldFromList(dataproc.client.projects_regions_jobs, request, limit=args.limit, field='jobs', batch_size=args.page_size, batch_size_attribute='pageSize') return (TypedJob(job) for job in jobs)
def ValidateReservationAffinityGroup(args): """Validates flags specifying reservation affinity.""" affinity = getattr(args, 'reservation_affinity', None) if affinity == 'specific': if not args.IsSpecified('reservation'): raise exceptions.ArgumentError( '--reservation must be specified with --reservation-affinity=specific' )
def Run(self, args): client = self.context['dataproc_client'] messages = self.context['dataproc_messages'] job_ref = util.ParseJob(args.id, self.context) changed_fields = [] has_changes = False # Update labels if the user requested it labels = None if args.update_labels or args.remove_labels: has_changes = True changed_fields.append('labels') # We need to fetch the job first so we know what the labels look like. The # labels_util.UpdateLabels will fill out the proto for us with all the # updates and removals, but first we need to provide the current state # of the labels orig_job = client.projects_regions_jobs.Get( client.MESSAGES_MODULE.DataprocProjectsRegionsJobsGetRequest( projectId=job_ref.projectId, region=job_ref.region, jobId=job_ref.jobId)) labels = labels_util.UpdateLabels(orig_job.labels, messages.Job.LabelsValue, args.update_labels, args.remove_labels) if not has_changes: raise exceptions.ArgumentError( 'Must specify at least one job parameter to update.') updated_job = orig_job updated_job.labels = labels request = messages.DataprocProjectsRegionsJobsPatchRequest( projectId=job_ref.projectId, region=job_ref.region, jobId=job_ref.jobId, job=updated_job, updateMask=','.join(changed_fields)) returned_job = client.projects_regions_jobs.Patch(request) log.UpdatedResource(returned_job) return returned_job
def GetFilesByType(args): """Returns a dict of files by their type (jars, archives, etc.).""" # TODO(user): Move arg manipulation elsewhere. # TODO(user): Remove with GA flags 2017-04-01 (b/33298024). if not args.main_class and not args.main_jar: raise exceptions.ArgumentError('Must either specify --class or JAR.') if args.main_class and args.main_jar: log.warn( 'You must specify exactly one of --jar and --class. ' 'This will be strictly enforced in April 2017. ' "Use 'gcloud beta dataproc jobs submit spark' to see new behavior.") log.info('Passing main jar as an additional jar.') args.jars.append(args.main_jar) args.main_jar = None return { 'main_jar': args.main_jar, 'jars': args.jars, 'archives': args.archives, 'files': args.files}
def GetClusterConfig(args, dataproc, project_id, compute_resources, beta=False, include_deprecated=True, include_ttl_config=False, include_gke_platform_args=False): """Get dataproc cluster configuration. Args: args: Arguments parsed from argparse.ArgParser. dataproc: Dataproc object that contains client, messages, and resources project_id: Dataproc project ID compute_resources: compute resource for cluster beta: use BETA only features include_deprecated: whether to include deprecated args include_ttl_config: whether to include Scheduled Delete(TTL) args include_gke_platform_args: whether to include GKE-based cluster args Returns: cluster_config: Dataproc cluster configuration """ master_accelerator_type = None worker_accelerator_type = None secondary_worker_accelerator_type = None if args.master_accelerator: master_accelerator_type = args.master_accelerator['type'] master_accelerator_count = args.master_accelerator.get('count', 1) if args.worker_accelerator: worker_accelerator_type = args.worker_accelerator['type'] worker_accelerator_count = args.worker_accelerator.get('count', 1) secondary_worker_accelerator = _FirstNonNone( args.secondary_worker_accelerator, args.preemptible_worker_accelerator) if secondary_worker_accelerator: secondary_worker_accelerator_type = secondary_worker_accelerator[ 'type'] secondary_worker_accelerator_count = secondary_worker_accelerator.get( 'count', 1) # Resolve non-zonal GCE resources # We will let the server resolve short names of zonal resources because # if auto zone is requested, we will not know the zone before sending the # request image_ref = args.image and compute_resources.Parse( args.image, params={'project': project_id}, collection='compute.images') network_ref = args.network and compute_resources.Parse( args.network, params={'project': project_id}, collection='compute.networks') subnetwork_ref = args.subnet and compute_resources.Parse( args.subnet, params={ 'project': project_id, 'region': properties.VALUES.compute.region.GetOrFail, }, collection='compute.subnetworks') timeout_str = six.text_type(args.initialization_action_timeout) + 's' init_actions = [ dataproc.messages.NodeInitializationAction( executableFile=exe, executionTimeout=timeout_str) for exe in (args.initialization_actions or []) ] # Increase the client timeout for each initialization action. args.timeout += args.initialization_action_timeout * len(init_actions) expanded_scopes = compute_helpers.ExpandScopeAliases(args.scopes) software_config = dataproc.messages.SoftwareConfig( imageVersion=args.image_version) if include_deprecated: master_boot_disk_size_gb = args.master_boot_disk_size_gb else: master_boot_disk_size_gb = None if args.master_boot_disk_size: master_boot_disk_size_gb = (api_utils.BytesToGb( args.master_boot_disk_size)) if include_deprecated: worker_boot_disk_size_gb = args.worker_boot_disk_size_gb else: worker_boot_disk_size_gb = None if args.worker_boot_disk_size: worker_boot_disk_size_gb = (api_utils.BytesToGb( args.worker_boot_disk_size)) secondary_worker_boot_disk_size_gb = (api_utils.BytesToGb( _FirstNonNone(args.secondary_worker_boot_disk_size, args.preemptible_worker_boot_disk_size))) if args.single_node or args.num_workers == 0: # Explicitly specifying --num-workers=0 gives you a single node cluster, # but if --num-workers is omitted, args.num_workers is None (not 0), and # this property will not be set args.properties[constants.ALLOW_ZERO_WORKERS_PROPERTY] = 'true' if args.properties: software_config.properties = encoding.DictToAdditionalPropertyMessage( args.properties, dataproc.messages.SoftwareConfig.PropertiesValue, sort_items=True) if args.components: software_config_cls = dataproc.messages.SoftwareConfig software_config.optionalComponents.extend( list( map( software_config_cls. OptionalComponentsValueListEntryValuesEnum, args.components))) gce_cluster_config = dataproc.messages.GceClusterConfig( networkUri=network_ref and network_ref.SelfLink(), subnetworkUri=subnetwork_ref and subnetwork_ref.SelfLink(), internalIpOnly=args.no_address, serviceAccount=args.service_account, serviceAccountScopes=expanded_scopes, zoneUri=properties.VALUES.compute.zone.GetOrFail()) reservation_affinity = GetReservationAffinity(args, dataproc) gce_cluster_config.reservationAffinity = reservation_affinity if args.tags: gce_cluster_config.tags = args.tags if args.metadata: flat_metadata = collections.OrderedDict([(k, v) for d in args.metadata for k, v in d.items()]) gce_cluster_config.metadata = encoding.DictToAdditionalPropertyMessage( flat_metadata, dataproc.messages.GceClusterConfig.MetadataValue) master_accelerators = [] if master_accelerator_type: master_accelerators.append( dataproc.messages.AcceleratorConfig( acceleratorTypeUri=master_accelerator_type, acceleratorCount=master_accelerator_count)) worker_accelerators = [] if worker_accelerator_type: worker_accelerators.append( dataproc.messages.AcceleratorConfig( acceleratorTypeUri=worker_accelerator_type, acceleratorCount=worker_accelerator_count)) secondary_worker_accelerators = [] if secondary_worker_accelerator_type: secondary_worker_accelerators.append( dataproc.messages.AcceleratorConfig( acceleratorTypeUri=secondary_worker_accelerator_type, acceleratorCount=secondary_worker_accelerator_count)) cluster_config = dataproc.messages.ClusterConfig( configBucket=args.bucket, gceClusterConfig=gce_cluster_config, masterConfig=dataproc.messages.InstanceGroupConfig( numInstances=args.num_masters, imageUri=image_ref and image_ref.SelfLink(), machineTypeUri=args.master_machine_type, accelerators=master_accelerators, diskConfig=GetDiskConfig(dataproc, args.master_boot_disk_type, master_boot_disk_size_gb, args.num_master_local_ssds), minCpuPlatform=args.master_min_cpu_platform), workerConfig=dataproc.messages.InstanceGroupConfig( numInstances=args.num_workers, imageUri=image_ref and image_ref.SelfLink(), machineTypeUri=args.worker_machine_type, accelerators=worker_accelerators, diskConfig=GetDiskConfig( dataproc, args.worker_boot_disk_type, worker_boot_disk_size_gb, args.num_worker_local_ssds, ), minCpuPlatform=args.worker_min_cpu_platform), initializationActions=init_actions, softwareConfig=software_config, ) if args.kerberos_config_file or args.kerberos_root_principal_password_uri: cluster_config.securityConfig = dataproc.messages.SecurityConfig() if args.kerberos_config_file: cluster_config.securityConfig.kerberosConfig = ParseKerberosConfigFile( dataproc, args.kerberos_config_file) else: kerberos_config = dataproc.messages.KerberosConfig() kerberos_config.enableKerberos = True if args.kerberos_root_principal_password_uri: kerberos_config.rootPrincipalPasswordUri = \ args.kerberos_root_principal_password_uri kerberos_kms_ref = args.CONCEPTS.kerberos_kms_key.Parse() kerberos_config.kmsKeyUri = kerberos_kms_ref.RelativeName() cluster_config.securityConfig.kerberosConfig = kerberos_config if args.autoscaling_policy: cluster_config.autoscalingConfig = dataproc.messages.AutoscalingConfig( policyUri=args.CONCEPTS.autoscaling_policy.Parse().RelativeName()) if include_ttl_config: lifecycle_config = dataproc.messages.LifecycleConfig() changed_config = False if args.max_age is not None: lifecycle_config.autoDeleteTtl = six.text_type(args.max_age) + 's' changed_config = True if args.expiration_time is not None: lifecycle_config.autoDeleteTime = times.FormatDateTime( args.expiration_time) changed_config = True if args.max_idle is not None: lifecycle_config.idleDeleteTtl = six.text_type(args.max_idle) + 's' changed_config = True if changed_config: cluster_config.lifecycleConfig = lifecycle_config if hasattr(args.CONCEPTS, 'kms_key'): kms_ref = args.CONCEPTS.kms_key.Parse() if kms_ref: encryption_config = dataproc.messages.EncryptionConfig() encryption_config.gcePdKmsKeyName = kms_ref.RelativeName() cluster_config.encryptionConfig = encryption_config else: # Did user use any gce-pd-kms-key flags? for keyword in [ 'gce-pd-kms-key', 'gce-pd-kms-key-project', 'gce-pd-kms-key-location', 'gce-pd-kms-key-keyring' ]: if getattr(args, keyword.replace('-', '_'), None): raise exceptions.ArgumentError( '--gce-pd-kms-key was not fully specified.') # Secondary worker group is optional. However, users may specify # future pVMs configuration at creation time. num_secondary_workers = _FirstNonNone(args.num_secondary_workers, args.num_preemptible_workers) secondary_worker_boot_disk_type = _FirstNonNone( args.secondary_worker_boot_disk_type, args.preemptible_worker_boot_disk_type) num_secondary_worker_local_ssds = _FirstNonNone( args.num_secondary_worker_local_ssds, args.num_preemptible_worker_local_ssds) if (num_secondary_workers is not None or secondary_worker_boot_disk_size_gb is not None or secondary_worker_boot_disk_type is not None or num_secondary_worker_local_ssds is not None or args.worker_min_cpu_platform is not None or args.secondary_worker_type != 'unspecified'): cluster_config.secondaryWorkerConfig = ( dataproc.messages.InstanceGroupConfig( numInstances=num_secondary_workers, accelerators=secondary_worker_accelerators, diskConfig=GetDiskConfig( dataproc, secondary_worker_boot_disk_type, secondary_worker_boot_disk_size_gb, num_secondary_worker_local_ssds, ), minCpuPlatform=args.worker_min_cpu_platform, preemptibility=_GetType(dataproc, args.secondary_worker_type))) if include_gke_platform_args: if args.enable_component_gateway: cluster_config.endpointConfig = dataproc.messages.EndpointConfig( enableHttpPortAccess=args.enable_component_gateway) if args.gke_cluster is not None: location = args.zone or args.region target_gke_cluster = 'projects/{0}/locations/{1}/clusters/{2}'.format( project_id, location, args.gke_cluster) cluster_config.gkeClusterConfig = dataproc.messages.GkeClusterConfig( namespacedGkeDeploymentTarget=dataproc.messages. NamespacedGkeDeploymentTarget( targetGkeCluster=target_gke_cluster, clusterNamespace=args.gke_cluster_namespace)) cluster_config.gceClusterConfig = None cluster_config.masterConfig = None cluster_config.workerConfig = None cluster_config.secondaryWorkerConfig = None return cluster_config
def GetClusterConfig(args, dataproc, project_id, compute_resources, beta=False): """Get dataproc cluster configuration. Args: args: Arguments parsed from argparse.ArgParser. dataproc: Dataproc object that contains client, messages, and resources project_id: Dataproc project ID compute_resources: compute resource for cluster beta: use BETA only features Returns: cluster_config: Dataproc cluster configuration """ master_accelerator_type = None worker_accelerator_type = None master_accelerator_count = None worker_accelerator_count = None if beta: if args.master_accelerator: master_accelerator_type = args.master_accelerator['type'] master_accelerator_count = args.master_accelerator.get('count', 1) if args.worker_accelerator: worker_accelerator_type = args.worker_accelerator['type'] worker_accelerator_count = args.worker_accelerator.get('count', 1) # Resolve non-zonal GCE resources # We will let the server resolve short names of zonal resources because # if auto zone is requested, we will not know the zone before sending the # request image_ref = args.image and compute_resources.Parse( args.image, params={'project': project_id}, collection='compute.images') network_ref = args.network and compute_resources.Parse( args.network, params={'project': project_id}, collection='compute.networks') subnetwork_ref = args.subnet and compute_resources.Parse( args.subnet, params={ 'project': project_id, 'region': properties.VALUES.compute.region.GetOrFail, }, collection='compute.subnetworks') timeout_str = str(args.initialization_action_timeout) + 's' init_actions = [ dataproc.messages.NodeInitializationAction( executableFile=exe, executionTimeout=timeout_str) for exe in (args.initialization_actions or []) ] # Increase the client timeout for each initialization action. args.timeout += args.initialization_action_timeout * len(init_actions) expanded_scopes = compute_helpers.ExpandScopeAliases(args.scopes) software_config = dataproc.messages.SoftwareConfig( imageVersion=args.image_version) master_boot_disk_size_gb = args.master_boot_disk_size_gb if args.master_boot_disk_size: master_boot_disk_size_gb = (api_utils.BytesToGb( args.master_boot_disk_size)) worker_boot_disk_size_gb = args.worker_boot_disk_size_gb if args.worker_boot_disk_size: worker_boot_disk_size_gb = (api_utils.BytesToGb( args.worker_boot_disk_size)) preemptible_worker_boot_disk_size_gb = (api_utils.BytesToGb( args.preemptible_worker_boot_disk_size)) if args.single_node or args.num_workers == 0: # Explicitly specifying --num-workers=0 gives you a single node cluster, # but if --num-workers is omitted, args.num_workers is None (not 0), and # this property will not be set args.properties[constants.ALLOW_ZERO_WORKERS_PROPERTY] = 'true' if args.properties: software_config.properties = encoding.DictToMessage( args.properties, dataproc.messages.SoftwareConfig.PropertiesValue) if beta: if args.components: software_config_cls = dataproc.messages.SoftwareConfig software_config.optionalComponents.extend( list( map( software_config_cls. OptionalComponentsValueListEntryValuesEnum, args.components))) gce_cluster_config = dataproc.messages.GceClusterConfig( networkUri=network_ref and network_ref.SelfLink(), subnetworkUri=subnetwork_ref and subnetwork_ref.SelfLink(), internalIpOnly=args.no_address, serviceAccount=args.service_account, serviceAccountScopes=expanded_scopes, zoneUri=properties.VALUES.compute.zone.GetOrFail()) if args.tags: gce_cluster_config.tags = args.tags if args.metadata: flat_metadata = dict( (k, v) for d in args.metadata for k, v in d.items()) gce_cluster_config.metadata = encoding.DictToMessage( flat_metadata, dataproc.messages.GceClusterConfig.MetadataValue) master_accelerators = [] if master_accelerator_type: master_accelerators.append( dataproc.messages.AcceleratorConfig( acceleratorTypeUri=master_accelerator_type, acceleratorCount=master_accelerator_count)) worker_accelerators = [] if worker_accelerator_type: worker_accelerators.append( dataproc.messages.AcceleratorConfig( acceleratorTypeUri=worker_accelerator_type, acceleratorCount=worker_accelerator_count)) cluster_config = dataproc.messages.ClusterConfig( configBucket=args.bucket, gceClusterConfig=gce_cluster_config, masterConfig=dataproc.messages.InstanceGroupConfig( numInstances=args.num_masters, imageUri=image_ref and image_ref.SelfLink(), machineTypeUri=args.master_machine_type, accelerators=master_accelerators, diskConfig=GetDiskConfig(dataproc, args.master_boot_disk_type, master_boot_disk_size_gb, args.num_master_local_ssds)), workerConfig=dataproc.messages.InstanceGroupConfig( numInstances=args.num_workers, imageUri=image_ref and image_ref.SelfLink(), machineTypeUri=args.worker_machine_type, accelerators=worker_accelerators, diskConfig=GetDiskConfig( dataproc, args.worker_boot_disk_type, worker_boot_disk_size_gb, args.num_worker_local_ssds, )), initializationActions=init_actions, softwareConfig=software_config, ) if beta: cluster_config.masterConfig.minCpuPlatform = args.master_min_cpu_platform cluster_config.workerConfig.minCpuPlatform = args.worker_min_cpu_platform if beta: lifecycle_config = dataproc.messages.LifecycleConfig() changed_config = False if args.max_age is not None: lifecycle_config.autoDeleteTtl = str(args.max_age) + 's' changed_config = True if args.expiration_time is not None: lifecycle_config.autoDeleteTime = times.FormatDateTime( args.expiration_time) changed_config = True if args.max_idle is not None: lifecycle_config.idleDeleteTtl = str(args.max_idle) + 's' changed_config = True if changed_config: cluster_config.lifecycleConfig = lifecycle_config if beta and hasattr(args.CONCEPTS, 'kms_key'): kms_ref = args.CONCEPTS.kms_key.Parse() if kms_ref: encryption_config = dataproc.messages.EncryptionConfig() encryption_config.gcePdKmsKeyName = kms_ref.RelativeName() cluster_config.encryptionConfig = encryption_config else: # Did user use any gce-pd-kms-key flags? for keyword in [ 'gce-pd-kms-key', 'gce-pd-kms-key-project', 'gce-pd-kms-key-location', 'gce-pd-kms-key-keyring' ]: if getattr(args, keyword.replace('-', '_'), None): raise exceptions.ArgumentError( '--gce-pd-kms-key was not fully specified.') # Secondary worker group is optional. However, users may specify # future pVMs configuration at creation time. if (args.num_preemptible_workers is not None or preemptible_worker_boot_disk_size_gb is not None or args.preemptible_worker_boot_disk_type is not None or (beta and args.worker_min_cpu_platform is not None)): cluster_config.secondaryWorkerConfig = ( dataproc.messages.InstanceGroupConfig( numInstances=args.num_preemptible_workers, diskConfig=GetDiskConfig( dataproc, args.preemptible_worker_boot_disk_type, preemptible_worker_boot_disk_size_gb, None, ))) if beta and args.worker_min_cpu_platform: cluster_config.secondaryWorkerConfig.minCpuPlatform = ( args.worker_min_cpu_platform) return cluster_config
def Run(self, args): dataproc = dp.Dataproc(self.ReleaseTrack()) cluster_ref = util.ParseCluster(args.name, dataproc) cluster_config = dataproc.messages.ClusterConfig() changed_fields = [] has_changes = False if args.num_workers is not None: worker_config = dataproc.messages.InstanceGroupConfig( numInstances=args.num_workers) cluster_config.workerConfig = worker_config changed_fields.append('config.worker_config.num_instances') has_changes = True if args.num_preemptible_workers is not None: worker_config = dataproc.messages.InstanceGroupConfig( numInstances=args.num_preemptible_workers) cluster_config.secondaryWorkerConfig = worker_config changed_fields.append( 'config.secondary_worker_config.num_instances') has_changes = True if self.ReleaseTrack() == base.ReleaseTrack.BETA: lifecycle_config = dataproc.messages.LifecycleConfig() changed_config = False if args.max_age is not None: lifecycle_config.autoDeleteTtl = str(args.max_age) + 's' changed_config = True if args.expiration_time is not None: lifecycle_config.autoDeleteTime = times.FormatDateTime( args.expiration_time) changed_config = True if args.max_idle is not None: lifecycle_config.idleDeleteTtl = str(args.max_idle) + 's' changed_config = True if changed_config: cluster_config.lifecycleConfig = lifecycle_config changed_fields.append('config.lifecycle_config') has_changes = True # Update labels if the user requested it labels = None if args.update_labels or args.remove_labels: has_changes = True changed_fields.append('labels') # We need to fetch cluster first so we know what the labels look like. The # labels_util.UpdateLabels will fill out the proto for us with all the # updates and removals, but first we need to provide the current state # of the labels get_cluster_request = ( dataproc.messages.DataprocProjectsRegionsClustersGetRequest( projectId=cluster_ref.projectId, region=cluster_ref.region, clusterName=cluster_ref.clusterName)) current_cluster = dataproc.client.projects_regions_clusters.Get( get_cluster_request) labels = labels_util.UpdateLabels( current_cluster.labels, dataproc.messages.Cluster.LabelsValue, args.update_labels, args.remove_labels) if not has_changes: raise exceptions.ArgumentError( 'Must specify at least one cluster parameter to update.') cluster = dataproc.messages.Cluster( config=cluster_config, clusterName=cluster_ref.clusterName, labels=labels, projectId=cluster_ref.projectId) request = dataproc.messages.DataprocProjectsRegionsClustersPatchRequest( clusterName=cluster_ref.clusterName, region=cluster_ref.region, projectId=cluster_ref.projectId, cluster=cluster, updateMask=','.join(changed_fields)) if (self.ReleaseTrack() == base.ReleaseTrack.BETA and args.graceful_decommission_timeout): request.gracefulDecommissionTimeout = ( str(args.graceful_decommission_timeout) + 's') operation = dataproc.client.projects_regions_clusters.Patch(request) if args. async: log.status.write('Updating [{0}] with operation [{1}].'.format( cluster_ref, operation.name)) return util.WaitForOperation(dataproc, operation, message='Waiting for cluster update operation', timeout_s=args.timeout) request = dataproc.messages.DataprocProjectsRegionsClustersGetRequest( projectId=cluster_ref.projectId, region=cluster_ref.region, clusterName=cluster_ref.clusterName) cluster = dataproc.client.projects_regions_clusters.Get(request) log.UpdatedResource(cluster_ref) return cluster
def Run(self, args): dataproc = dp.Dataproc(self.ReleaseTrack()) cluster_ref = util.ParseCluster(args.name, dataproc) cluster_config = dataproc.messages.ClusterConfig() changed_fields = [] has_changes = False if args.num_workers is not None: worker_config = dataproc.messages.InstanceGroupConfig( numInstances=args.num_workers) cluster_config.workerConfig = worker_config changed_fields.append('config.worker_config.num_instances') has_changes = True if args.num_preemptible_workers is not None: worker_config = dataproc.messages.InstanceGroupConfig( numInstances=args.num_preemptible_workers) cluster_config.secondaryWorkerConfig = worker_config changed_fields.append( 'config.secondary_worker_config.num_instances') has_changes = True if self.ReleaseTrack() == base.ReleaseTrack.BETA: if args.autoscaling_policy: cluster_config.autoscalingConfig = dataproc.messages.AutoscalingConfig( policyUri=args.CONCEPTS.autoscaling_policy.Parse( ).RelativeName()) changed_fields.append('config.autoscaling_config.policy_uri') has_changes = True elif args.autoscaling_policy == '' or args.disable_autoscaling: # pylint: disable=g-explicit-bool-comparison # Disabling autoscaling. Don't need to explicitly set # cluster_config.autoscaling_config to None. changed_fields.append('config.autoscaling_config.policy_uri') has_changes = True lifecycle_config = dataproc.messages.LifecycleConfig() changed_config = False if args.max_age is not None: lifecycle_config.autoDeleteTtl = str(args.max_age) + 's' changed_fields.append( 'config.lifecycle_config.auto_delete_ttl') changed_config = True if args.expiration_time is not None: lifecycle_config.autoDeleteTime = times.FormatDateTime( args.expiration_time) changed_fields.append( 'config.lifecycle_config.auto_delete_time') changed_config = True if args.max_idle is not None: lifecycle_config.idleDeleteTtl = str(args.max_idle) + 's' changed_fields.append( 'config.lifecycle_config.idle_delete_ttl') changed_config = True if args.no_max_age: lifecycle_config.autoDeleteTtl = None changed_fields.append( 'config.lifecycle_config.auto_delete_ttl') changed_config = True if args.no_max_idle: lifecycle_config.idleDeleteTtl = None changed_fields.append( 'config.lifecycle_config.idle_delete_ttl') changed_config = True if changed_config: cluster_config.lifecycleConfig = lifecycle_config has_changes = True # Put in a thunk so we only make this call if needed def _GetCurrentLabels(): # We need to fetch cluster first so we know what the labels look like. The # labels_util will fill out the proto for us with all the updates and # removals, but first we need to provide the current state of the labels get_cluster_request = ( dataproc.messages.DataprocProjectsRegionsClustersGetRequest( projectId=cluster_ref.projectId, region=cluster_ref.region, clusterName=cluster_ref.clusterName)) current_cluster = dataproc.client.projects_regions_clusters.Get( get_cluster_request) return current_cluster.labels labels_update = labels_util.ProcessUpdateArgsLazy( args, dataproc.messages.Cluster.LabelsValue, orig_labels_thunk=_GetCurrentLabels) if labels_update.needs_update: has_changes = True changed_fields.append('labels') labels = labels_update.GetOrNone() if not has_changes: raise exceptions.ArgumentError( 'Must specify at least one cluster parameter to update.') cluster = dataproc.messages.Cluster( config=cluster_config, clusterName=cluster_ref.clusterName, labels=labels, projectId=cluster_ref.projectId) request = dataproc.messages.DataprocProjectsRegionsClustersPatchRequest( clusterName=cluster_ref.clusterName, region=cluster_ref.region, projectId=cluster_ref.projectId, cluster=cluster, updateMask=','.join(changed_fields), requestId=util.GetUniqueId()) if args.graceful_decommission_timeout is not None: request.gracefulDecommissionTimeout = ( str(args.graceful_decommission_timeout) + 's') operation = dataproc.client.projects_regions_clusters.Patch(request) if args. async: log.status.write('Updating [{0}] with operation [{1}].'.format( cluster_ref, operation.name)) return util.WaitForOperation(dataproc, operation, message='Waiting for cluster update operation', timeout_s=args.timeout) request = dataproc.messages.DataprocProjectsRegionsClustersGetRequest( projectId=cluster_ref.projectId, region=cluster_ref.region, clusterName=cluster_ref.clusterName) cluster = dataproc.client.projects_regions_clusters.Get(request) log.UpdatedResource(cluster_ref) return cluster
def Run(self, args): dataproc = dp.Dataproc(self.ReleaseTrack()) cluster_ref = args.CONCEPTS.cluster.Parse() cluster_config = dataproc.messages.ClusterConfig() changed_fields = [] has_changes = False if args.num_workers is not None: worker_config = dataproc.messages.InstanceGroupConfig( numInstances=args.num_workers) cluster_config.workerConfig = worker_config changed_fields.append('config.worker_config.num_instances') has_changes = True num_secondary_workers = _FirstNonNone(args.num_preemptible_workers, args.num_secondary_workers) if num_secondary_workers is not None: worker_config = dataproc.messages.InstanceGroupConfig( numInstances=num_secondary_workers) cluster_config.secondaryWorkerConfig = worker_config changed_fields.append( 'config.secondary_worker_config.num_instances') has_changes = True if args.autoscaling_policy: cluster_config.autoscalingConfig = dataproc.messages.AutoscalingConfig( policyUri=args.CONCEPTS.autoscaling_policy.Parse( ).RelativeName()) changed_fields.append('config.autoscaling_config.policy_uri') has_changes = True elif args.autoscaling_policy == '' or args.disable_autoscaling: # pylint: disable=g-explicit-bool-comparison # Disabling autoscaling. Don't need to explicitly set # cluster_config.autoscaling_config to None. changed_fields.append('config.autoscaling_config.policy_uri') has_changes = True lifecycle_config = dataproc.messages.LifecycleConfig() changed_config = False if args.max_age is not None: lifecycle_config.autoDeleteTtl = six.text_type(args.max_age) + 's' changed_fields.append('config.lifecycle_config.auto_delete_ttl') changed_config = True if args.expiration_time is not None: lifecycle_config.autoDeleteTime = times.FormatDateTime( args.expiration_time) changed_fields.append('config.lifecycle_config.auto_delete_time') changed_config = True if args.max_idle is not None: lifecycle_config.idleDeleteTtl = six.text_type(args.max_idle) + 's' changed_fields.append('config.lifecycle_config.idle_delete_ttl') changed_config = True if args.no_max_age: lifecycle_config.autoDeleteTtl = None changed_fields.append('config.lifecycle_config.auto_delete_ttl') changed_config = True if args.no_max_idle: lifecycle_config.idleDeleteTtl = None changed_fields.append('config.lifecycle_config.idle_delete_ttl') changed_config = True if changed_config: cluster_config.lifecycleConfig = lifecycle_config has_changes = True def _GetCurrentCluster(): # This is used for labels and auxiliary_node_pool_configs get_cluster_request = ( dataproc.messages.DataprocProjectsRegionsClustersGetRequest( projectId=cluster_ref.projectId, region=cluster_ref.region, clusterName=cluster_ref.clusterName)) current_cluster = dataproc.client.projects_regions_clusters.Get( get_cluster_request) return current_cluster # Put in a thunk so we only make this call if needed def _GetCurrentLabels(): # We need to fetch cluster first so we know what the labels look like. The # labels_util will fill out the proto for us with all the updates and # removals, but first we need to provide the current state of the labels current_cluster = _GetCurrentCluster() return current_cluster.labels labels_update = labels_util.ProcessUpdateArgsLazy( args, dataproc.messages.Cluster.LabelsValue, orig_labels_thunk=_GetCurrentLabels) if labels_update.needs_update: has_changes = True changed_fields.append('labels') labels = labels_update.GetOrNone() if args.driver_pool_size is not None: # Getting the node_pool_ids from the current node_pools and other attrs # that are not shared with the user # Driver pools can only be updated currently with NO other updates # We are relying on our frontend validation to prevent this until # the change is made to allow driver pools to be updated with other fields auxiliary_node_pools = _GetCurrentCluster( ).config.auxiliaryNodePoolConfigs # get the index of the current cluster's driver pool in the auxiliary # node pools list, index_driver_pools is also a list that should have a # length of 1 index_driver_pools = [ i for i, n in enumerate(auxiliary_node_pools) if dataproc.messages.NodePoolConfig. RolesValueListEntryValuesEnum.DRIVER in n.roles ] if len(index_driver_pools) > 1: raise exceptions.ArgumentError( 'At most one driver pool can be specified per cluster.') elif len(index_driver_pools) == 1: index = index_driver_pools[0] auxiliary_node_pools[ index].nodePoolConfig.numInstances = args.driver_pool_size else: # This case is only relevant for scaling from 0 -> N nodes # this will not be supported initially, but will be relying on our # front end validation to prevent or allow worker_config = dataproc.messages.InstanceGroupConfig( numInstances=args.driver_pool_size) node_config = dataproc.messages.NodePoolConfig( nodePoolConfig=worker_config, roles=[ dataproc.messages.NodePoolConfig. RolesValueListEntryValuesEnum.DRIVER ]) auxiliary_node_pools.append(node_config) cluster_config.auxiliaryNodePoolConfigs = auxiliary_node_pools changed_fields.append('config.auxiliary_node_pool_configs') has_changes = True if not has_changes: raise exceptions.ArgumentError( 'Must specify at least one cluster parameter to update.') cluster = dataproc.messages.Cluster( config=cluster_config, clusterName=cluster_ref.clusterName, labels=labels, projectId=cluster_ref.projectId) request = dataproc.messages.DataprocProjectsRegionsClustersPatchRequest( clusterName=cluster_ref.clusterName, region=cluster_ref.region, projectId=cluster_ref.projectId, cluster=cluster, updateMask=','.join(changed_fields), requestId=util.GetUniqueId()) if args.graceful_decommission_timeout is not None: request.gracefulDecommissionTimeout = ( six.text_type(args.graceful_decommission_timeout) + 's') operation = dataproc.client.projects_regions_clusters.Patch(request) if args.async_: log.status.write('Updating [{0}] with operation [{1}].'.format( cluster_ref, operation.name)) return util.WaitForOperation(dataproc, operation, message='Waiting for cluster update operation', timeout_s=args.timeout) request = dataproc.messages.DataprocProjectsRegionsClustersGetRequest( projectId=cluster_ref.projectId, region=cluster_ref.region, clusterName=cluster_ref.clusterName) cluster = dataproc.client.projects_regions_clusters.Get(request) log.UpdatedResource(cluster_ref) return cluster
def Run(self, args): client = self.context['dataproc_client'] messages = self.context['dataproc_messages'] cluster_ref = util.ParseCluster(args.name, self.context) cluster_config = messages.ClusterConfig() changed_fields = [] has_changes = False if args.new_num_workers is not None: log.warn( '--new-num-workers parameter is deprecated and will be removed ' 'in a future release. Please use --num-workers instead') args.num_workers = args.new_num_workers if args.num_workers is not None: worker_config = messages.InstanceGroupConfig( numInstances=args.num_workers) cluster_config.workerConfig = worker_config changed_fields.append('config.worker_config.num_instances') has_changes = True if args.num_preemptible_workers is not None: worker_config = messages.InstanceGroupConfig( numInstances=args.num_preemptible_workers) cluster_config.secondaryWorkerConfig = worker_config changed_fields.append( 'config.secondary_worker_config.num_instances') has_changes = True if not has_changes: raise exceptions.ArgumentError( 'Must specify at least one cluster parameter to update.') cluster = messages.Cluster(config=cluster_config, clusterName=cluster_ref.clusterName, projectId=cluster_ref.projectId) request = messages.DataprocProjectsRegionsClustersPatchRequest( clusterName=cluster_ref.clusterName, region=cluster_ref.region, projectId=cluster_ref.projectId, cluster=cluster, updateMask=','.join(changed_fields)) operation = client.projects_regions_clusters.Patch(request) if args. async: log.status.write('Updating [{0}] with operation [{1}].'.format( cluster_ref, operation.name)) return util.WaitForOperation(operation, self.context, message='Waiting for cluster update operation', timeout_s=3600 * 3) request = client.MESSAGES_MODULE.DataprocProjectsRegionsClustersGetRequest( projectId=cluster_ref.projectId, region=cluster_ref.region, clusterName=cluster_ref.clusterName) cluster = client.projects_regions_clusters.Get(request) log.UpdatedResource(cluster_ref) return cluster
def Run(self, args): client = self.context['dataproc_client'] messages = self.context['dataproc_messages'] cluster_ref = util.ParseCluster(args.name, self.context) cluster_config = messages.ClusterConfig() changed_fields = [] has_changes = False if args.num_workers is not None: worker_config = messages.InstanceGroupConfig( numInstances=args.num_workers) cluster_config.workerConfig = worker_config changed_fields.append('config.worker_config.num_instances') has_changes = True if args.num_preemptible_workers is not None: worker_config = messages.InstanceGroupConfig( numInstances=args.num_preemptible_workers) cluster_config.secondaryWorkerConfig = worker_config changed_fields.append( 'config.secondary_worker_config.num_instances') has_changes = True # Update labels if the user requested it labels = None if args.update_labels or args.remove_labels: has_changes = True changed_fields.append('labels') # We need to fetch cluster first so we know what the labels look like. The # labels_util.UpdateLabels will fill out the proto for us with all the # updates and removals, but first we need to provide the current state # of the labels get_cluster_request = (client.MESSAGES_MODULE. DataprocProjectsRegionsClustersGetRequest( projectId=cluster_ref.projectId, region=cluster_ref.region, clusterName=cluster_ref.clusterName)) current_cluster = client.projects_regions_clusters.Get( get_cluster_request) labels = labels_util.UpdateLabels(current_cluster.labels, messages.Cluster.LabelsValue, args.update_labels, args.remove_labels) if not has_changes: raise exceptions.ArgumentError( 'Must specify at least one cluster parameter to update.') cluster = messages.Cluster(config=cluster_config, clusterName=cluster_ref.clusterName, labels=labels, projectId=cluster_ref.projectId) request = messages.DataprocProjectsRegionsClustersPatchRequest( clusterName=cluster_ref.clusterName, region=cluster_ref.region, projectId=cluster_ref.projectId, cluster=cluster, updateMask=','.join(changed_fields)) operation = client.projects_regions_clusters.Patch(request) if args. async: log.status.write('Updating [{0}] with operation [{1}].'.format( cluster_ref, operation.name)) return util.WaitForOperation(operation, self.context, message='Waiting for cluster update operation', timeout_s=3600 * 3) request = client.MESSAGES_MODULE.DataprocProjectsRegionsClustersGetRequest( projectId=cluster_ref.projectId, region=cluster_ref.region, clusterName=cluster_ref.clusterName) cluster = client.projects_regions_clusters.Get(request) log.UpdatedResource(cluster_ref) return cluster