def _Create(self): """Creates the cluster.""" if self.gpu_count: # TODO(ferneyhough): Make cluster version a flag, and allow it # to be specified in the spec (this will require a new spec class # for google_container_engine however). cmd = util.GcloudCommand(self, 'beta', 'container', 'clusters', 'create', self.name) cmd.flags['accelerator'] = ( gce_virtual_machine.GenerateAcceleratorSpecString( self.gpu_type, self.gpu_count)) else: cmd = util.GcloudCommand(self, 'container', 'clusters', 'create', self.name) cmd.flags['cluster-version'] = self.cluster_version cmd.flags['scopes'] = 'cloud-platform' if self.enable_autoscaling: cmd.args.append('--enable-autoscaling') cmd.flags['max-nodes'] = self.max_nodes cmd.flags['min-nodes'] = self.min_nodes cmd.flags['num-nodes'] = self.num_nodes cmd.flags['machine-type'] = self.machine_type # This command needs a long timeout due to the many minutes it # can take to provision a large GPU-accelerated GKE cluster. cmd.Issue(timeout=900, env=self._GetRequiredGkeEnv())
def _AddNodeParamsToCmd(self, vm_config, num_nodes, name, cmd): """Modifies cmd to include node specific command arguments.""" if vm_config.gpu_count: cmd.flags['accelerator'] = ( gce_virtual_machine.GenerateAcceleratorSpecString( vm_config.gpu_type, vm_config.gpu_count)) if vm_config.min_cpu_platform: cmd.flags['min-cpu-platform'] = vm_config.min_cpu_platform if vm_config.threads_per_core: # TODO(user): Remove when threads-per-core is available in GA cmd.use_alpha_gcloud = True cmd.flags['threads-per-core'] = vm_config.threads_per_core if vm_config.boot_disk_size: cmd.flags['disk-size'] = vm_config.boot_disk_size if vm_config.boot_disk_type: cmd.flags['disk-type'] = vm_config.boot_disk_type if vm_config.max_local_disks: # TODO(pclay): Switch to local-ssd-volumes which support NVME when it # leaves alpha. See # https://cloud.google.com/sdk/gcloud/reference/alpha/container/clusters/create cmd.flags['local-ssd-count'] = vm_config.max_local_disks cmd.flags['num-nodes'] = num_nodes if vm_config.machine_type is None: cmd.flags['machine-type'] = 'custom-{0}-{1}'.format( vm_config.cpus, vm_config.memory_mib) else: cmd.flags['machine-type'] = vm_config.machine_type cmd.flags['node-labels'] = f'pkb_nodepool={name}'
def _Create(self): """Creates the cluster.""" if self.min_cpu_platform or self.gpu_count: cmd = util.GcloudCommand(self, 'beta', 'container', 'clusters', 'create', self.name) else: cmd = util.GcloudCommand(self, 'container', 'clusters', 'create', self.name) cmd.flags['cluster-version'] = self.cluster_version if FLAGS.gke_enable_alpha: cmd.args.append('--enable-kubernetes-alpha') cmd.args.append('--no-enable-autorepair') cmd.args.append('--no-enable-autoupgrade') user = util.GetDefaultUser() if FLAGS.gcp_service_account: cmd.flags['service-account'] = FLAGS.gcp_service_account elif 'gserviceaccount.com' in user: cmd.flags['service-account'] = user self.use_application_default_credentials = False else: cmd.flags['scopes'] = 'cloud-platform' if self.gpu_count: cmd.flags['accelerator'] = ( gce_virtual_machine.GenerateAcceleratorSpecString( self.gpu_type, self.gpu_count)) if self.min_cpu_platform: cmd.flags['min-cpu-platform'] = self.min_cpu_platform if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes: cmd.args.append('--enable-autoscaling') cmd.flags['max-nodes'] = self.max_nodes cmd.flags['min-nodes'] = self.min_nodes cmd.flags['num-nodes'] = self.num_nodes if self.machine_type is None: cmd.flags['machine-type'] = 'custom-{0}-{1}'.format( self.cpus, self.memory) else: cmd.flags['machine-type'] = self.machine_type cmd.flags['metadata'] = util.MakeFormattedDefaultTags() cmd.flags['labels'] = util.MakeFormattedDefaultTags() # This command needs a long timeout due to the many minutes it # can take to provision a large GPU-accelerated GKE cluster. _, stderr, retcode = cmd.Issue(timeout=900, env=self._GetRequiredGkeEnv(), raise_on_failure=False) if retcode != 0: # Log specific type of failure, if known. if 'ZONE_RESOURCE_POOL_EXHAUSTED' in stderr: logging.exception('Container resources exhausted: %s', stderr) raise errors.Benchmarks.InsufficientCapacityCloudFailure( 'Container resources exhausted in zone %s: %s' % (self.zone, stderr)) raise errors.Resource.CreationError(stderr)
def _Create(self): """Creates the cluster.""" if self.min_cpu_platform or self.gpu_count: cmd = util.GcloudCommand(self, 'beta', 'container', 'clusters', 'create', self.name) else: cmd = util.GcloudCommand(self, 'container', 'clusters', 'create', self.name) cmd.flags['cluster-version'] = self.cluster_version cmd.flags['scopes'] = 'cloud-platform' if self.gpu_count: cmd.flags['accelerator'] = ( gce_virtual_machine.GenerateAcceleratorSpecString( self.gpu_type, self.gpu_count)) if self.min_cpu_platform: cmd.flags['min-cpu-platform'] = self.min_cpu_platform if self.enable_autoscaling: cmd.args.append('--enable-autoscaling') cmd.flags['max-nodes'] = self.max_nodes cmd.flags['min-nodes'] = self.min_nodes cmd.flags['num-nodes'] = self.num_nodes if self.machine_type is None: cmd.flags['machine-type'] = "custom-{0}-{1}".format( self.cpus, self.memory) else: cmd.flags['machine-type'] = self.machine_type # This command needs a long timeout due to the many minutes it # can take to provision a large GPU-accelerated GKE cluster. cmd.Issue(timeout=900, env=self._GetRequiredGkeEnv())
def _AddNodeParamsToCmd(self, vm_config, num_nodes, name, cmd): """Modifies cmd to include node specific command arguments.""" if vm_config.gpu_count: cmd.flags['accelerator'] = ( gce_virtual_machine.GenerateAcceleratorSpecString( vm_config.gpu_type, vm_config.gpu_count)) if vm_config.min_cpu_platform: cmd.flags['min-cpu-platform'] = vm_config.min_cpu_platform if vm_config.threads_per_core: # TODO(user): Remove when threads-per-core is available in GA cmd.use_alpha_gcloud = True cmd.flags['threads-per-core'] = vm_config.threads_per_core if vm_config.boot_disk_size: cmd.flags['disk-size'] = vm_config.boot_disk_size if vm_config.boot_disk_type: cmd.flags['disk-type'] = vm_config.boot_disk_type if vm_config.max_local_disks: # TODO(pclay): Switch to local-ssd-volumes which support NVME when it # leaves alpha. See # https://cloud.google.com/sdk/gcloud/reference/alpha/container/clusters/create cmd.flags['local-ssd-count'] = vm_config.max_local_disks cmd.flags['num-nodes'] = num_nodes # vm_config.zone may be split a comma separated list if vm_config.zone: cmd.flags['node-locations'] = vm_config.zone if vm_config.machine_type is None: cmd.flags['machine-type'] = 'custom-{0}-{1}'.format( vm_config.cpus, vm_config.memory_mib) else: cmd.flags['machine-type'] = vm_config.machine_type if FLAGS.gke_enable_gvnic: cmd.args.append('--enable-gvnic') else: cmd.args.append('--no-enable-gvnic') # If using a fixed version (or the default) do not enable upgrades. if self.cluster_version not in RELEASE_CHANNELS: cmd.args.append('--no-enable-autoupgrade') cmd.flags['node-labels'] = f'pkb_nodepool={name}'
def _Create(self): """Creates the cluster.""" if self.min_cpu_platform or self.gpu_count: cmd = util.GcloudCommand(self, 'beta', 'container', 'clusters', 'create', self.name) else: cmd = util.GcloudCommand(self, 'container', 'clusters', 'create', self.name) cmd.flags['cluster-version'] = self.cluster_version if FLAGS.gke_enable_alpha: cmd.args.append('--enable-kubernetes-alpha') cmd.args.append('--no-enable-autorepair') cmd.args.append('--no-enable-autoupgrade') user = util.GetDefaultUser() if 'gserviceaccount.com' in user: cmd.flags['service-account'] = user self.use_application_default_credentials = False else: cmd.flags['scopes'] = 'cloud-platform' if self.gpu_count: cmd.flags['accelerator'] = ( gce_virtual_machine.GenerateAcceleratorSpecString( self.gpu_type, self.gpu_count)) if self.min_cpu_platform: cmd.flags['min-cpu-platform'] = self.min_cpu_platform if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes: cmd.args.append('--enable-autoscaling') cmd.flags['max-nodes'] = self.max_nodes cmd.flags['min-nodes'] = self.min_nodes cmd.flags['num-nodes'] = self.num_nodes if self.machine_type is None: cmd.flags['machine-type'] = 'custom-{0}-{1}'.format( self.cpus, self.memory) else: cmd.flags['machine-type'] = self.machine_type cmd.flags['metadata'] = util.MakeFormattedDefaultTags() cmd.flags['labels'] = util.MakeFormattedDefaultTags() # This command needs a long timeout due to the many minutes it # can take to provision a large GPU-accelerated GKE cluster. cmd.Issue(timeout=900, env=self._GetRequiredGkeEnv())
def _Create(self): """Creates the cluster.""" if self.gpu_count: # TODO(ferneyhough): Make cluster version a flag, and allow it # to be specified in the spec (this will require a new spec class # for google_container_engine however). cmd = util.GcloudCommand(self, 'beta', 'container', 'clusters', 'create', self.name, '--cluster-version', '1.9.2-gke.1') cmd.flags['accelerator'] = ( gce_virtual_machine.GenerateAcceleratorSpecString( self.gpu_type, self.gpu_count)) else: cmd = util.GcloudCommand(self, 'container', 'clusters', 'create', self.name) cmd.flags['num-nodes'] = self.num_nodes cmd.flags['machine-type'] = self.machine_type cmd.Issue(timeout=600, env=self._GetRequiredGkeEnv())
def _Create(self): """Creates the cluster.""" cmd = util.GcloudCommand(self, 'container', 'clusters', 'create', self.name) cmd.flags['cluster-version'] = self.cluster_version if FLAGS.gke_enable_alpha: cmd.args.append('--enable-kubernetes-alpha') cmd.args.append('--no-enable-autorepair') cmd.args.append('--no-enable-autoupgrade') user = util.GetDefaultUser() if FLAGS.gcp_service_account: cmd.flags['service-account'] = FLAGS.gcp_service_account # Matches service accounts that either definitely belongs to this project or # are a GCP managed service account like the GCE default service account, # which we can't tell to which project they belong. elif re.match(SERVICE_ACCOUNT_PATTERN, user): logging.info( 'Re-using configured service-account for GKE Cluster: %s', user) cmd.flags['service-account'] = user self.use_application_default_credentials = False else: logging.info('Using default GCE service account for GKE cluster') cmd.flags['scopes'] = 'cloud-platform' if self.vm_config.gpu_count: cmd.flags['accelerator'] = ( gce_virtual_machine.GenerateAcceleratorSpecString( self.vm_config.gpu_type, self.vm_config.gpu_count)) if self.vm_config.min_cpu_platform: cmd.flags['min-cpu-platform'] = self.vm_config.min_cpu_platform if self.vm_config.boot_disk_size: cmd.flags['disk-size'] = self.vm_config.boot_disk_size if self.vm_config.boot_disk_type: cmd.flags['disk-type'] = self.vm_config.boot_disk_type if self.vm_config.max_local_disks: # TODO(pclay): Switch to local-ssd-volumes which support NVME when it # leaves alpha. See # https://cloud.google.com/sdk/gcloud/reference/alpha/container/clusters/create cmd.flags['local-ssd-count'] = self.vm_config.max_local_disks if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes: cmd.args.append('--enable-autoscaling') cmd.flags['max-nodes'] = self.max_nodes cmd.flags['min-nodes'] = self.min_nodes cmd.flags['num-nodes'] = self.num_nodes if self.vm_config.machine_type is None: cmd.flags['machine-type'] = 'custom-{0}-{1}'.format( self.vm_config.cpus, self.vm_config.memory_mib) else: cmd.flags['machine-type'] = self.vm_config.machine_type cmd.flags['metadata'] = util.MakeFormattedDefaultTags() cmd.flags['labels'] = util.MakeFormattedDefaultTags() # This command needs a long timeout due to the many minutes it # can take to provision a large GPU-accelerated GKE cluster. _, stderr, retcode = cmd.Issue(timeout=1200, raise_on_failure=False) if retcode: # Log specific type of failure, if known. if 'ZONE_RESOURCE_POOL_EXHAUSTED' in stderr: logging.exception('Container resources exhausted: %s', stderr) raise errors.Benchmarks.InsufficientCapacityCloudFailure( 'Container resources exhausted in zone %s: %s' % (self.zone, stderr)) util.CheckGcloudResponseKnownFailures(stderr, retcode) raise errors.Resource.CreationError(stderr)