def _Create(self): """Creates the cluster.""" cmd = self.DataprocGcloudCommand('clusters', 'create', self.cluster_id) if self.project is not None: cmd.flags['project'] = self.project if self.spec.worker_count: # The number of worker machines in the cluster cmd.flags['num-workers'] = self.spec.worker_count else: cmd.flags['single-node'] = True # Initialize applications on the dataproc cluster if self.spec.applications: logging.info('Include the requested applications') cmd.flags['optional-components'] = ','.join(self.spec.applications) # TODO(pclay): stop ignoring spec.master_group? for role in ['worker', 'master']: # Set machine type if self.spec.worker_group.vm_spec.machine_type: self._AddToCmd(cmd, '{0}-machine-type'.format(role), self.spec.worker_group.vm_spec.machine_type) # Set boot_disk_size if self.spec.worker_group.disk_spec.disk_size: size_in_gb = '{}GB'.format( str(self.spec.worker_group.disk_spec.disk_size)) self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role), size_in_gb) # Set boot_disk_type if self.spec.worker_group.disk_spec.disk_type: self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role), self.spec.worker_group.disk_spec.disk_type) self.dpb_hdfs_type = disk_to_hdfs_map[ self.spec.worker_group.disk_spec.disk_type] # Set ssd count if self.spec.worker_group.vm_spec.num_local_ssds: self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role), self.spec.worker_group.vm_spec.num_local_ssds) # Set zone cmd.flags['zone'] = self.dpb_service_zone if self.dpb_version: cmd.flags['image-version'] = self.dpb_version if FLAGS.gcp_dataproc_image: cmd.flags['image'] = FLAGS.gcp_dataproc_image cmd.flags['metadata'] = util.MakeFormattedDefaultTags() timeout = 900 # 15 min # TODO(saksena): Retrieve the cluster create time and hold in a var _, stderr, retcode = cmd.Issue(timeout=timeout, raise_on_failure=False) if retcode: util.CheckGcloudResponseKnownFailures(stderr, retcode) raise errors.Resource.CreationError(stderr)
def _Create(self): """Create a GCE VM instance.""" num_hosts = len(self.host_list) with open(self.ssh_public_key) as f: public_key = f.read().rstrip('\n') with vm_util.NamedTemporaryFile(mode='w', dir=vm_util.GetTempDir(), prefix='key-metadata') as tf: tf.write('%s:%s\n' % (self.user_name, public_key)) tf.close() create_cmd = self._GenerateCreateCommand(tf.name) _, stderr, retcode = create_cmd.Issue( timeout=_GCE_VM_CREATE_TIMEOUT, raise_on_failure=False) if (self.use_dedicated_host and retcode and _INSUFFICIENT_HOST_CAPACITY in stderr): if self.num_vms_per_host: raise errors.Resource.CreationError( 'Failed to create host: %d vms of type %s per host exceeds ' 'memory capacity limits of the host' % (self.num_vms_per_host, self.machine_type)) else: logging.warning( 'Creation failed due to insufficient host capacity. A new host will ' 'be created and instance creation will be retried.') with self._host_lock: if num_hosts == len(self.host_list): host = GceSoleTenantNodeGroup(self.node_template, self.zone, self.project) self.host_list.append(host) host.Create() self.node_group = self.host_list[-1] raise errors.Resource.RetryableCreationError() if (not self.use_dedicated_host and retcode and _INSUFFICIENT_HOST_CAPACITY in stderr): logging.error(util.STOCKOUT_MESSAGE) raise errors.Benchmarks.InsufficientCapacityCloudFailure( util.STOCKOUT_MESSAGE) util.CheckGcloudResponseKnownFailures(stderr, retcode) if retcode: if (create_cmd.rate_limited and 'already exists' in stderr and FLAGS.retry_on_rate_limited): # Gcloud create commands may still create VMs despite being rate # limited. return if util.RATE_LIMITED_MESSAGE in stderr: raise errors.Benchmarks.QuotaFailure.RateLimitExceededError( stderr) if self.preemptible and _FAILED_TO_START_DUE_TO_PREEMPTION in stderr: self.spot_early_termination = True raise errors.Benchmarks.InsufficientCapacityCloudFailure( 'Interrupted before VM started') raise errors.Resource.CreationError( 'Failed to create VM: %s return code: %s' % (stderr, retcode))
def _Create(self): """Creates the disk.""" cmd = util.GcloudCommand(self, 'compute', 'disks', 'create', self.name) cmd.flags['size'] = self.disk_size cmd.flags['type'] = self.disk_type if self.image: cmd.flags['image'] = self.image if self.image_project: cmd.flags['image-project'] = self.image_project _, stderr, retcode = cmd.Issue() util.CheckGcloudResponseKnownFailures(stderr, retcode)
def _Create(self): """Creates the disk.""" cmd = util.GcloudCommand(self, 'compute', 'disks', 'create', self.name) cmd.flags['size'] = self.disk_size cmd.flags['type'] = self.disk_type cmd.flags['labels'] = util.MakeFormattedDefaultTags() if self.image: cmd.flags['image'] = self.image if self.image_project: cmd.flags['image-project'] = self.image_project _, stderr, retcode = cmd.Issue(raise_on_failure=False) util.CheckGcloudResponseKnownFailures(stderr, retcode)
def _IssueResourceCreationCommand(self, cmd): """Issues a command to gcloud to create resources.""" # This command needs a long timeout due to the many minutes it # can take to provision a large GPU-accelerated GKE cluster. _, stderr, retcode = cmd.Issue(timeout=1200, raise_on_failure=False) if retcode: # Log specific type of failure, if known. if 'ZONE_RESOURCE_POOL_EXHAUSTED' in stderr: logging.exception('Container resources exhausted: %s', stderr) raise errors.Benchmarks.InsufficientCapacityCloudFailure( 'Container resources exhausted in zone %s: %s' % (self.zone, stderr)) util.CheckGcloudResponseKnownFailures(stderr, retcode) raise errors.Resource.CreationError(stderr)
def _Create(self): """Creates the disk.""" cmd = util.GcloudCommand(self, 'compute', 'disks', 'create', self.name) cmd.flags['size'] = self.disk_size cmd.flags['type'] = self.disk_type if self.provisioned_iops and self.disk_type == PD_EXTREME: cmd.flags['provisioned-iops'] = self.provisioned_iops cmd.flags['labels'] = util.MakeFormattedDefaultTags() if self.image: cmd.flags['image'] = self.image if self.image_project: cmd.flags['image-project'] = self.image_project if self.replica_zones: cmd.flags['region'] = self.region cmd.flags['replica-zones'] = ','.join(self.replica_zones) del cmd.flags['zone'] _, stderr, retcode = cmd.Issue(raise_on_failure=False) util.CheckGcloudResponseKnownFailures(stderr, retcode)
def _Create(self): """Create a GCE VM instance.""" num_hosts = len(self.host_list) with open(self.ssh_public_key) as f: public_key = f.read().rstrip('\n') with vm_util.NamedTemporaryFile(mode='w', dir=vm_util.GetTempDir(), prefix='key-metadata') as tf: tf.write('%s:%s\n' % (self.user_name, public_key)) tf.close() create_cmd = self._GenerateCreateCommand(tf.name) _, stderr, retcode = create_cmd.Issue( timeout=_GCE_VM_CREATE_TIMEOUT) if (self.use_dedicated_host and retcode and _INSUFFICIENT_HOST_CAPACITY in stderr and not self.num_vms_per_host): logging.warning( 'Creation failed due to insufficient host capacity. A new host will ' 'be created and instance creation will be retried.') with self._host_lock: if num_hosts == len(self.host_list): host = GceSoleTenantNodeGroup(self.node_template, self.zone, self.project) self.host_list.append(host) host.Create() self.node_group = self.host_list[-1] raise errors.Resource.RetryableCreationError() if (not self.use_dedicated_host and retcode and _INSUFFICIENT_HOST_CAPACITY in stderr): logging.error(STOCKOUT_MESSAGE) raise errors.Benchmarks.InsufficientCapacityCloudFailure( STOCKOUT_MESSAGE) util.CheckGcloudResponseKnownFailures(stderr, retcode) if retcode: raise errors.Resource.CreationError( 'Failed to create VM: %s return code: %s' % (retcode, stderr))
def ExtractDataset(self, dest_bucket, dataset=None, tables=None, dest_format='CSV'): """Extract all tables in a dataset to a GCS bucket. Args: dest_bucket: Name of the bucket to extract the data to. Should already exist. dataset: Optional name of the dataset. If none, will be extracted from the cluster_identifier. tables: Optional list of table names to extract. If none, all tables in the dataset will be extracted. dest_format: Format to extract data in. Can be one of: CSV, JSON, or Avro. """ if tables is None: tables = self.GetAllTablesInDataset(dataset) gcs_uri = 'gs://' + dest_bucket # Make sure the bucket is empty. vm_util.IssueCommand(['gsutil', '-m', 'rm', gcs_uri + '/**'], raise_on_failure=False) project_dataset = self.FormatProjectAndDatasetForCommand(dataset) for table in tables: cmd = [ 'bq', 'extract', '--destination_format=%s' % dest_format, '%s.%s' % (project_dataset, table), '%s/%s/*.csv' % (gcs_uri, table) ] _, stderr, retcode = vm_util.IssueCommand(cmd) # There is a 10T daily limit on extracting from BQ. Large datasets will # inherently hit this limit and benchmarks shouldn't use those. gcp_util.CheckGcloudResponseKnownFailures(stderr, retcode)
def _Create(self): """Creates the cluster.""" cmd = util.GcloudCommand(self, 'container', 'clusters', 'create', self.name) cmd.flags['cluster-version'] = self.cluster_version if FLAGS.gke_enable_alpha: cmd.args.append('--enable-kubernetes-alpha') cmd.args.append('--no-enable-autorepair') cmd.args.append('--no-enable-autoupgrade') user = util.GetDefaultUser() if FLAGS.gcp_service_account: cmd.flags['service-account'] = FLAGS.gcp_service_account # Matches service accounts that either definitely belongs to this project or # are a GCP managed service account like the GCE default service account, # which we can't tell to which project they belong. elif re.match(SERVICE_ACCOUNT_PATTERN, user): logging.info( 'Re-using configured service-account for GKE Cluster: %s', user) cmd.flags['service-account'] = user self.use_application_default_credentials = False else: logging.info('Using default GCE service account for GKE cluster') cmd.flags['scopes'] = 'cloud-platform' if self.vm_config.gpu_count: cmd.flags['accelerator'] = ( gce_virtual_machine.GenerateAcceleratorSpecString( self.vm_config.gpu_type, self.vm_config.gpu_count)) if self.vm_config.min_cpu_platform: cmd.flags['min-cpu-platform'] = self.vm_config.min_cpu_platform if self.vm_config.boot_disk_size: cmd.flags['disk-size'] = self.vm_config.boot_disk_size if self.vm_config.boot_disk_type: cmd.flags['disk-type'] = self.vm_config.boot_disk_type if self.vm_config.max_local_disks: # TODO(pclay): Switch to local-ssd-volumes which support NVME when it # leaves alpha. See # https://cloud.google.com/sdk/gcloud/reference/alpha/container/clusters/create cmd.flags['local-ssd-count'] = self.vm_config.max_local_disks if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes: cmd.args.append('--enable-autoscaling') cmd.flags['max-nodes'] = self.max_nodes cmd.flags['min-nodes'] = self.min_nodes cmd.flags['num-nodes'] = self.num_nodes if self.vm_config.machine_type is None: cmd.flags['machine-type'] = 'custom-{0}-{1}'.format( self.vm_config.cpus, self.vm_config.memory_mib) else: cmd.flags['machine-type'] = self.vm_config.machine_type cmd.flags['metadata'] = util.MakeFormattedDefaultTags() cmd.flags['labels'] = util.MakeFormattedDefaultTags() # This command needs a long timeout due to the many minutes it # can take to provision a large GPU-accelerated GKE cluster. _, stderr, retcode = cmd.Issue(timeout=1200, raise_on_failure=False) if retcode: # Log specific type of failure, if known. if 'ZONE_RESOURCE_POOL_EXHAUSTED' in stderr: logging.exception('Container resources exhausted: %s', stderr) raise errors.Benchmarks.InsufficientCapacityCloudFailure( 'Container resources exhausted in zone %s: %s' % (self.zone, stderr)) util.CheckGcloudResponseKnownFailures(stderr, retcode) raise errors.Resource.CreationError(stderr)
def _CreateGcloudSqlInstance(self): storage_size = self.spec.db_disk_spec.disk_size instance_zone = self.spec.db_spec.zone authorized_network = self._GetAuthorizedNetworks([self.client_vm]) database_version_string = self._GetEngineVersionString( self.spec.engine, self.spec.engine_version) cmd_string = [ self, 'beta', 'sql', 'instances', 'create', self.instance_id, '--quiet', '--format=json', '--activation-policy=ALWAYS', '--assign-ip', '--authorized-networks=%s' % authorized_network, '--zone=%s' % instance_zone, '--database-version=%s' % database_version_string, '--storage-size=%d' % storage_size, '--labels=%s' % util.MakeFormattedDefaultTags(), ] if self.spec.engine == relational_db.MYSQL: cmd_string.append('--enable-bin-log') if self.spec.engine == relational_db.SQLSERVER: # `--root-password` is required when creating SQL Server instances. cmd_string.append('--root-password={0}'.format( self.spec.database_password)) if (self.spec.db_spec.cpus and self.spec.db_spec.memory): self._ValidateSpec() memory = self.spec.db_spec.memory cpus = self.spec.db_spec.cpus self._ValidateMachineType(memory, cpus) cmd_string.append('--cpu={}'.format(cpus)) cmd_string.append('--memory={}MiB'.format(memory)) elif hasattr(self.spec.db_spec, 'machine_type'): machine_type_flag = '--tier=%s' % self.spec.db_spec.machine_type cmd_string.append(machine_type_flag) else: raise Exception('Unspecified machine type') if self.spec.high_availability: cmd_string.append(self._GetHighAvailabilityFlag()) if self.spec.backup_enabled: cmd_string.append('--backup') cmd_string.append('--backup-start-time={}'.format( self.spec.backup_start_time)) else: cmd_string.append('--no-backup') cmd = util.GcloudCommand(*cmd_string) cmd.flags['project'] = self.project _, stderr, retcode = cmd.Issue(timeout=CREATION_TIMEOUT) util.CheckGcloudResponseKnownFailures(stderr, retcode)
def _Create(self): """Creates the cluster.""" cmd = self.DataprocGcloudCommand('clusters', 'create', self.cluster_id) if self.project is not None: cmd.flags['project'] = self.project if self.spec.worker_count: # The number of worker machines in the cluster cmd.flags['num-workers'] = self.spec.worker_count else: cmd.flags['single-node'] = True # Initialize applications on the dataproc cluster if self.spec.applications: logging.info('Include the requested applications') cmd.flags['optional-components'] = ','.join(self.spec.applications) # Enable component gateway for debuggability. Does not impact performance. cmd.flags['enable-component-gateway'] = True # TODO(pclay): stop ignoring spec.master_group? for role in ['worker', 'master']: # Set machine type if self.spec.worker_group.vm_spec.machine_type: self._AddToCmd(cmd, '{0}-machine-type'.format(role), self.spec.worker_group.vm_spec.machine_type) # Set boot_disk_size if self.spec.worker_group.disk_spec.disk_size: size_in_gb = '{}GB'.format( str(self.spec.worker_group.disk_spec.disk_size)) self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role), size_in_gb) # Set boot_disk_type if self.spec.worker_group.disk_spec.disk_type: self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role), self.spec.worker_group.disk_spec.disk_type) self.dpb_hdfs_type = disk_to_hdfs_map[ self.spec.worker_group.disk_spec.disk_type] # Set ssd count if self.spec.worker_group.vm_spec.num_local_ssds: self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role), self.spec.worker_group.vm_spec.num_local_ssds) # This will actually be used for storage self.dpb_hdfs_type = 'Local SSD' # Set zone cmd.flags['zone'] = self.dpb_service_zone if self.dpb_version: cmd.flags['image-version'] = self.dpb_version if FLAGS.gcp_dataproc_image: cmd.flags['image'] = FLAGS.gcp_dataproc_image if FLAGS.dpb_cluster_properties: cmd.flags['properties'] = ','.join(FLAGS.dpb_cluster_properties) # Ideally DpbServiceSpec would have a network spec, which we would create to # Resolve the name, but because EMR provisions its own VPC and we are # generally happy using pre-existing networks for Dataproc. Just use the # underlying flag instead. if FLAGS.gce_network_name: cmd.flags['network'] = FLAGS.gce_network_name metadata = util.GetDefaultTags() metadata.update( flag_util.ParseKeyValuePairs(FLAGS.gcp_instance_metadata)) cmd.flags['metadata'] = util.FormatTags(metadata) cmd.flags['labels'] = util.MakeFormattedDefaultTags() timeout = 900 # 15 min stdout, stderr, retcode = cmd.Issue(timeout=timeout, raise_on_failure=False) self._cluster_create_time = self._ParseClusterCreateTime(stdout) if retcode: util.CheckGcloudResponseKnownFailures(stderr, retcode) raise errors.Resource.CreationError(stderr)