def stop_instance(self, instance_id): """Stops the instance gracefully. :param str instance_id: instance identifier :raises: `InstanceError` if instance can not be stopped """ if not instance_id: log.info("Instance to stop has no instance id") return gce = self._connect() try: request = gce.instances().delete(project=self._project_id, instance=instance_id, zone=self._zone) response = self._execute_request(request) self._check_response(response) except HttpError as e: # If the instance does not exist, we can a 404 - just log it, and # return without exception so the caller can remove the reference. if e.resp.status == 404: log.warning("Instance to stop `%s` was not found" % instance_id) else: raise InstanceError("Could not stop instance `%s`: `%s`" % (instance_id, e)) except CloudProviderError as e: raise InstanceError("Could not stop instance `%s`: `%s`" % (instance_id, e))
def stop_instance(self, instance_id): """Stops the instance gracefully. :param str instance_id: instance identifier :raises: `InstanceError` if instance can not be stopped """ if not instance_id: log.info("Instance to stop has no instance id") return gce = self._connect() try: request = gce.instances().delete(project=self._project_id, instance=instance_id, zone=self._zone) response = self._execute_request(request) self._check_response(response) except HttpError as e: # If the instance does not exist, we get a 404 if e.resp.status == 404: raise InstanceNotFoundError( "Instance `{instance_id}` was not found".format( instance_id=instance_id)) else: raise InstanceError( "Could not stop instance `{instance_id}`: `{e}`".format( instance_id=instance_id, e=e)) except CloudProviderError as e: raise InstanceError( "Could not stop instance `{instance_id}`: `{e}`".format( instance_id=instance_id, e=e))
def get_ips(self, instance_id): """Retrieves the ip addresses (public) from the cloud provider by the given instance id. :param str instance_id: id of the instance :return: list (ips) :raises: InstanceError if the ip could not be retrieved. """ gce = self._connect() instances = gce.instances() try: request = instances.get(instance=instance_id, project=self._project_id, zone=self._zone) response = self._execute_request(request) ip_public = None if response and "networkInterfaces" in response: interfaces = response['networkInterfaces'] if interfaces: if "accessConfigs" in interfaces[0]: ip_public = interfaces[0]['accessConfigs'][0]['natIP'] if ip_public: return [ip_public] else: raise InstanceError("could not retrieve the ip address for " "node `%s`, please check the node " "through the cloud provider interface" % instance_id) except (HttpError, CloudProviderError) as e: raise InstanceError('could not retrieve the ip address of `%s`: ' '`%s`' % (instance_id, e))
def pause_instance(self, instance_id): """Pauses the instance, retaining disk and config. :param str instance_id: instance identifier :raises: `InstanceError` if instance cannot be paused :return: dict - information needed to restart instance. """ if not instance_id: log.info("Instance to pause has no instance id.") return gce = self._connect() try: request = gce.instances().stop(project=self._project_id, instance=instance_id, zone=self._zone) operation = self._execute_request(request) response = self._wait_until_done(operation) self._check_response(response) return {"instance_id": instance_id} except HttpError as e: log.error("Error stopping instance: `%s", e) raise InstanceError("Error stopping instance `%s`", e)
def _load_instance(self, instance_id): """Checks if an instance with the given id is cached. If not it will connect to the cloud and put it into the local cache _instances. :param str instance_id: instance identifier :return: py:class:`boto.ec2.instance.Reservation` - instance :raises: `InstanceError` is returned if the instance can't be found in the local cache or in the cloud. """ connection = self._connect() if instance_id in self._instances: return self._instances[instance_id] # Instance not in the internal dictionary. # First, check the internal cache: if instance_id not in [i.id for i in self._cached_instances]: # Refresh the cache, just in case self._cached_instances = [] reservations = connection.get_all_instances() for res in reservations: self._cached_instances.extend(res.instances) for inst in self._cached_instances: if inst.id == instance_id: self._instances[instance_id] = inst return inst # If we reached this point, the instance was not found neither # in the cache or on the website. raise InstanceError("the given instance `%s` was not found " "on the coud" % instance_id)
def resume_instance(self, paused_info): """Restarts a paused instance, retaining disk and config. :param str instance_id: instance identifier :raises: `InstanceError` if instance cannot be resumed. :return: dict - information needed to restart instance. """ if not paused_info.get("instance_id"): log.info("Instance to stop has no instance id.") return gce = self._connect() try: request = gce.instances().start( project=self._project_id, instance=paused_info["instance_id"], zone=self._zone) operation = self._execute_request(request) response = self._wait_until_done(operation) self._check_response(response) return except HttpError as e: log.error("Error restarting instance: `%s", e) raise InstanceError("Error restarting instance `%s`", e)
def start_instance(self, key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None): """ Starts an instance in the cloud on the specified cloud provider (configuration option) and returns the id of the started instance. """ connection = self._connect() log.debug("Checking keypair `%s`.", key_name) self._check_keypair(key_name, public_key_path, private_key_path) log.debug("Checking security group `%s`.", security_group) self._check_security_group(security_group) # image_id = self._find_image_id(image_id) try: reservation = connection.run_instances( image_id, key_name=key_name, security_groups=[security_group], instance_type=flavor, user_data=image_userdata) except Exception, ex: log.error("Error starting instance: %s", ex) if "TooManyInstances" in ex: raise ClusterError(ex) else: raise InstanceError(ex)
def start_instance(self, key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None, **kwargs): """Starts a new instance on the cloud using the given properties. The following tasks are done to start an instance: * establish a connection to the cloud web service * check ssh keypair and upload it if it does not yet exist. This is a locked process, since this function might be called in multiple threads and we only want the key to be stored once. * check if the security group exists * run the instance with the given properties :param str key_name: name of the ssh key to connect :param str public_key_path: path to ssh public key :param str private_key_path: path to ssh private key :param str security_group: firewall rule definition to apply on the instance :param str flavor: machine type to use for the instance :param str image_id: image type (os) to use for the instance :param str image_userdata: command to execute after startup :param str username: username for the given ssh key, default None :return: str - instance id of the started instance """ connection = self._connect() log.debug("Checking keypair `%s`.", key_name) # the `_check_keypair` method has to be called within a lock, # since it will upload the key if it does not exist and if this # happens for every node at the same time ec2 will throw an error # message (see issue #79) with BotoCloudProvider.__node_start_lock: self._check_keypair(key_name, public_key_path, private_key_path) log.debug("Checking security group `%s`.", security_group) self._check_security_group(security_group) # image_id = self._find_image_id(image_id) try: reservation = connection.run_instances( image_id, key_name=key_name, security_groups=[security_group], instance_type=flavor, user_data=image_userdata) except Exception, ex: log.error("Error starting instance: %s", ex) if "TooManyInstances" in ex: raise ClusterError(ex) else: raise InstanceError(ex)
def get_ips(self, instance_id): """Retrieves the ip addresses (public) from the cloud provider by the given instance id. :param str instance_id: id of the instance :return: list (ips) :raises: InstanceError if the ip could not be retrieved. """ if not instance_id: raise InstanceError("could not retrieve the ip address for node: " "no associated instance id") gce = self._connect() instances = gce.instances() try: request = instances.get(instance=instance_id, project=self._project_id, zone=self._zone) response = self._execute_request(request) ip_public = None # If the instance is in status TERMINATED, then there will be # no IP addresses. if response and response['status'] in ('STOPPING', 'TERMINATED'): log.info("node '%s' state is '%s'; no IP address(es)" % (instance_id, response['status'])) return [None] if response and "networkInterfaces" in response: interfaces = response['networkInterfaces'] if interfaces: if "accessConfigs" in interfaces[0]: ip_public = interfaces[0]['accessConfigs'][0]['natIP'] ip_private = interfaces[0]['networkIP'] if ip_public and ip_private: return [ip_public, ip_private] else: raise InstanceError("could not retrieve the ip address for " "node `%s`, please check the node " "through the cloud provider interface" % instance_id) except (HttpError, CloudProviderError) as e: raise InstanceError('could not retrieve the ip address of `%s`: ' '`%s`' % (instance_id, e))
def _load_instance(self, instance_id, force_reload=True): """Checks if an instance with the given id is cached. If not it will connect to the cloud and put it into the local cache _instances. :param str instance_id: instance identifier :param bool force_reload: reload instance from server :return: py:class:`novaclient.v1_1.servers.Server` - instance :raises: `InstanceError` is returned if the instance can't be found in the local cache or in the cloud. """ if force_reload: try: # Remove from cache and get from server again vm = self.client.servers.get(instance_id) # update cache self._instances[instance_id] = vm # delete internal cache, just in case for i in self._cached_instances: if i.id == instance_id: self._cached_instances.remove(i) self._cached_instances.append(vm) break except NotFound: raise InstanceError("the given instance `%s` was not found " "on the coud" % instance_id) if instance_id in self._instances: return self._instances[instance_id] # Instance not in the internal dictionary. # First, check the internal cache: if instance_id not in [i.id for i in self._cached_instances]: # Refresh the cache, just in case self._cached_instances = self.client.servers.list() for inst in self._cached_instances: if inst.id == instance_id: self._instances[instance_id] = inst return inst # If we reached this point, the instance was not found neither # in the cache or on the website. raise InstanceError("the given instance `%s` was not found " "on the coud" % instance_id)
def stop_instance(self, instance_id): """ Stops the instance with the given id gracefully. """ gce = self._connect() try: request = gce.instances().delete(project=self._project_id, instance=instance_id, zone=self._zone) response = self._execute_request(request) self._check_response(response) except (HttpError, CloudProviderError) as e: raise InstanceError("Could not stop instance `%s`: `%s`" % (instance_id, e))
def stop_instance(self, instance_id): """Stops the instance gracefully. :param str instance_id: instance identifier :raises: `InstanceError` if instance can not be stopped """ gce = self._connect() try: request = gce.instances().delete(project=self._project_id, instance=instance_id, zone=self._zone) response = self._execute_request(request) self._check_response(response) except (HttpError, CloudProviderError) as e: raise InstanceError("Could not stop instance `%s`: `%s`" % (instance_id, e))
def list_instances(self, filter=None): """List instances on GCE, optionally filtering the results. :param str filter: Filter specification; see https://developers.google.com/compute/docs/reference/latest/instances/list for details. :return: list of instances """ gce = self._connect() try: request = gce.instances().list( project=self._project_id, filter=filter, zone=self._zone) response = self._execute_request(request) self._check_response(response) except (HttpError, CloudProviderError) as e: raise InstanceError("could not retrieve all instances on the " "cloud: ``" % e) if response and 'items' in response: return response['items'] else: return list()
def start_instance( self, # these are common to any # CloudProvider.start_instance() call key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None, # these params are specific to the # GoogleCloudProvider instance_name=None, boot_disk_type='pd-standard', boot_disk_size=10, **kwargs): """Starts a new instance with the given properties and returns the instance id. :param str key_name: name of the ssh key to connect :param str public_key_path: path to ssh public key :param str private_key_path: path to ssh private key :param str security_group: firewall rule definition to apply on the instance :param str flavor: machine type to use for the instance :param str image_id: image type (os) to use for the instance :param str image_userdata: command to execute after startup :param str username: username for the given ssh key, default None :param str instance_name: name of the instance :return: str - instance id of the started instance """ # construct URLs project_url = '%s%s' % (GCE_URL, self._project_id) machine_type_url = '%s/zones/%s/machineTypes/%s' \ % (project_url, self._zone, flavor) boot_disk_type_url = '%s/zones/%s/diskTypes/%s' \ % (project_url, self._zone, boot_disk_type) boot_disk_size_gb = boot_disk_size network_url = '%s/global/networks/%s' % (project_url, self._network) if image_id.startswith('http://') or image_id.startswith('https://'): image_url = image_id else: # The image names and full resource URLs for several Google- # provided images (debian, centos, etc.) follow a consistent # pattern, and so elasticluster supports a short-hand of just # an image name, such as # "debian-7-wheezy-v20150526". # The cloud project in this case is then "debian-cloud". # # Several images do not follow this convention, and so are # special-cased here: # backports-debian -> debian-cloud # ubuntu -> ubuntu-os-cloud # containter-vm -> google-containers if image_id.startswith('container-vm-'): os_cloud = 'google-containers' elif image_id.startswith('backports-debian-'): os_cloud = 'debian-cloud' elif image_id.startswith('ubuntu-'): os_cloud = 'ubuntu-os-cloud' else: os = image_id.split("-")[0] os_cloud = "%s-cloud" % os image_url = '%s%s/global/images/%s' % (GCE_URL, os_cloud, image_id) # construct the request body if instance_name is None: instance_name = 'elasticluster-%s' % uuid.uuid4() public_key_content = file(public_key_path).read() instance = { 'name': instance_name, 'machineType': machine_type_url, 'disks': [{ 'autoDelete': 'true', 'boot': 'true', 'type': 'PERSISTENT', 'initializeParams': { 'diskName': "%s-disk" % instance_name, 'diskType': boot_disk_type_url, 'diskSizeGb': boot_disk_size_gb, 'sourceImage': image_url } }], 'networkInterfaces': [{ 'accessConfigs': [{ 'type': 'ONE_TO_ONE_NAT', 'name': 'External NAT' }], 'network': network_url }], 'serviceAccounts': [{ 'email': self._email, 'scopes': GCE_DEFAULT_SCOPES }], "metadata": { "kind": "compute#metadata", "items": [{ "key": "sshKeys", "value": "%s:%s" % (username, public_key_content) }] } } # create the instance gce = self._connect() request = gce.instances().insert(project=self._project_id, body=instance, zone=self._zone) try: response = self._execute_request(request) response = self._wait_until_done(response) self._check_response(response) return instance_name except (HttpError, CloudProviderError) as e: log.error("Error creating instance `%s`" % e) raise InstanceError("Error creating instance `%s`" % e)
def start_instance( self, # these are common to any # CloudProvider.start_instance() call key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None, # these params are specific to the # GoogleCloudProvider instance_name=None, **kwargs): """Starts a new instance with the given properties and returns the instance id. :param str key_name: name of the ssh key to connect :param str public_key_path: path to ssh public key :param str private_key_path: path to ssh private key :param str security_group: firewall rule definition to apply on the instance :param str flavor: machine type to use for the instance :param str image_id: image type (os) to use for the instance :param str image_userdata: command to execute after startup :param str username: username for the given ssh key, default None :param str instance_name: name of the instance :return: str - instance id of the started instance """ # construct URLs project_url = '%s%s' % (GCE_URL, self._project_id) machine_type_url = '%s/zones/%s/machineTypes/%s' \ % (project_url, self._zone, flavor) network_url = '%s/global/networks/%s' % (project_url, self._network) if image_id.startswith('http://') or image_id.startswith('https://'): image_url = image_id else: os = image_id.split("-")[0] os_cloud = "%s-cloud" % os image_url = '%s%s/global/images/%s' % (GCE_URL, os_cloud, image_id) # construct the request body if instance_name is None: instance_name = 'elasticluster-%s' % uuid.uuid4() public_key_content = file(public_key_path).read() instance = { 'name': instance_name, 'machineType': machine_type_url, 'disks': [{ 'autoDelete': 'true', 'boot': 'true', 'type': 'PERSISTENT', 'initializeParams': { 'diskName': "%s-disk" % instance_name, 'sourceImage': image_url } }], 'networkInterfaces': [{ 'accessConfigs': [{ 'type': 'ONE_TO_ONE_NAT', 'name': 'External NAT' }], 'network': network_url }], 'serviceAccounts': [{ 'email': self._email, 'scopes': GCE_DEFAULT_SCOPES }], "metadata": { "kind": "compute#metadata", "items": [{ "key": "sshKeys", "value": "%s:%s" % (username, public_key_content) }] } } # create the instance gce = self._connect() request = gce.instances().insert(project=self._project_id, body=instance, zone=self._zone) try: response = self._execute_request(request) response = self._wait_until_done(response) self._check_response(response) return instance_name except (HttpError, CloudProviderError) as e: log.error("Error creating instance `%s`" % e) raise InstanceError("Error creating instance `%s`" % e)
def start_instance(self, key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, cluster_name, username=None, node_name=None, **options): template_id, attributes = self._parse_flavor(flavor) if node_name: # this only sets the VM name for display purposes attributes['NAME'] = node_name # boot disk attributes.setdefault('OS', {}) boot = attributes['OS'] boot.setdefault('BOOT', '') # FIXME: should this be 'disk0'? attributes.setdefault('DISK', {}) disk0 = attributes['DISK'] try: # `image_id` is numeric image_id = int(image_id) disk0['IMAGE_ID'] = image_id except (TypeError, ValueError): # `image_id` is the disk image name if '/' in image_id: img_username, img_id = image_id.split('/') else: img_username = self._username img_id = image_id disk0['IMAGE'] = img_id disk0['IMAGE_UNAME'] = img_username # not attempting to merge flavor attributes into the `NIC` # part: network configuration should be part of either the ONE # template, or the ElastiCluster configuration nics = attributes['NIC'] = [] network_ids = [ netid.strip() for netid in options.pop('network_ids', '').split(',') if netid.strip() != '' ] if network_ids: for netid in network_ids: try: # numeric ID? netid = int(netid) nics.append({'NETWORK_ID': netid}) except (TypeError, ValueError): if '/' in netid: net_username, net_id = netid.split('/') else: net_username = self._username net_id = netid nics.append({ 'NETWORK': net_id, 'NETWORK_UNAME': net_username, }) if security_group and security_group != 'default': nics[-1]['SECURITY_GROUP'] = security_group attributes.setdefault('CONTEXT', {}) context = attributes['CONTEXT'] # this is needed to enable networking; having the `NIC` # lines in template seems not to be enough in ONE 5.6.1 context['NETWORK'] = 'YES' if node_name: context['SET_HOSTNAME'] = node_name if username: context['USERNAME'] = username if public_key_path: with open(public_key_path) as pubkey: context['SSH_PUBLIC_KEY'] = pubkey.read() if image_userdata: # FIXME: should be base64-encoded and use `START_SCRIPT_BASE64` context['START_SCRIPT'] = image_userdata # create VM with self._api_lock: try: if template_id is not None: vm_id = self.server.template.instantiate( template_id, (node_name or ''), False, self._make_template_str(attributes)) else: vm_id = self.server.vm.allocate( self._make_template_str(attributes), False) return {'instance_id': vm_id} except pyone.OneException as err: raise InstanceError("Error creating node `{0}`: {1}".format( node_name, err))
def start_instance( self, # these are common to any # CloudProvider.start_instance() call key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None, # these params are specific to the # GoogleCloudProvider node_name=None, boot_disk_type='pd-standard', boot_disk_size=10, tags=None, scheduling=None, accelerator_count=0, accelerator_type='default', allow_project_ssh_keys=True, min_cpu_platform=None, **kwargs): """ Starts a new instance with the given properties and returns the instance id. :param str key_name: name of the ssh key to connect :param str public_key_path: path to ssh public key :param str private_key_path: path to ssh private key :param str security_group: firewall rule definition to apply on the instance :param str flavor: machine type to use for the instance :param str image_id: image type (os) to use for the instance :param str image_userdata: command to execute after startup :param str username: username for the given ssh key, default None :param str node_name: name of the instance :param str|Sequence tags: "Tags" to label the instance. Can be either a single string (individual tags are comma-separated), or a sequence of strings (each string being a single tag). :param str scheduling: scheduling option to use for the instance ("preemptible") :param int accelerator_count: Number of accelerators (e.g., GPUs) to make available in instance :param str accelerator_type: Type of accelerator to request. Can be one of: * Full URL specifying an accelerator type valid for the zone and project VMs are being created in. For example, ``https://www.googleapis.com/compute/v1/projects/[PROJECT_ID]/zones/[ZONE]/acceleratorTypes/[ACCELERATOR_TYPE]`` * An accelerator type name (any string which is not a valid URL). This is internally prefixed with the string ``https://www.googleapis.com/compute/v1/projects/[PROJECT_ID]/zones/[ZONE]/acceleratorTypes/`` to form a full URL. :param bool allow_project_ssh_keys: When ``True`` (default), SSH login is allowed to a node using any of the project-wide SSH keys (if they are defined). When ``False``, only the SSH key specified by ElastiCluster config's ``[login/*]`` section will be allowed to log in (instance-level key). :param str min_cpu_platform: require CPUs of this type or better (e.g., "Intel Skylake") Only used if ``accelerator_count`` is > 0. :return: str - instance id of the started instance """ # construct URLs project_url = '%s%s' % (GCE_URL, self._project_id) machine_type_url = '%s/zones/%s/machineTypes/%s' \ % (project_url, self._zone, flavor) boot_disk_type_url = '%s/zones/%s/diskTypes/%s' \ % (project_url, self._zone, boot_disk_type) # FIXME: `conf.py` should ensure that `boot_disk_size` has the right # type, so there would be no need to convert here boot_disk_size_gb = int(boot_disk_size) network_url = '%s/global/networks/%s' % (project_url, self._network) if image_id.startswith('http://') or image_id.startswith('https://'): image_url = image_id else: # The image names and full resource URLs for several Google- # provided images (debian, centos, etc.) follow a consistent # pattern, and so elasticluster supports a short-hand of just # an image name, such as # "debian-7-wheezy-v20150526". # The cloud project in this case is then "debian-cloud". # # Several images do not follow this convention, and so are # special-cased here: # backports-debian -> debian-cloud # ubuntu -> ubuntu-os-cloud # containter-vm -> google-containers if image_id.startswith('container-vm-'): os_cloud = 'google-containers' elif image_id.startswith('backports-debian-'): os_cloud = 'debian-cloud' elif image_id.startswith('ubuntu-'): os_cloud = 'ubuntu-os-cloud' else: os = image_id.split("-")[0] os_cloud = "%s-cloud" % os image_url = '%s%s/global/images/%s' % (GCE_URL, os_cloud, image_id) scheduling_option = {} if scheduling == 'preemptible': scheduling_option['preemptible'] = True elif scheduling is not None: raise InstanceError("Unknown scheduling option: '%s'" % scheduling) if isinstance(tags, types.StringTypes): tags = tags.split(',') elif isinstance(tags, collections.Sequence): # ok, nothing to do pass elif tags is not None: raise TypeError( "The `tags` argument to `gce.start_instance`" " should be a string or a list, got {T} instead".format( T=type(tags))) with open(public_key_path, 'r') as f: public_key_content = f.read() compute_metadata = [ { "key": "ssh-keys", "value": "%s:%s" % (username, public_key_content), }, { "key": "block-project-ssh-keys", "value": (not allow_project_ssh_keys), }, ] if image_userdata: compute_metadata.append({ "key": "startup-script", "value": image_userdata, }) # construct the request body if node_name: instance_id = node_name.lower().replace( '_', '-') # GCE doesn't allow "_" else: instance_id = 'elasticluster-%s' % uuid.uuid4() instance = { 'name': instance_id, 'machineType': machine_type_url, 'tags': { 'items': tags, }, 'scheduling': scheduling_option, 'disks': [{ 'autoDelete': 'true', 'boot': 'true', 'type': 'PERSISTENT', 'initializeParams': { 'diskName': "%s-disk" % instance_id, 'diskType': boot_disk_type_url, 'diskSizeGb': boot_disk_size_gb, 'sourceImage': image_url } }], 'networkInterfaces': [{ 'accessConfigs': [{ 'type': 'ONE_TO_ONE_NAT', 'name': 'External NAT' }], 'network': network_url }], 'serviceAccounts': [{ 'email': self._email, 'scopes': GCE_DEFAULT_SCOPES }], "metadata": { "kind": "compute#metadata", "items": compute_metadata, } } if min_cpu_platform is not None: instance['minCpuPlatform'] = min_cpu_platform # add accelerators/GPUs if requested if accelerator_count > 0: if (accelerator_type.startswith('https://') or accelerator_type.startswith('http://')): # use URL as-is accelerator_type_url = accelerator_type else: accelerator_type_url = ( 'https://www.googleapis.com/compute/{api_version}/' 'projects/{project_id}/zones/{zone}/' 'acceleratorTypes/{accelerator_type}'.format( api_version=GCE_API_VERSION, project_id=self._project_id, zone=self._zone, accelerator_type=accelerator_type)) log.debug( "VM instance `%s`:" " Requesting %d accelerator%s of type '%s'", instance_id, accelerator_count, ('s' if accelerator_count > 1 else ''), accelerator_type_url) instance['guestAccelerators'] = [{ 'acceleratorCount': accelerator_count, 'acceleratorType': accelerator_type_url, }] # no live migration with GPUs, # see: https://cloud.google.com/compute/docs/gpus#restrictions instance['scheduling']['onHostMaintenance'] = 'TERMINATE' instance['scheduling']['automaticRestart'] = True # create the instance gce = self._connect() request = gce.instances().insert(project=self._project_id, body=instance, zone=self._zone) try: response = self._execute_request(request) response = self._wait_until_done(response) self._check_response(response) return instance_id except (HttpError, CloudProviderError) as e: log.error("Error creating instance `%s`" % e) raise InstanceError("Error creating instance `%s`" % e)
def start_instance(self, key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None, node_name=None, network_ids=None, price=None, timeout=None, **kwargs): """Starts a new instance on the cloud using the given properties. The following tasks are done to start an instance: * establish a connection to the cloud web service * check ssh keypair and upload it if it does not yet exist. This is a locked process, since this function might be called in multiple threads and we only want the key to be stored once. * check if the security group exists * run the instance with the given properties :param str key_name: name of the ssh key to connect :param str public_key_path: path to ssh public key :param str private_key_path: path to ssh private key :param str security_group: firewall rule definition to apply on the instance :param str flavor: machine type to use for the instance :param str image_id: image type (os) to use for the instance :param str image_userdata: command to execute after startup :param str username: username for the given ssh key, default None :param float price: Spot instance price (if 0, do not use spot instances). :param int price: Timeout (in seconds) waiting for spot instances; only used if price > 0. :return: str - instance id of the started instance """ connection = self._connect() log.debug("Checking keypair `%s`.", key_name) # the `_check_keypair` method has to be called within a lock, # since it will upload the key if it does not exist and if this # happens for every node at the same time ec2 will throw an error # message (see issue #79) with BotoCloudProvider.__node_start_lock: self._check_keypair(key_name, public_key_path, private_key_path) log.debug("Checking security group `%s`.", security_group) security_group_id = self._check_security_group(security_group) # image_id = self._find_image_id(image_id) if network_ids: interfaces = [] for subnet in network_ids.split(','): subnet_id = self._check_subnet(subnet) interfaces.append( boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=subnet_id, groups=[security_group_id], associate_public_ip_address=self.request_floating_ip)) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection( *interfaces) security_groups = [] else: interfaces = None security_groups = [security_group] # get defaults for `price` and `timeout` from class instance if price is None: price = self.price if timeout is None: timeout = self.timeout try: #start spot instance if bid is specified if price: log.info("Requesting spot instance with price `%s` ...", price) request = connection.request_spot_instances( price, image_id, key_name=key_name, security_groups=security_groups, instance_type=flavor, user_data=image_userdata, network_interfaces=interfaces, instance_profile_name=self._instance_profile)[-1] # wait until spot request is fullfilled (will wait # forever if no timeout is given) start_time = time.time() timeout = (float(timeout) if timeout else 0) log.info( "Waiting for spot instance (will time out in %d seconds) ...", timeout) while request.status.code != 'fulfilled': if timeout and time.time() - start_time > timeout: request.cancel() raise RuntimeError('spot instance timed out') time.sleep(self.POLL_INTERVAL) # update request status request = connection.get_all_spot_instance_requests( request_ids=request.id)[-1] else: reservation = connection.run_instances( image_id, key_name=key_name, security_groups=security_groups, instance_type=flavor, user_data=image_userdata, network_interfaces=interfaces, instance_profile_name=self._instance_profile) except Exception as ex: log.error("Error starting instance: %s", ex) if "TooManyInstances" in ex: raise ClusterError(ex) else: raise InstanceError(ex) if price: vm = connection.get_only_instances( instance_ids=[request.instance_id])[-1] else: vm = reservation.instances[-1] vm.add_tag("Name", node_name) # cache instance object locally for faster access later on self._instances[vm.id] = vm return vm.id
def start_instance(self, # these are common to any # CloudProvider.start_instance() call key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None, # these params are specific to the # GoogleCloudProvider node_name=None, boot_disk_type='pd-standard', boot_disk_size=10, tags=None, scheduling=None, **kwargs): """Starts a new instance with the given properties and returns the instance id. :param str key_name: name of the ssh key to connect :param str public_key_path: path to ssh public key :param str private_key_path: path to ssh private key :param str security_group: firewall rule definition to apply on the instance :param str flavor: machine type to use for the instance :param str image_id: image type (os) to use for the instance :param str image_userdata: command to execute after startup :param str username: username for the given ssh key, default None :param str node_name: name of the instance :param str tags: comma-separated list of "tags" to label the instance :param str scheduling: scheduling option to use for the instance ("preemptible") :param str|Sequence tags: "Tags" to label the instance. Can be either a single string (individual tags are comma-separated), or a sequence of strings (each string being a single tag). :return: str - instance id of the started instance """ # construct URLs project_url = '%s%s' % (GCE_URL, self._project_id) machine_type_url = '%s/zones/%s/machineTypes/%s' \ % (project_url, self._zone, flavor) boot_disk_type_url = '%s/zones/%s/diskTypes/%s' \ % (project_url, self._zone, boot_disk_type) boot_disk_size_gb = boot_disk_size network_url = '%s/global/networks/%s' % (project_url, self._network) if image_id.startswith('http://') or image_id.startswith('https://'): image_url = image_id else: # The image names and full resource URLs for several Google- # provided images (debian, centos, etc.) follow a consistent # pattern, and so elasticluster supports a short-hand of just # an image name, such as # "debian-7-wheezy-v20150526". # The cloud project in this case is then "debian-cloud". # # Several images do not follow this convention, and so are # special-cased here: # backports-debian -> debian-cloud # ubuntu -> ubuntu-os-cloud # containter-vm -> google-containers if image_id.startswith('container-vm-'): os_cloud = 'google-containers' elif image_id.startswith('backports-debian-'): os_cloud = 'debian-cloud' elif image_id.startswith('ubuntu-'): os_cloud = 'ubuntu-os-cloud' else: os = image_id.split("-")[0] os_cloud = "%s-cloud" % os image_url = '%s%s/global/images/%s' % ( GCE_URL, os_cloud, image_id) if scheduling is None: # use GCE's default scheduling_option = {} elif scheduling == 'preemptible': scheduling_option = { 'preemptible': True } else: raise InstanceError("Unknown scheduling option: '%s'" % scheduling) if isinstance(tags, types.StringTypes): tags = tags.split(',') elif isinstance(tags, collections.Sequence): # ok, nothing to do pass elif tags is not None: raise TypeError( "The `tags` argument to `gce.start_instance`" " should be a string or a list, got {T} instead" .format(T=type(tags))) # construct the request body if node_name: instance_id = node_name.lower().replace('_', '-') # GCE doesn't allow "_" else: instance_id = 'elasticluster-%s' % uuid.uuid4() public_key_content = file(public_key_path).read() instance = { 'name': instance_id, 'machineType': machine_type_url, 'tags': { 'items': tags, }, 'scheduling': scheduling_option, 'disks': [{ 'autoDelete': 'true', 'boot': 'true', 'type': 'PERSISTENT', 'initializeParams' : { 'diskName': "%s-disk" % instance_id, 'diskType': boot_disk_type_url, 'diskSizeGb': boot_disk_size_gb, 'sourceImage': image_url } }], 'networkInterfaces': [ {'accessConfigs': [ {'type': 'ONE_TO_ONE_NAT', 'name': 'External NAT' }], 'network': network_url }], 'serviceAccounts': [ {'email': self._email, 'scopes': GCE_DEFAULT_SCOPES }], "metadata": { "kind": "compute#metadata", "items": [ { "key": "sshKeys", "value": "%s:%s" % (username, public_key_content) } ] } } # create the instance gce = self._connect() request = gce.instances().insert( project=self._project_id, body=instance, zone=self._zone) try: response = self._execute_request(request) response = self._wait_until_done(response) self._check_response(response) return instance_id except (HttpError, CloudProviderError) as e: log.error("Error creating instance `%s`" % e) raise InstanceError("Error creating instance `%s`" % e)
def start_instance( self, # these are common to any # CloudProvider.start_instance() call key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None, # these params are specific to the # GoogleCloudProvider node_name=None, boot_disk_type='pd-standard', boot_disk_size=10, tags=None, scheduling=None, **kwargs): """Starts a new instance with the given properties and returns the instance id. :param str key_name: name of the ssh key to connect :param str public_key_path: path to ssh public key :param str private_key_path: path to ssh private key :param str security_group: firewall rule definition to apply on the instance :param str flavor: machine type to use for the instance :param str image_id: image type (os) to use for the instance :param str image_userdata: command to execute after startup :param str username: username for the given ssh key, default None :param str node_name: name of the instance :param str tags: comma-separated list of "tags" to label the instance :param str scheduling: scheduling option to use for the instance ("preemptible") :param str|Sequence tags: "Tags" to label the instance. Can be either a single string (individual tags are comma-separated), or a sequence of strings (each string being a single tag). :return: str - instance id of the started instance """ # construct URLs project_url = '%s%s' % (GCE_URL, self._project_id) machine_type_url = '%s/zones/%s/machineTypes/%s' \ % (project_url, self._zone, flavor) boot_disk_type_url = '%s/zones/%s/diskTypes/%s' \ % (project_url, self._zone, boot_disk_type) for key in kwargs: if key == 'accelerator': accelerator = kwargs[key] accelerator_url = '%s/zones/%s/acceleratorTypes/%s' % ( project_url, self._zone, accelerator) sys.stdout.write("DEBUG: accelerator_url is %s\n" % (accelerator_url)) elif key == 'accelerator_count': accelerator_count = kwargs[key] sys.stdout.write("DEBUG: assigning %s value of %s\n" % (key, kwargs[key])) elif key == 'accelerator_script': if kwargs[key] == 'centos7-cuda8': accelerator_script = "#!/bin/bash\n"\ "if ! rpm -q cuda; then\n"\ " curl -O http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-repo-rhel7-8.0.61-1.x86_64.rpm\n"\ " rpm -i --force ./cuda-repo-rhel7-8.0.61-1.x86_64.rpm\n"\ " yum clean all\n"\ " yum install epel-release -y\n"\ " yum update -y\n"\ " yum install cuda -y\n"\ "fi\n"\ "sleep 30\n"\ "sudo modprobe nvidia\n" sys.stdout.write("DEBUG: assigning %s %s\n" % (key, accelerator_script)) # FIXME: `conf.py` should ensure that `boot_disk_size` has the right # type, so there would be no need to convert here boot_disk_size_gb = int(boot_disk_size) subnetwork_url = '%s/regions/%s/subnetworks/%s' % ( project_url, self._region, self._subnetwork) network_url = '%s/global/networks/%s' % (project_url, self._network) if image_id.startswith('http://') or image_id.startswith('https://'): image_url = image_id else: # The image names and full resource URLs for several Google- # provided images (debian, centos, etc.) follow a consistent # pattern, and so elasticluster supports a short-hand of just # an image name, such as # "debian-7-wheezy-v20150526". # The cloud project in this case is then "debian-cloud". # # Several images do not follow this convention, and so are # special-cased here: # backports-debian -> debian-cloud # ubuntu -> ubuntu-os-cloud # containter-vm -> google-containers if image_id.startswith('container-vm-'): os_cloud = 'google-containers' elif image_id.startswith('backports-debian-'): os_cloud = 'debian-cloud' elif image_id.startswith('ubuntu-'): os_cloud = 'ubuntu-os-cloud' else: os = image_id.split("-")[0] os_cloud = "%s-cloud" % os image_url = '%s%s/global/images/%s' % (GCE_URL, os_cloud, image_id) if scheduling is None: # use GCE's default scheduling_option = {} elif scheduling == 'preemptible': scheduling_option = {'preemptible': True} else: raise InstanceError("Unknown scheduling option: '%s'" % scheduling) if isinstance(tags, types.StringTypes): tags = tags.split(',') elif isinstance(tags, collections.Sequence): # ok, nothing to do pass elif tags is not None: raise TypeError( "The `tags` argument to `gce.start_instance`" " should be a string or a list, got {T} instead".format( T=type(tags))) # construct the request body if node_name: instance_id = node_name.lower().replace( '_', '-') # GCE doesn't allow "_" else: instance_id = 'elasticluster-%s' % uuid.uuid4() with open(public_key_path, 'r') as f: public_key_content = f.read() instance = { 'name': instance_id, 'machineType': machine_type_url, 'tags': { 'items': tags, }, 'scheduling': scheduling_option, 'disks': [{ 'autoDelete': 'true', 'boot': 'true', 'type': 'PERSISTENT', 'initializeParams': { 'diskName': "%s-disk" % instance_id, 'diskType': boot_disk_type_url, 'diskSizeGb': boot_disk_size_gb, 'sourceImage': image_url } }], 'networkInterfaces': [{ 'accessConfigs': [{ 'type': 'ONE_TO_ONE_NAT', 'name': 'External NAT' }], 'network': network_url, 'subnetwork': subnetwork_url }], 'serviceAccounts': [{ 'email': self._email, 'scopes': GCE_DEFAULT_SCOPES }], "metadata": { "kind": "compute#metadata", "items": [{ "key": "sshKeys", "value": "%s:%s" % (username, public_key_content) }] } } if 'accelerator' in locals(): instance["guestAccelerators"] = [{ "acceleratorCount": accelerator_count, "acceleratorType": accelerator_url, }] instance["scheduling"] = { "onHostMaintenance": "terminate", "automaticRestart": "true" } if 'accelerator_script' in locals(): instance["metadata"]["items"] += [{ "key": "startup-script", "value": accelerator_script }] sys.stdout.write("DEBUG: Accelerator setup:\n%s\n" % str(accelerator_script)) # create the instance gce = self._connect() sys.stdout.write("DEBUG: %s\n\n" % str(instance)) request = gce.instances().insert(project=self._project_id, body=instance, zone=self._zone) try: response = self._execute_request(request) response = self._wait_until_done(response) self._check_response(response) return instance_id except (HttpError, CloudProviderError) as e: log.error("Error creating instance `%s`" % e) raise InstanceError("Error creating instance `%s`" % e)
def start_instance( self, # these are common to any # CloudProvider.start_instance() call key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None, # these params are specific to the # GoogleCloudProvider instance_name=None): """ Starts a new instance with the given properties and returns the instance id. """ # construct URLs project_url = '%s%s' % (GCE_URL, self._project_id) machine_type_url = '%s/zones/%s/machineTypes/%s' \ % (project_url, self._zone, flavor) network_url = '%s/global/networks/%s' % (project_url, self._network) os = image_id.split("-")[0] os_cloud = "%s-cloud" % os image_url = '%s%s/global/images/%s' % (GCE_URL, os_cloud, image_id) # construct the request body if instance_name is None: # TODO: it would be nice to have a way to name this # <clustername>-<nodetype>-NNN, e.g., # "mycluster-compute-001", but we take an easy path to # uniqueness for now. instance_name = 'elasticluster-%s' % uuid.uuid4() public_key_content = file(public_key_path).read() instance = { 'name': instance_name, 'machineType': machine_type_url, 'image': image_url, 'networkInterfaces': [{ 'accessConfigs': [{ 'type': 'ONE_TO_ONE_NAT', 'name': 'External NAT' }], 'network': network_url }], 'serviceAccounts': [{ 'email': GCE_DEFAULT_SERVICE_EMAIL, 'scopes': GCE_DEFAULT_SCOPES }], "metadata": { "kind": "compute#metadata", "items": [{ "key": "sshKeys", "value": "%s:%s" % (username, public_key_content) }] } } # create the instance gce = self._connect() request = gce.instances().insert(project=self._project_id, body=instance, zone=self._zone) try: response = self._execute_request(request) response = self._wait_until_done(response) self._check_response(response) return instance_name except (HttpError, CloudProviderError) as e: log.error("Error creating instance `%s`" % e) raise InstanceError("Error creating instance `%s`" % e)
def start_instance(self, key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None, node_name=None, network_ids=None, **kwargs): """Starts a new instance on the cloud using the given properties. The following tasks are done to start an instance: * establish a connection to the cloud web service * check ssh keypair and upload it if it does not yet exist. This is a locked process, since this function might be called in multiple threads and we only want the key to be stored once. * check if the security group exists * run the instance with the given properties :param str key_name: name of the ssh key to connect :param str public_key_path: path to ssh public key :param str private_key_path: path to ssh private key :param str security_group: firewall rule definition to apply on the instance :param str flavor: machine type to use for the instance :param str image_id: image type (os) to use for the instance :param str image_userdata: command to execute after startup :param str username: username for the given ssh key, default None :return: str - instance id of the started instance """ connection = self._connect() log.debug("Checking keypair `%s`.", key_name) # the `_check_keypair` method has to be called within a lock, # since it will upload the key if it does not exist and if this # happens for every node at the same time ec2 will throw an error # message (see issue #79) with BotoCloudProvider.__node_start_lock: self._check_keypair(key_name, public_key_path, private_key_path) log.debug("Checking security group `%s`.", security_group) security_group_id = self._check_security_group(security_group) # image_id = self._find_image_id(image_id) if network_ids: interfaces = [] for subnet in network_ids.split(','): subnet_id = self._check_subnet(subnet) interfaces.append( ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=subnet_id, groups=[security_group_id], associate_public_ip_address=self.request_floating_ip)) interfaces = ec2.networkinterface.NetworkInterfaceCollection( *interfaces) security_groups = [] else: interfaces = None security_groups = [security_group] try: reservation = connection.run_instances( image_id, key_name=key_name, security_groups=security_groups, instance_type=flavor, user_data=image_userdata, network_interfaces=interfaces) except Exception as ex: log.error("Error starting instance: %s", ex) if "TooManyInstances" in ex: raise ClusterError(ex) else: raise InstanceError(ex) vm = reservation.instances[-1] vm.add_tag("Name", node_name) # cache instance object locally for faster access later on self._instances[vm.id] = vm return vm.id