Пример #1
0
    def stop_instance(self, instance_id):
        """Stops the instance gracefully.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance can not be stopped
        """
        if not instance_id:
          log.info("Instance to stop has no instance id")
          return

        gce = self._connect()

        try:
            request = gce.instances().delete(project=self._project_id,
                                        instance=instance_id, zone=self._zone)
            response = self._execute_request(request)
            self._check_response(response)
        except HttpError as e:
            # If the instance does not exist, we can a 404 - just log it, and
            # return without exception so the caller can remove the reference.
            if e.resp.status == 404:
              log.warning("Instance to stop `%s` was not found" % instance_id)
            else:
              raise InstanceError("Could not stop instance `%s`: `%s`"
                                  % (instance_id, e))
        except CloudProviderError as e:
            raise InstanceError("Could not stop instance `%s`: `%s`"
                                % (instance_id, e))
Пример #2
0
    def stop_instance(self, instance_id):
        """Stops the instance gracefully.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance can not be stopped
        """
        if not instance_id:
            log.info("Instance to stop has no instance id")
            return

        gce = self._connect()

        try:
            request = gce.instances().delete(project=self._project_id,
                                             instance=instance_id,
                                             zone=self._zone)
            response = self._execute_request(request)
            self._check_response(response)
        except HttpError as e:
            # If the instance does not exist, we get a 404
            if e.resp.status == 404:
                raise InstanceNotFoundError(
                    "Instance `{instance_id}` was not found".format(
                        instance_id=instance_id))
            else:
                raise InstanceError(
                    "Could not stop instance `{instance_id}`: `{e}`".format(
                        instance_id=instance_id, e=e))
        except CloudProviderError as e:
            raise InstanceError(
                "Could not stop instance `{instance_id}`: `{e}`".format(
                    instance_id=instance_id, e=e))
Пример #3
0
    def get_ips(self, instance_id):
        """Retrieves the ip addresses (public) from the cloud
        provider by the given instance id.

        :param str instance_id: id of the instance
        :return: list (ips)
        :raises: InstanceError if the ip could not be retrieved.
        """
        gce = self._connect()
        instances = gce.instances()
        try:
            request = instances.get(instance=instance_id,
                                    project=self._project_id,
                                    zone=self._zone)
            response = self._execute_request(request)
            ip_public = None
            if response and "networkInterfaces" in response:
                interfaces = response['networkInterfaces']
                if interfaces:
                    if "accessConfigs" in interfaces[0]:
                        ip_public = interfaces[0]['accessConfigs'][0]['natIP']

            if ip_public:
                return [ip_public]
            else:
                raise InstanceError("could not retrieve the ip address for "
                                    "node `%s`, please check the node "
                                    "through the cloud provider interface" %
                                    instance_id)

        except (HttpError, CloudProviderError) as e:
            raise InstanceError('could not retrieve the ip address of `%s`: '
                                '`%s`' % (instance_id, e))
Пример #4
0
    def pause_instance(self, instance_id):
        """Pauses the instance, retaining disk and config.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance cannot be paused

        :return: dict - information needed to restart instance.
        """

        if not instance_id:
            log.info("Instance to pause has no instance id.")
            return

        gce = self._connect()

        try:
            request = gce.instances().stop(project=self._project_id,
                                           instance=instance_id,
                                           zone=self._zone)
            operation = self._execute_request(request)
            response = self._wait_until_done(operation)
            self._check_response(response)
            return {"instance_id": instance_id}
        except HttpError as e:
            log.error("Error stopping instance: `%s", e)
            raise InstanceError("Error stopping instance `%s`", e)
Пример #5
0
    def _load_instance(self, instance_id):
        """Checks if an instance with the given id is cached. If not it
        will connect to the cloud and put it into the local cache
        _instances.

        :param str instance_id: instance identifier
        :return: py:class:`boto.ec2.instance.Reservation` - instance
        :raises: `InstanceError` is returned if the instance can't
                 be found in the local cache or in the cloud.
        """
        connection = self._connect()
        if instance_id in self._instances:
            return self._instances[instance_id]

        # Instance not in the internal dictionary.
        # First, check the internal cache:
        if instance_id not in [i.id for i in self._cached_instances]:
            # Refresh the cache, just in case
            self._cached_instances = []
            reservations = connection.get_all_instances()
            for res in reservations:
                self._cached_instances.extend(res.instances)

        for inst in self._cached_instances:
            if inst.id == instance_id:
                self._instances[instance_id] = inst
                return inst

        # If we reached this point, the instance was not found neither
        # in the cache or on the website.
        raise InstanceError("the given instance `%s` was not found "
                            "on the coud" % instance_id)
Пример #6
0
    def resume_instance(self, paused_info):
        """Restarts a paused instance, retaining disk and config.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance cannot be resumed.

        :return: dict - information needed to restart instance.
        """

        if not paused_info.get("instance_id"):
            log.info("Instance to stop has no instance id.")
            return

        gce = self._connect()

        try:
            request = gce.instances().start(
                project=self._project_id,
                instance=paused_info["instance_id"],
                zone=self._zone)
            operation = self._execute_request(request)
            response = self._wait_until_done(operation)
            self._check_response(response)
            return
        except HttpError as e:
            log.error("Error restarting instance: `%s", e)
            raise InstanceError("Error restarting instance `%s`", e)
Пример #7
0
    def start_instance(self, key_name, public_key_path, private_key_path,
                       security_group, flavor, image_id, image_userdata,
                       username=None):
        """
        Starts an instance in the cloud on the specified cloud
        provider (configuration option) and returns the id of the
        started instance.
        """
        connection = self._connect()

        log.debug("Checking keypair `%s`.", key_name)
        self._check_keypair(key_name, public_key_path, private_key_path)
        log.debug("Checking security group `%s`.", security_group)
        self._check_security_group(security_group)
        # image_id = self._find_image_id(image_id)

        try:
            reservation = connection.run_instances(
                image_id, key_name=key_name, security_groups=[security_group],
                instance_type=flavor, user_data=image_userdata)
        except Exception, ex:
            log.error("Error starting instance: %s", ex)
            if "TooManyInstances" in ex:
                raise ClusterError(ex)
            else:
                raise InstanceError(ex)
Пример #8
0
    def start_instance(self,
                       key_name,
                       public_key_path,
                       private_key_path,
                       security_group,
                       flavor,
                       image_id,
                       image_userdata,
                       username=None,
                       **kwargs):
        """Starts a new instance on the cloud using the given properties.
        The following tasks are done to start an instance:

        * establish a connection to the cloud web service
        * check ssh keypair and upload it if it does not yet exist. This is
          a locked process, since this function might be called in multiple
          threads and we only want the key to be stored once.
        * check if the security group exists
        * run the instance with the given properties

        :param str key_name: name of the ssh key to connect
        :param str public_key_path: path to ssh public key
        :param str private_key_path: path to ssh private key
        :param str security_group: firewall rule definition to apply on the
                                   instance
        :param str flavor: machine type to use for the instance
        :param str image_id: image type (os) to use for the instance
        :param str image_userdata: command to execute after startup
        :param str username: username for the given ssh key, default None

        :return: str - instance id of the started instance
        """
        connection = self._connect()

        log.debug("Checking keypair `%s`.", key_name)
        # the `_check_keypair` method has to be called within a lock,
        # since it will upload the key if it does not exist and if this
        # happens for every node at the same time ec2 will throw an error
        # message (see issue #79)
        with BotoCloudProvider.__node_start_lock:
            self._check_keypair(key_name, public_key_path, private_key_path)

        log.debug("Checking security group `%s`.", security_group)
        self._check_security_group(security_group)
        # image_id = self._find_image_id(image_id)

        try:
            reservation = connection.run_instances(
                image_id,
                key_name=key_name,
                security_groups=[security_group],
                instance_type=flavor,
                user_data=image_userdata)
        except Exception, ex:
            log.error("Error starting instance: %s", ex)
            if "TooManyInstances" in ex:
                raise ClusterError(ex)
            else:
                raise InstanceError(ex)
Пример #9
0
    def get_ips(self, instance_id):
        """Retrieves the ip addresses (public) from the cloud
        provider by the given instance id.

        :param str instance_id: id of the instance
        :return: list (ips)
        :raises: InstanceError if the ip could not be retrieved.
        """
        if not instance_id:
            raise InstanceError("could not retrieve the ip address for node: "
                                "no associated instance id")
        gce = self._connect()
        instances = gce.instances()
        try:
            request = instances.get(instance=instance_id,
                                    project=self._project_id,
                                    zone=self._zone)
            response = self._execute_request(request)
            ip_public = None

            # If the instance is in status TERMINATED, then there will be
            # no IP addresses.
            if response and response['status'] in ('STOPPING', 'TERMINATED'):
                log.info("node '%s' state is '%s'; no IP address(es)" %
                         (instance_id, response['status']))
                return [None]

            if response and "networkInterfaces" in response:
                interfaces = response['networkInterfaces']
                if interfaces:
                    if "accessConfigs" in interfaces[0]:
                        ip_public = interfaces[0]['accessConfigs'][0]['natIP']
                        ip_private = interfaces[0]['networkIP']

            if ip_public and ip_private:
                return [ip_public, ip_private]
            else:
                raise InstanceError("could not retrieve the ip address for "
                                    "node `%s`, please check the node "
                                    "through the cloud provider interface" %
                                    instance_id)

        except (HttpError, CloudProviderError) as e:
            raise InstanceError('could not retrieve the ip address of `%s`: '
                                '`%s`' % (instance_id, e))
Пример #10
0
    def _load_instance(self, instance_id, force_reload=True):
        """Checks if an instance with the given id is cached. If not it
        will connect to the cloud and put it into the local cache
        _instances.

        :param str instance_id: instance identifier
        :param bool force_reload: reload instance from server
        :return: py:class:`novaclient.v1_1.servers.Server` - instance
        :raises: `InstanceError` is returned if the instance can't
                 be found in the local cache or in the cloud.
        """
        if force_reload:
            try:
                # Remove from cache and get from server again
                vm = self.client.servers.get(instance_id)
                # update cache
                self._instances[instance_id] = vm
                # delete internal cache, just in case
                for i in self._cached_instances:
                    if i.id == instance_id:
                        self._cached_instances.remove(i)
                        self._cached_instances.append(vm)
                        break

            except NotFound:
                raise InstanceError("the given instance `%s` was not found "
                                    "on the coud" % instance_id)
        if instance_id in self._instances:
            return self._instances[instance_id]

        # Instance not in the internal dictionary.
        # First, check the internal cache:
        if instance_id not in [i.id for i in self._cached_instances]:
            # Refresh the cache, just in case
            self._cached_instances = self.client.servers.list()

        for inst in self._cached_instances:
            if inst.id == instance_id:
                self._instances[instance_id] = inst
                return inst

        # If we reached this point, the instance was not found neither
        # in the cache or on the website.
        raise InstanceError("the given instance `%s` was not found "
                            "on the coud" % instance_id)
Пример #11
0
    def stop_instance(self, instance_id):
        """
        Stops the instance with the given id gracefully.
        """
        gce = self._connect()

        try:
            request = gce.instances().delete(project=self._project_id,
                                             instance=instance_id,
                                             zone=self._zone)
            response = self._execute_request(request)
            self._check_response(response)
        except (HttpError, CloudProviderError) as e:
            raise InstanceError("Could not stop instance `%s`: `%s`" %
                                (instance_id, e))
Пример #12
0
    def stop_instance(self, instance_id):
        """Stops the instance gracefully.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance can not be stopped
        """
        gce = self._connect()

        try:
            request = gce.instances().delete(project=self._project_id,
                                        instance=instance_id, zone=self._zone)
            response = self._execute_request(request)
            self._check_response(response)
        except (HttpError, CloudProviderError) as e:
            raise InstanceError("Could not stop instance `%s`: `%s`"
                                % (instance_id, e))
Пример #13
0
    def list_instances(self, filter=None):
        """List instances on GCE, optionally filtering the results.

        :param str filter: Filter specification; see https://developers.google.com/compute/docs/reference/latest/instances/list for details.
        :return: list of instances
        """
        gce = self._connect()

        try:
            request = gce.instances().list(
                project=self._project_id, filter=filter, zone=self._zone)
            response = self._execute_request(request)
            self._check_response(response)
        except (HttpError, CloudProviderError) as e:
            raise InstanceError("could not retrieve all instances on the "
                                "cloud: ``" % e)

        if response and 'items' in response:
            return response['items']
        else:
            return list()
Пример #14
0
    def start_instance(
            self,
            # these are common to any
            # CloudProvider.start_instance() call
            key_name,
            public_key_path,
            private_key_path,
            security_group,
            flavor,
            image_id,
            image_userdata,
            username=None,
            # these params are specific to the
            # GoogleCloudProvider
            instance_name=None,
            boot_disk_type='pd-standard',
            boot_disk_size=10,
            **kwargs):
        """Starts a new instance with the given properties and returns
        the instance id.

        :param str key_name: name of the ssh key to connect
        :param str public_key_path: path to ssh public key
        :param str private_key_path: path to ssh private key
        :param str security_group: firewall rule definition to apply on the
                                   instance
        :param str flavor: machine type to use for the instance
        :param str image_id: image type (os) to use for the instance
        :param str image_userdata: command to execute after startup
        :param str username: username for the given ssh key, default None

        :param str instance_name: name of the instance

        :return: str - instance id of the started instance
        """
        # construct URLs
        project_url = '%s%s' % (GCE_URL, self._project_id)
        machine_type_url = '%s/zones/%s/machineTypes/%s' \
                           % (project_url, self._zone, flavor)
        boot_disk_type_url = '%s/zones/%s/diskTypes/%s' \
                           % (project_url, self._zone, boot_disk_type)
        boot_disk_size_gb = boot_disk_size
        network_url = '%s/global/networks/%s' % (project_url, self._network)
        if image_id.startswith('http://') or image_id.startswith('https://'):
            image_url = image_id
        else:
            # The image names and full resource URLs for several Google-
            # provided images (debian, centos, etc.) follow a consistent
            # pattern, and so elasticluster supports a short-hand of just
            # an image name, such as
            #   "debian-7-wheezy-v20150526".
            # The cloud project in this case is then "debian-cloud".
            #
            # Several images do not follow this convention, and so are
            # special-cased here:
            #   backports-debian -> debian-cloud
            #   ubuntu           -> ubuntu-os-cloud
            #   containter-vm    -> google-containers
            if image_id.startswith('container-vm-'):
                os_cloud = 'google-containers'
            elif image_id.startswith('backports-debian-'):
                os_cloud = 'debian-cloud'
            elif image_id.startswith('ubuntu-'):
                os_cloud = 'ubuntu-os-cloud'
            else:
                os = image_id.split("-")[0]
                os_cloud = "%s-cloud" % os

            image_url = '%s%s/global/images/%s' % (GCE_URL, os_cloud, image_id)

        # construct the request body
        if instance_name is None:
            instance_name = 'elasticluster-%s' % uuid.uuid4()

        public_key_content = file(public_key_path).read()

        instance = {
            'name':
            instance_name,
            'machineType':
            machine_type_url,
            'disks': [{
                'autoDelete': 'true',
                'boot': 'true',
                'type': 'PERSISTENT',
                'initializeParams': {
                    'diskName': "%s-disk" % instance_name,
                    'diskType': boot_disk_type_url,
                    'diskSizeGb': boot_disk_size_gb,
                    'sourceImage': image_url
                }
            }],
            'networkInterfaces': [{
                'accessConfigs': [{
                    'type': 'ONE_TO_ONE_NAT',
                    'name': 'External NAT'
                }],
                'network':
                network_url
            }],
            'serviceAccounts': [{
                'email': self._email,
                'scopes': GCE_DEFAULT_SCOPES
            }],
            "metadata": {
                "kind":
                "compute#metadata",
                "items": [{
                    "key": "sshKeys",
                    "value": "%s:%s" % (username, public_key_content)
                }]
            }
        }

        # create the instance
        gce = self._connect()
        request = gce.instances().insert(project=self._project_id,
                                         body=instance,
                                         zone=self._zone)
        try:
            response = self._execute_request(request)
            response = self._wait_until_done(response)
            self._check_response(response)
            return instance_name
        except (HttpError, CloudProviderError) as e:
            log.error("Error creating instance `%s`" % e)
            raise InstanceError("Error creating instance `%s`" % e)
Пример #15
0
    def start_instance(
            self,
            # these are common to any
            # CloudProvider.start_instance() call
            key_name,
            public_key_path,
            private_key_path,
            security_group,
            flavor,
            image_id,
            image_userdata,
            username=None,
            # these params are specific to the
            # GoogleCloudProvider
            instance_name=None,
            **kwargs):
        """Starts a new instance with the given properties and returns
        the instance id.

        :param str key_name: name of the ssh key to connect
        :param str public_key_path: path to ssh public key
        :param str private_key_path: path to ssh private key
        :param str security_group: firewall rule definition to apply on the
                                   instance
        :param str flavor: machine type to use for the instance
        :param str image_id: image type (os) to use for the instance
        :param str image_userdata: command to execute after startup
        :param str username: username for the given ssh key, default None

        :param str instance_name: name of the instance

        :return: str - instance id of the started instance
        """
        # construct URLs
        project_url = '%s%s' % (GCE_URL, self._project_id)
        machine_type_url = '%s/zones/%s/machineTypes/%s' \
                           % (project_url, self._zone, flavor)
        network_url = '%s/global/networks/%s' % (project_url, self._network)
        if image_id.startswith('http://') or image_id.startswith('https://'):
            image_url = image_id
        else:
            os = image_id.split("-")[0]
            os_cloud = "%s-cloud" % os
            image_url = '%s%s/global/images/%s' % (GCE_URL, os_cloud, image_id)

        # construct the request body
        if instance_name is None:
            instance_name = 'elasticluster-%s' % uuid.uuid4()

        public_key_content = file(public_key_path).read()

        instance = {
            'name':
            instance_name,
            'machineType':
            machine_type_url,
            'disks': [{
                'autoDelete': 'true',
                'boot': 'true',
                'type': 'PERSISTENT',
                'initializeParams': {
                    'diskName': "%s-disk" % instance_name,
                    'sourceImage': image_url
                }
            }],
            'networkInterfaces': [{
                'accessConfigs': [{
                    'type': 'ONE_TO_ONE_NAT',
                    'name': 'External NAT'
                }],
                'network':
                network_url
            }],
            'serviceAccounts': [{
                'email': self._email,
                'scopes': GCE_DEFAULT_SCOPES
            }],
            "metadata": {
                "kind":
                "compute#metadata",
                "items": [{
                    "key": "sshKeys",
                    "value": "%s:%s" % (username, public_key_content)
                }]
            }
        }

        # create the instance
        gce = self._connect()
        request = gce.instances().insert(project=self._project_id,
                                         body=instance,
                                         zone=self._zone)
        try:
            response = self._execute_request(request)
            response = self._wait_until_done(response)
            self._check_response(response)
            return instance_name
        except (HttpError, CloudProviderError) as e:
            log.error("Error creating instance `%s`" % e)
            raise InstanceError("Error creating instance `%s`" % e)
Пример #16
0
    def start_instance(self,
                       key_name,
                       public_key_path,
                       private_key_path,
                       security_group,
                       flavor,
                       image_id,
                       image_userdata,
                       cluster_name,
                       username=None,
                       node_name=None,
                       **options):

        template_id, attributes = self._parse_flavor(flavor)

        if node_name:
            # this only sets the VM name for display purposes
            attributes['NAME'] = node_name

        # boot disk
        attributes.setdefault('OS', {})
        boot = attributes['OS']
        boot.setdefault('BOOT', '')  # FIXME: should this be 'disk0'?

        attributes.setdefault('DISK', {})
        disk0 = attributes['DISK']
        try:
            # `image_id` is numeric
            image_id = int(image_id)
            disk0['IMAGE_ID'] = image_id
        except (TypeError, ValueError):
            # `image_id` is the disk image name
            if '/' in image_id:
                img_username, img_id = image_id.split('/')
            else:
                img_username = self._username
                img_id = image_id
            disk0['IMAGE'] = img_id
            disk0['IMAGE_UNAME'] = img_username

        # not attempting to merge flavor attributes into the `NIC`
        # part: network configuration should be part of either the ONE
        # template, or the ElastiCluster configuration
        nics = attributes['NIC'] = []
        network_ids = [
            netid.strip()
            for netid in options.pop('network_ids', '').split(',')
            if netid.strip() != ''
        ]
        if network_ids:
            for netid in network_ids:
                try:
                    # numeric ID?
                    netid = int(netid)
                    nics.append({'NETWORK_ID': netid})
                except (TypeError, ValueError):
                    if '/' in netid:
                        net_username, net_id = netid.split('/')
                    else:
                        net_username = self._username
                        net_id = netid
                    nics.append({
                        'NETWORK': net_id,
                        'NETWORK_UNAME': net_username,
                    })
                if security_group and security_group != 'default':
                    nics[-1]['SECURITY_GROUP'] = security_group

        attributes.setdefault('CONTEXT', {})
        context = attributes['CONTEXT']
        # this is needed to enable networking; having the `NIC`
        # lines in template seems not to be enough in ONE 5.6.1
        context['NETWORK'] = 'YES'
        if node_name:
            context['SET_HOSTNAME'] = node_name
        if username:
            context['USERNAME'] = username
        if public_key_path:
            with open(public_key_path) as pubkey:
                context['SSH_PUBLIC_KEY'] = pubkey.read()
        if image_userdata:
            # FIXME: should be base64-encoded and use `START_SCRIPT_BASE64`
            context['START_SCRIPT'] = image_userdata

        # create VM
        with self._api_lock:
            try:
                if template_id is not None:
                    vm_id = self.server.template.instantiate(
                        template_id, (node_name or ''), False,
                        self._make_template_str(attributes))
                else:
                    vm_id = self.server.vm.allocate(
                        self._make_template_str(attributes), False)
                return {'instance_id': vm_id}
            except pyone.OneException as err:
                raise InstanceError("Error creating node `{0}`: {1}".format(
                    node_name, err))
Пример #17
0
    def start_instance(
            self,
            # these are common to any
            # CloudProvider.start_instance() call
            key_name,
            public_key_path,
            private_key_path,
            security_group,
            flavor,
            image_id,
            image_userdata,
            username=None,
            # these params are specific to the
            # GoogleCloudProvider
            node_name=None,
            boot_disk_type='pd-standard',
            boot_disk_size=10,
            tags=None,
            scheduling=None,
            accelerator_count=0,
            accelerator_type='default',
            allow_project_ssh_keys=True,
            min_cpu_platform=None,
            **kwargs):
        """
        Starts a new instance with the given properties and returns
        the instance id.

        :param str key_name: name of the ssh key to connect
        :param str public_key_path: path to ssh public key
        :param str private_key_path: path to ssh private key
        :param str security_group: firewall rule definition to apply on the
                                   instance
        :param str flavor: machine type to use for the instance
        :param str image_id: image type (os) to use for the instance
        :param str image_userdata: command to execute after startup
        :param str username: username for the given ssh key, default None
        :param str node_name: name of the instance
        :param str|Sequence tags: "Tags" to label the instance.
          Can be either a single string (individual tags are comma-separated),
          or a sequence of strings (each string being a single tag).
        :param str scheduling: scheduling option to use for the instance ("preemptible")
        :param int accelerator_count: Number of accelerators (e.g., GPUs) to make available in instance
        :param str accelerator_type: Type of accelerator to request.  Can be one of:

          * Full URL specifying an accelerator type valid for the zone and project VMs are being created in.  For example, ``https://www.googleapis.com/compute/v1/projects/[PROJECT_ID]/zones/[ZONE]/acceleratorTypes/[ACCELERATOR_TYPE]``
          * An accelerator type name (any string which is not a valid URL).  This is internally prefixed with the string ``https://www.googleapis.com/compute/v1/projects/[PROJECT_ID]/zones/[ZONE]/acceleratorTypes/`` to form a full URL.
        :param bool allow_project_ssh_keys:
          When ``True`` (default), SSH login is allowed to a node
          using any of the project-wide SSH keys (if they are
          defined).  When ``False``, only the SSH key specified by
          ElastiCluster config's ``[login/*]`` section will be allowed
          to log in (instance-level key).
        :param str min_cpu_platform: require CPUs of this type or better (e.g., "Intel Skylake")

          Only used if ``accelerator_count`` is > 0.

        :return: str - instance id of the started instance
        """
        # construct URLs
        project_url = '%s%s' % (GCE_URL, self._project_id)
        machine_type_url = '%s/zones/%s/machineTypes/%s' \
                           % (project_url, self._zone, flavor)
        boot_disk_type_url = '%s/zones/%s/diskTypes/%s' \
                           % (project_url, self._zone, boot_disk_type)
        # FIXME: `conf.py` should ensure that `boot_disk_size` has the right
        # type, so there would be no need to convert here
        boot_disk_size_gb = int(boot_disk_size)
        network_url = '%s/global/networks/%s' % (project_url, self._network)
        if image_id.startswith('http://') or image_id.startswith('https://'):
            image_url = image_id
        else:
            # The image names and full resource URLs for several Google-
            # provided images (debian, centos, etc.) follow a consistent
            # pattern, and so elasticluster supports a short-hand of just
            # an image name, such as
            #   "debian-7-wheezy-v20150526".
            # The cloud project in this case is then "debian-cloud".
            #
            # Several images do not follow this convention, and so are
            # special-cased here:
            #   backports-debian -> debian-cloud
            #   ubuntu           -> ubuntu-os-cloud
            #   containter-vm    -> google-containers
            if image_id.startswith('container-vm-'):
                os_cloud = 'google-containers'
            elif image_id.startswith('backports-debian-'):
                os_cloud = 'debian-cloud'
            elif image_id.startswith('ubuntu-'):
                os_cloud = 'ubuntu-os-cloud'
            else:
                os = image_id.split("-")[0]
                os_cloud = "%s-cloud" % os

            image_url = '%s%s/global/images/%s' % (GCE_URL, os_cloud, image_id)

        scheduling_option = {}
        if scheduling == 'preemptible':
            scheduling_option['preemptible'] = True
        elif scheduling is not None:
            raise InstanceError("Unknown scheduling option: '%s'" % scheduling)

        if isinstance(tags, types.StringTypes):
            tags = tags.split(',')
        elif isinstance(tags, collections.Sequence):
            # ok, nothing to do
            pass
        elif tags is not None:
            raise TypeError(
                "The `tags` argument to `gce.start_instance`"
                " should be a string or a list, got {T} instead".format(
                    T=type(tags)))

        with open(public_key_path, 'r') as f:
            public_key_content = f.read()

        compute_metadata = [
            {
                "key": "ssh-keys",
                "value": "%s:%s" % (username, public_key_content),
            },
            {
                "key": "block-project-ssh-keys",
                "value": (not allow_project_ssh_keys),
            },
        ]
        if image_userdata:
            compute_metadata.append({
                "key": "startup-script",
                "value": image_userdata,
            })

        # construct the request body
        if node_name:
            instance_id = node_name.lower().replace(
                '_', '-')  # GCE doesn't allow "_"
        else:
            instance_id = 'elasticluster-%s' % uuid.uuid4()

        instance = {
            'name':
            instance_id,
            'machineType':
            machine_type_url,
            'tags': {
                'items': tags,
            },
            'scheduling':
            scheduling_option,
            'disks': [{
                'autoDelete': 'true',
                'boot': 'true',
                'type': 'PERSISTENT',
                'initializeParams': {
                    'diskName': "%s-disk" % instance_id,
                    'diskType': boot_disk_type_url,
                    'diskSizeGb': boot_disk_size_gb,
                    'sourceImage': image_url
                }
            }],
            'networkInterfaces': [{
                'accessConfigs': [{
                    'type': 'ONE_TO_ONE_NAT',
                    'name': 'External NAT'
                }],
                'network':
                network_url
            }],
            'serviceAccounts': [{
                'email': self._email,
                'scopes': GCE_DEFAULT_SCOPES
            }],
            "metadata": {
                "kind": "compute#metadata",
                "items": compute_metadata,
            }
        }

        if min_cpu_platform is not None:
            instance['minCpuPlatform'] = min_cpu_platform

        # add accelerators/GPUs if requested
        if accelerator_count > 0:
            if (accelerator_type.startswith('https://')
                    or accelerator_type.startswith('http://')):
                # use URL as-is
                accelerator_type_url = accelerator_type
            else:
                accelerator_type_url = (
                    'https://www.googleapis.com/compute/{api_version}/'
                    'projects/{project_id}/zones/{zone}/'
                    'acceleratorTypes/{accelerator_type}'.format(
                        api_version=GCE_API_VERSION,
                        project_id=self._project_id,
                        zone=self._zone,
                        accelerator_type=accelerator_type))
            log.debug(
                "VM instance `%s`:"
                " Requesting %d accelerator%s of type '%s'", instance_id,
                accelerator_count, ('s' if accelerator_count > 1 else ''),
                accelerator_type_url)
            instance['guestAccelerators'] = [{
                'acceleratorCount':
                accelerator_count,
                'acceleratorType':
                accelerator_type_url,
            }]
            # no live migration with GPUs,
            # see: https://cloud.google.com/compute/docs/gpus#restrictions
            instance['scheduling']['onHostMaintenance'] = 'TERMINATE'
            instance['scheduling']['automaticRestart'] = True

        # create the instance
        gce = self._connect()
        request = gce.instances().insert(project=self._project_id,
                                         body=instance,
                                         zone=self._zone)
        try:
            response = self._execute_request(request)
            response = self._wait_until_done(response)
            self._check_response(response)
            return instance_id
        except (HttpError, CloudProviderError) as e:
            log.error("Error creating instance `%s`" % e)
            raise InstanceError("Error creating instance `%s`" % e)
Пример #18
0
    def start_instance(self,
                       key_name,
                       public_key_path,
                       private_key_path,
                       security_group,
                       flavor,
                       image_id,
                       image_userdata,
                       username=None,
                       node_name=None,
                       network_ids=None,
                       price=None,
                       timeout=None,
                       **kwargs):
        """Starts a new instance on the cloud using the given properties.
        The following tasks are done to start an instance:

        * establish a connection to the cloud web service
        * check ssh keypair and upload it if it does not yet exist. This is
          a locked process, since this function might be called in multiple
          threads and we only want the key to be stored once.
        * check if the security group exists
        * run the instance with the given properties

        :param str key_name: name of the ssh key to connect
        :param str public_key_path: path to ssh public key
        :param str private_key_path: path to ssh private key
        :param str security_group: firewall rule definition to apply on the
                                   instance
        :param str flavor: machine type to use for the instance
        :param str image_id: image type (os) to use for the instance
        :param str image_userdata: command to execute after startup
        :param str username: username for the given ssh key, default None
        :param float price: Spot instance price (if 0, do not use spot instances).
        :param int price: Timeout (in seconds) waiting for spot instances;
                          only used if price > 0.

        :return: str - instance id of the started instance
        """
        connection = self._connect()

        log.debug("Checking keypair `%s`.", key_name)
        # the `_check_keypair` method has to be called within a lock,
        # since it will upload the key if it does not exist and if this
        # happens for every node at the same time ec2 will throw an error
        # message (see issue #79)
        with BotoCloudProvider.__node_start_lock:
            self._check_keypair(key_name, public_key_path, private_key_path)

        log.debug("Checking security group `%s`.", security_group)
        security_group_id = self._check_security_group(security_group)
        # image_id = self._find_image_id(image_id)

        if network_ids:
            interfaces = []
            for subnet in network_ids.split(','):
                subnet_id = self._check_subnet(subnet)

                interfaces.append(
                    boto.ec2.networkinterface.NetworkInterfaceSpecification(
                        subnet_id=subnet_id,
                        groups=[security_group_id],
                        associate_public_ip_address=self.request_floating_ip))
            interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(
                *interfaces)

            security_groups = []
        else:
            interfaces = None
            security_groups = [security_group]

        # get defaults for `price` and `timeout` from class instance
        if price is None:
            price = self.price
        if timeout is None:
            timeout = self.timeout

        try:
            #start spot instance if bid is specified
            if price:
                log.info("Requesting spot instance with price `%s` ...", price)
                request = connection.request_spot_instances(
                    price,
                    image_id,
                    key_name=key_name,
                    security_groups=security_groups,
                    instance_type=flavor,
                    user_data=image_userdata,
                    network_interfaces=interfaces,
                    instance_profile_name=self._instance_profile)[-1]

                # wait until spot request is fullfilled (will wait
                # forever if no timeout is given)
                start_time = time.time()
                timeout = (float(timeout) if timeout else 0)
                log.info(
                    "Waiting for spot instance (will time out in %d seconds) ...",
                    timeout)
                while request.status.code != 'fulfilled':
                    if timeout and time.time() - start_time > timeout:
                        request.cancel()
                        raise RuntimeError('spot instance timed out')
                    time.sleep(self.POLL_INTERVAL)
                    # update request status
                    request = connection.get_all_spot_instance_requests(
                        request_ids=request.id)[-1]
            else:
                reservation = connection.run_instances(
                    image_id,
                    key_name=key_name,
                    security_groups=security_groups,
                    instance_type=flavor,
                    user_data=image_userdata,
                    network_interfaces=interfaces,
                    instance_profile_name=self._instance_profile)
        except Exception as ex:
            log.error("Error starting instance: %s", ex)
            if "TooManyInstances" in ex:
                raise ClusterError(ex)
            else:
                raise InstanceError(ex)
        if price:
            vm = connection.get_only_instances(
                instance_ids=[request.instance_id])[-1]
        else:
            vm = reservation.instances[-1]
        vm.add_tag("Name", node_name)

        # cache instance object locally for faster access later on
        self._instances[vm.id] = vm

        return vm.id
Пример #19
0
    def start_instance(self,
                       # these are common to any
                       # CloudProvider.start_instance() call
                       key_name, public_key_path, private_key_path,
                       security_group, flavor, image_id, image_userdata,
                       username=None,
                       # these params are specific to the
                       # GoogleCloudProvider
                       node_name=None,
                       boot_disk_type='pd-standard',
                       boot_disk_size=10,
                       tags=None,
                       scheduling=None,
                       **kwargs):
        """Starts a new instance with the given properties and returns
        the instance id.

        :param str key_name: name of the ssh key to connect
        :param str public_key_path: path to ssh public key
        :param str private_key_path: path to ssh private key
        :param str security_group: firewall rule definition to apply on the
                                   instance
        :param str flavor: machine type to use for the instance
        :param str image_id: image type (os) to use for the instance
        :param str image_userdata: command to execute after startup
        :param str username: username for the given ssh key, default None
        :param str node_name: name of the instance
        :param str tags: comma-separated list of "tags" to label the instance
        :param str scheduling: scheduling option to use for the instance ("preemptible")
        :param str|Sequence tags: "Tags" to label the instance.

        Can be either a single string (individual tags are comma-separated),
        or a sequence of strings (each string being a single tag).

        :return: str - instance id of the started instance
        """
        # construct URLs
        project_url = '%s%s' % (GCE_URL, self._project_id)
        machine_type_url = '%s/zones/%s/machineTypes/%s' \
                           % (project_url, self._zone, flavor)
        boot_disk_type_url = '%s/zones/%s/diskTypes/%s' \
                           % (project_url, self._zone, boot_disk_type)
        boot_disk_size_gb = boot_disk_size
        network_url = '%s/global/networks/%s' % (project_url, self._network)
        if image_id.startswith('http://') or image_id.startswith('https://'):
            image_url = image_id
        else:
            # The image names and full resource URLs for several Google-
            # provided images (debian, centos, etc.) follow a consistent
            # pattern, and so elasticluster supports a short-hand of just
            # an image name, such as
            #   "debian-7-wheezy-v20150526".
            # The cloud project in this case is then "debian-cloud".
            #
            # Several images do not follow this convention, and so are
            # special-cased here:
            #   backports-debian -> debian-cloud
            #   ubuntu           -> ubuntu-os-cloud
            #   containter-vm    -> google-containers
            if image_id.startswith('container-vm-'):
              os_cloud = 'google-containers'
            elif image_id.startswith('backports-debian-'):
              os_cloud = 'debian-cloud'
            elif image_id.startswith('ubuntu-'):
              os_cloud = 'ubuntu-os-cloud'
            else:
              os = image_id.split("-")[0]
              os_cloud = "%s-cloud" % os

            image_url = '%s%s/global/images/%s' % (
                GCE_URL, os_cloud, image_id)

        if scheduling is None:
            # use GCE's default
            scheduling_option = {}
        elif scheduling == 'preemptible':
            scheduling_option = {
              'preemptible': True
            }
        else:
            raise InstanceError("Unknown scheduling option: '%s'" % scheduling)

        if isinstance(tags, types.StringTypes):
            tags = tags.split(',')
        elif isinstance(tags, collections.Sequence):
            # ok, nothing to do
            pass
        elif tags is not None:
            raise TypeError(
                "The `tags` argument to `gce.start_instance`"
                " should be a string or a list, got {T} instead"
                .format(T=type(tags)))

        # construct the request body
        if node_name:
            instance_id = node_name.lower().replace('_', '-')  # GCE doesn't allow "_"
        else:
            instance_id = 'elasticluster-%s' % uuid.uuid4()

        public_key_content = file(public_key_path).read()

        instance = {
            'name': instance_id,
            'machineType': machine_type_url,
            'tags': {
              'items': tags,
            },
            'scheduling': scheduling_option,
            'disks': [{
                'autoDelete': 'true',
                'boot': 'true',
                'type': 'PERSISTENT',
                'initializeParams' : {
                    'diskName': "%s-disk" % instance_id,
                    'diskType': boot_disk_type_url,
                    'diskSizeGb': boot_disk_size_gb,
                    'sourceImage': image_url
                    }
                }],
            'networkInterfaces': [
                {'accessConfigs': [
                    {'type': 'ONE_TO_ONE_NAT',
                     'name': 'External NAT'
                    }],
                 'network': network_url
                }],
            'serviceAccounts': [
                {'email': self._email,
                 'scopes': GCE_DEFAULT_SCOPES
                }],
            "metadata": {
                "kind": "compute#metadata",
                "items": [
                    {
                        "key": "sshKeys",
                        "value": "%s:%s" % (username, public_key_content)
                    }
                ]
            }
        }

        # create the instance
        gce = self._connect()
        request = gce.instances().insert(
            project=self._project_id, body=instance, zone=self._zone)
        try:
            response = self._execute_request(request)
            response = self._wait_until_done(response)
            self._check_response(response)
            return instance_id
        except (HttpError, CloudProviderError) as e:
            log.error("Error creating instance `%s`" % e)
            raise InstanceError("Error creating instance `%s`" % e)
Пример #20
0
    def start_instance(
            self,
            # these are common to any
            # CloudProvider.start_instance() call
            key_name,
            public_key_path,
            private_key_path,
            security_group,
            flavor,
            image_id,
            image_userdata,
            username=None,
            # these params are specific to the
            # GoogleCloudProvider
            node_name=None,
            boot_disk_type='pd-standard',
            boot_disk_size=10,
            tags=None,
            scheduling=None,
            **kwargs):
        """Starts a new instance with the given properties and returns
        the instance id.

        :param str key_name: name of the ssh key to connect
        :param str public_key_path: path to ssh public key
        :param str private_key_path: path to ssh private key
        :param str security_group: firewall rule definition to apply on the
                                   instance
        :param str flavor: machine type to use for the instance
        :param str image_id: image type (os) to use for the instance
        :param str image_userdata: command to execute after startup
        :param str username: username for the given ssh key, default None
        :param str node_name: name of the instance
        :param str tags: comma-separated list of "tags" to label the instance
        :param str scheduling: scheduling option to use for the instance ("preemptible")
        :param str|Sequence tags: "Tags" to label the instance.

        Can be either a single string (individual tags are comma-separated),
        or a sequence of strings (each string being a single tag).

        :return: str - instance id of the started instance
        """

        # construct URLs
        project_url = '%s%s' % (GCE_URL, self._project_id)
        machine_type_url = '%s/zones/%s/machineTypes/%s' \
                           % (project_url, self._zone, flavor)

        boot_disk_type_url = '%s/zones/%s/diskTypes/%s' \
                           % (project_url, self._zone, boot_disk_type)

        for key in kwargs:
            if key == 'accelerator':
                accelerator = kwargs[key]
                accelerator_url = '%s/zones/%s/acceleratorTypes/%s' % (
                    project_url, self._zone, accelerator)
                sys.stdout.write("DEBUG:  accelerator_url is %s\n" %
                                 (accelerator_url))
            elif key == 'accelerator_count':
                accelerator_count = kwargs[key]
                sys.stdout.write("DEBUG:  assigning %s value of %s\n" %
                                 (key, kwargs[key]))
            elif key == 'accelerator_script':
                if kwargs[key] == 'centos7-cuda8':
                    accelerator_script = "#!/bin/bash\n"\
                                     "if ! rpm -q  cuda; then\n"\
                                     "  curl -O http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-repo-rhel7-8.0.61-1.x86_64.rpm\n"\
                                     "  rpm -i --force ./cuda-repo-rhel7-8.0.61-1.x86_64.rpm\n"\
                                     "  yum clean all\n"\
                                     "  yum install epel-release -y\n"\
                                     "  yum update -y\n"\
                                     "  yum install cuda -y\n"\
                                     "fi\n"\
                                     "sleep 30\n"\
                                     "sudo modprobe nvidia\n"
                    sys.stdout.write("DEBUG:  assigning %s %s\n" %
                                     (key, accelerator_script))

        # FIXME: `conf.py` should ensure that `boot_disk_size` has the right
        # type, so there would be no need to convert here
        boot_disk_size_gb = int(boot_disk_size)
        subnetwork_url = '%s/regions/%s/subnetworks/%s' % (
            project_url, self._region, self._subnetwork)
        network_url = '%s/global/networks/%s' % (project_url, self._network)
        if image_id.startswith('http://') or image_id.startswith('https://'):
            image_url = image_id
        else:
            # The image names and full resource URLs for several Google-
            # provided images (debian, centos, etc.) follow a consistent
            # pattern, and so elasticluster supports a short-hand of just
            # an image name, such as
            #   "debian-7-wheezy-v20150526".
            # The cloud project in this case is then "debian-cloud".
            #
            # Several images do not follow this convention, and so are
            # special-cased here:
            #   backports-debian -> debian-cloud
            #   ubuntu           -> ubuntu-os-cloud
            #   containter-vm    -> google-containers
            if image_id.startswith('container-vm-'):
                os_cloud = 'google-containers'
            elif image_id.startswith('backports-debian-'):
                os_cloud = 'debian-cloud'
            elif image_id.startswith('ubuntu-'):
                os_cloud = 'ubuntu-os-cloud'
            else:
                os = image_id.split("-")[0]
                os_cloud = "%s-cloud" % os

            image_url = '%s%s/global/images/%s' % (GCE_URL, os_cloud, image_id)

        if scheduling is None:
            # use GCE's default
            scheduling_option = {}
        elif scheduling == 'preemptible':
            scheduling_option = {'preemptible': True}
        else:
            raise InstanceError("Unknown scheduling option: '%s'" % scheduling)

        if isinstance(tags, types.StringTypes):
            tags = tags.split(',')
        elif isinstance(tags, collections.Sequence):
            # ok, nothing to do
            pass
        elif tags is not None:
            raise TypeError(
                "The `tags` argument to `gce.start_instance`"
                " should be a string or a list, got {T} instead".format(
                    T=type(tags)))

        # construct the request body
        if node_name:
            instance_id = node_name.lower().replace(
                '_', '-')  # GCE doesn't allow "_"
        else:
            instance_id = 'elasticluster-%s' % uuid.uuid4()

        with open(public_key_path, 'r') as f:
            public_key_content = f.read()

        instance = {
            'name':
            instance_id,
            'machineType':
            machine_type_url,
            'tags': {
                'items': tags,
            },
            'scheduling':
            scheduling_option,
            'disks': [{
                'autoDelete': 'true',
                'boot': 'true',
                'type': 'PERSISTENT',
                'initializeParams': {
                    'diskName': "%s-disk" % instance_id,
                    'diskType': boot_disk_type_url,
                    'diskSizeGb': boot_disk_size_gb,
                    'sourceImage': image_url
                }
            }],
            'networkInterfaces': [{
                'accessConfigs': [{
                    'type': 'ONE_TO_ONE_NAT',
                    'name': 'External NAT'
                }],
                'network':
                network_url,
                'subnetwork':
                subnetwork_url
            }],
            'serviceAccounts': [{
                'email': self._email,
                'scopes': GCE_DEFAULT_SCOPES
            }],
            "metadata": {
                "kind":
                "compute#metadata",
                "items": [{
                    "key": "sshKeys",
                    "value": "%s:%s" % (username, public_key_content)
                }]
            }
        }
        if 'accelerator' in locals():
            instance["guestAccelerators"] = [{
                "acceleratorCount":
                accelerator_count,
                "acceleratorType":
                accelerator_url,
            }]
            instance["scheduling"] = {
                "onHostMaintenance": "terminate",
                "automaticRestart": "true"
            }

        if 'accelerator_script' in locals():
            instance["metadata"]["items"] += [{
                "key": "startup-script",
                "value": accelerator_script
            }]
            sys.stdout.write("DEBUG: Accelerator setup:\n%s\n" %
                             str(accelerator_script))

        # create the instance
        gce = self._connect()
        sys.stdout.write("DEBUG: %s\n\n" % str(instance))
        request = gce.instances().insert(project=self._project_id,
                                         body=instance,
                                         zone=self._zone)
        try:
            response = self._execute_request(request)
            response = self._wait_until_done(response)
            self._check_response(response)
            return instance_id
        except (HttpError, CloudProviderError) as e:
            log.error("Error creating instance `%s`" % e)
            raise InstanceError("Error creating instance `%s`" % e)
Пример #21
0
    def start_instance(
        self,
        # these are common to any
        # CloudProvider.start_instance() call
        key_name,
        public_key_path,
        private_key_path,
        security_group,
        flavor,
        image_id,
        image_userdata,
        username=None,
        # these params are specific to the
        # GoogleCloudProvider
        instance_name=None):
        """
        Starts a new instance with the given properties and returns
        the instance id.
        """
        # construct URLs
        project_url = '%s%s' % (GCE_URL, self._project_id)
        machine_type_url = '%s/zones/%s/machineTypes/%s' \
                           % (project_url, self._zone, flavor)
        network_url = '%s/global/networks/%s' % (project_url, self._network)
        os = image_id.split("-")[0]
        os_cloud = "%s-cloud" % os
        image_url = '%s%s/global/images/%s' % (GCE_URL, os_cloud, image_id)

        # construct the request body
        if instance_name is None:
            # TODO: it would be nice to have a way to name this
            # <clustername>-<nodetype>-NNN, e.g.,
            # "mycluster-compute-001", but we take an easy path to
            # uniqueness for now.
            instance_name = 'elasticluster-%s' % uuid.uuid4()

        public_key_content = file(public_key_path).read()

        instance = {
            'name':
            instance_name,
            'machineType':
            machine_type_url,
            'image':
            image_url,
            'networkInterfaces': [{
                'accessConfigs': [{
                    'type': 'ONE_TO_ONE_NAT',
                    'name': 'External NAT'
                }],
                'network':
                network_url
            }],
            'serviceAccounts': [{
                'email': GCE_DEFAULT_SERVICE_EMAIL,
                'scopes': GCE_DEFAULT_SCOPES
            }],
            "metadata": {
                "kind":
                "compute#metadata",
                "items": [{
                    "key": "sshKeys",
                    "value": "%s:%s" % (username, public_key_content)
                }]
            }
        }

        # create the instance
        gce = self._connect()
        request = gce.instances().insert(project=self._project_id,
                                         body=instance,
                                         zone=self._zone)
        try:
            response = self._execute_request(request)
            response = self._wait_until_done(response)
            self._check_response(response)
            return instance_name
        except (HttpError, CloudProviderError) as e:
            log.error("Error creating instance `%s`" % e)
            raise InstanceError("Error creating instance `%s`" % e)
Пример #22
0
    def start_instance(self,
                       key_name,
                       public_key_path,
                       private_key_path,
                       security_group,
                       flavor,
                       image_id,
                       image_userdata,
                       username=None,
                       node_name=None,
                       network_ids=None,
                       **kwargs):
        """Starts a new instance on the cloud using the given properties.
        The following tasks are done to start an instance:

        * establish a connection to the cloud web service
        * check ssh keypair and upload it if it does not yet exist. This is
          a locked process, since this function might be called in multiple
          threads and we only want the key to be stored once.
        * check if the security group exists
        * run the instance with the given properties

        :param str key_name: name of the ssh key to connect
        :param str public_key_path: path to ssh public key
        :param str private_key_path: path to ssh private key
        :param str security_group: firewall rule definition to apply on the
                                   instance
        :param str flavor: machine type to use for the instance
        :param str image_id: image type (os) to use for the instance
        :param str image_userdata: command to execute after startup
        :param str username: username for the given ssh key, default None

        :return: str - instance id of the started instance
        """
        connection = self._connect()

        log.debug("Checking keypair `%s`.", key_name)
        # the `_check_keypair` method has to be called within a lock,
        # since it will upload the key if it does not exist and if this
        # happens for every node at the same time ec2 will throw an error
        # message (see issue #79)
        with BotoCloudProvider.__node_start_lock:
            self._check_keypair(key_name, public_key_path, private_key_path)

        log.debug("Checking security group `%s`.", security_group)
        security_group_id = self._check_security_group(security_group)
        # image_id = self._find_image_id(image_id)

        if network_ids:
            interfaces = []
            for subnet in network_ids.split(','):
                subnet_id = self._check_subnet(subnet)

                interfaces.append(
                    ec2.networkinterface.NetworkInterfaceSpecification(
                        subnet_id=subnet_id,
                        groups=[security_group_id],
                        associate_public_ip_address=self.request_floating_ip))
            interfaces = ec2.networkinterface.NetworkInterfaceCollection(
                *interfaces)

            security_groups = []
        else:
            interfaces = None
            security_groups = [security_group]

        try:
            reservation = connection.run_instances(
                image_id,
                key_name=key_name,
                security_groups=security_groups,
                instance_type=flavor,
                user_data=image_userdata,
                network_interfaces=interfaces)
        except Exception as ex:
            log.error("Error starting instance: %s", ex)
            if "TooManyInstances" in ex:
                raise ClusterError(ex)
            else:
                raise InstanceError(ex)

        vm = reservation.instances[-1]
        vm.add_tag("Name", node_name)

        # cache instance object locally for faster access later on
        self._instances[vm.id] = vm

        return vm.id