예제 #1
0
파일: gce.py 프로젝트: dvischi/TissueMAPS
    def stop_instance(self, instance_id):
        """Stops the instance gracefully.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance can not be stopped
        """
        if not instance_id:
          log.info("Instance to stop has no instance id")
          return

        gce = self._connect()

        try:
            request = gce.instances().delete(project=self._project_id,
                                        instance=instance_id, zone=self._zone)
            response = self._execute_request(request)
            self._check_response(response)
        except HttpError as e:
            # If the instance does not exist, we get a 404
            if e.resp.status == 404:
                raise InstanceNotFoundError(
                    "Instance `{instance_id}` was not found"
                    .format(instance_id=instance_id))
            else:
                raise InstanceError(
                    "Could not stop instance `{instance_id}`: `{e}`"
                    .format(instance_id=instance_id, e=e))
        except CloudProviderError as e:
            raise InstanceError(
                "Could not stop instance `{instance_id}`: `{e}`"
                .format(instance_id=instance_id, e=e))
예제 #2
0
        def start_node(node_queue):
            try:
                while not node_queue.empty():
                    if not self.keep_running:
                        log.error("Aborting execution upon CTRL-C")
                        break
                    node = node_queue.get()
                    # TODO: the following check is not optimal yet. When a
                    # node is still in a starting state,
                    # it will start another node here,
                    # since the `is_alive` method will only check for
                    # running nodes (see issue #13)
                    if node.is_alive():
                        log.info("Not starting node %s which is "
                                 "already up&running.", node.name)
                    else:
                        log.info("starting node...")
                        try:
                            node.start()
                        except (InstanceError, SecurityGroupError,
                        KeypairError, ImageError) as e:
                            log.error("could not start node `%s` for reason "
                                      "`%s`" % (node.name, e))

            except Empty:
                # nothing to do if the queue turns out to be empty - the
                # nodes are then already started.
                pass
예제 #3
0
    def _start_node(node):
        """Static method to start a specific node on a cloud

        :return: bool -- True on success, False otherwise
        """
        log.debug("_start_node: working on node %s" % node.name)
        # TODO: the following check is not optimal yet. When a
        # node is still in a starting state,
        # it will start another node here,
        # since the `is_alive` method will only check for
        # running nodes (see issue #13)
        if node.is_alive():
            log.info("Not starting node %s which is "
                     "already up&running.", node.name)
            return True
        else:
            try:
                node.start()
                log.info("_start_node: node has been started")
                return True
            except KeypairError as e:
                return e
            except Exception as e:
                log.error("could not start node `%s` for reason "
                          "`%s`" % (node.name, e))
                return None
예제 #4
0
    def pause_instance(self, instance_id):
        """Pauses the instance, retaining disk and config.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance cannot be paused

        :return: dict - information needed to restart instance.
        """

        if not instance_id:
            log.info("Instance to pause has no instance id.")
            return

        gce = self._connect()

        try:
            request = gce.instances().stop(project=self._project_id,
                                           instance=instance_id,
                                           zone=self._zone)
            operation = self._execute_request(request)
            response = self._wait_until_done(operation)
            self._check_response(response)
            return {"instance_id": instance_id}
        except HttpError as e:
            log.error("Error stopping instance: `%s", e)
            raise InstanceError("Error stopping instance `%s`", e)
예제 #5
0
    def resume_instance(self, paused_info):
        """Restarts a paused instance, retaining disk and config.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance cannot be resumed.

        :return: dict - information needed to restart instance.
        """

        if not paused_info.get("instance_id"):
            log.info("Instance to stop has no instance id.")
            return

        gce = self._connect()

        try:
            request = gce.instances().start(project=self._project_id,
                                            instance=paused_info["instance_id"],
                                            zone=self._zone)
            operation = self._execute_request(request)
            response = self._wait_until_done(operation)
            self._check_response(response)
            return
        except HttpError as e:
            log.error("Error restarting instance: `%s", e)
            raise InstanceError("Error restarting instance `%s`", e)
예제 #6
0
    def _stop_all_nodes(self, wait=False):
        """
        Terminate all cluster nodes. Return number of failures.
        """
        failed = 0
        for node in self.get_all_nodes():
            if not node.instance_id:
                log.warning(
                    "Node `%s` has no instance ID."
                    " Assuming it did not start correctly,"
                    " so removing it anyway from the cluster.", node.name)
                self.nodes[node.kind].remove(node)
                continue
            # try and stop node
            try:
                # wait and pause for and recheck.
                node.stop(wait)

                self.nodes[node.kind].remove(node)
                log.debug(
                    "Removed node `%s` from cluster `%s`", node.name, self.name)
            except InstanceNotFoundError as err:
                log.info(
                    "Node `%s` (instance ID `%s`) was not found;"
                    " assuming it has already been terminated.",
                    node.name, node.instance_id)
            except Exception as err:
                failed += 1
                log.error(
                    "Could not stop node `%s` (instance ID `%s`): %s %s",
                    node.name, node.instance_id, err, err.__class__)
        return failed
예제 #7
0
    def stop_instance(self, instance_id):
        """Stops the instance gracefully.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance can not be stopped
        """
        if not instance_id:
          log.info("Instance to stop has no instance id")
          return

        gce = self._connect()

        try:
            request = gce.instances().delete(project=self._project_id,
                                        instance=instance_id, zone=self._zone)
            response = self._execute_request(request)
            self._check_response(response)
        except HttpError as e:
            # If the instance does not exist, we can a 404 - just log it, and
            # return without exception so the caller can remove the reference.
            if e.resp.status == 404:
              log.warning("Instance to stop `%s` was not found" % instance_id)
            else:
              raise InstanceError("Could not stop instance `%s`: `%s`"
                                  % (instance_id, e))
        except CloudProviderError as e:
            raise InstanceError("Could not stop instance `%s`: `%s`"
                                % (instance_id, e))
예제 #8
0
 def __init_keystone_session_v2(self, check=False):
     """Create and return a session object using Keystone API v2."""
     from keystoneauth1 import loading as keystone_v2
     loader = keystone_v2.get_plugin_loader('password')
     auth = loader.load_from_options(
         auth_url=self._os_auth_url,
         username=self._os_username,
         password=self._os_password,
         project_name=self._os_tenant_name,
     )
     sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert)
     if check:
         log.debug("Checking that Keystone API v2 session works...")
         try:
             # if session is invalid, the following will raise some exception
             nova = nova_client.Client(self._compute_api_version, session=sess, cacert=self._os_cacert)
             nova.flavors.list()
         except keystoneauth1.exceptions.NotFound as err:
             log.warning("Creating Keystone v2 session failed: %s", err)
             return None
         except keystoneauth1.exceptions.ClientException as err:
             log.error("OpenStack server rejected request (likely configuration error?): %s", err)
             return None  # FIXME: should we be raising an error instead?
     # if we got to this point, v2 session is valid
     log.info("Using Keystone API v2 session to authenticate to OpenStack")
     return sess
예제 #9
0
    def __prepare_key_pair(self, key_name, private_key_path, public_key_path, password):
        if not key_name:
            log.warn('user_key_name has not been defined, assuming password-based authentication')
            return

        if key_name in [k.name for k in self.driver.list_key_pairs()]:
            log.info('Key pair `%s` already exists, skipping import.', key_name)
            return

        if public_key_path:
            log.debug("importing public key from file %s ...", public_key_path)
            if not self.driver.import_key_pair_from_file(
                    name=key_name,
                    key_file_path=os.path.expandvars(os.path.expanduser(public_key_path))):
                raise KeypairError(
                    'Could not upload public key {p}'
                    .format(p=public_key_path))
        elif private_key_path:
            if not private_key_path.endswith('.pem'):
                raise KeypairError(
                    'can only work with .pem private keys,'
                    ' derive public key and set user_key_public')
            log.debug("deriving and importing public key from private key")
            self.__import_pem(key_name, private_key_path, password)
        else:
            pem_file_path = os.path.join(self.storage_path, key_name + '.pem')
            if not os.path.exists(pem_file_path):
                with open(pem_file_path, 'w') as new_key_file:
                    new_key_file.write(
                        self.driver.create_key_pair(name=key_name))
            self.__import_pem(key_name, pem_file_path, password)
예제 #10
0
    def _init_az_api(self):
        """
        Initialise client objects for talking to Azure API.

        This is in a separate function so to be called by ``__init__``
        and ``__setstate__``.
        """
        with self.__lock:
            if self._resource_client is None:
                log.debug("Making Azure `ServicePrincipalcredentials` object"
                          " with tenant=%r, client_id=%r, secret=%r ...",
                          self.tenant_id, self.client_id,
                          ('<redacted>' if self.secret else None))
                credentials = ServicePrincipalCredentials(
                    tenant=self.tenant_id,
                    client_id=self.client_id,
                    secret=self.secret,
                )
                log.debug("Initializing Azure `ComputeManagementclient` ...")
                self._compute_client = ComputeManagementClient(credentials, self.subscription_id)
                log.debug("Initializing Azure `NetworkManagementclient` ...")
                self._network_client = NetworkManagementClient(credentials, self.subscription_id)
                log.debug("Initializing Azure `ResourceManagementclient` ...")
                self._resource_client = ResourceManagementClient(credentials, self.subscription_id)
                log.info("Azure API clients initialized.")
예제 #11
0
 def verbose_add(fname, basedir='', comment=None):
     zipname = basedir + os.path.basename(fname)
     log.info("Adding '%s' as '%s'" % (fname, zipname))
     zipfile.write(fname, zipname)
     if comment:
         info = zipfile.getinfo(zipname)
         info.comment = comment
예제 #12
0
    def start(self, min_nodes=None):
        """
        Starts up all the instances in the cloud.

        To speed things up, all
        instances are started in a seperate thread. To make sure
        ElastiCluster is not stopped during creation of an instance, it will
        overwrite the sigint handler. As soon as the last started instance
        is returned and saved to the repository, sigint is executed as usual.

        A VM instance is considered 'up and running' as soon as an SSH
        connection can be established. If the startup timeout is reached before
        all instances are started, ElastiCluster stops the cluster and
        terminates all VM instances.

        This method is blocking and might take some time depending on the
        amount of instances to start.

        :param min_nodes: minimum number of nodes to start in case the quota
                          is reached before all instances are up
        :type min_nodes: dict [node_kind] = number
        """

        nodes = self.get_all_nodes()

        log.info("Starting cluster nodes ...")
        if log.DO_NOT_FORK:
            nodes = self._start_nodes_sequentially(nodes)
        else:
            nodes = self._start_nodes_parallel(nodes, self.thread_pool_max_size)

        # checkpoint cluster state
        self.repository.save_or_update(self)

        not_started_nodes = self._check_starting_nodes(nodes, self.startup_timeout)

        # now that all nodes are up, checkpoint cluster state again
        self.repository.save_or_update(self)

        # Try to connect to each node to gather IP addresses and SSH host keys
        log.info("Checking SSH connection to nodes ...")
        pending_nodes = nodes - not_started_nodes
        self._gather_node_ip_addresses(pending_nodes, self.startup_timeout)

        # It might be possible that the node.connect() call updated
        # the `preferred_ip` attribute, so, let's save the cluster
        # again.
        self.repository.save_or_update(self)

        # A lot of things could go wrong when starting the cluster. To
        # ensure a stable cluster fitting the needs of the user in terms of
        # cluster size, we check the minimum nodes within the node groups to
        # match the current setup.
        min_nodes = self._compute_min_nodes(min_nodes)
        self._check_cluster_size(min_nodes)
예제 #13
0
 def stop(self):
     """Destroys the instance launched on the cloud for this specific node.
     """
     log.info("shutting down instance `%s`", self.instance_id)
     self._cloud_provider.stop_instance(self.instance_id)
     # When an instance is terminated, the EC2 cloud provider will
     # basically return it as "running" state. Setting the
     # `instance_id` attribute to None will force `is_alive()`
     # method not to check with the cloud provider, and forever
     # forgetting about the instance id.
     self.instance_id = None
예제 #14
0
 def start(self):
     """
     Starts an instance for this node on the cloud through the
     clode provider. This method is non-blocking, as soon as the
     node id is returned from the cloud provider, it will return.
     """
     log.info("Starting node %s.", self.name)
     self.instance_id = self._cloud_provider.start_instance(
         self.user_key_name, self.user_key_public, self.security_group,
         self.flavor, self.image, self.image_userdata)
     log.debug("Node %s has instance_id: `%s`", self.name, self.instance_id)
예제 #15
0
    def get_stored_clusters(self):
        """
        Returns a list of all stored clusters.
        """
        allfiles = os.listdir(self._storage_dir)
        db_files = []
        for fname in allfiles:
            fpath = os.path.join(self._storage_dir, fname)
            if fname.endswith('.json') and os.path.isfile(fpath):
                db_files.append(fname[:-5])
            else:
                log.info("Ignoring invalid storage file %s", fpath)

        return db_files
예제 #16
0
 def start(self):
     """Starts the node on the cloud using the given
     instance properties. This method is non-blocking, as soon
     as the node id is returned from the cloud provider, it will return.
     Therefore the `is_alive` and `update_ips` methods can be used to
     further gather details about the state of the node.
     """
     log.info("Starting node %s.", self.name)
     self.instance_id = self._cloud_provider.start_instance(
         self.user_key_name, self.user_key_public, self.user_key_private,
         self.security_group,
         self.flavor, self.image_id, self.image_userdata,
         username=self.image_user, node_name="%s-%s" % (self.cluster_name, self.name), **self.extra)
     log.debug("Node %s has instance_id: `%s`", self.name, self.instance_id)
예제 #17
0
    def _gather_node_ip_addresses(self, nodes, lapse):
        """
        Connect via SSH to each node.

        Return set of nodes that could not be reached with `lapse` seconds.
        """
        # for convenience, we might set this to ``None`` if the file cannot
        # be opened -- but we do not want to forget the cluster-wide
        # setting in case the error is transient
        known_hosts_path = self.known_hosts_file

        # Create the file if it's not present, otherwise the
        # following lines will raise an error
        try:
            fd = open(known_hosts_path, 'a')
            fd.close()
        except IOError as err:
            log.warning("Error opening SSH 'known hosts' file `%s`: %s",
                        known_hosts_path, err)
            known_hosts_path = None

        keys = paramiko.hostkeys.HostKeys(known_hosts_path)

        with timeout(lapse, raise_timeout_error):
            try:
                while nodes:
                    for node in copy(nodes):
                        ssh = node.connect(keyfile=known_hosts_path)
                        if ssh:
                            log.info("Connection to node `%s` successful,"
                                     " using IP address %s to connect.",
                                     node.name, node.connection_ip())
                            # Add host keys to the keys object.
                            for host, key in ssh.get_host_keys().items():
                                for keytype, keydata in key.items():
                                    keys.add(host, keytype, keydata)
                            self._save_keys_to_known_hosts_file(keys)
                            nodes.remove(node)
                    if nodes:
                        time.sleep(self.polling_interval)

            except TimeoutError:
                log.error(
                    "Some nodes of the cluster were unreachable"
                    " within the given %d-seconds timeout: %s",
                    lapse, ', '.join(node.name for node in nodes))

        # return list of nodes
        return nodes
예제 #18
0
    def _gather_node_ip_addresses(self, nodes, lapse):
        """
        Connect via SSH to each node.

        Return set of nodes that could not be reached with `lapse` seconds.
        """
        # for convenience, we might set this to ``None`` if the file cannot
        # be opened -- but we do not want to forget the cluster-wide
        # setting in case the error is transient
        known_hosts_path = self.known_hosts_file

        # Create the file if it's not present, otherwise the
        # following lines will raise an error
        try:
            fd = open(known_hosts_path, 'a')
            fd.close()
        except IOError as err:
            log.warning("Error opening SSH 'known hosts' file `%s`: %s",
                        known_hosts_path, err)
            known_hosts_path = None

        keys = paramiko.hostkeys.HostKeys(known_hosts_path)

        with timeout(lapse, raise_timeout_error):
            try:
                while nodes:
                    for node in copy(nodes):
                        ssh = node.connect(keyfile=known_hosts_path)
                        if ssh:
                            log.info("Connection to node `%s` successful,"
                                     " using IP address %s to connect.",
                                     node.name, node.connection_ip())
                            # Add host keys to the keys object.
                            for host, key in ssh.get_host_keys().items():
                                for keytype, keydata in key.items():
                                    keys.add(host, keytype, keydata)
                            self._save_keys_to_known_hosts_file(keys)
                            nodes.remove(node)
                    if nodes:
                        time.sleep(self.polling_interval)

            except TimeoutError:
                log.error(
                    "Some nodes of the cluster were unreachable"
                    " within the given %d-seconds timeout: %s",
                    lapse, ', '.join(node.name for node in nodes))

        # return list of nodes
        return nodes
예제 #19
0
 def _add_key_to_sshagent(self, private_key_path):
     """Function to add a private key to the ssh-agent
     :param str private_key_path: path to the ssh private key file
     :raises KeyNotAccessible: If the password provided is empty (in other cases the ssh-add
             asks for the password again)
     """
     # This block avoid repetition of checks after it is done for the first instance
     if self._SSH_KEY_ACCESS_ERROR==True:
         raise KeyNotAccessible#("Unable to access key file `"+private_key_path+": Invalid password")      
     return_code=subprocess.call(['ssh-add', private_key_path])
     if return_code==0:
         log.info("Key %s suscessfully added to ssh-agent" % private_key_path)
     else: # This only happens if the password is empty
         self._SSH_KEY_ACCESS_ERROR=True # This avoid user entering the code right the second time
         raise KeyNotAccessible#("Unable to access key file `"+private_key_path+": Invalid password") 
예제 #20
0
 def start(self):
     """
     Starts an instance for this node on the cloud through the
     clode provider. This method is non-blocking, as soon as the
     node id is returned from the cloud provider, it will return.
     """
     log.info("Starting node %s.", self.name)
     self.instance_id = self._cloud_provider.start_instance(
         self.user_key_name,
         self.user_key_public,
         self.user_key_private,
         self.security_group,
         self.flavor,
         self.image,
         self.image_userdata,
         username=self.image_user)
     log.debug("Node %s has instance_id: `%s`", self.name, self.instance_id)
예제 #21
0
    def stop(self, wait=False):
        """
        Terminate the VM instance launched on the cloud for this specific node.
        """
        if self.instance_id is not None:
            log.info("Shutting down instance `%s` ...", self.instance_id)

            self._cloud_provider.stop_instance(self.instance_id)
            if wait:
                while self.is_alive():
                    time.sleep(1)
            # When an instance is terminated, the EC2 cloud provider will
            # basically return it as "running" state. Setting the
            # `instance_id` attribute to None will force `is_alive()`
            # method not to check with the cloud provider, and forever
            # forgetting about the instance id.
            self.instance_id = None
예제 #22
0
    def get_ips(self, instance_id):
        """Retrieves the ip addresses (public) from the cloud
        provider by the given instance id.

        :param str instance_id: id of the instance
        :return: list (ips)
        :raises: InstanceError if the ip could not be retrieved.
        """
        if not instance_id:
            raise InstanceError("could not retrieve the ip address for node: "
                                "no associated instance id")
        gce = self._connect()
        instances = gce.instances()
        try:
            request = instances.get(instance=instance_id,
                                    project=self._project_id,
                                    zone=self._zone)
            response = self._execute_request(request)
            ip_public = None

            # If the instance is in status TERMINATED, then there will be
            # no IP addresses.
            if response and response['status'] in ('STOPPING', 'TERMINATED'):
                log.info("node '%s' state is '%s'; no IP address(es)" %
                         (instance_id, response['status']))
                return [None]

            if response and "networkInterfaces" in response:
                interfaces = response['networkInterfaces']
                if interfaces:
                    if "accessConfigs" in interfaces[0]:
                        ip_public = interfaces[0]['accessConfigs'][0]['natIP']
                        ip_private = interfaces[0]['networkIP']

            if ip_public and ip_private:
                return [ip_public, ip_private]
            else:
                raise InstanceError("could not retrieve the ip address for "
                                    "node `%s`, please check the node "
                                    "through the cloud provider interface" %
                                    instance_id)

        except (HttpError, CloudProviderError) as e:
            raise InstanceError('could not retrieve the ip address of `%s`: '
                                '`%s`' % (instance_id, e))
예제 #23
0
    def stop(self, wait=False):
        """
        Terminate the VM instance launched on the cloud for this specific node.
        """
        if self.instance_id is not None:
            log.info("Shutting down node `%s` (VM instance `%s`) ...",
                     self.name, self.instance_id)

            self._cloud_provider.stop_instance(self.instance_id)
            if wait:
                while self.is_alive():
                    time.sleep(1)
            # When an instance is terminated, the EC2 cloud provider will
            # basically return it as "running" state. Setting the
            # `instance_id` attribute to None will force `is_alive()`
            # method not to check with the cloud provider, and forever
            # forgetting about the instance id.
            self.instance_id = None
예제 #24
0
    def start(self):
        """
        Start the node on the cloud using the given instance properties.

        This method is non-blocking: as soon as the node id is returned from
        the cloud provider, it will return. The `is_alive`:meth: and
        `update_ips`:meth: methods should be used to further gather details
        about the state of the node.
        """
        log.info("Starting node %s ...", self.name)
        self.instance_id = self._cloud_provider.start_instance(
            self.user_key_name, self.user_key_public, self.user_key_private,
            self.security_group,
            self.flavor, self.image_id, self.image_userdata,
            username=self.image_user,
            node_name=("%s-%s" % (self.cluster_name, self.name)),
            **self.extra)
        log.debug("Node `%s` has instance ID `%s`", self.name, self.instance_id)
예제 #25
0
파일: gce.py 프로젝트: dvischi/TissueMAPS
    def get_ips(self, instance_id):
        """Retrieves the ip addresses (public) from the cloud
        provider by the given instance id.

        :param str instance_id: id of the instance
        :return: list (ips)
        :raises: InstanceError if the ip could not be retrieved.
        """
        if not instance_id:
          raise InstanceError("could not retrieve the ip address for node: "
                              "no associated instance id")
        gce = self._connect()
        instances = gce.instances()
        try:
            request = instances.get(instance=instance_id,
                                    project=self._project_id, zone=self._zone)
            response = self._execute_request(request)
            ip_public = None

            # If the instance is in status TERMINATED, then there will be
            # no IP addresses.
            if response and response['status'] in ('STOPPING', 'TERMINATED'):
              log.info("node '%s' state is '%s'; no IP address(es)" %
                       (instance_id, response['status']))
              return [None]

            if response and "networkInterfaces" in response:
                interfaces = response['networkInterfaces']
                if interfaces:
                    if "accessConfigs" in interfaces[0]:
                        ip_public = interfaces[0]['accessConfigs'][0]['natIP']

            if ip_public:
                return [ip_public]
            else:
                raise InstanceError("could not retrieve the ip address for "
                                    "node `%s`, please check the node "
                                    "through the cloud provider interface"
                                    % instance_id)

        except (HttpError, CloudProviderError) as e:
            raise InstanceError('could not retrieve the ip address of `%s`: '
                                '`%s`' % (instance_id, e))
예제 #26
0
 def start(self):
     """Starts the node on the cloud using the given
     instance properties. This method is non-blocking, as soon
     as the node id is returned from the cloud provider, it will return.
     Therefore the `is_alive` and `update_ips` methods can be used to
     further gather details about the state of the node.
     """
     log.info("Starting node %s.", self.name)
     self.instance_id = self._cloud_provider.start_instance(
         self.user_key_name,
         self.user_key_public,
         self.user_key_private,
         self.security_group,
         self.flavor,
         self.image,
         self.image_userdata,
         username=self.image_user,
         node_name=self.name)
     log.debug("Node %s has instance_id: `%s`", self.name, self.instance_id)
예제 #27
0
    def get_all(self):
        """Retrieves all clusters from the persistent state.

        :return: list of :py:class:`elasticluster.cluster.Cluster`
        """
        file_ending = ClusterRepository.file_ending
        allfiles = os.listdir(self.storage_path)
        cluster_files = []
        for fname in allfiles:
            fpath = os.path.join(self.storage_path, fname)
            if fname.endswith('.%s' % file_ending) and os.path.isfile(fpath):
                cluster_files.append(fname[:-len(file_ending)-1])
            else:
                log.info("Ignoring invalid storage file %s", fpath)

        clusters = list()
        for cluster_file in cluster_files:
            clusters.append(self.get(cluster_file))
        return clusters
예제 #28
0
    def __init_keystone_session_v3(self, check=False):
        """
        Return a new session object, created using Keystone API v3.

        .. note::

          Note that the only supported authN method is password authentication;
          token or other plug-ins are not currently supported.
        """
        try:
            # may fail on Python 2.6?
            from keystoneauth1.identity import v3 as keystone_v3
        except ImportError:
            log.warning("Cannot load Keystone API v3 library.")
            return None
        auth = keystone_v3.Password(
            auth_url=self._os_auth_url,
            username=self._os_username,
            password=self._os_password,
            user_domain_name=self._os_user_domain_name,
            project_domain_name=self._os_project_domain_name,
            project_name=self._os_tenant_name,
        )
        sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert)
        if check:
            log.debug("Checking that Keystone API v3 session works...")
            try:
                # if session is invalid, the following will raise some exception
                nova = nova_client.Client(self.compute_api_version,
                                          session=sess)
                nova.flavors.list()
            except keystoneauth1.exceptions.NotFound as err:
                log.warning("Creating Keystone v3 session failed: %s", err)
                return None
            except keystoneauth1.exceptions.ClientException as err:
                log.error(
                    "OpenStack server rejected request (likely configuration error?): %s",
                    err)
                return None  # FIXME: should we be raising an error instead?
        # if we got to this point, v3 session is valid
        log.info("Using Keystone API v3 session to authenticate to OpenStack")
        return sess
예제 #29
0
    def stop_instance(self, instance_id):
        """Stops the instance gracefully.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance can not be stopped
        """
        if not instance_id:
          log.info("Instance to stop has no instance id")
          return

        gce = self._connect()

        try:
            request = gce.instances().delete(project=self._project_id,
                                        instance=instance_id, zone=self._zone)
            response = self._execute_request(request)
            self._check_response(response)
        except (HttpError, CloudProviderError) as e:
            raise InstanceError("Could not stop instance `%s`: `%s`"
                                % (instance_id, e))
예제 #30
0
    def __init_keystone_session_v3(self, check=False):
        """
        Return a new session object, created using Keystone API v3.

        .. note::

          Note that the only supported authN method is password authentication;
          token or other plug-ins are not currently supported.
        """
        try:
            # may fail on Python 2.6?
            from keystoneauth1.identity import v3 as keystone_v3
        except ImportError:
            log.warning("Cannot load Keystone API v3 library.")
            return None
        auth = keystone_v3.Password(
            auth_url=self._os_auth_url,
            username=self._os_username,
            password=self._os_password,
            user_domain_name=self._os_user_domain_name,
            project_domain_name=self._os_project_domain_name,
            project_name=self._os_tenant_name,
        )
        sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert)
        if check:
            log.debug("Checking that Keystone API v3 session works...")
            try:
                # if session is invalid, the following will raise some exception
                nova = nova_client.Client(self._compute_api_version, session=sess)
                nova.flavors.list()
            except keystoneauth1.exceptions.NotFound as err:
                log.warning("Creating Keystone v3 session failed: %s", err)
                return None
            except keystoneauth1.exceptions.ClientException as err:
                log.error("OpenStack server rejected request (likely configuration error?): %s", err)
                return None  # FIXME: should we be raising an error instead?
        # if we got to this point, v3 session is valid
        log.info("Using Keystone API v3 session to authenticate to OpenStack")
        return sess
예제 #31
0
    def _start_node(node):
        """
        Start the given node VM.

        :return: bool -- True on success, False otherwise
        """
        log.debug("_start_node: working on node `%s`", node.name)
        # FIXME: the following check is not optimal yet. When a node is still
        # in a starting state, it will start another node here, since the
        # `is_alive` method will only check for running nodes (see issue #13)
        if node.is_alive():
            log.info("Not starting node `%s` which is already up.", node.name)
            return True
        else:
            try:
                node.start()
                log.info("Node `%s` has been started.", node.name)
                return True
            except Exception as err:
                log.exception("Could not start node `%s`: %s -- %s", node.name,
                              err, err.__class__)
                return False
예제 #32
0
    def __prepare_key_pair(self, key_name, private_key_path, public_key_path, password):
        if not key_name:
            log.warn('user_key_name has not been defined, assuming password based authentication')
            return

        try:
            list_key_pairs = self.__get_function_by_pattern('list_key_pairs')
        except AttributeError:
            raise UnsupportedError('key management not supported by provider')
        try:
            self.__get_function_or_ex_function('import_key_pair_from_file')
        except AttributeError:
            raise UnsupportedError('key import not supported by provider')
        try:
            self.__get_function_or_ex_function('create_key_pair')
        except AttributeError:
            raise UnsupportedError('key creation not supported by provider')

        if key_name in [k.name for k in list_key_pairs()]:
            log.info('Key pair (%s) already exists, skipping import.', key_name)
            return

        if public_key_path:
            log.debug("importing public key from path %s", public_key_path)
            key_import = self.__get_function_or_ex_function('import_key_pair_from_file')
            if not key_import(name=key_name, key_file_path=os.path.expandvars(os.path.expanduser(public_key_path))):
                raise KeypairError('failure during import of public key {p}'.format(p=public_key_path))
        elif private_key_path:
            if not private_key_path.endswith('.pem'):
                raise KeypairError('can only work with .pem private keys, derive public key and set user_key_public')
            log.debug("deriving and importing public key from private key")
            self.__import_pem(key_name, private_key_path, password)
        elif os.path.exists(os.path.join(self.storage_path, '{p}.pem'.format(p=key_name))):
            self.__import_pem(key_name, os.path.join(self.storage_path, '{}.pem'.format(key_name)), password)
        else:
            with open(os.path.join(self.storage_path, '{p}.pem'.format(p=key_name)), 'w') as new_key_file:
                new_key_file.write(self.__get_function_or_ex_function('create_key_pair')(name=key_name))
            self.__import_pem(key_name, os.path.join(self.storage_path, '{p}.pem'.format(p=key_name)), password)
예제 #33
0
    def _start_node(node):
        """Static method to start a specific node on a cloud

        :return: bool -- True on success, False otherwise
        """
        log.debug("_start_node: working on node `%s`" % node.name)
        # TODO: the following check is not optimal yet. When a
        # node is still in a starting state,
        # it will start another node here,
        # since the `is_alive` method will only check for
        # running nodes (see issue #13)
        if node.is_alive():
            log.info("Not starting node `%s` which is "
                     "already up&running.", node.name)
            return True
        else:
            try:
                node.start()
                log.info("Node `%s` has been started.", node.name)
                return True
            except Exception as e:
                log.error("Could not start node `%s`: %s", node.name, e)
                return None
예제 #34
0
    def _start_node(node):
        """
        Start the given node VM.

        :return: bool -- True on success, False otherwise
        """
        log.debug("_start_node: working on node `%s`", node.name)
        # FIXME: the following check is not optimal yet. When a node is still
        # in a starting state, it will start another node here, since the
        # `is_alive` method will only check for running nodes (see issue #13)
        if node.is_alive():
            log.info("Not starting node `%s` which is "
                     "already up&running.", node.name)
            return True
        else:
            try:
                node.start()
                log.info("Node `%s` has been started.", node.name)
                return True
            except Exception as err:
                log.exception("Could not start node `%s`: %s -- %s",
                              node.name, err, err.__class__)
                return False
예제 #35
0
    def get_all(self):
        """Retrieves all clusters from the persistent state.

        :return: list of :py:class:`elasticluster.cluster.Cluster`
        """
        file_ending = PickleRepository.file_ending
        allfiles = os.listdir(self.storage_path)
        cluster_files = []
        for fname in allfiles:
            fpath = os.path.join(self.storage_path, fname)
            if fname.endswith('.%s' % file_ending) and os.path.isfile(fpath):
                cluster_files.append(fname[:-len(file_ending)-1])
            else:
                log.info("Ignoring invalid storage file %s", fpath)

        clusters = list()
        for cluster_file in cluster_files:
            try:
                cluster = self.get(cluster_file)
                clusters.append(cluster)
            except (ImportError, AttributeError) as ex:
                log.error("Unable to load cluster %s: `%s`", cluster_file, ex)
                log.error("If cluster %s was created with a previous version of elasticluster, you may need to run `elasticluster migrate %s %s` to update it.", cluster_file, self.storage_path, cluster_file)
        return clusters
예제 #36
0
    def __detect_os_identity_api_version(self):
        """
        Return preferred OpenStack Identity API version (either one of the two strings ``'2'`` or ``'3'``) or ``None``.

        The following auto-detection strategies are tried (in this order):

        #. Read the environmental variable `OS_IDENTITY_API_VERSION` and check if its value is one of the two strings ``'2'`` or ``'3'``;
        #. Check if a version tag like ``/v3`` or ``/v2.0`` ends the OpenStack auth URL.

        If none of the above worked, return ``None``.

        For more information on ``OS_IDENTITY_API_VERSION``, please see
        `<https://docs.openstack.org/developer/python-openstackclient/authentication.html>`_.
        """
        ver = os.getenv('OS_IDENTITY_API_VERSION', '')
        if ver == '3':
            log.info(
                "Using OpenStack Identity API v3"
                " because of environmental variable setting `OS_IDENTITY_API_VERSION=3`"
            )
            return '3'
        elif ver == '2' or ver.startswith('2.'):
            log.info(
                "Using OpenStack Identity API v2"
                " because of environmental variable setting `OS_IDENTITY_API_VERSION=2`"
            )
            return '2'
        elif self._os_auth_url.endswith('/v3'):
            log.info(
                "Using OpenStack Identity API v3 because of `/v3` ending in auth URL;"
                " set environmental variable OS_IDENTITY_API_VERSION to force use of Identity API v2 instead."
            )
            return '3'
        elif self._os_auth_url.endswith('/v2.0'):
            log.info(
                "Using OpenStack Identity API v2 because of `/v2.0` ending in auth URL;"
                " set environmental variable OS_IDENTITY_API_VERSION to force use of Identity API v3 instead."
            )
            return '2'
        else:
            # auto-detection failed, need to probe
            return None
예제 #37
0
    def _get_credentials(self):
        if self._client_id and self._client_secret:
            flow = OAuth2WebServerFlow(self._client_id, self._client_secret,
                                       GCE_SCOPE)
            # The `Storage` object holds the credentials that your
            # application needs to authorize access to the user's
            # data. The name of the credentials file is provided. If the
            # file does not exist, it is created. This object can only
            # hold credentials for a single user. It stores the access
            # priviledges for the application, so a user only has to grant
            # access through the web interface once.
            storage_path = os.path.join(self._storage_path,
                                        self._client_id + '.oauth.dat')
            storage = Storage(storage_path)

            credentials = storage.get()
            if credentials is not None and not credentials.invalid:
                return credentials
            else:
                log.info("Determined that provided credentials are not valid.")

        try:
            # Next, check to see if there is a set of application
            # default credentials to use.
            log.info(
                "Attempting to use Google Application Default Credentials.")
            return GoogleCredentials.get_application_default()
        except ApplicationDefaultCredentialsError:
            log.info(
                "Failed to use Google Application Default Credentials, falling back to config."
            )
            log.debug("(Original traceback follows.)", exc_info=True)

        try:
            # Finally, try to start a browser to have the user authenticate with Google
            args = argparser.parse_args([])
            args.noauth_local_webserver = self._noauth_local_webserver
            return run_flow(flow, storage, flags=args)
        except Exception as err:
            log.error("Could not run authentication flow: %s", err)
            log.debug("(Original traceback follows.)", exc_info=True)
        raise CredentialsError(
            "No method to obtain GCE credentials was successful!  Either "
            "set up Application Default Credentials using gcloud, or "
            "provide a client id and client secret from an oauth flow, "
            "or go through the oauth flow that elasticluster runs.")
예제 #38
0
    def _get_credentials(self):
        if self._client_id and self._client_secret:
            flow = OAuth2WebServerFlow(self._client_id, self._client_secret,
                                       GCE_SCOPE)
            # The `Storage` object holds the credentials that your
            # application needs to authorize access to the user's
            # data. The name of the credentials file is provided. If the
            # file does not exist, it is created. This object can only
            # hold credentials for a single user. It stores the access
            # priviledges for the application, so a user only has to grant
            # access through the web interface once.
            storage_path = os.path.join(self._storage_path,
                                        self._client_id + '.oauth.dat')
            storage = Storage(storage_path)

            credentials = storage.get()
            if credentials is not None and not credentials.invalid:
                return credentials
            else:
                log.info("Determined that provided credentials are not valid.")

        try:
            # Next, check to see if there is a set of application
            # default credentials to use.
            log.info("Attempting to use Google Application Default Credentials.")
            return GoogleCredentials.get_application_default()
        except ApplicationDefaultCredentialsError:
            log.info("Failed to use Google Application Default Credentials, falling back to config.")
            log.debug("(Original traceback follows.)", exc_info=True)

        try:
            # Finally, try to start a browser to have the user authenticate with Google
            args = argparser.parse_args([])
            args.noauth_local_webserver = self._noauth_local_webserver
            return run_flow(flow, storage, flags=args)
        except Exception as err:
            log.error("Could not run authentication flow: %s", err)
            log.debug("(Original traceback follows.)", exc_info=True)
        raise CredentialsError("No method to obtain GCE credentials was successful!  Either "
                               "set up Application Default Credentials using gcloud, or "
                               "provide a client id and client secret from an oauth flow, "
                               "or go through the oauth flow that elasticluster runs.")
예제 #39
0
    def start_instance(self, key_name, public_key_path, private_key_path,
                       security_group, flavor, image_id, image_userdata,
                       username=None, node_name=None, **kwargs):
        """Starts a new instance on the cloud using the given properties.
        The following tasks are done to start an instance:

        * establish a connection to the cloud web service
        * check ssh keypair and upload it if it does not yet exist. This is
          a locked process, since this function might be called in multiple
          threads and we only want the key to be stored once.
        * check if the security group exists
        * run the instance with the given properties

        :param str key_name: name of the ssh key to connect
        :param str public_key_path: path to ssh public key
        :param str private_key_path: path to ssh private key
        :param str security_group: firewall rule definition to apply on the
                                   instance
        :param str flavor: machine type to use for the instance
        :param str image_id: image type (os) to use for the instance
        :param str image_userdata: command to execute after startup
        :param str username: username for the given ssh key, default None

        :return: str - instance id of the started instance
        """
        vm_start_args = {}

        log.debug("Checking keypair `%s` ...", key_name)
        with OpenStackCloudProvider.__node_start_lock:
            self._check_keypair(key_name, public_key_path, private_key_path)
        vm_start_args['key_name'] = key_name

        security_groups = [sg.strip() for sg in security_group.split(',')]
        self._check_security_groups(security_groups)
        vm_start_args['security_groups'] = security_groups


        # Check if the image id is present.
        if image_id not in [img.id for img in self._get_images()]:
            raise ImageError(
                    "No image found with ID `{0}` in project `{1}` of cloud {2}"
                    .format(image_id, self._os_tenant_name, self._os_auth_url))
        vm_start_args['userdata'] = image_userdata

        # Check if the flavor exists
        flavors = [fl for fl in self._get_flavors() if fl.name == flavor]
        if not flavors:
            raise FlavorError(
                "No flavor found with name `{0}` in project `{1}` of cloud {2}"
                .format(flavor, self._os_tenant_name, self._os_auth_url))
        flavor = flavors[0]

        network_ids = [net_id.strip()
                       for net_id in kwargs.pop('network_ids', '').split(',')]
        if network_ids:
            nics = [{'net-id': net_id, 'v4-fixed-ip': ''}
                    for net_id in network_ids ]
            log.debug("Specifying networks for node %s: %s",
                      node_name, ', '.join([nic['net-id'] for nic in nics]))
        else:
            nics = None
        vm_start_args['nics'] = nics

        if 'boot_disk_size' in kwargs:
            # check if the backing volume is already there
            volume_name = '{name}-{id}'.format(name=node_name, id=image_id)
            if volume_name in [v.name for v in self._get_volumes()]:
                raise ImageError(
                    "Volume `{0}` already exists in project `{1}` of cloud {2}"
                    .format(volume_name, self._os_tenant_name, self._os_auth_url))

            log.info('Creating volume `%s` to use as VM disk ...', volume_name)
            try:
                bds = int(kwargs['boot_disk_size'])
                if bds < 1:
                    raise ValueError('non-positive int')
            except (ValueError, TypeError):
                raise ConfigurationError(
                    "Invalid `boot_disk_size` specified:"
                    " should be a positive integer, got {0} instead"
                    .format(kwargs['boot_disk_size']))
            volume = self.cinder_client.volumes.create(
                size=bds, name=volume_name, imageRef=image_id,
                volume_type=kwargs.pop('boot_disk_type'))

            # wait for volume to come up
            volume_available = False
            while not volume_available:
                for v in self._get_volumes():
                    if v.name == volume_name and v.status == 'available':
                        volume_available = True
                        break
                sleep(1)  # FIXME: hard-coded waiting time

            # ok, use volume as VM disk
            vm_start_args['block_device_mapping'] = {
                # FIXME: is it possible that `vda` is not the boot disk? e.g. if
                # a non-paravirtualized kernel is being used?  should we allow
                # to set the boot device as an image parameter?
                'vda': ('{id}:::{delete_on_terminate}'
                        .format(id=volume.id, delete_on_terminate=1)),
            }

        # due to some `nova_client.servers.create()` implementation weirdness,
        # the first three args need to be spelt out explicitly and cannot be
        # conflated into `**vm_start_args`
        vm = self.nova_client.servers.create(node_name, image_id, flavor, **vm_start_args)

        # allocate and attach a floating IP, if requested
        if self.request_floating_ip:
            # We need to list the floating IPs for this instance
            try:
                # python-novaclient <8.0.0
                floating_ips = [ip for ip in self.nova_client.floating_ips.list()
                                if ip.instance_id == vm.id]
            except AttributeError:
                floating_ips = self.neutron_client.list_floatingips(id=vm.id)
            # allocate new floating IP if none given
            if not floating_ips:
                self._allocate_address(vm, network_ids)

        self._instances[vm.id] = vm

        return vm.id
예제 #40
0
    def _build_inventory(self, cluster):
        """
        Builds the inventory for the given cluster and returns its path

        :param cluster: cluster to build inventory for
        :type cluster: :py:class:`elasticluster.cluster.Cluster`
        """
        inventory_data = defaultdict(list)

        for node in cluster.get_all_nodes():
            if node.preferred_ip is None:
                log.warning("Ignoring node `{0}`: No IP address.".format(
                    node.name))
                continue
            if node.kind not in self.groups:
                # FIXME: should this raise a `ConfigurationError` instead?
                log.warning("Ignoring node `{0}`:"
                            " Node kind `{1}` not defined in cluster!".format(
                                node.name, node.kind))
                continue

            extra_vars = ['ansible_user=%s' % node.image_user]

            ip_addr, port = parse_ip_address_and_port(node.preferred_ip)
            if port != 22:
                extra_vars.append('ansible_port=%s' % port)

            # write additional `ansible_*` variables to inventory;
            # `ansible_python_interpreter` gets special treatment
            # since we need to tell script `install-py2.sh` that
            # it should create a wrapper script for running `eatmydata python`
            extra_conf = self.extra_conf.copy()
            ansible_python_interpreter = extra_conf.pop(
                'ansible_python_interpreter', '/usr/bin/python')
            extra_vars.append(
                'ansible_python_interpreter={python}{eatmydata}'.format(
                    python=ansible_python_interpreter,
                    eatmydata=('+eatmydata' if self.use_eatmydata else '')))
            extra_vars.extend('%s=%s' % (k, v) for k, v in extra_conf.items()
                              if k.startswith('ansible_'))

            if node.kind in self.environment:
                extra_vars.extend(
                    '%s=%s' % (k, v)
                    for k, v in self.environment[node.kind].items())

            for group in self.groups[node.kind]:
                inventory_data[group].append(
                    (node.name, ip_addr, ' '.join(extra_vars)))

        if not inventory_data:
            log.info("No inventory file was created.")
            return None

        # create a temporary file to pass to ansible, since the
        # api is not stable yet...
        if self._storage_path_tmp:
            if not self._storage_path:
                self._storage_path = tempfile.mkdtemp()
            elasticluster.log.warning("Writing inventory file to tmp dir `%s`",
                                      self._storage_path)

        inventory_path = os.path.join(self._storage_path,
                                      (cluster.name + '.inventory'))
        log.debug("Writing Ansible inventory to file `%s` ...", inventory_path)
        with open(inventory_path, 'w+') as inventory_file:
            for section, hosts in inventory_data.items():
                # Ansible throws an error "argument of type 'NoneType' is not
                # iterable" if a section is empty, so ensure we have something
                # to write in there
                if hosts:
                    inventory_file.write("\n[" + section + "]\n")
                    for host in hosts:
                        hostline = "{0} ansible_host={1} {2}\n".format(*host)
                        inventory_file.write(hostline)
        return inventory_path
예제 #41
0
    def execute(self):
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        repo = creator.create_repository()
        tmpdir = tempfile.mkdtemp()
        log.debug("Using temporary directory %s" % tmpdir)
        tmpconf = make_creator(self.params.config, storage_path=tmpdir)
        tmprepo = tmpconf.create_repository()

        rc = 0
        # Read the zip file.
        try:
            with ZipFile(self.params.file, 'r') as zipfile:
                # Find main cluster file
                # create cluster object from it
                log.debug("ZIP file %s opened" % self.params.file)
                cluster = None
                zipfile.extractall(tmpdir)
                newclusters = tmprepo.get_all()
                cluster = newclusters[0]
                cur_clusternames = [c.name for c in repo.get_all()]
                oldname = cluster.name
                newname = self.params.rename
                if self.params.rename:
                    cluster.name = self.params.rename
                    for node in cluster.get_all_nodes():
                        node.cluster_name = cluster.name
                if cluster.name in cur_clusternames:
                    raise Exception(
                        "A cluster with name %s already exists. Use "
                        "option --rename to rename the cluster to be "
                        "imported." % cluster.name)

                    # Save the cluster in the new position
                cluster.repository = repo
                repo.save_or_update(cluster)
                dest = cluster.repository.storage_path

                # Copy the known hosts
                srcfile = os.path.join(tmpdir, oldname + '.known_hosts')
                destfile = os.path.join(dest, cluster.name + '.known_hosts')
                shutil.copy(srcfile, destfile)

                # Copy the ssh keys, if present
                for attr in ('user_key_public', 'user_key_private'):
                    keyfile = getattr(cluster, attr)
                    keybase = os.path.basename(keyfile)
                    srcfile = os.path.join(tmpdir, keybase)
                    if os.path.isfile(srcfile):
                        log.info("Importing key file %s" % keybase)
                        destfile = os.path.join(dest, keybase)
                        shutil.copy(srcfile, destfile)
                        setattr(cluster, attr, destfile)

                    for node in cluster.get_all_nodes():
                        nodekeyfile = getattr(node, attr)
                        # Check if it's different from the main key
                        if nodekeyfile != keyfile \
                           and os.path.isfile(nodekeyfile):
                            destdir = os.path.join(dest, cluster.name,
                                                   node.kind, node.name)
                            nodekeybase = os.path.basename(nodekeyfile)
                            log.info("Importing key file %s for node %s" %
                                     (nodekeybase, node.name))
                            if not os.path.isdir(destdir):
                                os.makedirs(destdir)
                            # Path to key in zip file
                            srcfile = os.path.join(tmpdir, oldname, node.kind,
                                                   node.name, nodekeybase)
                            destfile = os.path.join(destdir, nodekeybase)
                            shutil.copy(srcfile, destfile)
                        # Always save the correct destfile
                        setattr(node, attr, destfile)

                repo.save_or_update(cluster)
                if not cluster:
                    log.error("ZIP file %s does not contain a valid cluster." %
                              self.params.file)
                    rc = 2

                # Check if a cluster already exists.
                # if not, unzip the needed files, and update ssh key path if needed.
        except Exception as ex:
            log.error("Unable to import from zipfile %s: %s" %
                      (self.params.file, ex))
            rc = 1
        finally:
            if os.path.isdir(tmpdir):
                shutil.rmtree(tmpdir)
            log.info("Cleaning up directory %s" % tmpdir)

        if rc == 0:
            print("Successfully imported cluster from ZIP %s to %s" %
                  (self.params.file, repo.storage_path))
        sys.exit(rc)
예제 #42
0
    def _allocate_address_neutron(self, instance, network_ids):
        """
        Allocates a floating/public ip address to the given instance,
        using the OpenStack Network ('Neutron') API.

        :param instance: instance to assign address to
        :param list network_id:
          List of IDs (as strings) of networks where to
          request allocation the floating IP.

        :return: public ip address
        """
        self._init_os_api()
        with OpenStackCloudProvider.__node_start_lock:
            # Note: to return *all* addresses, all parameters to
            # `neutron_client.list_floatingips()` should be left out;
            # setting them to `None` (e.g., `fixed_ip_address=None`)
            # results in an empty list...
            free_ips = [
                ip for ip in
                self.neutron_client.list_floatingips().get('floatingips')
                if (ip['floating_network_id'] in network_ids
                    # keep only unallocated IP addrs
                    and ip['fixed_ip_address'] is None
                    and ip['port_id'] is None)
            ]
            if free_ips:
                floating_ip = free_ips.pop()
                log.debug("Using existing floating IP %r", floating_ip)
            else:
                # FIXME: OpenStack Network API v2 requires that we specify
                # a network ID along with the request for a floating IP.
                # However, ElastiCluster configuration allows for multiple
                # networks to be connected to a VM, but does not give any
                # hint as to which one(s) should be used for such requests.
                # So we try them all, ignoring errors until one request
                # succeeds and hope that it's OK. One can imagine
                # scenarios where this is *not* correct, but: (1) these
                # scenarios are unlikely, and (2) the old novaclient code
                # above has not even had the concept of multiple networks
                # for floating IPs and no-one has complained in 5 years...
                for network_id in network_ids:
                    log.debug(
                        "Trying to allocate floating IP on network %s ...", network_id)
                    try:
                        floating_ip = self.neutron_client.create_floatingip({
                            'floatingip': {
                                'floating_network_id':network_id,
                            }}).get('floatingip')
                        log.debug(
                            "Allocated IP address %s on network %s",
                            floating_ip['floating_ip_address'], network_id)
                        break  # stop at first network where we get a floating IP
                    except BadNeutronRequest as err:
                        raise RuntimeError(
                            "Failed allocating floating IP on network {0}: {1}"
                            .format(network_id, err))
            if floating_ip.get('floating_ip_address', None) is None:
                raise RuntimeError(
                    "Could not allocate floating IP for VM {0}"
                    .format(instance_id))
            # wait until at least one interface is up
            interfaces = []
            # FIXMEE: no timeout!
            while not interfaces:
                interfaces = instance.interface_list()
                sleep(2)  ## FIXME: hard-coded value
            # get port ID
            for interface in interfaces:
                log.debug(
                    "Instance %s (ID: %s):"
                    " Checking if floating IP can be attached to interface %r ...",
                    instance.name, instance.id, interface)
                # if interface.net_id not in network_ids:
                #     log.debug(
                #         "Instance %s (ID: %s):"
                #         " Skipping interface %r:"
                #         " not attached to any of the requested networks.",
                #         instance.name, instance.id, interface)
                #     continue
                port_id = interface.port_id
                if port_id is None:
                    log.debug(
                        "Instance %s (ID: %s):"
                        " Skipping interface %r: no port ID!",
                        instance.name, instance.id, interface)
                    continue
                log.debug(
                    "Instance `%s` (ID: %s):"
                    " will assign floating IP to port ID %s (state: %s),"
                    " already running IP addresses %r",
                    instance.name, instance.id,
                    port_id, interface.port_state,
                    [item['ip_address'] for item in interface.fixed_ips])
                if interface.port_state != 'ACTIVE':
                    log.warn(
                        "Instance `%s` (ID: %s):"
                        " port `%s` is in state %s (epected 'ACTIVE' instead)",
                        instance.name, instance.id,
                        port_id, interface.port_state)
                break
            else:
                raise RuntimeError(
                    "Could not find port on network(s) {0}"
                    " for instance {1} (ID: {2}) to bind a floating IP to."
                    .format(network_ids, instance.name, instance.id))
            # assign floating IP to port
            floating_ip = self.neutron_client.update_floatingip(
                floating_ip['id'], {
                    'floatingip': {
                        'port_id': port_id,
                    },
                }
            ).get('floatingip')
            ip_address = floating_ip['floating_ip_address']
            log.debug("Assigned IP address %s to port %s", ip_address, port_id)

            log.info("Waiting 300s until floating IP %s is ACTIVE", ip_address)
            for i in range(300):
                _floating_ip = self.neutron_client.show_floatingip(floating_ip['id'])
                if _floating_ip['floatingip']['status'] != 'DOWN':
                    break
                sleep(1)

            # Invalidate cache for this VM, as we just assigned a new IP
            if instance.id in self._cached_instances:
                del self._cached_instances[instance.id]
        return ip_address
예제 #43
0
 def missing_host_key(self, client, hostname, key):
     log.info('Ignoring unknown %s host key for %s: %s' %
              (key.get_name(), hostname, hexlify(key.get_fingerprint())))
예제 #44
0
    def start(self, min_nodes=None):
        """Starts up all the instances in the cloud. To speed things up all
        instances are started in a seperate thread. To make sure
        elasticluster is not stopped during creation of an instance, it will
        overwrite the sigint handler. As soon as the last started instance
        is returned and saved to the repository, sigint is executed as usual.
        An instance is up and running as soon as a ssh connection can be
        established. If the startup timeout is reached before all instances
        are started, the cluster will stop and destroy all instances.

        This method is blocking and might take some time depending on the
        amount of instances to start.

        :param min_nodes: minimum number of nodes to start in case the quota
                          is reached before all instances are up
        :type min_nodes: dict [node_kind] = number
        """

        # To not mess up the cluster management we start the nodes in a
        # different thread. In this case the main thread receives the sigint
        # and communicates to the `start_node` thread. The nodes to work on
        # are passed in a managed queue.
        self.keep_running = True

        def sigint_handler(signal, frame):
            """
            Makes sure the cluster is stored, before the sigint results in
            exiting during the node startup.
            """
            log.error("user interruption: saving cluster before exit.")
            self.keep_running = False

        nodes = self.get_all_nodes()

        if log.DO_NOT_FORK:
            # Start the nodes sequentially without forking, in order
            # to ease the debugging
            for node in nodes:
                self._start_node(node)
                self.repository.save_or_update(self)
        else:
            # Create one thread for each node to start
            thread_pool = Pool(processes=min(len(nodes),
                                             self.thread_pool_max_size))
            log.debug("Created pool of %d threads" % len(nodes))
            # Intercept Ctrl-c
            signal.signal(signal.SIGINT, sigint_handler)

            # This is blocking
            result = thread_pool.map_async(self._start_node, nodes)

            while not result.ready():
                result.wait(1)
                if not self.keep_running:
                    # the user did abort the start of the cluster. We
                    # finish the current start of a node and save the
                    # status to the storage, so we don't have
                    # unmanaged instances laying around
                    log.error("Aborting upon Ctrl-C")
                    thread_pool.close()
                    thread_pool.join()
                    self.repository.save_or_update(self)
                    sys.exit(1)

        # dump the cluster here, so we don't loose any knowledge
        self.repository.save_or_update(self)

        signal.alarm(0)

        def sigint_reset(signal, frame):
            sys.exit(1)
        signal.signal(signal.SIGINT, sigint_reset)

        # check if all nodes are running, stop all nodes if the
        # timeout is reached
        def timeout_handler(signum, frame):
            raise TimeoutError("problems occured while starting the nodes, "
                               "timeout `%i`", Cluster.startup_timeout)

        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(Cluster.startup_timeout)

        starting_nodes = self.get_all_nodes()
        try:
            while starting_nodes:
                starting_nodes = [n for n in starting_nodes
                                  if not n.is_alive()]
                if starting_nodes:
                    time.sleep(10)
        except TimeoutError as timeout:
            # FIXME: this is wrong: the reason why `node.is_alive()` fails could be caused by a network error, and we shouldn't just delete the nodes.

            log.error("Not all nodes were started correctly within the given"
                      " timeout `%s`" % Cluster.startup_timeout)
            log.error("Please check if image, keypair, and network configuration is correct and try again.")
            # for node in starting_nodes:
            #     log.error("Stopping node `%s`, since it could not start "
            #               "within the given timeout" % node.name)
            #     node.stop()
            #     self.remove_node(node)

        signal.alarm(0)

        # If we reached this point, we should have IP addresses for
        # the nodes, so update the storage file again.
        self.repository.save_or_update(self)

        # Try to connect to each node. Run the setup action only when
        # we successfully connect to all of them.
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(Cluster.startup_timeout)
        pending_nodes = self.get_all_nodes()[:]

        if not os.path.exists(self.known_hosts_file):
            # Create the file if it's not present, otherwise the
            # following lines will raise an error
            try:
                fd = open(self.known_hosts_file, 'a')
                fd.close()
            except IOError as err:
                log.warning("Error while opening known_hosts file `%s`: `%s`"
                            " NOT using known_hosts_file.",
                            self.known_hosts_file, err)
        try:
            keys = paramiko.hostkeys.HostKeys(self.known_hosts_file)
        except IOError:
            keys = paramiko.hostkeys.HostKeys()
            log.warning("Ignoring error while opening known_hosts file %s" % self.known_hosts_file)

        try:
            while pending_nodes:
                for node in pending_nodes[:]:
                    ssh = node.connect(keyfile=self.known_hosts_file)
                    if ssh:
                        log.info("Connection to node %s (%s) successful.",
                                 node.name, node.connection_ip())
                        # Add host keys to the keys object.
                        for host, key in ssh.get_host_keys().items():
                            for ktype, keydata in key.items():
                                keys.add(host, ktype, keydata)
                        pending_nodes.remove(node)
                    self._save_keys_to_known_hosts_file(keys)
                if pending_nodes:
                    time.sleep(5)

        except TimeoutError:
            # remove the pending nodes from the cluster
            log.error("Could not connect to all the nodes of the "
                      "cluster within the given timeout `%s`."
                      % Cluster.startup_timeout)
            for node in pending_nodes:
                log.error("Stopping node `%s`, since we could not connect to"
                          " it within the timeout." % node.name)
                self.remove_node(node, stop=True)

        signal.alarm(0)

        # It might be possible that the node.connect() call updated
        # the `preferred_ip` attribute, so, let's save the cluster
        # again.
        self.repository.save_or_update(self)

        # Save host keys
        self._save_keys_to_known_hosts_file(keys)

        # A lot of things could go wrong when starting the cluster. To
        # ensure a stable cluster fitting the needs of the user in terms of
        # cluster size, we check the minimum nodes within the node groups to
        # match the current setup.
        if not min_nodes:
            # the node minimum is implicit if not specified.
            min_nodes = dict((key, len(self.nodes[key])) for key in
                             self.nodes.iterkeys())
        else:
            # check that each group has a minimum value
            for group, nodes in nodes.iteritems():
                if group not in min_nodes:
                    min_nodes[group] = len(nodes)

        self._check_cluster_size(min_nodes)
예제 #45
0
    def start(self, min_nodes=None):
        """Starts up all the instances in the cloud. To speed things up all
        instances are started in a seperate thread. To make sure
        elasticluster is not stopped during creation of an instance, it will
        overwrite the sigint handler. As soon as the last started instance
        is returned and saved to the repository, sigint is executed as usual.
        An instance is up and running as soon as a ssh connection can be
        established. If the startup timeout is reached before all instances
        are started, the cluster will stop and destroy all instances.

        This method is blocking and might take some time depending on the
        amount of instances to start.

        :param min_nodes: minimum number of nodes to start in case the quota
                          is reached before all instances are up
        :type min_nodes: dict [node_kind] = number
        """

        # To not mess up the cluster management we start the nodes in a
        # different thread. In this case the main thread receives the sigint
        # and communicates to the `start_node` thread. The nodes to work on
        # are passed in a managed queue.
        self.keep_running = True

        def sigint_handler(signal, frame):
            """
            Makes sure the cluster is stored, before the sigint results in
            exiting during the node startup.
            """
            log.error("user interruption: saving cluster before exit.")
            self.keep_running = False

        nodes = self.get_all_nodes()
        thread_pool = Pool(processes=len(nodes))
        log.debug("Created pool of %d threads" % len(nodes))
        signal.signal(signal.SIGINT, sigint_handler)

        # This is blocking
        result = thread_pool.map_async(self._start_node, nodes)

        while not result.ready():
            result.wait(1)
            if not self.keep_running:
                # the user did abort the start of the cluster. We finish the
                #  current start of a node and save the status to the
                # storage, so we don't have not managed instances laying
                # around
                log.error("Aborting upon Ctrl-C")
                thread_pool.close()
                thread_pool.join()
                self.repository.save_or_update(self)
                sys.exit(1)

        # dump the cluster here, so we don't loose any knowledge
        self.repository.save_or_update(self)

        signal.alarm(0)

        def sigint_reset(signal, frame):
            sys.exit(1)

        signal.signal(signal.SIGINT, sigint_reset)

        # check if all nodes are running, stop all nodes if the
        # timeout is reached
        def timeout_handler(signum, frame):
            raise TimeoutError(
                "problems occured while starting the nodes, "
                "timeout `%i`", Cluster.startup_timeout)

        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(Cluster.startup_timeout)

        starting_nodes = self.get_all_nodes()
        try:
            while starting_nodes:
                starting_nodes = [
                    n for n in starting_nodes if not n.is_alive()
                ]
                if starting_nodes:
                    time.sleep(10)
        except TimeoutError as timeout:
            log.error("Not all nodes were started correctly within the given"
                      " timeout `%s`" % Cluster.startup_timeout)
            for node in starting_nodes:
                log.error("Stopping node `%s`, since it could not start "
                          "within the given timeout" % node.name)
                node.stop()
                self.remove_node(node)

        signal.alarm(0)

        # If we reached this point, we should have IP addresses for
        # the nodes, so update the storage file again.
        self.repository.save_or_update(self)

        # Try to connect to each node. Run the setup action only when
        # we successfully connect to all of them.
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(Cluster.startup_timeout)
        pending_nodes = self.get_all_nodes()[:]

        try:
            while pending_nodes:
                for node in pending_nodes[:]:
                    if node.connect():
                        log.info("Connection to node %s (%s) successful.",
                                 node.name, node.connection_ip())
                        pending_nodes.remove(node)
                if pending_nodes:
                    time.sleep(5)

        except TimeoutError:
            # remove the pending nodes from the cluster
            log.error("Could not connect to all the nodes of the "
                      "cluster within the given timeout `%s`." %
                      Cluster.startup_timeout)
            for node in pending_nodes:
                log.error("Stopping node `%s`, since we could not connect to"
                          " it within the timeout." % node.name)
                node.stop()
                self.remove_node(node)

        signal.alarm(0)

        # It might be possible that the node.connect() call updated
        # the `preferred_ip` attribute, so, let's save the cluster
        # again.
        self.repository.save_or_update(self)

        # A lot of things could go wrong when starting the cluster. To
        # ensure a stable cluster fitting the needs of the user in terms of
        # cluster size, we check the minimum nodes within the node groups to
        # match the current setup.
        if not min_nodes:
            # the node minimum is implicit if not specified.
            min_nodes = dict(
                (key, len(self.nodes[key])) for key in self.nodes.iterkeys())
        else:
            # check that each group has a minimum value
            for group, nodes in nodes.iteritems():
                if group not in min_nodes:
                    min_nodes[group] = len(nodes)

        self._check_cluster_size(min_nodes)
예제 #46
0
    def execute(self):
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        repo = creator.create_repository()
        tmpdir = tempfile.mkdtemp()
        log.debug("Using temporary directory %s" % tmpdir)
        tmpconf = make_creator(self.params.config, storage_path=tmpdir)
        tmprepo = tmpconf.create_repository()

        rc=0
        # Read the zip file.
        try:
            with ZipFile(self.params.file, 'r') as zipfile:
                # Find main cluster file
                # create cluster object from it
                log.debug("ZIP file %s opened" % self.params.file)
                cluster = None
                zipfile.extractall(tmpdir)
                newclusters = tmprepo.get_all()
                cluster = newclusters[0]
                cur_clusternames = [c.name for c in repo.get_all()]
                oldname = cluster.name
                newname = self.params.rename
                if self.params.rename:
                    cluster.name = self.params.rename
                    for node in cluster.get_all_nodes():
                        node.cluster_name = cluster.name
                if cluster.name in cur_clusternames:
                    raise Exception(
                        "A cluster with name %s already exists. Use "
                        "option --rename to rename the cluster to be "
                        "imported." % cluster.name)

                        # Save the cluster in the new position
                cluster.repository = repo
                repo.save_or_update(cluster)
                dest = cluster.repository.storage_path

                # Copy the known hosts
                srcfile = os.path.join(tmpdir, oldname+'.known_hosts')
                destfile = os.path.join(dest, cluster.name+'.known_hosts')
                shutil.copy(srcfile, destfile)

                # Copy the ssh keys, if present
                for attr in ('user_key_public', 'user_key_private'):
                    keyfile = getattr(cluster, attr)
                    keybase = os.path.basename(keyfile)
                    srcfile = os.path.join(tmpdir, keybase)
                    if os.path.isfile(srcfile):
                        log.info("Importing key file %s" % keybase)
                        destfile = os.path.join(dest, keybase)
                        shutil.copy(srcfile, destfile)
                        setattr(cluster, attr, destfile)

                    for node in cluster.get_all_nodes():
                        nodekeyfile = getattr(node, attr)
                        # Check if it's different from the main key
                        if nodekeyfile != keyfile \
                           and os.path.isfile(nodekeyfile):
                            destdir = os.path.join(dest,
                                                   cluster.name,
                                                   node.kind,
                                                   node.name)
                            nodekeybase = os.path.basename(nodekeyfile)
                            log.info("Importing key file %s for node %s" %
                                     (nodekeybase, node.name))
                            if not os.path.isdir(destdir):
                                os.makedirs(destdir)
                            # Path to key in zip file
                            srcfile = os.path.join(tmpdir,
                                                   oldname,
                                                   node.kind,
                                                   node.name,
                                                   nodekeybase)
                            destfile = os.path.join(destdir, nodekeybase)
                            shutil.copy(srcfile, destfile)
                        # Always save the correct destfile
                        setattr(node, attr, destfile)

                repo.save_or_update(cluster)
                if not cluster:
                    log.error("ZIP file %s does not contain a valid cluster."
                              % self.params.file)
                    rc = 2

                # Check if a cluster already exists.
                # if not, unzip the needed files, and update ssh key path if needed.
        except Exception as ex:
            log.error("Unable to import from zipfile %s: %s"
                      % (self.params.file, ex))
            rc=1
        finally:
            if os.path.isdir(tmpdir):
                shutil.rmtree(tmpdir)
            log.info("Cleaning up directory %s" % tmpdir)

        if rc == 0:
            print("Successfully imported cluster from ZIP %s to %s"
                  % (self.params.file, repo.storage_path))
        sys.exit(rc)
예제 #47
0
 def stop(self):
     log.info("shutting down instance `%s`", self.instance_id)
     self._cloud_provider.stop_instance(self.instance_id)
예제 #48
0
 def stop_instance(self, instance_id):
     instance = self.__get_instance(instance_id)
     if not instance:
         return
     log.info('stopping %s', instance.name)
     instance.destroy()
예제 #49
0
    def start(self):
        """
        Starts the cluster with the properties given in the
        constructor. It will create the nodes through the configurator
        and delegate all the work to them. After the identifiers of
        all instances are available, it will save the cluster through
        the cluster storage.
        """

        # To not mess up the cluster management we start the nodes in a
        # different thread. In this case the main thread receives the sigint
        # and communicates to the `start_node` thread. The nodes to work on
        # are passed in a managed queue.
        self.keep_running = True

        def sigint_handler(signal, frame):
            """
            Makes sure the cluster is stored, before the sigint results in
            exiting during the node startup.
            """
            log.error("user interruption: saving cluster before exit.")
            self.keep_running = False

        nodes = self.get_all_nodes()
        thread_pool = Pool(processes=len(nodes))
        log.debug("Created pool of %d threads" % len(nodes))
        signal.signal(signal.SIGINT, sigint_handler)

        # This is blocking
        result = thread_pool.map_async(self._start_node, nodes)

        while not result.ready():
            result.wait(1)
            if not self.keep_running:
                # the user did abort the start of the cluster. We finish the
                #  current start of a node and save the status to the
                # storage, so we don't have not managed instances laying
                # around
                log.error("Aborting upon Ctrl-C")
                thread_pool.close()
                thread_pool.join()
                self._storage.dump_cluster(self)
                sys.exit(1)

        # dump the cluster here, so we don't loose any knowledge
        self._storage.dump_cluster(self)

        signal.alarm(0)

        def sigint_reset(signal, frame):
            sys.exit(1)

        signal.signal(signal.SIGINT, sigint_reset)

        # check if all nodes are running, stop all nodes if the
        # timeout is reached
        def timeout_handler(signum, frame):
            raise TimeoutError(
                "problems occured while starting the nodes, "
                "timeout `%i`", Cluster.startup_timeout)

        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(Cluster.startup_timeout)

        starting_nodes = self.get_all_nodes()
        try:
            while starting_nodes:
                starting_nodes = [
                    n for n in starting_nodes if not n.is_alive()
                ]
                if starting_nodes:
                    time.sleep(10)
        except TimeoutError as timeout:
            log.error("Not all nodes were started correctly within the given"
                      " timeout `%s`" % Cluster.startup_timeout)
            for node in starting_nodes:
                log.error("Stopping node `%s`, since it could not start "
                          "within the given timeout" % node.name)
                node.stop()
                self.remove_node(node)

        signal.alarm(0)

        # If we reached this point, we should have IP addresses for
        # the nodes, so update the storage file again.
        self._storage.dump_cluster(self)

        # Try to connect to each node. Run the setup action only when
        # we successfully connect to all of them.
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(Cluster.startup_timeout)
        pending_nodes = self.get_all_nodes()[:]

        try:
            while pending_nodes:
                for node in pending_nodes[:]:
                    if node.connect():
                        log.info("Connection to node %s (%s) successful.",
                                 node.name, node.ip_public)
                        pending_nodes.remove(node)
                if pending_nodes:
                    time.sleep(5)

        except TimeoutError:
            # remove the pending nodes from the cluster
            log.error("Could not connect to all the nodes of the "
                      "cluster within the given timeout `%s`." %
                      Cluster.startup_timeout)
            for node in pending_nodes:
                log.error("Stopping node `%s`, since we could not connect to"
                          " it within the timeout." % node.name)
                node.stop()
                self.remove_node(node)

        signal.alarm(0)

        # A lot of things could go wrong when starting the cluster. To
        # ensure a stable cluster fitting the needs of the user in terms of
        # cluster size, we check the minimum nodes within the node groups to
        # match the current setup.
        self._check_cluster_size()
예제 #50
0
    def start_instance(self,
                       key_name,
                       public_key_path,
                       private_key_path,
                       security_group,
                       flavor,
                       image_id,
                       image_userdata,
                       username=None,
                       node_name=None,
                       network_ids=None,
                       price=None,
                       timeout=None,
                       **kwargs):
        """Starts a new instance on the cloud using the given properties.
        The following tasks are done to start an instance:

        * establish a connection to the cloud web service
        * check ssh keypair and upload it if it does not yet exist. This is
          a locked process, since this function might be called in multiple
          threads and we only want the key to be stored once.
        * check if the security group exists
        * run the instance with the given properties

        :param str key_name: name of the ssh key to connect
        :param str public_key_path: path to ssh public key
        :param str private_key_path: path to ssh private key
        :param str security_group: firewall rule definition to apply on the
                                   instance
        :param str flavor: machine type to use for the instance
        :param str image_id: image type (os) to use for the instance
        :param str image_userdata: command to execute after startup
        :param str username: username for the given ssh key, default None
        :param float price: Spot instance price (if 0, do not use spot instances).
        :param int price: Timeout (in seconds) waiting for spot instances;
                          only used if price > 0.

        :return: str - instance id of the started instance
        """
        connection = self._connect()

        log.debug("Checking keypair `%s`.", key_name)
        # the `_check_keypair` method has to be called within a lock,
        # since it will upload the key if it does not exist and if this
        # happens for every node at the same time ec2 will throw an error
        # message (see issue #79)
        with BotoCloudProvider.__node_start_lock:
            self._check_keypair(key_name, public_key_path, private_key_path)

        log.debug("Checking security group `%s`.", security_group)
        security_group_id = self._check_security_group(security_group)
        # image_id = self._find_image_id(image_id)

        if network_ids:
            interfaces = []
            for subnet in network_ids.split(','):
                subnet_id = self._check_subnet(subnet)

                interfaces.append(
                    boto.ec2.networkinterface.NetworkInterfaceSpecification(
                        subnet_id=subnet_id,
                        groups=[security_group_id],
                        associate_public_ip_address=self.request_floating_ip))
            interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(
                *interfaces)

            security_groups = []
        else:
            interfaces = None
            security_groups = [security_group]

        # get defaults for `price` and `timeout` from class instance
        if price is None:
            price = self.price
        if timeout is None:
            timeout = self.timeout

        try:
            #start spot instance if bid is specified
            if price:
                log.info("Requesting spot instance with price `%s` ...", price)
                request = connection.request_spot_instances(
                    price,
                    image_id,
                    key_name=key_name,
                    security_groups=security_groups,
                    instance_type=flavor,
                    user_data=image_userdata,
                    network_interfaces=interfaces,
                    instance_profile_name=self._instance_profile)[-1]

                # wait until spot request is fullfilled (will wait
                # forever if no timeout is given)
                start_time = time.time()
                timeout = (float(timeout) if timeout else 0)
                log.info(
                    "Waiting for spot instance (will time out in %d seconds) ...",
                    timeout)
                while request.status.code != 'fulfilled':
                    if timeout and time.time() - start_time > timeout:
                        request.cancel()
                        raise RuntimeError('spot instance timed out')
                    time.sleep(self.POLL_INTERVAL)
                    # update request status
                    request = connection.get_all_spot_instance_requests(
                        request_ids=request.id)[-1]
            else:
                reservation = connection.run_instances(
                    image_id,
                    key_name=key_name,
                    security_groups=security_groups,
                    instance_type=flavor,
                    user_data=image_userdata,
                    network_interfaces=interfaces,
                    instance_profile_name=self._instance_profile)
        except Exception as ex:
            log.error("Error starting instance: %s", ex)
            if "TooManyInstances" in ex:
                raise ClusterError(ex)
            else:
                raise InstanceError(ex)
        if price:
            vm = connection.get_only_instances(
                instance_ids=[request.instance_id])[-1]
        else:
            vm = reservation.instances[-1]
        vm.add_tag("Name", node_name)

        # cache instance object locally for faster access later on
        self._instances[vm.id] = vm

        return vm.id
예제 #51
0
    def start_instance(self,
                       key_name,
                       public_key_path,
                       private_key_path,
                       security_group,
                       flavor,
                       image_id,
                       image_userdata,
                       username=None,
                       node_name=None,
                       **kwargs):
        """Starts a new instance on the cloud using the given properties.
        The following tasks are done to start an instance:

        * establish a connection to the cloud web service
        * check ssh keypair and upload it if it does not yet exist. This is
          a locked process, since this function might be called in multiple
          threads and we only want the key to be stored once.
        * check if the security group exists
        * run the instance with the given properties

        :param str key_name: name of the ssh key to connect
        :param str public_key_path: path to ssh public key
        :param str private_key_path: path to ssh private key
        :param str security_group: firewall rule definition to apply on the
                                   instance
        :param str flavor: machine type to use for the instance
        :param str image_id: image type (os) to use for the instance
        :param str image_userdata: command to execute after startup
        :param str username: username for the given ssh key, default None

        :return: str - instance id of the started instance
        """
        self._init_os_api()

        vm_start_args = {}

        log.debug("Checking keypair `%s` ...", key_name)
        with OpenStackCloudProvider.__node_start_lock:
            self._check_keypair(key_name, public_key_path, private_key_path)
        vm_start_args['key_name'] = key_name

        security_groups = [sg.strip() for sg in security_group.split(',')]
        self._check_security_groups(security_groups)
        vm_start_args['security_groups'] = security_groups

        # Check if the image id is present.
        if image_id not in [img.id for img in self._get_images()]:
            raise ImageError(
                "No image found with ID `{0}` in project `{1}` of cloud {2}".
                format(image_id, self._os_tenant_name, self._os_auth_url))
        vm_start_args['userdata'] = image_userdata

        # Check if the flavor exists
        flavors = [fl for fl in self._get_flavors() if fl.name == flavor]
        if not flavors:
            raise FlavorError(
                "No flavor found with name `{0}` in project `{1}` of cloud {2}"
                .format(flavor, self._os_tenant_name, self._os_auth_url))
        flavor = flavors[0]

        network_ids = [
            net_id.strip()
            for net_id in kwargs.pop('network_ids', '').split(',')
        ]
        if network_ids:
            nics = [{
                'net-id': net_id,
                'v4-fixed-ip': ''
            } for net_id in network_ids]
            log.debug("Specifying networks for node %s: %s", node_name,
                      ', '.join([nic['net-id'] for nic in nics]))
        else:
            nics = None
        vm_start_args['nics'] = nics

        if 'boot_disk_size' in kwargs:
            # check if the backing volume is already there
            volume_name = '{name}-{id}'.format(name=node_name, id=image_id)
            if volume_name in [v.name for v in self._get_volumes()]:
                raise ImageError(
                    "Volume `{0}` already exists in project `{1}` of cloud {2}"
                    .format(volume_name, self._os_tenant_name,
                            self._os_auth_url))

            log.info('Creating volume `%s` to use as VM disk ...', volume_name)
            try:
                bds = int(kwargs['boot_disk_size'])
                if bds < 1:
                    raise ValueError('non-positive int')
            except (ValueError, TypeError):
                raise ConfigurationError(
                    "Invalid `boot_disk_size` specified:"
                    " should be a positive integer, got {0} instead".format(
                        kwargs['boot_disk_size']))
            volume = self.cinder_client.volumes.create(
                size=bds,
                name=volume_name,
                imageRef=image_id,
                volume_type=kwargs.pop('boot_disk_type'))

            # wait for volume to come up
            volume_available = False
            while not volume_available:
                for v in self._get_volumes():
                    if v.name == volume_name and v.status == 'available':
                        volume_available = True
                        break
                sleep(1)  # FIXME: hard-coded waiting time

            # ok, use volume as VM disk
            vm_start_args['block_device_mapping'] = {
                # FIXME: is it possible that `vda` is not the boot disk? e.g. if
                # a non-paravirtualized kernel is being used?  should we allow
                # to set the boot device as an image parameter?
                'vda':
                ('{id}:::{delete_on_terminate}'.format(id=volume.id,
                                                       delete_on_terminate=1)),
            }

        # due to some `nova_client.servers.create()` implementation weirdness,
        # the first three args need to be spelt out explicitly and cannot be
        # conflated into `**vm_start_args`
        vm = self.nova_client.servers.create(node_name, image_id, flavor,
                                             **vm_start_args)

        # allocate and attach a floating IP, if requested
        if self.request_floating_ip:
            # We need to list the floating IPs for this instance
            try:
                # python-novaclient <8.0.0
                floating_ips = [
                    ip for ip in self.nova_client.floating_ips.list()
                    if ip.instance_id == vm.id
                ]
            except AttributeError:
                floating_ips = self.neutron_client.list_floatingips(id=vm.id)
            # allocate new floating IP if none given
            if not floating_ips:
                self._allocate_address(vm, network_ids)

        self._instances[vm.id] = vm

        return vm.id
예제 #52
0
    def start(self, min_nodes=None, max_concurrent_requests=0):
        """
        Starts up all the instances in the cloud.

        To speed things up, all
        instances are started in a seperate thread. To make sure
        ElastiCluster is not stopped during creation of an instance, it will
        overwrite the sigint handler. As soon as the last started instance
        is returned and saved to the repository, sigint is executed as usual.

        A VM instance is considered 'up and running' as soon as an SSH
        connection can be established. If the startup timeout is reached before
        all instances are started, ElastiCluster stops the cluster and
        terminates all VM instances.

        This method is blocking and might take some time depending on the
        amount of instances to start.

        :param min_nodes: minimum number of nodes to start in case the quota
                          is reached before all instances are up
        :type min_nodes: dict [node_kind] = number
        :param int max_concurrent_requests:
          Issue at most this number of requests to start
          VMs; if 1 or less, start nodes one at a time (sequentially).
          The special value ``0`` means run 4 threads for each available
          processor.
        """

        nodes = self.get_all_nodes()

        log.info("Starting cluster nodes ...")
        if max_concurrent_requests == 0:
            try:
                max_concurrent_requests = 4 * get_num_processors()
            except RuntimeError:
                log.warning("Cannot determine number of processors!"
                            " will start nodes sequentially...")
                max_concurrent_requests = 1
        if max_concurrent_requests > 1:
            nodes = self._start_nodes_parallel(nodes, max_concurrent_requests)
        else:
            nodes = self._start_nodes_sequentially(nodes)

        # checkpoint cluster state
        self.repository.save_or_update(self)

        not_started_nodes = self._check_starting_nodes(nodes,
                                                       self.startup_timeout)

        # now that all nodes are up, checkpoint cluster state again
        self.repository.save_or_update(self)

        # Try to connect to each node to gather IP addresses and SSH host keys
        log.info("Checking SSH connection to nodes ...")
        pending_nodes = nodes - not_started_nodes
        self._gather_node_ip_addresses(pending_nodes, self.startup_timeout)

        # It might be possible that the node.connect() call updated
        # the `preferred_ip` attribute, so, let's save the cluster
        # again.
        self.repository.save_or_update(self)

        # A lot of things could go wrong when starting the cluster. To
        # ensure a stable cluster fitting the needs of the user in terms of
        # cluster size, we check the minimum nodes within the node groups to
        # match the current setup.
        min_nodes = self._compute_min_nodes(min_nodes)
        self._check_cluster_size(min_nodes)
예제 #53
0
    def _build_inventory(self, cluster):
        """
        Builds the inventory for the given cluster and returns its path

        :param cluster: cluster to build inventory for
        :type cluster: :py:class:`elasticluster.cluster.Cluster`
        """
        inventory_data = defaultdict(list)

        for node in cluster.get_all_nodes():
            if node.preferred_ip is None:
                log.warning(
                    "Ignoring node `{0}`: No IP address."
                    .format(node.name))
                continue
            if node.kind not in self.groups:
                # FIXME: should this raise a `ConfigurationError` instead?
                log.warning(
                    "Ignoring node `{0}`:"
                    " Node kind `{1}` not defined in cluster!"
                    .format(node.name, node.kind))
                continue

            extra_vars = ['ansible_user=%s' % node.image_user]

            ip_addr, port = parse_ip_address_and_port(node.preferred_ip)
            if port != 22:
                extra_vars.append('ansible_port=%s' % port)

            if node.kind in self.environment:
                extra_vars.extend('%s=%s' % (k, v) for k, v in
                                  self.environment[node.kind].items())
            for group in self.groups[node.kind]:
                inventory_data[group].append(
                    (node.name, ip_addr, str.join(' ', extra_vars)))

        if not inventory_data:
            log.info("No inventory file was created.")
            return None

        # create a temporary file to pass to ansible, since the
        # api is not stable yet...
        if self._storage_path_tmp:
            if not self._storage_path:
                self._storage_path = tempfile.mkdtemp()
            elasticluster.log.warning(
                "Writing inventory file to tmp dir `%s`", self._storage_path)

        inventory_path = os.path.join(
            self._storage_path, (cluster.name + '.inventory'))
        log.debug("Writing Ansible inventory to file `%s` ...", inventory_path)
        with open(inventory_path, 'w+') as inventory_file:
            for section, hosts in inventory_data.items():
                # Ansible throws an error "argument of type 'NoneType' is not
                # iterable" if a section is empty, so ensure we have something
                # to write in there
                if hosts:
                    inventory_file.write("\n[" + section + "]\n")
                    for host in hosts:
                        hostline = "{0} ansible_host={1} {2}\n".format(*host)
                        inventory_file.write(hostline)
        return inventory_path
예제 #54
0
 def missing_host_key(self, client, hostname, key):
     log.info('Ignoring unknown %s host key for %s: %s' %
              (key.get_name(), hostname, hexlify(key.get_fingerprint())))
예제 #55
0
    def start(self, min_nodes=None):
        """Starts up all the instances in the cloud. To speed things up all
        instances are started in a seperate thread. To make sure
        elasticluster is not stopped during creation of an instance, it will
        overwrite the sigint handler. As soon as the last started instance
        is returned and saved to the repository, sigint is executed as usual.
        An instance is up and running as soon as a ssh connection can be
        established. If the startup timeout is reached before all instances
        are started, the cluster will stop and destroy all instances.

        This method is blocking and might take some time depending on the
        amount of instances to start.

        :param min_nodes: minimum number of nodes to start in case the quota
                          is reached before all instances are up
        :type min_nodes: dict [node_kind] = number
        """

        # To not mess up the cluster management we start the nodes in a
        # different thread. In this case the main thread receives the sigint
        # and communicates to the `start_node` thread. The nodes to work on
        # are passed in a managed queue.
        self.keep_running = True

        def sigint_handler(signal, frame):
            """
            Makes sure the cluster is stored, before the sigint results in
            exiting during the node startup.
            """
            log.error("user interruption: saving cluster before exit.")
            self.keep_running = False

        nodes = self.get_all_nodes()

        if log.DO_NOT_FORK:
            # Start the nodes sequentially without forking, in order
            # to ease the debugging
            for node in nodes:
                self._start_node(node)
                self.repository.save_or_update(self)
        else:
            # Create one thread for each node to start
            thread_pool = Pool(
                processes=min(len(nodes), self.thread_pool_max_size))
            log.debug("Created pool of %d threads" % len(nodes))
            # Intercept Ctrl-c
            signal.signal(signal.SIGINT, sigint_handler)

            # This is blocking
            result = thread_pool.map_async(self._start_node, nodes)

            while not result.ready():
                result.wait(1)
                if not self.keep_running:
                    # the user did abort the start of the cluster. We
                    # finish the current start of a node and save the
                    # status to the storage, so we don't have
                    # unmanaged instances laying around
                    log.error("Aborting upon Ctrl-C")
                    thread_pool.close()
                    thread_pool.join()
                    self.repository.save_or_update(self)
                    sys.exit(1)

        # dump the cluster here, so we don't loose any knowledge
        self.repository.save_or_update(self)

        signal.alarm(0)

        def sigint_reset(signal, frame):
            sys.exit(1)

        signal.signal(signal.SIGINT, sigint_reset)

        # check if all nodes are running, stop all nodes if the
        # timeout is reached
        def timeout_handler(signum, frame):
            raise TimeoutError(
                "problems occured while starting the nodes, "
                "timeout `%i`", Cluster.startup_timeout)

        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(Cluster.startup_timeout)

        starting_nodes = self.get_all_nodes()
        try:
            while starting_nodes:
                starting_nodes = [
                    n for n in starting_nodes if not n.is_alive()
                ]
                if starting_nodes:
                    time.sleep(10)
        except TimeoutError as timeout:
            # FIXME: this is wrong: the reason why `node.is_alive()` fails could be caused by a network error, and we shouldn't just delete the nodes.

            log.error("Not all nodes were started correctly within the given"
                      " timeout `%s`" % Cluster.startup_timeout)
            log.error(
                "Please check if image, keypair, and network configuration is correct and try again."
            )
            # for node in starting_nodes:
            #     log.error("Stopping node `%s`, since it could not start "
            #               "within the given timeout" % node.name)
            #     node.stop()
            #     self.remove_node(node)

        signal.alarm(0)

        # If we reached this point, we should have IP addresses for
        # the nodes, so update the storage file again.
        self.repository.save_or_update(self)

        # Try to connect to each node. Run the setup action only when
        # we successfully connect to all of them.
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(Cluster.startup_timeout)
        pending_nodes = self.get_all_nodes()[:]

        if not os.path.exists(self.known_hosts_file):
            # Create the file if it's not present, otherwise the
            # following lines will raise an error
            try:
                fd = open(self.known_hosts_file, 'a')
                fd.close()
            except IOError as err:
                log.warning(
                    "Error while opening known_hosts file `%s`: `%s`"
                    " NOT using known_hosts_file.", self.known_hosts_file, err)
        try:
            keys = paramiko.hostkeys.HostKeys(self.known_hosts_file)
        except IOError:
            keys = paramiko.hostkeys.HostKeys()
            log.warning("Ignoring error while opening known_hosts file %s" %
                        self.known_hosts_file)

        try:
            while pending_nodes:
                for node in pending_nodes[:]:
                    ssh = node.connect(keyfile=self.known_hosts_file)
                    if ssh:
                        log.info("Connection to node %s (%s) successful.",
                                 node.name, node.connection_ip())
                        # Add host keys to the keys object.
                        for host, key in ssh.get_host_keys().items():
                            for ktype, keydata in key.items():
                                keys.add(host, ktype, keydata)
                        pending_nodes.remove(node)
                if pending_nodes:
                    time.sleep(5)

        except TimeoutError:
            # remove the pending nodes from the cluster
            log.error("Could not connect to all the nodes of the "
                      "cluster within the given timeout `%s`." %
                      Cluster.startup_timeout)
            for node in pending_nodes:
                log.error("Stopping node `%s`, since we could not connect to"
                          " it within the timeout." % node.name)
                self.remove_node(node, stop=True)

        signal.alarm(0)

        # It might be possible that the node.connect() call updated
        # the `preferred_ip` attribute, so, let's save the cluster
        # again.
        self.repository.save_or_update(self)

        # Save host keys
        try:
            keys.save(self.known_hosts_file)
        except IOError:
            log.warning("Ignoring error while saving known_hosts file %s" %
                        self.known_hosts_file)

        # A lot of things could go wrong when starting the cluster. To
        # ensure a stable cluster fitting the needs of the user in terms of
        # cluster size, we check the minimum nodes within the node groups to
        # match the current setup.
        if not min_nodes:
            # the node minimum is implicit if not specified.
            min_nodes = dict(
                (key, len(self.nodes[key])) for key in self.nodes.iterkeys())
        else:
            # check that each group has a minimum value
            for group, nodes in nodes.iteritems():
                if group not in min_nodes:
                    min_nodes[group] = len(nodes)

        self._check_cluster_size(min_nodes)