def stop_instance(self, instance_id): """Stops the instance gracefully. :param str instance_id: instance identifier :raises: `InstanceError` if instance can not be stopped """ if not instance_id: log.info("Instance to stop has no instance id") return gce = self._connect() try: request = gce.instances().delete(project=self._project_id, instance=instance_id, zone=self._zone) response = self._execute_request(request) self._check_response(response) except HttpError as e: # If the instance does not exist, we get a 404 if e.resp.status == 404: raise InstanceNotFoundError( "Instance `{instance_id}` was not found" .format(instance_id=instance_id)) else: raise InstanceError( "Could not stop instance `{instance_id}`: `{e}`" .format(instance_id=instance_id, e=e)) except CloudProviderError as e: raise InstanceError( "Could not stop instance `{instance_id}`: `{e}`" .format(instance_id=instance_id, e=e))
def start_node(node_queue): try: while not node_queue.empty(): if not self.keep_running: log.error("Aborting execution upon CTRL-C") break node = node_queue.get() # TODO: the following check is not optimal yet. When a # node is still in a starting state, # it will start another node here, # since the `is_alive` method will only check for # running nodes (see issue #13) if node.is_alive(): log.info("Not starting node %s which is " "already up&running.", node.name) else: log.info("starting node...") try: node.start() except (InstanceError, SecurityGroupError, KeypairError, ImageError) as e: log.error("could not start node `%s` for reason " "`%s`" % (node.name, e)) except Empty: # nothing to do if the queue turns out to be empty - the # nodes are then already started. pass
def _start_node(node): """Static method to start a specific node on a cloud :return: bool -- True on success, False otherwise """ log.debug("_start_node: working on node %s" % node.name) # TODO: the following check is not optimal yet. When a # node is still in a starting state, # it will start another node here, # since the `is_alive` method will only check for # running nodes (see issue #13) if node.is_alive(): log.info("Not starting node %s which is " "already up&running.", node.name) return True else: try: node.start() log.info("_start_node: node has been started") return True except KeypairError as e: return e except Exception as e: log.error("could not start node `%s` for reason " "`%s`" % (node.name, e)) return None
def pause_instance(self, instance_id): """Pauses the instance, retaining disk and config. :param str instance_id: instance identifier :raises: `InstanceError` if instance cannot be paused :return: dict - information needed to restart instance. """ if not instance_id: log.info("Instance to pause has no instance id.") return gce = self._connect() try: request = gce.instances().stop(project=self._project_id, instance=instance_id, zone=self._zone) operation = self._execute_request(request) response = self._wait_until_done(operation) self._check_response(response) return {"instance_id": instance_id} except HttpError as e: log.error("Error stopping instance: `%s", e) raise InstanceError("Error stopping instance `%s`", e)
def resume_instance(self, paused_info): """Restarts a paused instance, retaining disk and config. :param str instance_id: instance identifier :raises: `InstanceError` if instance cannot be resumed. :return: dict - information needed to restart instance. """ if not paused_info.get("instance_id"): log.info("Instance to stop has no instance id.") return gce = self._connect() try: request = gce.instances().start(project=self._project_id, instance=paused_info["instance_id"], zone=self._zone) operation = self._execute_request(request) response = self._wait_until_done(operation) self._check_response(response) return except HttpError as e: log.error("Error restarting instance: `%s", e) raise InstanceError("Error restarting instance `%s`", e)
def _stop_all_nodes(self, wait=False): """ Terminate all cluster nodes. Return number of failures. """ failed = 0 for node in self.get_all_nodes(): if not node.instance_id: log.warning( "Node `%s` has no instance ID." " Assuming it did not start correctly," " so removing it anyway from the cluster.", node.name) self.nodes[node.kind].remove(node) continue # try and stop node try: # wait and pause for and recheck. node.stop(wait) self.nodes[node.kind].remove(node) log.debug( "Removed node `%s` from cluster `%s`", node.name, self.name) except InstanceNotFoundError as err: log.info( "Node `%s` (instance ID `%s`) was not found;" " assuming it has already been terminated.", node.name, node.instance_id) except Exception as err: failed += 1 log.error( "Could not stop node `%s` (instance ID `%s`): %s %s", node.name, node.instance_id, err, err.__class__) return failed
def stop_instance(self, instance_id): """Stops the instance gracefully. :param str instance_id: instance identifier :raises: `InstanceError` if instance can not be stopped """ if not instance_id: log.info("Instance to stop has no instance id") return gce = self._connect() try: request = gce.instances().delete(project=self._project_id, instance=instance_id, zone=self._zone) response = self._execute_request(request) self._check_response(response) except HttpError as e: # If the instance does not exist, we can a 404 - just log it, and # return without exception so the caller can remove the reference. if e.resp.status == 404: log.warning("Instance to stop `%s` was not found" % instance_id) else: raise InstanceError("Could not stop instance `%s`: `%s`" % (instance_id, e)) except CloudProviderError as e: raise InstanceError("Could not stop instance `%s`: `%s`" % (instance_id, e))
def __init_keystone_session_v2(self, check=False): """Create and return a session object using Keystone API v2.""" from keystoneauth1 import loading as keystone_v2 loader = keystone_v2.get_plugin_loader('password') auth = loader.load_from_options( auth_url=self._os_auth_url, username=self._os_username, password=self._os_password, project_name=self._os_tenant_name, ) sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert) if check: log.debug("Checking that Keystone API v2 session works...") try: # if session is invalid, the following will raise some exception nova = nova_client.Client(self._compute_api_version, session=sess, cacert=self._os_cacert) nova.flavors.list() except keystoneauth1.exceptions.NotFound as err: log.warning("Creating Keystone v2 session failed: %s", err) return None except keystoneauth1.exceptions.ClientException as err: log.error("OpenStack server rejected request (likely configuration error?): %s", err) return None # FIXME: should we be raising an error instead? # if we got to this point, v2 session is valid log.info("Using Keystone API v2 session to authenticate to OpenStack") return sess
def __prepare_key_pair(self, key_name, private_key_path, public_key_path, password): if not key_name: log.warn('user_key_name has not been defined, assuming password-based authentication') return if key_name in [k.name for k in self.driver.list_key_pairs()]: log.info('Key pair `%s` already exists, skipping import.', key_name) return if public_key_path: log.debug("importing public key from file %s ...", public_key_path) if not self.driver.import_key_pair_from_file( name=key_name, key_file_path=os.path.expandvars(os.path.expanduser(public_key_path))): raise KeypairError( 'Could not upload public key {p}' .format(p=public_key_path)) elif private_key_path: if not private_key_path.endswith('.pem'): raise KeypairError( 'can only work with .pem private keys,' ' derive public key and set user_key_public') log.debug("deriving and importing public key from private key") self.__import_pem(key_name, private_key_path, password) else: pem_file_path = os.path.join(self.storage_path, key_name + '.pem') if not os.path.exists(pem_file_path): with open(pem_file_path, 'w') as new_key_file: new_key_file.write( self.driver.create_key_pair(name=key_name)) self.__import_pem(key_name, pem_file_path, password)
def _init_az_api(self): """ Initialise client objects for talking to Azure API. This is in a separate function so to be called by ``__init__`` and ``__setstate__``. """ with self.__lock: if self._resource_client is None: log.debug("Making Azure `ServicePrincipalcredentials` object" " with tenant=%r, client_id=%r, secret=%r ...", self.tenant_id, self.client_id, ('<redacted>' if self.secret else None)) credentials = ServicePrincipalCredentials( tenant=self.tenant_id, client_id=self.client_id, secret=self.secret, ) log.debug("Initializing Azure `ComputeManagementclient` ...") self._compute_client = ComputeManagementClient(credentials, self.subscription_id) log.debug("Initializing Azure `NetworkManagementclient` ...") self._network_client = NetworkManagementClient(credentials, self.subscription_id) log.debug("Initializing Azure `ResourceManagementclient` ...") self._resource_client = ResourceManagementClient(credentials, self.subscription_id) log.info("Azure API clients initialized.")
def verbose_add(fname, basedir='', comment=None): zipname = basedir + os.path.basename(fname) log.info("Adding '%s' as '%s'" % (fname, zipname)) zipfile.write(fname, zipname) if comment: info = zipfile.getinfo(zipname) info.comment = comment
def start(self, min_nodes=None): """ Starts up all the instances in the cloud. To speed things up, all instances are started in a seperate thread. To make sure ElastiCluster is not stopped during creation of an instance, it will overwrite the sigint handler. As soon as the last started instance is returned and saved to the repository, sigint is executed as usual. A VM instance is considered 'up and running' as soon as an SSH connection can be established. If the startup timeout is reached before all instances are started, ElastiCluster stops the cluster and terminates all VM instances. This method is blocking and might take some time depending on the amount of instances to start. :param min_nodes: minimum number of nodes to start in case the quota is reached before all instances are up :type min_nodes: dict [node_kind] = number """ nodes = self.get_all_nodes() log.info("Starting cluster nodes ...") if log.DO_NOT_FORK: nodes = self._start_nodes_sequentially(nodes) else: nodes = self._start_nodes_parallel(nodes, self.thread_pool_max_size) # checkpoint cluster state self.repository.save_or_update(self) not_started_nodes = self._check_starting_nodes(nodes, self.startup_timeout) # now that all nodes are up, checkpoint cluster state again self.repository.save_or_update(self) # Try to connect to each node to gather IP addresses and SSH host keys log.info("Checking SSH connection to nodes ...") pending_nodes = nodes - not_started_nodes self._gather_node_ip_addresses(pending_nodes, self.startup_timeout) # It might be possible that the node.connect() call updated # the `preferred_ip` attribute, so, let's save the cluster # again. self.repository.save_or_update(self) # A lot of things could go wrong when starting the cluster. To # ensure a stable cluster fitting the needs of the user in terms of # cluster size, we check the minimum nodes within the node groups to # match the current setup. min_nodes = self._compute_min_nodes(min_nodes) self._check_cluster_size(min_nodes)
def stop(self): """Destroys the instance launched on the cloud for this specific node. """ log.info("shutting down instance `%s`", self.instance_id) self._cloud_provider.stop_instance(self.instance_id) # When an instance is terminated, the EC2 cloud provider will # basically return it as "running" state. Setting the # `instance_id` attribute to None will force `is_alive()` # method not to check with the cloud provider, and forever # forgetting about the instance id. self.instance_id = None
def start(self): """ Starts an instance for this node on the cloud through the clode provider. This method is non-blocking, as soon as the node id is returned from the cloud provider, it will return. """ log.info("Starting node %s.", self.name) self.instance_id = self._cloud_provider.start_instance( self.user_key_name, self.user_key_public, self.security_group, self.flavor, self.image, self.image_userdata) log.debug("Node %s has instance_id: `%s`", self.name, self.instance_id)
def get_stored_clusters(self): """ Returns a list of all stored clusters. """ allfiles = os.listdir(self._storage_dir) db_files = [] for fname in allfiles: fpath = os.path.join(self._storage_dir, fname) if fname.endswith('.json') and os.path.isfile(fpath): db_files.append(fname[:-5]) else: log.info("Ignoring invalid storage file %s", fpath) return db_files
def start(self): """Starts the node on the cloud using the given instance properties. This method is non-blocking, as soon as the node id is returned from the cloud provider, it will return. Therefore the `is_alive` and `update_ips` methods can be used to further gather details about the state of the node. """ log.info("Starting node %s.", self.name) self.instance_id = self._cloud_provider.start_instance( self.user_key_name, self.user_key_public, self.user_key_private, self.security_group, self.flavor, self.image_id, self.image_userdata, username=self.image_user, node_name="%s-%s" % (self.cluster_name, self.name), **self.extra) log.debug("Node %s has instance_id: `%s`", self.name, self.instance_id)
def _gather_node_ip_addresses(self, nodes, lapse): """ Connect via SSH to each node. Return set of nodes that could not be reached with `lapse` seconds. """ # for convenience, we might set this to ``None`` if the file cannot # be opened -- but we do not want to forget the cluster-wide # setting in case the error is transient known_hosts_path = self.known_hosts_file # Create the file if it's not present, otherwise the # following lines will raise an error try: fd = open(known_hosts_path, 'a') fd.close() except IOError as err: log.warning("Error opening SSH 'known hosts' file `%s`: %s", known_hosts_path, err) known_hosts_path = None keys = paramiko.hostkeys.HostKeys(known_hosts_path) with timeout(lapse, raise_timeout_error): try: while nodes: for node in copy(nodes): ssh = node.connect(keyfile=known_hosts_path) if ssh: log.info("Connection to node `%s` successful," " using IP address %s to connect.", node.name, node.connection_ip()) # Add host keys to the keys object. for host, key in ssh.get_host_keys().items(): for keytype, keydata in key.items(): keys.add(host, keytype, keydata) self._save_keys_to_known_hosts_file(keys) nodes.remove(node) if nodes: time.sleep(self.polling_interval) except TimeoutError: log.error( "Some nodes of the cluster were unreachable" " within the given %d-seconds timeout: %s", lapse, ', '.join(node.name for node in nodes)) # return list of nodes return nodes
def _add_key_to_sshagent(self, private_key_path): """Function to add a private key to the ssh-agent :param str private_key_path: path to the ssh private key file :raises KeyNotAccessible: If the password provided is empty (in other cases the ssh-add asks for the password again) """ # This block avoid repetition of checks after it is done for the first instance if self._SSH_KEY_ACCESS_ERROR==True: raise KeyNotAccessible#("Unable to access key file `"+private_key_path+": Invalid password") return_code=subprocess.call(['ssh-add', private_key_path]) if return_code==0: log.info("Key %s suscessfully added to ssh-agent" % private_key_path) else: # This only happens if the password is empty self._SSH_KEY_ACCESS_ERROR=True # This avoid user entering the code right the second time raise KeyNotAccessible#("Unable to access key file `"+private_key_path+": Invalid password")
def start(self): """ Starts an instance for this node on the cloud through the clode provider. This method is non-blocking, as soon as the node id is returned from the cloud provider, it will return. """ log.info("Starting node %s.", self.name) self.instance_id = self._cloud_provider.start_instance( self.user_key_name, self.user_key_public, self.user_key_private, self.security_group, self.flavor, self.image, self.image_userdata, username=self.image_user) log.debug("Node %s has instance_id: `%s`", self.name, self.instance_id)
def stop(self, wait=False): """ Terminate the VM instance launched on the cloud for this specific node. """ if self.instance_id is not None: log.info("Shutting down instance `%s` ...", self.instance_id) self._cloud_provider.stop_instance(self.instance_id) if wait: while self.is_alive(): time.sleep(1) # When an instance is terminated, the EC2 cloud provider will # basically return it as "running" state. Setting the # `instance_id` attribute to None will force `is_alive()` # method not to check with the cloud provider, and forever # forgetting about the instance id. self.instance_id = None
def get_ips(self, instance_id): """Retrieves the ip addresses (public) from the cloud provider by the given instance id. :param str instance_id: id of the instance :return: list (ips) :raises: InstanceError if the ip could not be retrieved. """ if not instance_id: raise InstanceError("could not retrieve the ip address for node: " "no associated instance id") gce = self._connect() instances = gce.instances() try: request = instances.get(instance=instance_id, project=self._project_id, zone=self._zone) response = self._execute_request(request) ip_public = None # If the instance is in status TERMINATED, then there will be # no IP addresses. if response and response['status'] in ('STOPPING', 'TERMINATED'): log.info("node '%s' state is '%s'; no IP address(es)" % (instance_id, response['status'])) return [None] if response and "networkInterfaces" in response: interfaces = response['networkInterfaces'] if interfaces: if "accessConfigs" in interfaces[0]: ip_public = interfaces[0]['accessConfigs'][0]['natIP'] ip_private = interfaces[0]['networkIP'] if ip_public and ip_private: return [ip_public, ip_private] else: raise InstanceError("could not retrieve the ip address for " "node `%s`, please check the node " "through the cloud provider interface" % instance_id) except (HttpError, CloudProviderError) as e: raise InstanceError('could not retrieve the ip address of `%s`: ' '`%s`' % (instance_id, e))
def stop(self, wait=False): """ Terminate the VM instance launched on the cloud for this specific node. """ if self.instance_id is not None: log.info("Shutting down node `%s` (VM instance `%s`) ...", self.name, self.instance_id) self._cloud_provider.stop_instance(self.instance_id) if wait: while self.is_alive(): time.sleep(1) # When an instance is terminated, the EC2 cloud provider will # basically return it as "running" state. Setting the # `instance_id` attribute to None will force `is_alive()` # method not to check with the cloud provider, and forever # forgetting about the instance id. self.instance_id = None
def start(self): """ Start the node on the cloud using the given instance properties. This method is non-blocking: as soon as the node id is returned from the cloud provider, it will return. The `is_alive`:meth: and `update_ips`:meth: methods should be used to further gather details about the state of the node. """ log.info("Starting node %s ...", self.name) self.instance_id = self._cloud_provider.start_instance( self.user_key_name, self.user_key_public, self.user_key_private, self.security_group, self.flavor, self.image_id, self.image_userdata, username=self.image_user, node_name=("%s-%s" % (self.cluster_name, self.name)), **self.extra) log.debug("Node `%s` has instance ID `%s`", self.name, self.instance_id)
def get_ips(self, instance_id): """Retrieves the ip addresses (public) from the cloud provider by the given instance id. :param str instance_id: id of the instance :return: list (ips) :raises: InstanceError if the ip could not be retrieved. """ if not instance_id: raise InstanceError("could not retrieve the ip address for node: " "no associated instance id") gce = self._connect() instances = gce.instances() try: request = instances.get(instance=instance_id, project=self._project_id, zone=self._zone) response = self._execute_request(request) ip_public = None # If the instance is in status TERMINATED, then there will be # no IP addresses. if response and response['status'] in ('STOPPING', 'TERMINATED'): log.info("node '%s' state is '%s'; no IP address(es)" % (instance_id, response['status'])) return [None] if response and "networkInterfaces" in response: interfaces = response['networkInterfaces'] if interfaces: if "accessConfigs" in interfaces[0]: ip_public = interfaces[0]['accessConfigs'][0]['natIP'] if ip_public: return [ip_public] else: raise InstanceError("could not retrieve the ip address for " "node `%s`, please check the node " "through the cloud provider interface" % instance_id) except (HttpError, CloudProviderError) as e: raise InstanceError('could not retrieve the ip address of `%s`: ' '`%s`' % (instance_id, e))
def start(self): """Starts the node on the cloud using the given instance properties. This method is non-blocking, as soon as the node id is returned from the cloud provider, it will return. Therefore the `is_alive` and `update_ips` methods can be used to further gather details about the state of the node. """ log.info("Starting node %s.", self.name) self.instance_id = self._cloud_provider.start_instance( self.user_key_name, self.user_key_public, self.user_key_private, self.security_group, self.flavor, self.image, self.image_userdata, username=self.image_user, node_name=self.name) log.debug("Node %s has instance_id: `%s`", self.name, self.instance_id)
def get_all(self): """Retrieves all clusters from the persistent state. :return: list of :py:class:`elasticluster.cluster.Cluster` """ file_ending = ClusterRepository.file_ending allfiles = os.listdir(self.storage_path) cluster_files = [] for fname in allfiles: fpath = os.path.join(self.storage_path, fname) if fname.endswith('.%s' % file_ending) and os.path.isfile(fpath): cluster_files.append(fname[:-len(file_ending)-1]) else: log.info("Ignoring invalid storage file %s", fpath) clusters = list() for cluster_file in cluster_files: clusters.append(self.get(cluster_file)) return clusters
def __init_keystone_session_v3(self, check=False): """ Return a new session object, created using Keystone API v3. .. note:: Note that the only supported authN method is password authentication; token or other plug-ins are not currently supported. """ try: # may fail on Python 2.6? from keystoneauth1.identity import v3 as keystone_v3 except ImportError: log.warning("Cannot load Keystone API v3 library.") return None auth = keystone_v3.Password( auth_url=self._os_auth_url, username=self._os_username, password=self._os_password, user_domain_name=self._os_user_domain_name, project_domain_name=self._os_project_domain_name, project_name=self._os_tenant_name, ) sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert) if check: log.debug("Checking that Keystone API v3 session works...") try: # if session is invalid, the following will raise some exception nova = nova_client.Client(self.compute_api_version, session=sess) nova.flavors.list() except keystoneauth1.exceptions.NotFound as err: log.warning("Creating Keystone v3 session failed: %s", err) return None except keystoneauth1.exceptions.ClientException as err: log.error( "OpenStack server rejected request (likely configuration error?): %s", err) return None # FIXME: should we be raising an error instead? # if we got to this point, v3 session is valid log.info("Using Keystone API v3 session to authenticate to OpenStack") return sess
def stop_instance(self, instance_id): """Stops the instance gracefully. :param str instance_id: instance identifier :raises: `InstanceError` if instance can not be stopped """ if not instance_id: log.info("Instance to stop has no instance id") return gce = self._connect() try: request = gce.instances().delete(project=self._project_id, instance=instance_id, zone=self._zone) response = self._execute_request(request) self._check_response(response) except (HttpError, CloudProviderError) as e: raise InstanceError("Could not stop instance `%s`: `%s`" % (instance_id, e))
def __init_keystone_session_v3(self, check=False): """ Return a new session object, created using Keystone API v3. .. note:: Note that the only supported authN method is password authentication; token or other plug-ins are not currently supported. """ try: # may fail on Python 2.6? from keystoneauth1.identity import v3 as keystone_v3 except ImportError: log.warning("Cannot load Keystone API v3 library.") return None auth = keystone_v3.Password( auth_url=self._os_auth_url, username=self._os_username, password=self._os_password, user_domain_name=self._os_user_domain_name, project_domain_name=self._os_project_domain_name, project_name=self._os_tenant_name, ) sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert) if check: log.debug("Checking that Keystone API v3 session works...") try: # if session is invalid, the following will raise some exception nova = nova_client.Client(self._compute_api_version, session=sess) nova.flavors.list() except keystoneauth1.exceptions.NotFound as err: log.warning("Creating Keystone v3 session failed: %s", err) return None except keystoneauth1.exceptions.ClientException as err: log.error("OpenStack server rejected request (likely configuration error?): %s", err) return None # FIXME: should we be raising an error instead? # if we got to this point, v3 session is valid log.info("Using Keystone API v3 session to authenticate to OpenStack") return sess
def _start_node(node): """ Start the given node VM. :return: bool -- True on success, False otherwise """ log.debug("_start_node: working on node `%s`", node.name) # FIXME: the following check is not optimal yet. When a node is still # in a starting state, it will start another node here, since the # `is_alive` method will only check for running nodes (see issue #13) if node.is_alive(): log.info("Not starting node `%s` which is already up.", node.name) return True else: try: node.start() log.info("Node `%s` has been started.", node.name) return True except Exception as err: log.exception("Could not start node `%s`: %s -- %s", node.name, err, err.__class__) return False
def __prepare_key_pair(self, key_name, private_key_path, public_key_path, password): if not key_name: log.warn('user_key_name has not been defined, assuming password based authentication') return try: list_key_pairs = self.__get_function_by_pattern('list_key_pairs') except AttributeError: raise UnsupportedError('key management not supported by provider') try: self.__get_function_or_ex_function('import_key_pair_from_file') except AttributeError: raise UnsupportedError('key import not supported by provider') try: self.__get_function_or_ex_function('create_key_pair') except AttributeError: raise UnsupportedError('key creation not supported by provider') if key_name in [k.name for k in list_key_pairs()]: log.info('Key pair (%s) already exists, skipping import.', key_name) return if public_key_path: log.debug("importing public key from path %s", public_key_path) key_import = self.__get_function_or_ex_function('import_key_pair_from_file') if not key_import(name=key_name, key_file_path=os.path.expandvars(os.path.expanduser(public_key_path))): raise KeypairError('failure during import of public key {p}'.format(p=public_key_path)) elif private_key_path: if not private_key_path.endswith('.pem'): raise KeypairError('can only work with .pem private keys, derive public key and set user_key_public') log.debug("deriving and importing public key from private key") self.__import_pem(key_name, private_key_path, password) elif os.path.exists(os.path.join(self.storage_path, '{p}.pem'.format(p=key_name))): self.__import_pem(key_name, os.path.join(self.storage_path, '{}.pem'.format(key_name)), password) else: with open(os.path.join(self.storage_path, '{p}.pem'.format(p=key_name)), 'w') as new_key_file: new_key_file.write(self.__get_function_or_ex_function('create_key_pair')(name=key_name)) self.__import_pem(key_name, os.path.join(self.storage_path, '{p}.pem'.format(p=key_name)), password)
def _start_node(node): """Static method to start a specific node on a cloud :return: bool -- True on success, False otherwise """ log.debug("_start_node: working on node `%s`" % node.name) # TODO: the following check is not optimal yet. When a # node is still in a starting state, # it will start another node here, # since the `is_alive` method will only check for # running nodes (see issue #13) if node.is_alive(): log.info("Not starting node `%s` which is " "already up&running.", node.name) return True else: try: node.start() log.info("Node `%s` has been started.", node.name) return True except Exception as e: log.error("Could not start node `%s`: %s", node.name, e) return None
def _start_node(node): """ Start the given node VM. :return: bool -- True on success, False otherwise """ log.debug("_start_node: working on node `%s`", node.name) # FIXME: the following check is not optimal yet. When a node is still # in a starting state, it will start another node here, since the # `is_alive` method will only check for running nodes (see issue #13) if node.is_alive(): log.info("Not starting node `%s` which is " "already up&running.", node.name) return True else: try: node.start() log.info("Node `%s` has been started.", node.name) return True except Exception as err: log.exception("Could not start node `%s`: %s -- %s", node.name, err, err.__class__) return False
def get_all(self): """Retrieves all clusters from the persistent state. :return: list of :py:class:`elasticluster.cluster.Cluster` """ file_ending = PickleRepository.file_ending allfiles = os.listdir(self.storage_path) cluster_files = [] for fname in allfiles: fpath = os.path.join(self.storage_path, fname) if fname.endswith('.%s' % file_ending) and os.path.isfile(fpath): cluster_files.append(fname[:-len(file_ending)-1]) else: log.info("Ignoring invalid storage file %s", fpath) clusters = list() for cluster_file in cluster_files: try: cluster = self.get(cluster_file) clusters.append(cluster) except (ImportError, AttributeError) as ex: log.error("Unable to load cluster %s: `%s`", cluster_file, ex) log.error("If cluster %s was created with a previous version of elasticluster, you may need to run `elasticluster migrate %s %s` to update it.", cluster_file, self.storage_path, cluster_file) return clusters
def __detect_os_identity_api_version(self): """ Return preferred OpenStack Identity API version (either one of the two strings ``'2'`` or ``'3'``) or ``None``. The following auto-detection strategies are tried (in this order): #. Read the environmental variable `OS_IDENTITY_API_VERSION` and check if its value is one of the two strings ``'2'`` or ``'3'``; #. Check if a version tag like ``/v3`` or ``/v2.0`` ends the OpenStack auth URL. If none of the above worked, return ``None``. For more information on ``OS_IDENTITY_API_VERSION``, please see `<https://docs.openstack.org/developer/python-openstackclient/authentication.html>`_. """ ver = os.getenv('OS_IDENTITY_API_VERSION', '') if ver == '3': log.info( "Using OpenStack Identity API v3" " because of environmental variable setting `OS_IDENTITY_API_VERSION=3`" ) return '3' elif ver == '2' or ver.startswith('2.'): log.info( "Using OpenStack Identity API v2" " because of environmental variable setting `OS_IDENTITY_API_VERSION=2`" ) return '2' elif self._os_auth_url.endswith('/v3'): log.info( "Using OpenStack Identity API v3 because of `/v3` ending in auth URL;" " set environmental variable OS_IDENTITY_API_VERSION to force use of Identity API v2 instead." ) return '3' elif self._os_auth_url.endswith('/v2.0'): log.info( "Using OpenStack Identity API v2 because of `/v2.0` ending in auth URL;" " set environmental variable OS_IDENTITY_API_VERSION to force use of Identity API v3 instead." ) return '2' else: # auto-detection failed, need to probe return None
def _get_credentials(self): if self._client_id and self._client_secret: flow = OAuth2WebServerFlow(self._client_id, self._client_secret, GCE_SCOPE) # The `Storage` object holds the credentials that your # application needs to authorize access to the user's # data. The name of the credentials file is provided. If the # file does not exist, it is created. This object can only # hold credentials for a single user. It stores the access # priviledges for the application, so a user only has to grant # access through the web interface once. storage_path = os.path.join(self._storage_path, self._client_id + '.oauth.dat') storage = Storage(storage_path) credentials = storage.get() if credentials is not None and not credentials.invalid: return credentials else: log.info("Determined that provided credentials are not valid.") try: # Next, check to see if there is a set of application # default credentials to use. log.info( "Attempting to use Google Application Default Credentials.") return GoogleCredentials.get_application_default() except ApplicationDefaultCredentialsError: log.info( "Failed to use Google Application Default Credentials, falling back to config." ) log.debug("(Original traceback follows.)", exc_info=True) try: # Finally, try to start a browser to have the user authenticate with Google args = argparser.parse_args([]) args.noauth_local_webserver = self._noauth_local_webserver return run_flow(flow, storage, flags=args) except Exception as err: log.error("Could not run authentication flow: %s", err) log.debug("(Original traceback follows.)", exc_info=True) raise CredentialsError( "No method to obtain GCE credentials was successful! Either " "set up Application Default Credentials using gcloud, or " "provide a client id and client secret from an oauth flow, " "or go through the oauth flow that elasticluster runs.")
def _get_credentials(self): if self._client_id and self._client_secret: flow = OAuth2WebServerFlow(self._client_id, self._client_secret, GCE_SCOPE) # The `Storage` object holds the credentials that your # application needs to authorize access to the user's # data. The name of the credentials file is provided. If the # file does not exist, it is created. This object can only # hold credentials for a single user. It stores the access # priviledges for the application, so a user only has to grant # access through the web interface once. storage_path = os.path.join(self._storage_path, self._client_id + '.oauth.dat') storage = Storage(storage_path) credentials = storage.get() if credentials is not None and not credentials.invalid: return credentials else: log.info("Determined that provided credentials are not valid.") try: # Next, check to see if there is a set of application # default credentials to use. log.info("Attempting to use Google Application Default Credentials.") return GoogleCredentials.get_application_default() except ApplicationDefaultCredentialsError: log.info("Failed to use Google Application Default Credentials, falling back to config.") log.debug("(Original traceback follows.)", exc_info=True) try: # Finally, try to start a browser to have the user authenticate with Google args = argparser.parse_args([]) args.noauth_local_webserver = self._noauth_local_webserver return run_flow(flow, storage, flags=args) except Exception as err: log.error("Could not run authentication flow: %s", err) log.debug("(Original traceback follows.)", exc_info=True) raise CredentialsError("No method to obtain GCE credentials was successful! Either " "set up Application Default Credentials using gcloud, or " "provide a client id and client secret from an oauth flow, " "or go through the oauth flow that elasticluster runs.")
def start_instance(self, key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None, node_name=None, **kwargs): """Starts a new instance on the cloud using the given properties. The following tasks are done to start an instance: * establish a connection to the cloud web service * check ssh keypair and upload it if it does not yet exist. This is a locked process, since this function might be called in multiple threads and we only want the key to be stored once. * check if the security group exists * run the instance with the given properties :param str key_name: name of the ssh key to connect :param str public_key_path: path to ssh public key :param str private_key_path: path to ssh private key :param str security_group: firewall rule definition to apply on the instance :param str flavor: machine type to use for the instance :param str image_id: image type (os) to use for the instance :param str image_userdata: command to execute after startup :param str username: username for the given ssh key, default None :return: str - instance id of the started instance """ vm_start_args = {} log.debug("Checking keypair `%s` ...", key_name) with OpenStackCloudProvider.__node_start_lock: self._check_keypair(key_name, public_key_path, private_key_path) vm_start_args['key_name'] = key_name security_groups = [sg.strip() for sg in security_group.split(',')] self._check_security_groups(security_groups) vm_start_args['security_groups'] = security_groups # Check if the image id is present. if image_id not in [img.id for img in self._get_images()]: raise ImageError( "No image found with ID `{0}` in project `{1}` of cloud {2}" .format(image_id, self._os_tenant_name, self._os_auth_url)) vm_start_args['userdata'] = image_userdata # Check if the flavor exists flavors = [fl for fl in self._get_flavors() if fl.name == flavor] if not flavors: raise FlavorError( "No flavor found with name `{0}` in project `{1}` of cloud {2}" .format(flavor, self._os_tenant_name, self._os_auth_url)) flavor = flavors[0] network_ids = [net_id.strip() for net_id in kwargs.pop('network_ids', '').split(',')] if network_ids: nics = [{'net-id': net_id, 'v4-fixed-ip': ''} for net_id in network_ids ] log.debug("Specifying networks for node %s: %s", node_name, ', '.join([nic['net-id'] for nic in nics])) else: nics = None vm_start_args['nics'] = nics if 'boot_disk_size' in kwargs: # check if the backing volume is already there volume_name = '{name}-{id}'.format(name=node_name, id=image_id) if volume_name in [v.name for v in self._get_volumes()]: raise ImageError( "Volume `{0}` already exists in project `{1}` of cloud {2}" .format(volume_name, self._os_tenant_name, self._os_auth_url)) log.info('Creating volume `%s` to use as VM disk ...', volume_name) try: bds = int(kwargs['boot_disk_size']) if bds < 1: raise ValueError('non-positive int') except (ValueError, TypeError): raise ConfigurationError( "Invalid `boot_disk_size` specified:" " should be a positive integer, got {0} instead" .format(kwargs['boot_disk_size'])) volume = self.cinder_client.volumes.create( size=bds, name=volume_name, imageRef=image_id, volume_type=kwargs.pop('boot_disk_type')) # wait for volume to come up volume_available = False while not volume_available: for v in self._get_volumes(): if v.name == volume_name and v.status == 'available': volume_available = True break sleep(1) # FIXME: hard-coded waiting time # ok, use volume as VM disk vm_start_args['block_device_mapping'] = { # FIXME: is it possible that `vda` is not the boot disk? e.g. if # a non-paravirtualized kernel is being used? should we allow # to set the boot device as an image parameter? 'vda': ('{id}:::{delete_on_terminate}' .format(id=volume.id, delete_on_terminate=1)), } # due to some `nova_client.servers.create()` implementation weirdness, # the first three args need to be spelt out explicitly and cannot be # conflated into `**vm_start_args` vm = self.nova_client.servers.create(node_name, image_id, flavor, **vm_start_args) # allocate and attach a floating IP, if requested if self.request_floating_ip: # We need to list the floating IPs for this instance try: # python-novaclient <8.0.0 floating_ips = [ip for ip in self.nova_client.floating_ips.list() if ip.instance_id == vm.id] except AttributeError: floating_ips = self.neutron_client.list_floatingips(id=vm.id) # allocate new floating IP if none given if not floating_ips: self._allocate_address(vm, network_ids) self._instances[vm.id] = vm return vm.id
def _build_inventory(self, cluster): """ Builds the inventory for the given cluster and returns its path :param cluster: cluster to build inventory for :type cluster: :py:class:`elasticluster.cluster.Cluster` """ inventory_data = defaultdict(list) for node in cluster.get_all_nodes(): if node.preferred_ip is None: log.warning("Ignoring node `{0}`: No IP address.".format( node.name)) continue if node.kind not in self.groups: # FIXME: should this raise a `ConfigurationError` instead? log.warning("Ignoring node `{0}`:" " Node kind `{1}` not defined in cluster!".format( node.name, node.kind)) continue extra_vars = ['ansible_user=%s' % node.image_user] ip_addr, port = parse_ip_address_and_port(node.preferred_ip) if port != 22: extra_vars.append('ansible_port=%s' % port) # write additional `ansible_*` variables to inventory; # `ansible_python_interpreter` gets special treatment # since we need to tell script `install-py2.sh` that # it should create a wrapper script for running `eatmydata python` extra_conf = self.extra_conf.copy() ansible_python_interpreter = extra_conf.pop( 'ansible_python_interpreter', '/usr/bin/python') extra_vars.append( 'ansible_python_interpreter={python}{eatmydata}'.format( python=ansible_python_interpreter, eatmydata=('+eatmydata' if self.use_eatmydata else ''))) extra_vars.extend('%s=%s' % (k, v) for k, v in extra_conf.items() if k.startswith('ansible_')) if node.kind in self.environment: extra_vars.extend( '%s=%s' % (k, v) for k, v in self.environment[node.kind].items()) for group in self.groups[node.kind]: inventory_data[group].append( (node.name, ip_addr, ' '.join(extra_vars))) if not inventory_data: log.info("No inventory file was created.") return None # create a temporary file to pass to ansible, since the # api is not stable yet... if self._storage_path_tmp: if not self._storage_path: self._storage_path = tempfile.mkdtemp() elasticluster.log.warning("Writing inventory file to tmp dir `%s`", self._storage_path) inventory_path = os.path.join(self._storage_path, (cluster.name + '.inventory')) log.debug("Writing Ansible inventory to file `%s` ...", inventory_path) with open(inventory_path, 'w+') as inventory_file: for section, hosts in inventory_data.items(): # Ansible throws an error "argument of type 'NoneType' is not # iterable" if a section is empty, so ensure we have something # to write in there if hosts: inventory_file.write("\n[" + section + "]\n") for host in hosts: hostline = "{0} ansible_host={1} {2}\n".format(*host) inventory_file.write(hostline) return inventory_path
def execute(self): creator = make_creator(self.params.config, storage_path=self.params.storage) repo = creator.create_repository() tmpdir = tempfile.mkdtemp() log.debug("Using temporary directory %s" % tmpdir) tmpconf = make_creator(self.params.config, storage_path=tmpdir) tmprepo = tmpconf.create_repository() rc = 0 # Read the zip file. try: with ZipFile(self.params.file, 'r') as zipfile: # Find main cluster file # create cluster object from it log.debug("ZIP file %s opened" % self.params.file) cluster = None zipfile.extractall(tmpdir) newclusters = tmprepo.get_all() cluster = newclusters[0] cur_clusternames = [c.name for c in repo.get_all()] oldname = cluster.name newname = self.params.rename if self.params.rename: cluster.name = self.params.rename for node in cluster.get_all_nodes(): node.cluster_name = cluster.name if cluster.name in cur_clusternames: raise Exception( "A cluster with name %s already exists. Use " "option --rename to rename the cluster to be " "imported." % cluster.name) # Save the cluster in the new position cluster.repository = repo repo.save_or_update(cluster) dest = cluster.repository.storage_path # Copy the known hosts srcfile = os.path.join(tmpdir, oldname + '.known_hosts') destfile = os.path.join(dest, cluster.name + '.known_hosts') shutil.copy(srcfile, destfile) # Copy the ssh keys, if present for attr in ('user_key_public', 'user_key_private'): keyfile = getattr(cluster, attr) keybase = os.path.basename(keyfile) srcfile = os.path.join(tmpdir, keybase) if os.path.isfile(srcfile): log.info("Importing key file %s" % keybase) destfile = os.path.join(dest, keybase) shutil.copy(srcfile, destfile) setattr(cluster, attr, destfile) for node in cluster.get_all_nodes(): nodekeyfile = getattr(node, attr) # Check if it's different from the main key if nodekeyfile != keyfile \ and os.path.isfile(nodekeyfile): destdir = os.path.join(dest, cluster.name, node.kind, node.name) nodekeybase = os.path.basename(nodekeyfile) log.info("Importing key file %s for node %s" % (nodekeybase, node.name)) if not os.path.isdir(destdir): os.makedirs(destdir) # Path to key in zip file srcfile = os.path.join(tmpdir, oldname, node.kind, node.name, nodekeybase) destfile = os.path.join(destdir, nodekeybase) shutil.copy(srcfile, destfile) # Always save the correct destfile setattr(node, attr, destfile) repo.save_or_update(cluster) if not cluster: log.error("ZIP file %s does not contain a valid cluster." % self.params.file) rc = 2 # Check if a cluster already exists. # if not, unzip the needed files, and update ssh key path if needed. except Exception as ex: log.error("Unable to import from zipfile %s: %s" % (self.params.file, ex)) rc = 1 finally: if os.path.isdir(tmpdir): shutil.rmtree(tmpdir) log.info("Cleaning up directory %s" % tmpdir) if rc == 0: print("Successfully imported cluster from ZIP %s to %s" % (self.params.file, repo.storage_path)) sys.exit(rc)
def _allocate_address_neutron(self, instance, network_ids): """ Allocates a floating/public ip address to the given instance, using the OpenStack Network ('Neutron') API. :param instance: instance to assign address to :param list network_id: List of IDs (as strings) of networks where to request allocation the floating IP. :return: public ip address """ self._init_os_api() with OpenStackCloudProvider.__node_start_lock: # Note: to return *all* addresses, all parameters to # `neutron_client.list_floatingips()` should be left out; # setting them to `None` (e.g., `fixed_ip_address=None`) # results in an empty list... free_ips = [ ip for ip in self.neutron_client.list_floatingips().get('floatingips') if (ip['floating_network_id'] in network_ids # keep only unallocated IP addrs and ip['fixed_ip_address'] is None and ip['port_id'] is None) ] if free_ips: floating_ip = free_ips.pop() log.debug("Using existing floating IP %r", floating_ip) else: # FIXME: OpenStack Network API v2 requires that we specify # a network ID along with the request for a floating IP. # However, ElastiCluster configuration allows for multiple # networks to be connected to a VM, but does not give any # hint as to which one(s) should be used for such requests. # So we try them all, ignoring errors until one request # succeeds and hope that it's OK. One can imagine # scenarios where this is *not* correct, but: (1) these # scenarios are unlikely, and (2) the old novaclient code # above has not even had the concept of multiple networks # for floating IPs and no-one has complained in 5 years... for network_id in network_ids: log.debug( "Trying to allocate floating IP on network %s ...", network_id) try: floating_ip = self.neutron_client.create_floatingip({ 'floatingip': { 'floating_network_id':network_id, }}).get('floatingip') log.debug( "Allocated IP address %s on network %s", floating_ip['floating_ip_address'], network_id) break # stop at first network where we get a floating IP except BadNeutronRequest as err: raise RuntimeError( "Failed allocating floating IP on network {0}: {1}" .format(network_id, err)) if floating_ip.get('floating_ip_address', None) is None: raise RuntimeError( "Could not allocate floating IP for VM {0}" .format(instance_id)) # wait until at least one interface is up interfaces = [] # FIXMEE: no timeout! while not interfaces: interfaces = instance.interface_list() sleep(2) ## FIXME: hard-coded value # get port ID for interface in interfaces: log.debug( "Instance %s (ID: %s):" " Checking if floating IP can be attached to interface %r ...", instance.name, instance.id, interface) # if interface.net_id not in network_ids: # log.debug( # "Instance %s (ID: %s):" # " Skipping interface %r:" # " not attached to any of the requested networks.", # instance.name, instance.id, interface) # continue port_id = interface.port_id if port_id is None: log.debug( "Instance %s (ID: %s):" " Skipping interface %r: no port ID!", instance.name, instance.id, interface) continue log.debug( "Instance `%s` (ID: %s):" " will assign floating IP to port ID %s (state: %s)," " already running IP addresses %r", instance.name, instance.id, port_id, interface.port_state, [item['ip_address'] for item in interface.fixed_ips]) if interface.port_state != 'ACTIVE': log.warn( "Instance `%s` (ID: %s):" " port `%s` is in state %s (epected 'ACTIVE' instead)", instance.name, instance.id, port_id, interface.port_state) break else: raise RuntimeError( "Could not find port on network(s) {0}" " for instance {1} (ID: {2}) to bind a floating IP to." .format(network_ids, instance.name, instance.id)) # assign floating IP to port floating_ip = self.neutron_client.update_floatingip( floating_ip['id'], { 'floatingip': { 'port_id': port_id, }, } ).get('floatingip') ip_address = floating_ip['floating_ip_address'] log.debug("Assigned IP address %s to port %s", ip_address, port_id) log.info("Waiting 300s until floating IP %s is ACTIVE", ip_address) for i in range(300): _floating_ip = self.neutron_client.show_floatingip(floating_ip['id']) if _floating_ip['floatingip']['status'] != 'DOWN': break sleep(1) # Invalidate cache for this VM, as we just assigned a new IP if instance.id in self._cached_instances: del self._cached_instances[instance.id] return ip_address
def missing_host_key(self, client, hostname, key): log.info('Ignoring unknown %s host key for %s: %s' % (key.get_name(), hostname, hexlify(key.get_fingerprint())))
def start(self, min_nodes=None): """Starts up all the instances in the cloud. To speed things up all instances are started in a seperate thread. To make sure elasticluster is not stopped during creation of an instance, it will overwrite the sigint handler. As soon as the last started instance is returned and saved to the repository, sigint is executed as usual. An instance is up and running as soon as a ssh connection can be established. If the startup timeout is reached before all instances are started, the cluster will stop and destroy all instances. This method is blocking and might take some time depending on the amount of instances to start. :param min_nodes: minimum number of nodes to start in case the quota is reached before all instances are up :type min_nodes: dict [node_kind] = number """ # To not mess up the cluster management we start the nodes in a # different thread. In this case the main thread receives the sigint # and communicates to the `start_node` thread. The nodes to work on # are passed in a managed queue. self.keep_running = True def sigint_handler(signal, frame): """ Makes sure the cluster is stored, before the sigint results in exiting during the node startup. """ log.error("user interruption: saving cluster before exit.") self.keep_running = False nodes = self.get_all_nodes() if log.DO_NOT_FORK: # Start the nodes sequentially without forking, in order # to ease the debugging for node in nodes: self._start_node(node) self.repository.save_or_update(self) else: # Create one thread for each node to start thread_pool = Pool(processes=min(len(nodes), self.thread_pool_max_size)) log.debug("Created pool of %d threads" % len(nodes)) # Intercept Ctrl-c signal.signal(signal.SIGINT, sigint_handler) # This is blocking result = thread_pool.map_async(self._start_node, nodes) while not result.ready(): result.wait(1) if not self.keep_running: # the user did abort the start of the cluster. We # finish the current start of a node and save the # status to the storage, so we don't have # unmanaged instances laying around log.error("Aborting upon Ctrl-C") thread_pool.close() thread_pool.join() self.repository.save_or_update(self) sys.exit(1) # dump the cluster here, so we don't loose any knowledge self.repository.save_or_update(self) signal.alarm(0) def sigint_reset(signal, frame): sys.exit(1) signal.signal(signal.SIGINT, sigint_reset) # check if all nodes are running, stop all nodes if the # timeout is reached def timeout_handler(signum, frame): raise TimeoutError("problems occured while starting the nodes, " "timeout `%i`", Cluster.startup_timeout) signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(Cluster.startup_timeout) starting_nodes = self.get_all_nodes() try: while starting_nodes: starting_nodes = [n for n in starting_nodes if not n.is_alive()] if starting_nodes: time.sleep(10) except TimeoutError as timeout: # FIXME: this is wrong: the reason why `node.is_alive()` fails could be caused by a network error, and we shouldn't just delete the nodes. log.error("Not all nodes were started correctly within the given" " timeout `%s`" % Cluster.startup_timeout) log.error("Please check if image, keypair, and network configuration is correct and try again.") # for node in starting_nodes: # log.error("Stopping node `%s`, since it could not start " # "within the given timeout" % node.name) # node.stop() # self.remove_node(node) signal.alarm(0) # If we reached this point, we should have IP addresses for # the nodes, so update the storage file again. self.repository.save_or_update(self) # Try to connect to each node. Run the setup action only when # we successfully connect to all of them. signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(Cluster.startup_timeout) pending_nodes = self.get_all_nodes()[:] if not os.path.exists(self.known_hosts_file): # Create the file if it's not present, otherwise the # following lines will raise an error try: fd = open(self.known_hosts_file, 'a') fd.close() except IOError as err: log.warning("Error while opening known_hosts file `%s`: `%s`" " NOT using known_hosts_file.", self.known_hosts_file, err) try: keys = paramiko.hostkeys.HostKeys(self.known_hosts_file) except IOError: keys = paramiko.hostkeys.HostKeys() log.warning("Ignoring error while opening known_hosts file %s" % self.known_hosts_file) try: while pending_nodes: for node in pending_nodes[:]: ssh = node.connect(keyfile=self.known_hosts_file) if ssh: log.info("Connection to node %s (%s) successful.", node.name, node.connection_ip()) # Add host keys to the keys object. for host, key in ssh.get_host_keys().items(): for ktype, keydata in key.items(): keys.add(host, ktype, keydata) pending_nodes.remove(node) self._save_keys_to_known_hosts_file(keys) if pending_nodes: time.sleep(5) except TimeoutError: # remove the pending nodes from the cluster log.error("Could not connect to all the nodes of the " "cluster within the given timeout `%s`." % Cluster.startup_timeout) for node in pending_nodes: log.error("Stopping node `%s`, since we could not connect to" " it within the timeout." % node.name) self.remove_node(node, stop=True) signal.alarm(0) # It might be possible that the node.connect() call updated # the `preferred_ip` attribute, so, let's save the cluster # again. self.repository.save_or_update(self) # Save host keys self._save_keys_to_known_hosts_file(keys) # A lot of things could go wrong when starting the cluster. To # ensure a stable cluster fitting the needs of the user in terms of # cluster size, we check the minimum nodes within the node groups to # match the current setup. if not min_nodes: # the node minimum is implicit if not specified. min_nodes = dict((key, len(self.nodes[key])) for key in self.nodes.iterkeys()) else: # check that each group has a minimum value for group, nodes in nodes.iteritems(): if group not in min_nodes: min_nodes[group] = len(nodes) self._check_cluster_size(min_nodes)
def start(self, min_nodes=None): """Starts up all the instances in the cloud. To speed things up all instances are started in a seperate thread. To make sure elasticluster is not stopped during creation of an instance, it will overwrite the sigint handler. As soon as the last started instance is returned and saved to the repository, sigint is executed as usual. An instance is up and running as soon as a ssh connection can be established. If the startup timeout is reached before all instances are started, the cluster will stop and destroy all instances. This method is blocking and might take some time depending on the amount of instances to start. :param min_nodes: minimum number of nodes to start in case the quota is reached before all instances are up :type min_nodes: dict [node_kind] = number """ # To not mess up the cluster management we start the nodes in a # different thread. In this case the main thread receives the sigint # and communicates to the `start_node` thread. The nodes to work on # are passed in a managed queue. self.keep_running = True def sigint_handler(signal, frame): """ Makes sure the cluster is stored, before the sigint results in exiting during the node startup. """ log.error("user interruption: saving cluster before exit.") self.keep_running = False nodes = self.get_all_nodes() thread_pool = Pool(processes=len(nodes)) log.debug("Created pool of %d threads" % len(nodes)) signal.signal(signal.SIGINT, sigint_handler) # This is blocking result = thread_pool.map_async(self._start_node, nodes) while not result.ready(): result.wait(1) if not self.keep_running: # the user did abort the start of the cluster. We finish the # current start of a node and save the status to the # storage, so we don't have not managed instances laying # around log.error("Aborting upon Ctrl-C") thread_pool.close() thread_pool.join() self.repository.save_or_update(self) sys.exit(1) # dump the cluster here, so we don't loose any knowledge self.repository.save_or_update(self) signal.alarm(0) def sigint_reset(signal, frame): sys.exit(1) signal.signal(signal.SIGINT, sigint_reset) # check if all nodes are running, stop all nodes if the # timeout is reached def timeout_handler(signum, frame): raise TimeoutError( "problems occured while starting the nodes, " "timeout `%i`", Cluster.startup_timeout) signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(Cluster.startup_timeout) starting_nodes = self.get_all_nodes() try: while starting_nodes: starting_nodes = [ n for n in starting_nodes if not n.is_alive() ] if starting_nodes: time.sleep(10) except TimeoutError as timeout: log.error("Not all nodes were started correctly within the given" " timeout `%s`" % Cluster.startup_timeout) for node in starting_nodes: log.error("Stopping node `%s`, since it could not start " "within the given timeout" % node.name) node.stop() self.remove_node(node) signal.alarm(0) # If we reached this point, we should have IP addresses for # the nodes, so update the storage file again. self.repository.save_or_update(self) # Try to connect to each node. Run the setup action only when # we successfully connect to all of them. signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(Cluster.startup_timeout) pending_nodes = self.get_all_nodes()[:] try: while pending_nodes: for node in pending_nodes[:]: if node.connect(): log.info("Connection to node %s (%s) successful.", node.name, node.connection_ip()) pending_nodes.remove(node) if pending_nodes: time.sleep(5) except TimeoutError: # remove the pending nodes from the cluster log.error("Could not connect to all the nodes of the " "cluster within the given timeout `%s`." % Cluster.startup_timeout) for node in pending_nodes: log.error("Stopping node `%s`, since we could not connect to" " it within the timeout." % node.name) node.stop() self.remove_node(node) signal.alarm(0) # It might be possible that the node.connect() call updated # the `preferred_ip` attribute, so, let's save the cluster # again. self.repository.save_or_update(self) # A lot of things could go wrong when starting the cluster. To # ensure a stable cluster fitting the needs of the user in terms of # cluster size, we check the minimum nodes within the node groups to # match the current setup. if not min_nodes: # the node minimum is implicit if not specified. min_nodes = dict( (key, len(self.nodes[key])) for key in self.nodes.iterkeys()) else: # check that each group has a minimum value for group, nodes in nodes.iteritems(): if group not in min_nodes: min_nodes[group] = len(nodes) self._check_cluster_size(min_nodes)
def execute(self): creator = make_creator(self.params.config, storage_path=self.params.storage) repo = creator.create_repository() tmpdir = tempfile.mkdtemp() log.debug("Using temporary directory %s" % tmpdir) tmpconf = make_creator(self.params.config, storage_path=tmpdir) tmprepo = tmpconf.create_repository() rc=0 # Read the zip file. try: with ZipFile(self.params.file, 'r') as zipfile: # Find main cluster file # create cluster object from it log.debug("ZIP file %s opened" % self.params.file) cluster = None zipfile.extractall(tmpdir) newclusters = tmprepo.get_all() cluster = newclusters[0] cur_clusternames = [c.name for c in repo.get_all()] oldname = cluster.name newname = self.params.rename if self.params.rename: cluster.name = self.params.rename for node in cluster.get_all_nodes(): node.cluster_name = cluster.name if cluster.name in cur_clusternames: raise Exception( "A cluster with name %s already exists. Use " "option --rename to rename the cluster to be " "imported." % cluster.name) # Save the cluster in the new position cluster.repository = repo repo.save_or_update(cluster) dest = cluster.repository.storage_path # Copy the known hosts srcfile = os.path.join(tmpdir, oldname+'.known_hosts') destfile = os.path.join(dest, cluster.name+'.known_hosts') shutil.copy(srcfile, destfile) # Copy the ssh keys, if present for attr in ('user_key_public', 'user_key_private'): keyfile = getattr(cluster, attr) keybase = os.path.basename(keyfile) srcfile = os.path.join(tmpdir, keybase) if os.path.isfile(srcfile): log.info("Importing key file %s" % keybase) destfile = os.path.join(dest, keybase) shutil.copy(srcfile, destfile) setattr(cluster, attr, destfile) for node in cluster.get_all_nodes(): nodekeyfile = getattr(node, attr) # Check if it's different from the main key if nodekeyfile != keyfile \ and os.path.isfile(nodekeyfile): destdir = os.path.join(dest, cluster.name, node.kind, node.name) nodekeybase = os.path.basename(nodekeyfile) log.info("Importing key file %s for node %s" % (nodekeybase, node.name)) if not os.path.isdir(destdir): os.makedirs(destdir) # Path to key in zip file srcfile = os.path.join(tmpdir, oldname, node.kind, node.name, nodekeybase) destfile = os.path.join(destdir, nodekeybase) shutil.copy(srcfile, destfile) # Always save the correct destfile setattr(node, attr, destfile) repo.save_or_update(cluster) if not cluster: log.error("ZIP file %s does not contain a valid cluster." % self.params.file) rc = 2 # Check if a cluster already exists. # if not, unzip the needed files, and update ssh key path if needed. except Exception as ex: log.error("Unable to import from zipfile %s: %s" % (self.params.file, ex)) rc=1 finally: if os.path.isdir(tmpdir): shutil.rmtree(tmpdir) log.info("Cleaning up directory %s" % tmpdir) if rc == 0: print("Successfully imported cluster from ZIP %s to %s" % (self.params.file, repo.storage_path)) sys.exit(rc)
def stop(self): log.info("shutting down instance `%s`", self.instance_id) self._cloud_provider.stop_instance(self.instance_id)
def stop_instance(self, instance_id): instance = self.__get_instance(instance_id) if not instance: return log.info('stopping %s', instance.name) instance.destroy()
def start(self): """ Starts the cluster with the properties given in the constructor. It will create the nodes through the configurator and delegate all the work to them. After the identifiers of all instances are available, it will save the cluster through the cluster storage. """ # To not mess up the cluster management we start the nodes in a # different thread. In this case the main thread receives the sigint # and communicates to the `start_node` thread. The nodes to work on # are passed in a managed queue. self.keep_running = True def sigint_handler(signal, frame): """ Makes sure the cluster is stored, before the sigint results in exiting during the node startup. """ log.error("user interruption: saving cluster before exit.") self.keep_running = False nodes = self.get_all_nodes() thread_pool = Pool(processes=len(nodes)) log.debug("Created pool of %d threads" % len(nodes)) signal.signal(signal.SIGINT, sigint_handler) # This is blocking result = thread_pool.map_async(self._start_node, nodes) while not result.ready(): result.wait(1) if not self.keep_running: # the user did abort the start of the cluster. We finish the # current start of a node and save the status to the # storage, so we don't have not managed instances laying # around log.error("Aborting upon Ctrl-C") thread_pool.close() thread_pool.join() self._storage.dump_cluster(self) sys.exit(1) # dump the cluster here, so we don't loose any knowledge self._storage.dump_cluster(self) signal.alarm(0) def sigint_reset(signal, frame): sys.exit(1) signal.signal(signal.SIGINT, sigint_reset) # check if all nodes are running, stop all nodes if the # timeout is reached def timeout_handler(signum, frame): raise TimeoutError( "problems occured while starting the nodes, " "timeout `%i`", Cluster.startup_timeout) signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(Cluster.startup_timeout) starting_nodes = self.get_all_nodes() try: while starting_nodes: starting_nodes = [ n for n in starting_nodes if not n.is_alive() ] if starting_nodes: time.sleep(10) except TimeoutError as timeout: log.error("Not all nodes were started correctly within the given" " timeout `%s`" % Cluster.startup_timeout) for node in starting_nodes: log.error("Stopping node `%s`, since it could not start " "within the given timeout" % node.name) node.stop() self.remove_node(node) signal.alarm(0) # If we reached this point, we should have IP addresses for # the nodes, so update the storage file again. self._storage.dump_cluster(self) # Try to connect to each node. Run the setup action only when # we successfully connect to all of them. signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(Cluster.startup_timeout) pending_nodes = self.get_all_nodes()[:] try: while pending_nodes: for node in pending_nodes[:]: if node.connect(): log.info("Connection to node %s (%s) successful.", node.name, node.ip_public) pending_nodes.remove(node) if pending_nodes: time.sleep(5) except TimeoutError: # remove the pending nodes from the cluster log.error("Could not connect to all the nodes of the " "cluster within the given timeout `%s`." % Cluster.startup_timeout) for node in pending_nodes: log.error("Stopping node `%s`, since we could not connect to" " it within the timeout." % node.name) node.stop() self.remove_node(node) signal.alarm(0) # A lot of things could go wrong when starting the cluster. To # ensure a stable cluster fitting the needs of the user in terms of # cluster size, we check the minimum nodes within the node groups to # match the current setup. self._check_cluster_size()
def start_instance(self, key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None, node_name=None, network_ids=None, price=None, timeout=None, **kwargs): """Starts a new instance on the cloud using the given properties. The following tasks are done to start an instance: * establish a connection to the cloud web service * check ssh keypair and upload it if it does not yet exist. This is a locked process, since this function might be called in multiple threads and we only want the key to be stored once. * check if the security group exists * run the instance with the given properties :param str key_name: name of the ssh key to connect :param str public_key_path: path to ssh public key :param str private_key_path: path to ssh private key :param str security_group: firewall rule definition to apply on the instance :param str flavor: machine type to use for the instance :param str image_id: image type (os) to use for the instance :param str image_userdata: command to execute after startup :param str username: username for the given ssh key, default None :param float price: Spot instance price (if 0, do not use spot instances). :param int price: Timeout (in seconds) waiting for spot instances; only used if price > 0. :return: str - instance id of the started instance """ connection = self._connect() log.debug("Checking keypair `%s`.", key_name) # the `_check_keypair` method has to be called within a lock, # since it will upload the key if it does not exist and if this # happens for every node at the same time ec2 will throw an error # message (see issue #79) with BotoCloudProvider.__node_start_lock: self._check_keypair(key_name, public_key_path, private_key_path) log.debug("Checking security group `%s`.", security_group) security_group_id = self._check_security_group(security_group) # image_id = self._find_image_id(image_id) if network_ids: interfaces = [] for subnet in network_ids.split(','): subnet_id = self._check_subnet(subnet) interfaces.append( boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=subnet_id, groups=[security_group_id], associate_public_ip_address=self.request_floating_ip)) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection( *interfaces) security_groups = [] else: interfaces = None security_groups = [security_group] # get defaults for `price` and `timeout` from class instance if price is None: price = self.price if timeout is None: timeout = self.timeout try: #start spot instance if bid is specified if price: log.info("Requesting spot instance with price `%s` ...", price) request = connection.request_spot_instances( price, image_id, key_name=key_name, security_groups=security_groups, instance_type=flavor, user_data=image_userdata, network_interfaces=interfaces, instance_profile_name=self._instance_profile)[-1] # wait until spot request is fullfilled (will wait # forever if no timeout is given) start_time = time.time() timeout = (float(timeout) if timeout else 0) log.info( "Waiting for spot instance (will time out in %d seconds) ...", timeout) while request.status.code != 'fulfilled': if timeout and time.time() - start_time > timeout: request.cancel() raise RuntimeError('spot instance timed out') time.sleep(self.POLL_INTERVAL) # update request status request = connection.get_all_spot_instance_requests( request_ids=request.id)[-1] else: reservation = connection.run_instances( image_id, key_name=key_name, security_groups=security_groups, instance_type=flavor, user_data=image_userdata, network_interfaces=interfaces, instance_profile_name=self._instance_profile) except Exception as ex: log.error("Error starting instance: %s", ex) if "TooManyInstances" in ex: raise ClusterError(ex) else: raise InstanceError(ex) if price: vm = connection.get_only_instances( instance_ids=[request.instance_id])[-1] else: vm = reservation.instances[-1] vm.add_tag("Name", node_name) # cache instance object locally for faster access later on self._instances[vm.id] = vm return vm.id
def start_instance(self, key_name, public_key_path, private_key_path, security_group, flavor, image_id, image_userdata, username=None, node_name=None, **kwargs): """Starts a new instance on the cloud using the given properties. The following tasks are done to start an instance: * establish a connection to the cloud web service * check ssh keypair and upload it if it does not yet exist. This is a locked process, since this function might be called in multiple threads and we only want the key to be stored once. * check if the security group exists * run the instance with the given properties :param str key_name: name of the ssh key to connect :param str public_key_path: path to ssh public key :param str private_key_path: path to ssh private key :param str security_group: firewall rule definition to apply on the instance :param str flavor: machine type to use for the instance :param str image_id: image type (os) to use for the instance :param str image_userdata: command to execute after startup :param str username: username for the given ssh key, default None :return: str - instance id of the started instance """ self._init_os_api() vm_start_args = {} log.debug("Checking keypair `%s` ...", key_name) with OpenStackCloudProvider.__node_start_lock: self._check_keypair(key_name, public_key_path, private_key_path) vm_start_args['key_name'] = key_name security_groups = [sg.strip() for sg in security_group.split(',')] self._check_security_groups(security_groups) vm_start_args['security_groups'] = security_groups # Check if the image id is present. if image_id not in [img.id for img in self._get_images()]: raise ImageError( "No image found with ID `{0}` in project `{1}` of cloud {2}". format(image_id, self._os_tenant_name, self._os_auth_url)) vm_start_args['userdata'] = image_userdata # Check if the flavor exists flavors = [fl for fl in self._get_flavors() if fl.name == flavor] if not flavors: raise FlavorError( "No flavor found with name `{0}` in project `{1}` of cloud {2}" .format(flavor, self._os_tenant_name, self._os_auth_url)) flavor = flavors[0] network_ids = [ net_id.strip() for net_id in kwargs.pop('network_ids', '').split(',') ] if network_ids: nics = [{ 'net-id': net_id, 'v4-fixed-ip': '' } for net_id in network_ids] log.debug("Specifying networks for node %s: %s", node_name, ', '.join([nic['net-id'] for nic in nics])) else: nics = None vm_start_args['nics'] = nics if 'boot_disk_size' in kwargs: # check if the backing volume is already there volume_name = '{name}-{id}'.format(name=node_name, id=image_id) if volume_name in [v.name for v in self._get_volumes()]: raise ImageError( "Volume `{0}` already exists in project `{1}` of cloud {2}" .format(volume_name, self._os_tenant_name, self._os_auth_url)) log.info('Creating volume `%s` to use as VM disk ...', volume_name) try: bds = int(kwargs['boot_disk_size']) if bds < 1: raise ValueError('non-positive int') except (ValueError, TypeError): raise ConfigurationError( "Invalid `boot_disk_size` specified:" " should be a positive integer, got {0} instead".format( kwargs['boot_disk_size'])) volume = self.cinder_client.volumes.create( size=bds, name=volume_name, imageRef=image_id, volume_type=kwargs.pop('boot_disk_type')) # wait for volume to come up volume_available = False while not volume_available: for v in self._get_volumes(): if v.name == volume_name and v.status == 'available': volume_available = True break sleep(1) # FIXME: hard-coded waiting time # ok, use volume as VM disk vm_start_args['block_device_mapping'] = { # FIXME: is it possible that `vda` is not the boot disk? e.g. if # a non-paravirtualized kernel is being used? should we allow # to set the boot device as an image parameter? 'vda': ('{id}:::{delete_on_terminate}'.format(id=volume.id, delete_on_terminate=1)), } # due to some `nova_client.servers.create()` implementation weirdness, # the first three args need to be spelt out explicitly and cannot be # conflated into `**vm_start_args` vm = self.nova_client.servers.create(node_name, image_id, flavor, **vm_start_args) # allocate and attach a floating IP, if requested if self.request_floating_ip: # We need to list the floating IPs for this instance try: # python-novaclient <8.0.0 floating_ips = [ ip for ip in self.nova_client.floating_ips.list() if ip.instance_id == vm.id ] except AttributeError: floating_ips = self.neutron_client.list_floatingips(id=vm.id) # allocate new floating IP if none given if not floating_ips: self._allocate_address(vm, network_ids) self._instances[vm.id] = vm return vm.id
def start(self, min_nodes=None, max_concurrent_requests=0): """ Starts up all the instances in the cloud. To speed things up, all instances are started in a seperate thread. To make sure ElastiCluster is not stopped during creation of an instance, it will overwrite the sigint handler. As soon as the last started instance is returned and saved to the repository, sigint is executed as usual. A VM instance is considered 'up and running' as soon as an SSH connection can be established. If the startup timeout is reached before all instances are started, ElastiCluster stops the cluster and terminates all VM instances. This method is blocking and might take some time depending on the amount of instances to start. :param min_nodes: minimum number of nodes to start in case the quota is reached before all instances are up :type min_nodes: dict [node_kind] = number :param int max_concurrent_requests: Issue at most this number of requests to start VMs; if 1 or less, start nodes one at a time (sequentially). The special value ``0`` means run 4 threads for each available processor. """ nodes = self.get_all_nodes() log.info("Starting cluster nodes ...") if max_concurrent_requests == 0: try: max_concurrent_requests = 4 * get_num_processors() except RuntimeError: log.warning("Cannot determine number of processors!" " will start nodes sequentially...") max_concurrent_requests = 1 if max_concurrent_requests > 1: nodes = self._start_nodes_parallel(nodes, max_concurrent_requests) else: nodes = self._start_nodes_sequentially(nodes) # checkpoint cluster state self.repository.save_or_update(self) not_started_nodes = self._check_starting_nodes(nodes, self.startup_timeout) # now that all nodes are up, checkpoint cluster state again self.repository.save_or_update(self) # Try to connect to each node to gather IP addresses and SSH host keys log.info("Checking SSH connection to nodes ...") pending_nodes = nodes - not_started_nodes self._gather_node_ip_addresses(pending_nodes, self.startup_timeout) # It might be possible that the node.connect() call updated # the `preferred_ip` attribute, so, let's save the cluster # again. self.repository.save_or_update(self) # A lot of things could go wrong when starting the cluster. To # ensure a stable cluster fitting the needs of the user in terms of # cluster size, we check the minimum nodes within the node groups to # match the current setup. min_nodes = self._compute_min_nodes(min_nodes) self._check_cluster_size(min_nodes)
def _build_inventory(self, cluster): """ Builds the inventory for the given cluster and returns its path :param cluster: cluster to build inventory for :type cluster: :py:class:`elasticluster.cluster.Cluster` """ inventory_data = defaultdict(list) for node in cluster.get_all_nodes(): if node.preferred_ip is None: log.warning( "Ignoring node `{0}`: No IP address." .format(node.name)) continue if node.kind not in self.groups: # FIXME: should this raise a `ConfigurationError` instead? log.warning( "Ignoring node `{0}`:" " Node kind `{1}` not defined in cluster!" .format(node.name, node.kind)) continue extra_vars = ['ansible_user=%s' % node.image_user] ip_addr, port = parse_ip_address_and_port(node.preferred_ip) if port != 22: extra_vars.append('ansible_port=%s' % port) if node.kind in self.environment: extra_vars.extend('%s=%s' % (k, v) for k, v in self.environment[node.kind].items()) for group in self.groups[node.kind]: inventory_data[group].append( (node.name, ip_addr, str.join(' ', extra_vars))) if not inventory_data: log.info("No inventory file was created.") return None # create a temporary file to pass to ansible, since the # api is not stable yet... if self._storage_path_tmp: if not self._storage_path: self._storage_path = tempfile.mkdtemp() elasticluster.log.warning( "Writing inventory file to tmp dir `%s`", self._storage_path) inventory_path = os.path.join( self._storage_path, (cluster.name + '.inventory')) log.debug("Writing Ansible inventory to file `%s` ...", inventory_path) with open(inventory_path, 'w+') as inventory_file: for section, hosts in inventory_data.items(): # Ansible throws an error "argument of type 'NoneType' is not # iterable" if a section is empty, so ensure we have something # to write in there if hosts: inventory_file.write("\n[" + section + "]\n") for host in hosts: hostline = "{0} ansible_host={1} {2}\n".format(*host) inventory_file.write(hostline) return inventory_path
def start(self, min_nodes=None): """Starts up all the instances in the cloud. To speed things up all instances are started in a seperate thread. To make sure elasticluster is not stopped during creation of an instance, it will overwrite the sigint handler. As soon as the last started instance is returned and saved to the repository, sigint is executed as usual. An instance is up and running as soon as a ssh connection can be established. If the startup timeout is reached before all instances are started, the cluster will stop and destroy all instances. This method is blocking and might take some time depending on the amount of instances to start. :param min_nodes: minimum number of nodes to start in case the quota is reached before all instances are up :type min_nodes: dict [node_kind] = number """ # To not mess up the cluster management we start the nodes in a # different thread. In this case the main thread receives the sigint # and communicates to the `start_node` thread. The nodes to work on # are passed in a managed queue. self.keep_running = True def sigint_handler(signal, frame): """ Makes sure the cluster is stored, before the sigint results in exiting during the node startup. """ log.error("user interruption: saving cluster before exit.") self.keep_running = False nodes = self.get_all_nodes() if log.DO_NOT_FORK: # Start the nodes sequentially without forking, in order # to ease the debugging for node in nodes: self._start_node(node) self.repository.save_or_update(self) else: # Create one thread for each node to start thread_pool = Pool( processes=min(len(nodes), self.thread_pool_max_size)) log.debug("Created pool of %d threads" % len(nodes)) # Intercept Ctrl-c signal.signal(signal.SIGINT, sigint_handler) # This is blocking result = thread_pool.map_async(self._start_node, nodes) while not result.ready(): result.wait(1) if not self.keep_running: # the user did abort the start of the cluster. We # finish the current start of a node and save the # status to the storage, so we don't have # unmanaged instances laying around log.error("Aborting upon Ctrl-C") thread_pool.close() thread_pool.join() self.repository.save_or_update(self) sys.exit(1) # dump the cluster here, so we don't loose any knowledge self.repository.save_or_update(self) signal.alarm(0) def sigint_reset(signal, frame): sys.exit(1) signal.signal(signal.SIGINT, sigint_reset) # check if all nodes are running, stop all nodes if the # timeout is reached def timeout_handler(signum, frame): raise TimeoutError( "problems occured while starting the nodes, " "timeout `%i`", Cluster.startup_timeout) signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(Cluster.startup_timeout) starting_nodes = self.get_all_nodes() try: while starting_nodes: starting_nodes = [ n for n in starting_nodes if not n.is_alive() ] if starting_nodes: time.sleep(10) except TimeoutError as timeout: # FIXME: this is wrong: the reason why `node.is_alive()` fails could be caused by a network error, and we shouldn't just delete the nodes. log.error("Not all nodes were started correctly within the given" " timeout `%s`" % Cluster.startup_timeout) log.error( "Please check if image, keypair, and network configuration is correct and try again." ) # for node in starting_nodes: # log.error("Stopping node `%s`, since it could not start " # "within the given timeout" % node.name) # node.stop() # self.remove_node(node) signal.alarm(0) # If we reached this point, we should have IP addresses for # the nodes, so update the storage file again. self.repository.save_or_update(self) # Try to connect to each node. Run the setup action only when # we successfully connect to all of them. signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(Cluster.startup_timeout) pending_nodes = self.get_all_nodes()[:] if not os.path.exists(self.known_hosts_file): # Create the file if it's not present, otherwise the # following lines will raise an error try: fd = open(self.known_hosts_file, 'a') fd.close() except IOError as err: log.warning( "Error while opening known_hosts file `%s`: `%s`" " NOT using known_hosts_file.", self.known_hosts_file, err) try: keys = paramiko.hostkeys.HostKeys(self.known_hosts_file) except IOError: keys = paramiko.hostkeys.HostKeys() log.warning("Ignoring error while opening known_hosts file %s" % self.known_hosts_file) try: while pending_nodes: for node in pending_nodes[:]: ssh = node.connect(keyfile=self.known_hosts_file) if ssh: log.info("Connection to node %s (%s) successful.", node.name, node.connection_ip()) # Add host keys to the keys object. for host, key in ssh.get_host_keys().items(): for ktype, keydata in key.items(): keys.add(host, ktype, keydata) pending_nodes.remove(node) if pending_nodes: time.sleep(5) except TimeoutError: # remove the pending nodes from the cluster log.error("Could not connect to all the nodes of the " "cluster within the given timeout `%s`." % Cluster.startup_timeout) for node in pending_nodes: log.error("Stopping node `%s`, since we could not connect to" " it within the timeout." % node.name) self.remove_node(node, stop=True) signal.alarm(0) # It might be possible that the node.connect() call updated # the `preferred_ip` attribute, so, let's save the cluster # again. self.repository.save_or_update(self) # Save host keys try: keys.save(self.known_hosts_file) except IOError: log.warning("Ignoring error while saving known_hosts file %s" % self.known_hosts_file) # A lot of things could go wrong when starting the cluster. To # ensure a stable cluster fitting the needs of the user in terms of # cluster size, we check the minimum nodes within the node groups to # match the current setup. if not min_nodes: # the node minimum is implicit if not specified. min_nodes = dict( (key, len(self.nodes[key])) for key in self.nodes.iterkeys()) else: # check that each group has a minimum value for group, nodes in nodes.iteritems(): if group not in min_nodes: min_nodes[group] = len(nodes) self._check_cluster_size(min_nodes)