def stop(self, force=False): """Destroys all instances of this cluster and calls delete on the repository. :param bool force: force termination of instances in any case """ for node in self.get_all_nodes(): if node.instance_id: try: node.stop() self.nodes[node.kind].remove(node) log.debug("Removed node with instance id %s from %s" % (node.instance_id, node.kind)) except: # Boto does not always raises an `Exception` class! log.error("could not stop instance `%s`, it might " "already be down.", node.instance_id) else: log.debug("Not stopping node with no instance id. It seems " "like node `%s` did not start correctly." % node.name) self.nodes[node.kind].remove(node) if not self.get_all_nodes(): log.debug("Removing cluster %s.", self.name) self._setup_provider.cleanup(self) self.repository.delete(self) elif not force: log.warning("Not all instances have been terminated. " "Please rerun the `elasticluster stop %s`", self.name) self.repository.save_or_update(self) else: log.warning("Not all instances have been terminated. However, " "as requested, the cluster has been force-removed.") self._setup_provider.cleanup(self) self.repository.delete(self)
def setup(self, extra_args=tuple()): """ Configure the cluster nodes. Actual action is delegated to the :py:class:`elasticluster.providers.AbstractSetupProvider` that was provided at construction time. :param list extra_args: List of additional command-line arguments that are appended to each invocation of the setup program. :return: bool - True on success, False otherwise """ try: # setup the cluster using the setup provider ret = self._setup_provider.setup_cluster(self, extra_args) except Exception as err: log.error( "The cluster hosts are up and running," " but %s failed to set the cluster up: %s", self._setup_provider.HUMAN_READABLE_NAME, err) ret = False if not ret: log.warning( "Cluster `%s` not yet configured. Please, re-run " "`elasticluster setup %s` and/or check your configuration", self.name, self.name) return ret
def update(self): for node in self.get_all_nodes(): try: node.update_ips() except InstanceError, ex: log.warning("Ignoring error updating information on node %s: %s", node, str(ex))
def get_frontend_node(self): """Returns the first node of the class specified in the configuration file as `ssh_to`, or the first node of the first class in alphabetic order. :return: :py:class:`Node` :raise: :py:class:`elasticluster.exceptions.NodeNotFound` if no valid frontend node is found """ if self.ssh_to: if self.ssh_to in self.nodes: cls = self.nodes[self.ssh_to] if cls: return cls[0] else: log.warning( "preferred `ssh_to` `%s` is empty: unable to " "get the choosen frontend node from that class.", self.ssh_to) else: raise NodeNotFound( "Invalid ssh_to `%s`. Please check your " "configuration file." % self.ssh_to) # If we reach this point, the preferred class was empty. Pick # one using the default logic. for cls in sorted(self.nodes.keys()): if self.nodes[cls]: return self.nodes[cls][0] # Uh-oh, no nodes in this cluster. raise NodeNotFound("Unable to find a valid frontend: " "cluster has no nodes!")
def resume_cluster(self, cluster, extra_args=tuple()): """ As `setup_cluster`, but prefers to run a resume playbook, if one is available. A resume playbook is a playbook which is designed to restart a cluster after it has been paused, and can be more efficient than a setup playbook (since it can assume that the required software is already installed). If no such playbook is available, it will use the standard setup playbook and print a warning. :param cluster: cluster to configure :type cluster: :py:class:`elasticluster.cluster.Cluster` :param list extra_args: List of additional command-line arguments that are appended to each invocation of the setup program. :return: ``True`` on success, ``False`` otherwise. Please note, if nothing has to be configured, then ``True`` is returned. :raises: `ConfigurationError` if the playbook can not be found or is corrupt. """ if self._resume_playbook_path is not None: return self._run_playbook(cluster, self._resume_playbook_path, extra_args) else: log.warning("No resume playbook is available - falling back to the setup " "playbook, which could be slow.") return self.setup_cluster(cluster, extra_args)
def stop_instance(self, instance_id): """Stops the instance gracefully. :param str instance_id: instance identifier :raises: `InstanceError` if instance can not be stopped """ if not instance_id: log.info("Instance to stop has no instance id") return gce = self._connect() try: request = gce.instances().delete(project=self._project_id, instance=instance_id, zone=self._zone) response = self._execute_request(request) self._check_response(response) except HttpError as e: # If the instance does not exist, we can a 404 - just log it, and # return without exception so the caller can remove the reference. if e.resp.status == 404: log.warning("Instance to stop `%s` was not found" % instance_id) else: raise InstanceError("Could not stop instance `%s`: `%s`" % (instance_id, e)) except CloudProviderError as e: raise InstanceError("Could not stop instance `%s`: `%s`" % (instance_id, e))
def stop(self, force=False, wait=False): """ Terminate all VMs in this cluster and delete its repository. :param bool force: remove cluster from storage even if not all nodes could be stopped. """ log.debug("Stopping cluster `%s` ...", self.name) failed = self._stop_all_nodes(wait) if failed: if force: self._delete_saved_data() log.warning("Not all cluster nodes have been terminated." " However, as requested, data about the cluster" " has been removed from local storage.") else: self.repository.save_or_update(self) log.warning( "Not all cluster nodes have been terminated." " Fix errors above and re-run `elasticluster stop %s`", self.name) else: self._delete_saved_data()
def __init_keystone_session_v2(self, check=False): """Create and return a session object using Keystone API v2.""" from keystoneauth1 import loading as keystone_v2 loader = keystone_v2.get_plugin_loader('password') auth = loader.load_from_options( auth_url=self._os_auth_url, username=self._os_username, password=self._os_password, project_name=self._os_tenant_name, ) sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert) if check: log.debug("Checking that Keystone API v2 session works...") try: # if session is invalid, the following will raise some exception nova = nova_client.Client(self.compute_api_version, session=sess, cacert=self._os_cacert) nova.flavors.list() except keystoneauth1.exceptions.NotFound as err: log.warning("Creating Keystone v2 session failed: %s", err) return None except keystoneauth1.exceptions.ClientException as err: log.error( "OpenStack server rejected request (likely configuration error?): %s", err) return None # FIXME: should we be raising an error instead? # if we got to this point, v2 session is valid log.info("Using Keystone API v2 session to authenticate to OpenStack") return sess
def execute(self): configurator = Configurator.fromConfig( self.params.config, storage_path=self.params.storage) config = configurator.cluster_conf print("""%d cluster templates found in configuration file.""" % len(config)) templates = config.keys() for pattern in self.params.clusters: templates = [t for t in templates if fnmatch(t, pattern)] if self.params.clusters: print("""%d cluter templates found matching pattern(s) '%s'""" % (len(templates), str.join(", ", self.params.clusters))) for template in templates: try: cluster = configurator.create_cluster(template, template) print(""" name: %s""" % template) for nodekind in cluster.nodes: print("%s nodes: %d" % (nodekind, len(cluster.nodes[nodekind]))) except ConfigurationError, ex: log.warning("unable to load cluster `%s`: %s", template, ex)
def resume_cluster(self, cluster, extra_args=tuple()): """ As `setup_cluster`, but prefers to run a resume playbook, if one is available. A resume playbook is a playbook which is designed to restart a cluster after it has been paused, and can be more efficient than a setup playbook (since it can assume that the required software is already installed). If no such playbook is available, it will use the standard setup playbook and print a warning. :param cluster: cluster to configure :type cluster: :py:class:`elasticluster.cluster.Cluster` :param list extra_args: List of additional command-line arguments that are appended to each invocation of the setup program. :return: ``True`` on success, ``False`` otherwise. Please note, if nothing has to be configured, then ``True`` is returned. :raises: `ConfigurationError` if the playbook can not be found or is corrupt. """ if self._resume_playbook_path is not None: return self._run_playbook(cluster, self._resume_playbook_path, extra_args) else: log.warning( "No resume playbook is available - falling back to the setup " "playbook, which could be slow.") return self.setup_cluster(cluster, extra_args)
def _stop_all_nodes(self, wait=False): """ Terminate all cluster nodes. Return number of failures. """ failed = 0 for node in self.get_all_nodes(): if not node.instance_id: log.warning( "Node `%s` has no instance ID." " Assuming it did not start correctly," " so removing it anyway from the cluster.", node.name) self.nodes[node.kind].remove(node) continue # try and stop node try: # wait and pause for and recheck. node.stop(wait) self.nodes[node.kind].remove(node) log.debug( "Removed node `%s` from cluster `%s`", node.name, self.name) except InstanceNotFoundError as err: log.info( "Node `%s` (instance ID `%s`) was not found;" " assuming it has already been terminated.", node.name, node.instance_id) except Exception as err: failed += 1 log.error( "Could not stop node `%s` (instance ID `%s`): %s %s", node.name, node.instance_id, err, err.__class__) return failed
def _stop_all_nodes(self, wait=False): """ Terminate all cluster nodes. Return number of failures. """ failed = 0 for node in self.get_all_nodes(): if not node.instance_id: log.warning( "Node `%s` has no instance ID." " Assuming it did not start correctly," " so removing it anyway from the cluster.", node.name) self.nodes[node.kind].remove(node) continue # try and stop node try: # wait and pause for and recheck. node.stop(wait) self.nodes[node.kind].remove(node) log.debug("Removed node `%s` from cluster `%s`", node.name, self.name) except InstanceNotFoundError as err: log.info( "Node `%s` (instance ID `%s`) was not found;" " assuming it has already been terminated.", node.name, node.instance_id) except Exception as err: failed += 1 log.error("Could not stop node `%s` (instance ID `%s`): %s %s", node.name, node.instance_id, err, err.__class__) return failed
def stop(self, force=False): """ Terminates all instances corresponding to this cluster and deletes the cluster storage. """ for node in self.get_all_nodes(): if node.instance_id: try: node.stop() self.nodes[node.type].remove(node) log.debug("Removed node with instance id %s from %s" % (node.instance_id, node.type)) except: # Boto does not always raises an `Exception` class! log.error("could not stop instance `%s`, it might " "already be down.", node.instance_id) else: log.debug("Not stopping node with no instance id. It seems " "like node `%s` did not start correctly." % node.name) self.nodes[node.type].remove(node) if not self.get_all_nodes(): log.debug("Removing cluster %s.", self.name) self._setup_provider.cleanup() self._storage.delete_cluster(self.name) elif not force: log.warning("Not all instances have been terminated. " "Please rerun the `elasticluster stop %s`", self.name) self._storage.dump_cluster(self) else: log.warning("Not all instances have been terminated. However, " "as requested, the cluster has been force-removed.") self._setup_provider.cleanup() self._storage.delete_cluster(self.name)
def inspect_slurm_cluster(ssh, node_information): (_in, _out, _err) = ssh.exec_command("sinfo -hNel") nodes = [] for line in _out: match = slurm_sinfo_regexp.match(line) if match: num_nodes = int(match.group('num')) num_cores = int(match.group('cpus')) * num_nodes memory = int(match.group('memory')) * num_nodes memory_per_core = float(match.group('memory')) / num_cores nodes.append([num_nodes, num_cores, memory, memory_per_core]) else: log.warning("Unable to parse output of sinfo: following line doesn't match node regexp: '%s'" % line.strip()) node_information['num_nodes'] = sum(i[0] for i in nodes) node_information['max_cores'] = sum(i[1] for i in nodes) node_information['max_cores_per_job'] = max(i[1] for i in nodes) node_information['max_memory_per_core'] = max(i[2] for i in nodes) (_in, _out, _err) = ssh.exec_command("scontrol -o show part") # Assuming only one partition line = _out.read() match = slurm_scontrol_maxtime_regexp.match(line) node_information['max_walltime'] = '672hours' if match: maxtime = match.group('MaxTime') if maxtime != 'UNLIMITED': node_information['max_walltime'] = maxtime return node_information
def __init_keystone_session_v2(self, check=False): """Create and return a session object using Keystone API v2.""" from keystoneauth1 import loading as keystone_v2 loader = keystone_v2.get_plugin_loader('password') auth = loader.load_from_options( auth_url=self._os_auth_url, username=self._os_username, password=self._os_password, project_name=self._os_tenant_name, ) sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert) if check: log.debug("Checking that Keystone API v2 session works...") try: # if session is invalid, the following will raise some exception nova = nova_client.Client(self._compute_api_version, session=sess, cacert=self._os_cacert) nova.flavors.list() except keystoneauth1.exceptions.NotFound as err: log.warning("Creating Keystone v2 session failed: %s", err) return None except keystoneauth1.exceptions.ClientException as err: log.error("OpenStack server rejected request (likely configuration error?): %s", err) return None # FIXME: should we be raising an error instead? # if we got to this point, v2 session is valid log.info("Using Keystone API v2 session to authenticate to OpenStack") return sess
def setup(self): """Configure the cluster nodes with the specified This is delegated to the provided :py:class:`elasticluster.providers.AbstractSetupProvider` :return: bool - True on success, False otherwise """ try: # setup the cluster using the setup provider ret = self._setup_provider.setup_cluster(self) except Exception as e: log.error( "the setup provider was not able to setup the cluster, " "but the cluster is running by now. Setup provider error " "message: `%s`", str(e), ) ret = False if not ret: log.warning( "Cluster `%s` not yet configured. Please, re-run " "`elasticluster setup %s` and/or check your configuration", self.name, self.name, ) return ret
def pre_run(self): # Hack around http://bugs.python.org/issue9253 ? if "--version" in sys.argv: import pkg_resources version = pkg_resources.get_distribution("elasticluster").version print("elasticluster version %s" % version) sys.exit(0) cli.app.CommandLineApp.pre_run(self) # print *all* Python warnings through the logging subsystem warnings.resetwarnings() warnings.simplefilter('once') utils.redirect_warnings(logger='gc3.elasticluster') # Set verbosity level loglevel = max(1, logging.WARNING - 10 * max(0, self.params.verbose)) coloredlogs.install(logger=log, level=loglevel) # In debug mode, avoid forking if self.params.verbose > 3: log.DO_NOT_FORK = True log.raiseExceptions = True if not os.path.isdir(self.params.storage): # We do not create *all* the parents, but we do create the # directory if we can. try: os.makedirs(self.params.storage) except OSError as ex: sys.stderr.write("Unable to create storage directory: " "%s\n" % (str(ex))) sys.exit(1) # If no configuration file was specified and default does not exists... if not os.path.isfile(self.params.config): if self.params.config == self.default_configuration_file: # Copy the default configuration file to the user's home if not os.path.exists(os.path.dirname(self.params.config)): os.mkdir(os.path.dirname(self.params.config)) template = resource_filename( 'elasticluster', 'share/etc/config.template') log.warning("Deploying default configuration file to %s.", self.params.config) shutil.copyfile(template, self.params.config) else: # Exit if supplied configuration file does not exists. if not os.path.isfile(self.params.config): sys.stderr.write( "Unable to read configuration file `%s`.\n" % self.params.config) sys.exit(1) assert self.params.func, ("No subcommand defined in `ElastiCluster.setup()") try: self.params.func.pre_run() except (RuntimeError, ConfigurationError) as ex: sys.stderr.write(str(ex).strip()) sys.stderr.write('\n') sys.exit(1)
def execute(self): """ Load the cluster and build a GC3Pie configuration snippet. """ log.warning( "Command `elasticluster gc3pie-config` is DEPRECATED" " and will be removed in release 1.4 of ElastiCluster") creator = make_creator(self.params.config, storage_path=self.params.storage) cluster_name = self.params.cluster try: cluster = creator.load_cluster(cluster_name) except (ClusterNotFound, ConfigurationError) as ex: log.error("Listing nodes from cluster %s: %s", cluster_name, ex) return from elasticluster.gc3pie_config import create_gc3pie_config_snippet if self.params.append: path = os.path.expanduser(self.params.append) try: fd = open(path, 'a') fd.write(create_gc3pie_config_snippet(cluster)) fd.close() except IOError as ex: log.error("Unable to write configuration to file %s: %s", path, ex) else: print(create_gc3pie_config_snippet(cluster))
def inspect_slurm_cluster(ssh, node_information): (_in, _out, _err) = ssh.exec_command("sinfo -hNel") nodes = [] for line in _out: match = slurm_sinfo_regexp.match(line) if match: num_nodes = int(match.group('num')) num_cores = int(match.group('cpus')) * num_nodes memory = int(match.group('memory')) * num_nodes memory_per_core = float(match.group('memory')) / num_cores nodes.append([num_nodes, num_cores, memory, memory_per_core]) else: log.warning( "Unable to parse output of sinfo: following line doesn't match node regexp: '%s'" % line.strip()) node_information['num_nodes'] = sum(i[0] for i in nodes) node_information['max_cores'] = sum(i[1] for i in nodes) node_information['max_cores_per_job'] = max(i[1] for i in nodes) node_information['max_memory_per_core'] = max(i[2] for i in nodes) (_in, _out, _err) = ssh.exec_command("scontrol -o show part") # Assuming only one partition line = _out.read() match = slurm_scontrol_maxtime_regexp.match(line) node_information['max_walltime'] = '672hours' if match: maxtime = match.group('MaxTime') if maxtime != 'UNLIMITED': node_information['max_walltime'] = maxtime return node_information
def execute(self): """ Load the cluster and build a GC3Pie configuration snippet. """ log.warning("Command `elasticluster gc3pie-config` is DEPRECATED" " and will be removed in release 1.4 of ElastiCluster") creator = make_creator(self.params.config, storage_path=self.params.storage) cluster_name = self.params.cluster try: cluster = creator.load_cluster(cluster_name) except (ClusterNotFound, ConfigurationError) as ex: log.error("Listing nodes from cluster %s: %s", cluster_name, ex) return from elasticluster.gc3pie_config import create_gc3pie_config_snippet if self.params.append: path = os.path.expanduser(self.params.append) try: fd = open(path, 'a') fd.write(create_gc3pie_config_snippet(cluster)) fd.close() except IOError as ex: log.error("Unable to write configuration to file %s: %s", path, ex) else: print(create_gc3pie_config_snippet(cluster))
def stop(self, force=False, wait=False): """ Terminate all VMs in this cluster and delete its repository. :param bool force: remove cluster from storage even if not all nodes could be stopped. """ log.debug("Stopping cluster `%s` ...", self.name) failed = self._stop_all_nodes(wait) if failed: if force: self._delete_saved_data() log.warning( "Not all cluster nodes have been terminated." " However, as requested, data about the cluster" " has been removed from local storage.") else: self.repository.save_or_update(self) log.warning( "Not all cluster nodes have been terminated." " Fix errors above and re-run `elasticluster stop %s`", self.name) else: self._delete_saved_data()
def execute(self): configurator = Configurator.fromConfig( self.params.config, storage_path=self.params.storage) config = configurator.cluster_conf print("""%d cluster templates found in configuration file.""" % len(config)) templates = config.keys() for pattern in self.params.clusters: templates = [t for t in templates if fnmatch(t, pattern)] if self.params.clusters: print("""%d cluter templates found matching pattern(s) '%s'""" % (len(templates), str.join(", ", self.params.clusters))) for template in templates: try: cluster = configurator.create_cluster(template, template) print(""" name: %s""" % template) for nodekind in cluster.nodes: print("%s nodes: %d" % ( nodekind, len(cluster.nodes[nodekind]))) except ConfigurationError, ex: log.warning("unable to load cluster `%s`: %s", template, ex)
def update(self): for node in self.get_all_nodes(): try: node.update_ips() except InstanceError, ex: log.warning( "Ignoring error updating information on node %s: %s", node, str(ex))
def _gather_node_ip_addresses(self, nodes, lapse, ssh_timeout, remake=False): """ Connect via SSH to each node. Return set of nodes that could not be reached with `lapse` seconds. """ # for convenience, we might set this to ``None`` if the file cannot # be opened -- but we do not want to forget the cluster-wide # setting in case the error is transient known_hosts_path = self.known_hosts_file # If run with remake=True, deletes known_hosts_file so that it will # be recreated. Prevents "Invalid host key" errors if remake and os.path.isfile(known_hosts_path): os.remove(known_hosts_path) # Create the file if it's not present, otherwise the # following lines will raise an error try: fd = open(known_hosts_path, 'a') fd.close() except IOError as err: log.warning("Error opening SSH 'known hosts' file `%s`: %s", known_hosts_path, err) known_hosts_path = None keys = paramiko.hostkeys.HostKeys(known_hosts_path) with timeout(lapse, raise_timeout_error): try: while nodes: for node in copy(nodes): ssh = node.connect( keyfile=known_hosts_path, timeout=ssh_timeout) if ssh: log.info("Connection to node `%s` successful," " using IP address %s to connect.", node.name, node.connection_ip()) # Add host keys to the keys object. for host, key in ssh.get_host_keys().items(): for keytype, keydata in key.items(): keys.add(host, keytype, keydata) self._save_keys_to_known_hosts_file(keys) nodes.remove(node) if nodes: time.sleep(self.polling_interval) except TimeoutError: log.error( "Some nodes of the cluster were unreachable" " within the given %d-seconds timeout: %s", lapse, ', '.join(node.name for node in nodes)) # return list of nodes return nodes
def update(self): """Update all connection information of the nodes of this cluster. It occurs for example public ip's are not available imediatly, therefore calling this method might help. """ for node in self.get_all_nodes(): try: node.update_ips() except InstanceError, ex: log.warning("Ignoring error updating information on node %s: %s", node, str(ex))
def pre_run(self): # Hack around http://bugs.python.org/issue9253 ? if "--version" in sys.argv: import pkg_resources version = pkg_resources.get_distribution("elasticluster").version print("elasticluster version %s" % version) sys.exit(0) cli.app.CommandLineApp.pre_run(self) # Set verbosity level loglevel = max(1, logging.WARNING - 10 * max(0, self.params.verbose)) coloredlogs.install(logger=log, level=loglevel) # In debug mode, avoid forking if self.params.verbose > 3: log.DO_NOT_FORK = True log.raiseExceptions = True if not os.path.isdir(self.params.storage): # We do not create *all* the parents, but we do create the # directory if we can. try: os.makedirs(self.params.storage) except OSError as ex: sys.stderr.write("Unable to create storage directory: " "%s\n" % (str(ex))) sys.exit(1) # If no configuration file was specified and default does not exists... if not os.path.isfile(self.params.config): if self.params.config == self.default_configuration_file: # Copy the default configuration file to the user's home if not os.path.exists(os.path.dirname(self.params.config)): os.mkdir(os.path.dirname(self.params.config)) template = resource_filename('elasticluster', 'share/etc/config.template') log.warning("Deploying default configuration file to %s.", self.params.config) shutil.copyfile(template, self.params.config) else: # Exit if supplied configuration file does not exists. if not os.path.isfile(self.params.config): sys.stderr.write( "Unable to read configuration file `%s`.\n" % self.params.config) sys.exit(1) assert self.params.func, ( "No subcommand defined in `ElastiCluster.setup()") try: self.params.func.pre_run() except (RuntimeError, ConfigurationError) as ex: sys.stderr.write(str(ex).strip()) sys.stderr.write('\n') sys.exit(1)
def update(self): """Update all connection information of the nodes of this cluster. It occurs for example public ip's are not available imediatly, therefore calling this method might help. """ for node in self.get_all_nodes(): try: node.update_ips() except InstanceError, ex: log.warning( "Ignoring error updating information on node %s: %s", node, str(ex))
def cleanup(self): """ Delete inventory file. """ if self.inventory_path: if os.path.exists(self.inventory_path): try: os.unlink(self.inventory_path) except OSError, ex: log.warning( "AnsibileProvider: Ignoring error while deleting " "inventory file %s: %s", self.inventory_path, ex)
def _check_keypair(self, name, public_key_path, private_key_path): connection = self._connect() keypairs = connection.get_all_key_pairs() keypairs = dict((k.name, k) for k in keypairs) # decide if dsa or rsa key is provided pkey = None is_dsa_key = False try: pkey = DSSKey.from_private_key_file(private_key_path) is_dsa_key = True except PasswordRequiredException: raise KeypairError( "Key `%s` is encrypted with a password. Please, use" "an unencrypted key or use ssh-agent" % private_key_path) except SSHException: try: pkey = RSAKey.from_private_key_file(private_key_path) except PasswordRequiredException: raise KeypairError( "Key `%s` is encrypted with a password. Please, use" "an unencrypted key or use ssh-agent" % private_key_path) except SSHException: raise KeypairError('File `%s` is neither a valid DSA key ' 'or RSA key.' % private_key_path) # create keys that don't exist yet if name not in keypairs: log.warning( "Keypair `%s` not found on resource `%s`, Creating a new one", name, self._url) with open(os.path.expanduser(public_key_path)) as f: key_material = f.read() try: # check for DSA on amazon if "amazon" in self._ec2host and is_dsa_key: log.error( "Apparently, amazon does not support DSA keys. " "Please specify a valid RSA key.") raise KeypairError( "Apparently, amazon does not support DSA keys." "Please specify a valid RSA key.") connection.import_key_pair(name, key_material) except Exception, ex: log.error( "Could not import key `%s` with name `%s` to `%s`", name, public_key_path, self._url) raise KeypairError( "could not create keypair `%s`: %s" % (name, ex))
def get_stored_clusters(self): """ Returns a list of all stored clusters. """ allfiles = os.listdir(self._storage_dir) db_files = [] for fname in allfiles: fpath = os.path.join(self._storage_dir, fname) if fname.endswith('.json') and os.path.isfile(fpath): db_files.append(fname[:-5]) else: log.warning("Ignoring invalid storage file %s", fpath) return db_files
def _gather_node_ip_addresses(self, nodes, lapse): """ Connect via SSH to each node. Return set of nodes that could not be reached with `lapse` seconds. """ # for convenience, we might set this to ``None`` if the file cannot # be opened -- but we do not want to forget the cluster-wide # setting in case the error is transient known_hosts_path = self.known_hosts_file # Create the file if it's not present, otherwise the # following lines will raise an error try: fd = open(known_hosts_path, 'a') fd.close() except IOError as err: log.warning("Error opening SSH 'known hosts' file `%s`: %s", known_hosts_path, err) known_hosts_path = None keys = paramiko.hostkeys.HostKeys(known_hosts_path) with timeout(lapse, raise_timeout_error): try: while nodes: for node in copy(nodes): ssh = node.connect(keyfile=known_hosts_path) if ssh: log.info("Connection to node `%s` successful," " using IP address %s to connect.", node.name, node.connection_ip()) # Add host keys to the keys object. for host, key in ssh.get_host_keys().items(): for keytype, keydata in key.items(): keys.add(host, keytype, keydata) self._save_keys_to_known_hosts_file(keys) nodes.remove(node) if nodes: time.sleep(self.polling_interval) except TimeoutError: log.error( "Some nodes of the cluster were unreachable" " within the given %d-seconds timeout: %s", lapse, ', '.join(node.name for node in nodes)) # return list of nodes return nodes
def update(self): """Update all connection information of the nodes of this cluster. It occurs for example public ip's are not available imediatly, therefore calling this method might help. """ for node in self.get_all_nodes(): try: node.update_ips() # If we previously did not have a preferred_ip or the # preferred_ip is not in the current list, then try to connect # to one of the node ips and update the preferred_ip. if node.ips and not (node.preferred_ip and node.preferred_ip in node.ips): node.connect() except InstanceError as ex: log.warning("Ignoring error updating information on node %s: %s", node, str(ex)) self.repository.save_or_update(self)
def use(self, kind, name): """ Mark a node name as used. """ try: params = self.parse(name) index = int(params['index'], 10) if index in self._free[kind]: self._free[kind].remove(index) top = self._top[kind] if index > top: self._free[kind].update(range(top + 1, index)) self._top[kind] = index except ValueError: log.warning( "Cannot extract numerical index" " from node name `%s`!", name)
def use(self, kind, name): """ Mark a node name as used. """ try: params = self._parse(name) index = int(params['index'], 10) if index in self._free[kind]: self._free[kind].remove(index) top = self._top[kind] if index > top: self._free[kind].update(range(top+1, index)) self._top[kind] = index except ValueError: log.warning( "Cannot extract numerical index" " from node name `%s`!", name)
def read_login_section(self, name): """ Reads the login section for the given name from the configuration file and returns its properties in a dictionary """ config = self._read_section("login/" + name) config["user_key_private"] = os.path.expanduser(os.path.expandvars(config["user_key_private"])) config["user_key_public"] = os.path.expanduser(os.path.expandvars(config["user_key_public"])) if not os.path.exists(config["user_key_private"]) or not os.path.exists(config["user_key_public"]): log.warning( "The key files don't exist. Please check your " "configuration file `user_key_public`, " "`user_key_private`." ) return config
def _ensure_sshagent(cls): """Function to start a ssh-agent if it is not running :raises SSHAgentError if the process does not succed """ if 'SSH_AUTH_SOCK' in os.environ.keys(): return else: try: output=subprocess.check_output(['ssh-agent',]) for output_line in output.split('\n'): match=re.match('(^.*)\=([^\;]*)\;.*$',output_line) if match: var_name, var_value =match.group(1,2) if var_name=='SSH_AUTH_SOCK' or var_name=='SSH_AGENT_PID': os.environ[str(var_name)]=str(var_value) log.warning('ssh-agent started') return except subprocess.CalledProcessError: raise SSHAgentError
def check_config_or_copy_template(self): # If no configuration file was specified and default does not exists and the user did not create a config dir... if not os.path.isfile(self.params.config) and not os.path.isdir(self.params.config + '.d'): if self.params.config == self.default_configuration_file: # Copy the default configuration file to the user's home if not os.path.exists(os.path.dirname(self.params.config)): os.mkdir(os.path.dirname(self.params.config)) template = resource_filename( 'elasticluster', 'share/etc/config.template') log.warning("Deploying default configuration file to %s.", self.params.config) shutil.copyfile(template, self.params.config) else: # Exit if supplied configuration file does not exists. if not os.path.isfile(self.params.config): sys.stderr.write( "Unable to read configuration file `%s`.\n" % self.params.config) sys.exit(1)
def stop(self, force=False): """Destroys all instances of this cluster and calls delete on the repository. :param bool force: force termination of instances in any case """ for node in self.get_all_nodes(): if node.instance_id: try: node.stop() self.nodes[node.kind].remove(node) log.debug("Removed node with instance id %s from %s" % (node.instance_id, node.kind)) except: # Boto does not always raises an `Exception` class! log.error( "could not stop instance `%s`, it might " "already be down.", node.instance_id) else: log.debug("Not stopping node with no instance id. It seems " "like node `%s` did not start correctly." % node.name) self.nodes[node.kind].remove(node) if not self.get_all_nodes(): log.debug("Removing cluster %s.", self.name) self._setup_provider.cleanup(self) self.repository.delete(self) elif not force: log.warning( "Not all instances have been terminated. " "Please rerun the `elasticluster stop %s`", self.name) self.repository.save_or_update(self) else: log.warning("Not all instances have been terminated. However, " "as requested, the cluster has been force-removed.") self._setup_provider.cleanup(self) self.repository.delete(self) # Remove also ssh known hosts if os.path.exists(self.known_hosts_file): os.remove(self.known_hosts_file)
def __init_keystone_session_v3(self, check=False): """ Return a new session object, created using Keystone API v3. .. note:: Note that the only supported authN method is password authentication; token or other plug-ins are not currently supported. """ try: # may fail on Python 2.6? from keystoneauth1.identity import v3 as keystone_v3 except ImportError: log.warning("Cannot load Keystone API v3 library.") return None auth = keystone_v3.Password( auth_url=self._os_auth_url, username=self._os_username, password=self._os_password, user_domain_name=self._os_user_domain_name, project_domain_name=self._os_project_domain_name, project_name=self._os_tenant_name, ) sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert) if check: log.debug("Checking that Keystone API v3 session works...") try: # if session is invalid, the following will raise some exception nova = nova_client.Client(self.compute_api_version, session=sess) nova.flavors.list() except keystoneauth1.exceptions.NotFound as err: log.warning("Creating Keystone v3 session failed: %s", err) return None except keystoneauth1.exceptions.ClientException as err: log.error( "OpenStack server rejected request (likely configuration error?): %s", err) return None # FIXME: should we be raising an error instead? # if we got to this point, v3 session is valid log.info("Using Keystone API v3 session to authenticate to OpenStack") return sess
def cleanup(self, cluster): """Deletes the inventory file used last recently used. :param cluster: cluster to clear up inventory file for :type cluster: :py:class:`elasticluster.cluster.Cluster` """ if self._storage_path and os.path.exists(self._storage_path): filename = (cluster.name + '.inventory') inventory_path = os.path.join(self._storage_path, filename) if os.path.exists(inventory_path): try: os.unlink(inventory_path) if self._storage_path_tmp: if len(os.listdir(self._storage_path)) == 0: shutil.rmtree(self._storage_path) except OSError as ex: log.warning( "AnsibileProvider: Ignoring error while deleting " "inventory file %s: %s", inventory_path, ex)
def cleanup(self, cluster): """Deletes the inventory file used last recently used. :param cluster: cluster to clear up inventory file for :type cluster: :py:class:`elasticluster.cluster.Cluster` """ if self._storage_path and os.path.exists(self._storage_path): fname = "%s.%s" % (AnsibleSetupProvider.inventory_file_ending, cluster.name) inventory_path = os.path.join(self._storage_path, fname) if os.path.exists(inventory_path): try: os.unlink(inventory_path) if self._storage_path_tmp: if len(os.listdir(self._storage_path)) == 0: shutil.rmtree(self._storage_path) except OSError, ex: log.warning( "AnsibileProvider: Ignoring error while deleting " "inventory file %s: %s", inventory_path, ex )
def update(self): """Update all connection information of the nodes of this cluster. It occurs for example public ip's are not available imediatly, therefore calling this method might help. """ for node in self.get_all_nodes(): try: node.update_ips() # If we previously did not have a preferred_ip or the # preferred_ip is not in the current list, then try to connect # to one of the node ips and update the preferred_ip. if node.ips and \ not (node.preferred_ip and \ node.preferred_ip in node.ips): node.connect() except InstanceError as ex: log.warning("Ignoring error updating information on node %s: %s", node, str(ex)) self.repository.save_or_update(self)
def execute(self): templates = Configuration.Instance().list_cluster_templates() for pattern in self.params.clusters: templates = [t for t in templates if fnmatch(t, pattern)] print("""%d cluster templates found.""" % len(templates)) for template in templates: try: cluster = Configurator().create_cluster(template) print(""" name: %s image id: %s flavor: %s cloud: %s""" % (template, cluster.extra['image_id'], cluster.extra['flavor'], cluster._cloud)) for nodetype in cluster.nodes: print("%s nodes: %d" % ( nodetype, len(cluster.nodes[nodetype]))) except ConfigurationError, ex: log.warning("unable to load cluster `%s`: %s", template, ex)
def _check_keypair(self, name, path): connection = self._connect() keypairs = connection.get_all_key_pairs() keypairs = dict((k.name, k) for k in keypairs) # create keys that don't exist yet if name not in keypairs: log.warning( "Keypair `%s` not found on resource `%s`, Creating a new one", name, self._url) with open(os.path.expanduser(path)) as f: key_material = f.read() try: # TODO check if given key is a public key file connection.import_key_pair(name, key_material) except Exception, ex: log.error( "Could not import key `%s` with name `%s` to `%s`", name, path, self._url) raise KeypairError( "could not create keypair `%s`: %s" % (name, ex))
def __init_keystone_session_v3(self, check=False): """ Return a new session object, created using Keystone API v3. .. note:: Note that the only supported authN method is password authentication; token or other plug-ins are not currently supported. """ try: # may fail on Python 2.6? from keystoneauth1.identity import v3 as keystone_v3 except ImportError: log.warning("Cannot load Keystone API v3 library.") return None auth = keystone_v3.Password( auth_url=self._os_auth_url, username=self._os_username, password=self._os_password, user_domain_name=self._os_user_domain_name, project_domain_name=self._os_project_domain_name, project_name=self._os_tenant_name, ) sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert) if check: log.debug("Checking that Keystone API v3 session works...") try: # if session is invalid, the following will raise some exception nova = nova_client.Client(self._compute_api_version, session=sess) nova.flavors.list() except keystoneauth1.exceptions.NotFound as err: log.warning("Creating Keystone v3 session failed: %s", err) return None except keystoneauth1.exceptions.ClientException as err: log.error("OpenStack server rejected request (likely configuration error?): %s", err) return None # FIXME: should we be raising an error instead? # if we got to this point, v3 session is valid log.info("Using Keystone API v3 session to authenticate to OpenStack") return sess
def execute(self): configurator = Configurator.fromConfig( self.params.config, storage_path=self.params.storage) config = configurator.cluster_conf print("""%d cluster templates found.""" % len(config)) templates = config.keys() for pattern in self.params.clusters: templates = [t for t in templates if fnmatch(t, pattern)] for template in templates: try: cluster = configurator.create_cluster(template, template) print(""" name: %s cloud: %s""" % (template, cluster._cloud)) for nodetype in cluster.nodes: print("%s nodes: %d" % (nodetype, len(cluster.nodes[nodetype]))) except ConfigurationError, ex: log.warning("unable to load cluster `%s`: %s", template, ex)
def _check_keypair(self, name, public_key_path, private_key_path): """First checks if the keypair is valid, then checks if the keypair is registered with on the cloud. If not the keypair is added to the users ssh keys. :param str name: name of the ssh key :param str public_key_path: path to the ssh public key file :param str private_key_path: path to the ssh private key file :raises: `KeypairError` if key is not a valid RSA or DSA key, the key could not be uploaded, the fingerprint does not match to the one uploaded to the cloud or the key is neither accessible nor included in the ssh-agent """ # Read key. We do it as first thing because we need it either # way, to check the fingerprint of the remote keypair if it # exists already, or to create a new keypair. # Check if a keypair `name` exists on the cloud. try: keypair = self.client.keypairs.get(name) except NotFound: log.warning( "Keypair `%s` not found on resource `%s`, Creating a new one", name, self._os_auth_url) # Create a new keypair with open(os.path.expanduser(public_key_path)) as f: key_material = f.read() try: self.client.keypairs.create(name, key_material) except Exception, ex: log.error( "Could not import key `%s` with name `%s` to `%s`", name, public_key_path, self._os_auth_url) raise KeypairError( "could not create keypair `%s`: %s" % (name, ex))
def setup(self): """Configure the cluster nodes with the specified This is delegated to the provided :py:class:`elasticluster.providers.AbstractSetupProvider` :return: bool - True on success, False otherwise """ try: # setup the cluster using the setup provider ret = self._setup_provider.setup_cluster(self) except Exception as e: log.error( "the setup provider was not able to setup the cluster, " "but the cluster is running by now. Setup provider error " "message: `%s`", str(e)) ret = False if not ret: log.warning( "Cluster `%s` not yet configured. Please, re-run " "`elasticluster setup %s` and/or check your configuration", self.name, self.name) return ret
def stop(self, force=False): """ Terminates all instances corresponding to this cluster and deletes the cluster storage. """ for node in self.get_all_nodes(): if node.instance_id: try: node.stop() self.nodes[node.type].remove(node) log.debug("Removed node with instance id %s from %s" % (node.instance_id, node.type)) except: # Boto does not always raises an `Exception` class! log.error( "could not stop instance `%s`, it might " "already be down.", node.instance_id) else: log.debug("Not stopping node with no instance id. It seems " "like node `%s` did not start correctly." % node.name) self.nodes[node.type].remove(node) if not self.get_all_nodes(): log.debug("Removing cluster %s.", self.name) self._setup_provider.cleanup() self._storage.delete_cluster(self.name) elif not force: log.warning( "Not all instances have been terminated. " "Please rerun the `elasticluster stop %s`", self.name) self._storage.dump_cluster(self) else: log.warning("Not all instances have been terminated. However, " "as requested, the cluster has been force-removed.") self._setup_provider.cleanup() self._storage.delete_cluster(self.name)
def _check_keypair(self, name, public_key_path, private_key_path): connection = self._connect() keypairs = connection.get_all_key_pairs() keypairs = dict((k.name, k) for k in keypairs) # decide if dsa or rsa key is provided pkey = None is_dsa_key = False try: pkey = DSSKey.from_private_key_file(private_key_path) is_dsa_key = True except PasswordRequiredException: log.warning( "Unable to check key file `%s` because it is encrypted with a " "password. Please, ensure that you added it to the SSH agent " "with `ssh-add %s`", private_key_path, private_key_path) except SSHException: try: pkey = RSAKey.from_private_key_file(private_key_path) except PasswordRequiredException: log.warning( "Unable to check key file `%s` because it is encrypted with a " "password. Please, ensure that you added it to the SSH agent " "with `ssh-add %s`", private_key_path, private_key_path) except SSHException: raise KeypairError('File `%s` is neither a valid DSA key ' 'or RSA key.' % private_key_path) # create keys that don't exist yet if name not in keypairs: log.warning( "Keypair `%s` not found on resource `%s`, Creating a new one", name, self._url) with open(os.path.expanduser(public_key_path)) as f: key_material = f.read() try: # check for DSA on amazon if "amazon" in self._ec2host and is_dsa_key: log.error( "Apparently, amazon does not support DSA keys. " "Please specify a valid RSA key.") raise KeypairError( "Apparently, amazon does not support DSA keys." "Please specify a valid RSA key.") connection.import_key_pair(name, key_material) except Exception, ex: log.error( "Could not import key `%s` with name `%s` to `%s`", name, public_key_path, self._url) raise KeypairError( "could not create keypair `%s`: %s" % (name, ex))
def get_ssh_to_node(self, ssh_to=None): """ Return target node for SSH/SFTP connections. The target node is the first node of the class specified in the configuration file as ``ssh_to`` (but argument ``ssh_to`` can override this choice). If not ``ssh_to`` has been specified in this cluster's config, then try node class names ``ssh``, ``login``, ``frontend``, and ``master``: if any of these is non-empty, return the first node. If all else fails, return the first node of the first class (in alphabetic order). :return: :py:class:`Node` :raise: :py:class:`elasticluster.exceptions.NodeNotFound` if no valid frontend node is found """ if ssh_to is None: ssh_to = self.ssh_to # first try to interpret `ssh_to` as a node name if ssh_to: try: return self.get_node_by_name(ssh_to) except NodeNotFound: pass # next, ensure `ssh_to` is a class name if ssh_to: try: parts = self._naming_policy.parse(ssh_to) log.warning( "Node `%s` not found." " Trying to find other node in class `%s` ...", ssh_to, parts['kind']) ssh_to = parts['kind'] except ValueError: # it's already a class name pass # try getting first node of kind `ssh_to` if ssh_to: try: nodes = self.nodes[ssh_to] except KeyError: raise ConfigurationError( "Invalid configuration item `ssh_to={ssh_to}` in cluster `{name}`:" " node class `{ssh_to}` does not exist in this cluster.". format(ssh_to=ssh_to, name=self.name)) try: return nodes[0] except IndexError: log.warning( "Chosen `ssh_to` class `%s` is empty: unable to " "get the choosen frontend node from that class.", ssh_to) # If we reach this point, `ssh_to` was not set or the # preferred class was empty. Try "natural" `ssh_to` values. for kind in ['ssh', 'login', 'frontend', 'master']: try: nodes = self.nodes[kind] return nodes[0] except (KeyError, IndexError): pass # ... if all else fails, return first node for kind in sorted(self.nodes.keys()): if self.nodes[kind]: return self.nodes[kind][0] # Uh-oh, no nodes in this cluster! raise NodeNotFound("Unable to find a valid frontend:" " cluster has no nodes!")
def _save_keys_to_known_hosts_file(self, keys): try: keys.save(self.known_hosts_file) except IOError: log.warning("Ignoring error saving known_hosts file: %s", self.known_hosts_file)
def start(self, min_nodes=None, max_concurrent_requests=0): """ Starts up all the instances in the cloud. To speed things up, all instances are started in a seperate thread. To make sure ElastiCluster is not stopped during creation of an instance, it will overwrite the sigint handler. As soon as the last started instance is returned and saved to the repository, sigint is executed as usual. A VM instance is considered 'up and running' as soon as an SSH connection can be established. If the startup timeout is reached before all instances are started, ElastiCluster stops the cluster and terminates all VM instances. This method is blocking and might take some time depending on the amount of instances to start. :param min_nodes: minimum number of nodes to start in case the quota is reached before all instances are up :type min_nodes: dict [node_kind] = number :param int max_concurrent_requests: Issue at most this number of requests to start VMs; if 1 or less, start nodes one at a time (sequentially). The special value ``0`` means run 4 threads for each available processor. """ nodes = self.get_all_nodes() log.info("Starting cluster nodes ...") if max_concurrent_requests == 0: try: max_concurrent_requests = 4 * get_num_processors() except RuntimeError: log.warning("Cannot determine number of processors!" " will start nodes sequentially...") max_concurrent_requests = 1 if max_concurrent_requests > 1: nodes = self._start_nodes_parallel(nodes, max_concurrent_requests) else: nodes = self._start_nodes_sequentially(nodes) # checkpoint cluster state self.repository.save_or_update(self) not_started_nodes = self._check_starting_nodes(nodes, self.startup_timeout) # now that all nodes are up, checkpoint cluster state again self.repository.save_or_update(self) # Try to connect to each node to gather IP addresses and SSH host keys log.info("Checking SSH connection to nodes ...") pending_nodes = nodes - not_started_nodes self._gather_node_ip_addresses(pending_nodes, self.startup_timeout) # It might be possible that the node.connect() call updated # the `preferred_ip` attribute, so, let's save the cluster # again. self.repository.save_or_update(self) # A lot of things could go wrong when starting the cluster. To # ensure a stable cluster fitting the needs of the user in terms of # cluster size, we check the minimum nodes within the node groups to # match the current setup. min_nodes = self._compute_min_nodes(min_nodes) self._check_cluster_size(min_nodes)
def _check_keypair(self, name, public_key_path, private_key_path): """First checks if the keypair is valid, then checks if the keypair is registered with on the cloud. If not the keypair is added to the users ssh keys. :param str name: name of the ssh key :param str public_key_path: path to the ssh public key file :param str private_key_path: path to the ssh private key file :raises: `KeypairError` if key is not a valid RSA or DSA key, the key could not be uploaded or the fingerprint does not match to the one uploaded to the cloud. """ self._init_os_api() # Read key. We do it as first thing because we need it either # way, to check the fingerprint of the remote keypair if it # exists already, or to create a new keypair. pkey = None try: pkey = DSSKey.from_private_key_file(private_key_path) except PasswordRequiredException: warn( "Unable to check key file `{0}` because it is encrypted with a " "password. Please, ensure that you added it to the SSH agent " "with `ssh-add {1}`".format(private_key_path, private_key_path)) except SSHException: try: pkey = RSAKey.from_private_key_file(private_key_path) except PasswordRequiredException: warn( "Unable to check key file `{0}` because it is encrypted with a " "password. Please, ensure that you added it to the SSH agent " "with `ssh-add {1}`".format(private_key_path, private_key_path)) except SSHException: raise KeypairError('File `%s` is neither a valid DSA key ' 'or RSA key.' % private_key_path) try: # Check if a keypair `name` exists on the cloud. keypair = self.nova_client.keypairs.get(name) # Check if it has the correct keypair, but only if we can read the local key if pkey: fingerprint = str.join(':', (i.encode('hex') for i in pkey.get_fingerprint())) if fingerprint != keypair.fingerprint: raise KeypairError("Keypair `%s` is present but has " "different fingerprint. Aborting!" % name) else: warn( "Unable to check if the keypair is using the correct key.") except NotFound: log.warning( "Keypair `%s` not found on resource `%s`, Creating a new one", name, self._os_auth_url) # Create a new keypair with open(os.path.expanduser(public_key_path)) as f: key_material = f.read() try: self.nova_client.keypairs.create(name, key_material) except Exception as ex: log.error( "Could not import key `%s` with name `%s` to `%s`", name, public_key_path, self._os_auth_url) raise KeypairError("could not create keypair `%s`: %s" % (name, ex))
def _build_inventory(self, cluster): """ Builds the inventory for the given cluster and returns its path :param cluster: cluster to build inventory for :type cluster: :py:class:`elasticluster.cluster.Cluster` """ inventory_data = defaultdict(list) for node in cluster.get_all_nodes(): if node.preferred_ip is None: log.warning("Ignoring node `{0}`: No IP address.".format( node.name)) continue if node.kind not in self.groups: # FIXME: should this raise a `ConfigurationError` instead? log.warning("Ignoring node `{0}`:" " Node kind `{1}` not defined in cluster!".format( node.name, node.kind)) continue extra_vars = ['ansible_user=%s' % node.image_user] ip_addr, port = parse_ip_address_and_port(node.preferred_ip) if port != 22: extra_vars.append('ansible_port=%s' % port) # write additional `ansible_*` variables to inventory; # `ansible_python_interpreter` gets special treatment # since we need to tell script `install-py2.sh` that # it should create a wrapper script for running `eatmydata python` extra_conf = self.extra_conf.copy() ansible_python_interpreter = extra_conf.pop( 'ansible_python_interpreter', '/usr/bin/python') extra_vars.append( 'ansible_python_interpreter={python}{eatmydata}'.format( python=ansible_python_interpreter, eatmydata=('+eatmydata' if self.use_eatmydata else ''))) extra_vars.extend('%s=%s' % (k, v) for k, v in extra_conf.items() if k.startswith('ansible_')) if node.kind in self.environment: extra_vars.extend( '%s=%s' % (k, v) for k, v in self.environment[node.kind].items()) for group in self.groups[node.kind]: inventory_data[group].append( (node.name, ip_addr, ' '.join(extra_vars))) if not inventory_data: log.info("No inventory file was created.") return None # create a temporary file to pass to ansible, since the # api is not stable yet... if self._storage_path_tmp: if not self._storage_path: self._storage_path = tempfile.mkdtemp() elasticluster.log.warning("Writing inventory file to tmp dir `%s`", self._storage_path) inventory_path = os.path.join(self._storage_path, (cluster.name + '.inventory')) log.debug("Writing Ansible inventory to file `%s` ...", inventory_path) with open(inventory_path, 'w+') as inventory_file: for section, hosts in inventory_data.items(): # Ansible throws an error "argument of type 'NoneType' is not # iterable" if a section is empty, so ensure we have something # to write in there if hosts: inventory_file.write("\n[" + section + "]\n") for host in hosts: hostline = "{0} ansible_host={1} {2}\n".format(*host) inventory_file.write(hostline) return inventory_path