示例#1
0
    def stop(self, force=False):
        """Destroys all instances of this cluster and calls delete on the
        repository.

        :param bool force: force termination of instances in any case
        """
        for node in self.get_all_nodes():
            if node.instance_id:
                try:
                    node.stop()
                    self.nodes[node.kind].remove(node)
                    log.debug("Removed node with instance id %s from %s"
                              % (node.instance_id, node.kind))
                except:
                    # Boto does not always raises an `Exception` class!
                    log.error("could not stop instance `%s`, it might "
                              "already be down.", node.instance_id)
            else:
                log.debug("Not stopping node with no instance id. It seems "
                          "like node `%s` did not start correctly."
                          % node.name)
                self.nodes[node.kind].remove(node)
        if not self.get_all_nodes():
            log.debug("Removing cluster %s.", self.name)
            self._setup_provider.cleanup(self)
            self.repository.delete(self)
        elif not force:
            log.warning("Not all instances have been terminated. "
                        "Please rerun the `elasticluster stop %s`", self.name)
            self.repository.save_or_update(self)
        else:
            log.warning("Not all instances have been terminated. However, "
                        "as requested, the cluster has been force-removed.")
            self._setup_provider.cleanup(self)
            self.repository.delete(self)
    def setup(self, extra_args=tuple()):
        """
        Configure the cluster nodes.

        Actual action is delegated to the
        :py:class:`elasticluster.providers.AbstractSetupProvider` that
        was provided at construction time.

        :param list extra_args:
          List of additional command-line arguments
          that are appended to each invocation of the setup program.

        :return: bool - True on success, False otherwise
        """
        try:
            # setup the cluster using the setup provider
            ret = self._setup_provider.setup_cluster(self, extra_args)
        except Exception as err:
            log.error(
                "The cluster hosts are up and running,"
                " but %s failed to set the cluster up: %s",
                self._setup_provider.HUMAN_READABLE_NAME, err)
            ret = False

        if not ret:
            log.warning(
                "Cluster `%s` not yet configured. Please, re-run "
                "`elasticluster setup %s` and/or check your configuration",
                self.name, self.name)

        return ret
示例#3
0
 def update(self):
     for node in self.get_all_nodes():
         try:
             node.update_ips()
         except InstanceError, ex:
             log.warning("Ignoring error updating information on node %s: %s",
                       node, str(ex))
示例#4
0
    def get_frontend_node(self):
        """Returns the first node of the class specified in the
        configuration file as `ssh_to`, or the first node of
        the first class in alphabetic order.

        :return: :py:class:`Node`
        :raise: :py:class:`elasticluster.exceptions.NodeNotFound` if no
                valid frontend node is found
        """
        if self.ssh_to:
            if self.ssh_to in self.nodes:
                cls = self.nodes[self.ssh_to]
                if cls:
                    return cls[0]
                else:
                    log.warning(
                        "preferred `ssh_to` `%s` is empty: unable to "
                        "get the choosen frontend node from that class.",
                        self.ssh_to)
            else:
                raise NodeNotFound(
                    "Invalid ssh_to `%s`. Please check your "
                    "configuration file." % self.ssh_to)

        # If we reach this point, the preferred class was empty. Pick
        # one using the default logic.
        for cls in sorted(self.nodes.keys()):
            if self.nodes[cls]:
                return self.nodes[cls][0]
        # Uh-oh, no nodes in this cluster.
        raise NodeNotFound("Unable to find a valid frontend: "
                           "cluster has no nodes!")
    def resume_cluster(self, cluster, extra_args=tuple()):
        """
        As `setup_cluster`, but prefers to run a resume playbook, if
        one is available.  A resume playbook is a playbook which is
        designed to restart a cluster after it has been paused, and
        can be more efficient than a setup playbook (since it can
        assume that the required software is already installed).
        If no such playbook is available, it will use the standard
        setup playbook and print a warning.

        :param cluster: cluster to configure
        :type cluster: :py:class:`elasticluster.cluster.Cluster`

        :param list extra_args:
          List of additional command-line arguments
          that are appended to each invocation of the setup program.

        :return: ``True`` on success, ``False`` otherwise. Please note, if nothing
                 has to be configured, then ``True`` is returned.

        :raises: `ConfigurationError` if the playbook can not be found
                 or is corrupt.
        """
        if self._resume_playbook_path is not None:
            return self._run_playbook(cluster, self._resume_playbook_path, extra_args)
        else:
            log.warning("No resume playbook is available - falling back to the setup "
                        "playbook, which could be slow.")
            return self.setup_cluster(cluster, extra_args)
示例#6
0
    def stop_instance(self, instance_id):
        """Stops the instance gracefully.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance can not be stopped
        """
        if not instance_id:
          log.info("Instance to stop has no instance id")
          return

        gce = self._connect()

        try:
            request = gce.instances().delete(project=self._project_id,
                                        instance=instance_id, zone=self._zone)
            response = self._execute_request(request)
            self._check_response(response)
        except HttpError as e:
            # If the instance does not exist, we can a 404 - just log it, and
            # return without exception so the caller can remove the reference.
            if e.resp.status == 404:
              log.warning("Instance to stop `%s` was not found" % instance_id)
            else:
              raise InstanceError("Could not stop instance `%s`: `%s`"
                                  % (instance_id, e))
        except CloudProviderError as e:
            raise InstanceError("Could not stop instance `%s`: `%s`"
                                % (instance_id, e))
    def stop(self, force=False, wait=False):
        """
        Terminate all VMs in this cluster and delete its repository.

        :param bool force:
          remove cluster from storage even if not all nodes could be stopped.
        """
        log.debug("Stopping cluster `%s` ...", self.name)

        failed = self._stop_all_nodes(wait)

        if failed:
            if force:
                self._delete_saved_data()
                log.warning("Not all cluster nodes have been terminated."
                            " However, as requested, data about the cluster"
                            " has been removed from local storage.")
            else:
                self.repository.save_or_update(self)
                log.warning(
                    "Not all cluster nodes have been terminated."
                    " Fix errors above and re-run `elasticluster stop %s`",
                    self.name)
        else:
            self._delete_saved_data()
示例#8
0
 def __init_keystone_session_v2(self, check=False):
     """Create and return a session object using Keystone API v2."""
     from keystoneauth1 import loading as keystone_v2
     loader = keystone_v2.get_plugin_loader('password')
     auth = loader.load_from_options(
         auth_url=self._os_auth_url,
         username=self._os_username,
         password=self._os_password,
         project_name=self._os_tenant_name,
     )
     sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert)
     if check:
         log.debug("Checking that Keystone API v2 session works...")
         try:
             # if session is invalid, the following will raise some exception
             nova = nova_client.Client(self.compute_api_version,
                                       session=sess,
                                       cacert=self._os_cacert)
             nova.flavors.list()
         except keystoneauth1.exceptions.NotFound as err:
             log.warning("Creating Keystone v2 session failed: %s", err)
             return None
         except keystoneauth1.exceptions.ClientException as err:
             log.error(
                 "OpenStack server rejected request (likely configuration error?): %s",
                 err)
             return None  # FIXME: should we be raising an error instead?
     # if we got to this point, v2 session is valid
     log.info("Using Keystone API v2 session to authenticate to OpenStack")
     return sess
    def execute(self):

        configurator = Configurator.fromConfig(
            self.params.config, storage_path=self.params.storage)
        config = configurator.cluster_conf

        print("""%d cluster templates found in configuration file.""" %
              len(config))
        templates = config.keys()
        for pattern in self.params.clusters:
            templates = [t for t in templates if fnmatch(t, pattern)]

        if self.params.clusters:
            print("""%d cluter templates found matching pattern(s) '%s'""" %
                  (len(templates), str.join(", ", self.params.clusters)))

        for template in templates:
            try:
                cluster = configurator.create_cluster(template, template)
                print("""
name:     %s""" % template)
                for nodekind in cluster.nodes:
                    print("%s nodes: %d" %
                          (nodekind, len(cluster.nodes[nodekind])))
            except ConfigurationError, ex:
                log.warning("unable to load cluster `%s`: %s", template, ex)
示例#10
0
    def stop_instance(self, instance_id):
        """Stops the instance gracefully.

        :param str instance_id: instance identifier
        :raises: `InstanceError` if instance can not be stopped
        """
        if not instance_id:
          log.info("Instance to stop has no instance id")
          return

        gce = self._connect()

        try:
            request = gce.instances().delete(project=self._project_id,
                                        instance=instance_id, zone=self._zone)
            response = self._execute_request(request)
            self._check_response(response)
        except HttpError as e:
            # If the instance does not exist, we can a 404 - just log it, and
            # return without exception so the caller can remove the reference.
            if e.resp.status == 404:
              log.warning("Instance to stop `%s` was not found" % instance_id)
            else:
              raise InstanceError("Could not stop instance `%s`: `%s`"
                                  % (instance_id, e))
        except CloudProviderError as e:
            raise InstanceError("Could not stop instance `%s`: `%s`"
                                % (instance_id, e))
示例#11
0
    def resume_cluster(self, cluster, extra_args=tuple()):
        """
        As `setup_cluster`, but prefers to run a resume playbook, if
        one is available.  A resume playbook is a playbook which is
        designed to restart a cluster after it has been paused, and
        can be more efficient than a setup playbook (since it can
        assume that the required software is already installed).
        If no such playbook is available, it will use the standard
        setup playbook and print a warning.

        :param cluster: cluster to configure
        :type cluster: :py:class:`elasticluster.cluster.Cluster`

        :param list extra_args:
          List of additional command-line arguments
          that are appended to each invocation of the setup program.

        :return: ``True`` on success, ``False`` otherwise. Please note, if nothing
                 has to be configured, then ``True`` is returned.

        :raises: `ConfigurationError` if the playbook can not be found
                 or is corrupt.
        """
        if self._resume_playbook_path is not None:
            return self._run_playbook(cluster, self._resume_playbook_path,
                                      extra_args)
        else:
            log.warning(
                "No resume playbook is available - falling back to the setup "
                "playbook, which could be slow.")
            return self.setup_cluster(cluster, extra_args)
示例#12
0
    def _stop_all_nodes(self, wait=False):
        """
        Terminate all cluster nodes. Return number of failures.
        """
        failed = 0
        for node in self.get_all_nodes():
            if not node.instance_id:
                log.warning(
                    "Node `%s` has no instance ID."
                    " Assuming it did not start correctly,"
                    " so removing it anyway from the cluster.", node.name)
                self.nodes[node.kind].remove(node)
                continue
            # try and stop node
            try:
                # wait and pause for and recheck.
                node.stop(wait)

                self.nodes[node.kind].remove(node)
                log.debug(
                    "Removed node `%s` from cluster `%s`", node.name, self.name)
            except InstanceNotFoundError as err:
                log.info(
                    "Node `%s` (instance ID `%s`) was not found;"
                    " assuming it has already been terminated.",
                    node.name, node.instance_id)
            except Exception as err:
                failed += 1
                log.error(
                    "Could not stop node `%s` (instance ID `%s`): %s %s",
                    node.name, node.instance_id, err, err.__class__)
        return failed
    def _stop_all_nodes(self, wait=False):
        """
        Terminate all cluster nodes. Return number of failures.
        """
        failed = 0
        for node in self.get_all_nodes():
            if not node.instance_id:
                log.warning(
                    "Node `%s` has no instance ID."
                    " Assuming it did not start correctly,"
                    " so removing it anyway from the cluster.", node.name)
                self.nodes[node.kind].remove(node)
                continue
            # try and stop node
            try:
                # wait and pause for and recheck.
                node.stop(wait)

                self.nodes[node.kind].remove(node)
                log.debug("Removed node `%s` from cluster `%s`", node.name,
                          self.name)
            except InstanceNotFoundError as err:
                log.info(
                    "Node `%s` (instance ID `%s`) was not found;"
                    " assuming it has already been terminated.", node.name,
                    node.instance_id)
            except Exception as err:
                failed += 1
                log.error("Could not stop node `%s` (instance ID `%s`): %s %s",
                          node.name, node.instance_id, err, err.__class__)
        return failed
示例#14
0
 def stop(self, force=False):
     """
     Terminates all instances corresponding to this cluster and
     deletes the cluster storage.
     """
     for node in self.get_all_nodes():
         if node.instance_id:
             try:
                 node.stop()
                 self.nodes[node.type].remove(node)
                 log.debug("Removed node with instance id %s from %s"
                           % (node.instance_id, node.type))
             except:
                 # Boto does not always raises an `Exception` class!
                 log.error("could not stop instance `%s`, it might "
                           "already be down.", node.instance_id)
         else:
             log.debug("Not stopping node with no instance id. It seems "
                       "like node `%s` did not start correctly."
                       % node.name)
             self.nodes[node.type].remove(node)
     if not self.get_all_nodes():
         log.debug("Removing cluster %s.", self.name)
         self._setup_provider.cleanup()
         self._storage.delete_cluster(self.name)
     elif not force:
         log.warning("Not all instances have been terminated. "
                     "Please rerun the `elasticluster stop %s`", self.name)
         self._storage.dump_cluster(self)
     else:
         log.warning("Not all instances have been terminated. However, "
                     "as requested, the cluster has been force-removed.")
         self._setup_provider.cleanup()
         self._storage.delete_cluster(self.name)
示例#15
0
def inspect_slurm_cluster(ssh, node_information):
    (_in, _out, _err) = ssh.exec_command("sinfo -hNel")

    nodes = []
    for line in _out:
        match = slurm_sinfo_regexp.match(line)
        if match:
            num_nodes = int(match.group('num'))
            num_cores = int(match.group('cpus')) * num_nodes
            memory = int(match.group('memory')) * num_nodes
            memory_per_core = float(match.group('memory')) / num_cores
            nodes.append([num_nodes, num_cores, memory, memory_per_core])
        else:
            log.warning("Unable to parse output of sinfo: following line doesn't match node regexp: '%s'" % line.strip())
    node_information['num_nodes'] = sum(i[0] for i in nodes)
    node_information['max_cores'] = sum(i[1] for i in nodes)
    node_information['max_cores_per_job'] = max(i[1] for i in nodes)
    node_information['max_memory_per_core'] = max(i[2] for i in nodes)

    (_in, _out, _err) = ssh.exec_command("scontrol -o show part")
    # Assuming only one partition
    line = _out.read()
    match = slurm_scontrol_maxtime_regexp.match(line)
    node_information['max_walltime'] = '672hours'
    if match:
        maxtime = match.group('MaxTime')
        if maxtime != 'UNLIMITED':
            node_information['max_walltime'] = maxtime

    return node_information
示例#16
0
    def setup(self, extra_args=tuple()):
        """
        Configure the cluster nodes.

        Actual action is delegated to the
        :py:class:`elasticluster.providers.AbstractSetupProvider` that
        was provided at construction time.

        :param list extra_args:
          List of additional command-line arguments
          that are appended to each invocation of the setup program.

        :return: bool - True on success, False otherwise
        """
        try:
            # setup the cluster using the setup provider
            ret = self._setup_provider.setup_cluster(self, extra_args)
        except Exception as err:
            log.error(
                "The cluster hosts are up and running,"
                " but %s failed to set the cluster up: %s",
                self._setup_provider.HUMAN_READABLE_NAME, err)
            ret = False

        if not ret:
            log.warning(
                "Cluster `%s` not yet configured. Please, re-run "
                "`elasticluster setup %s` and/or check your configuration",
                self.name, self.name)

        return ret
示例#17
0
 def __init_keystone_session_v2(self, check=False):
     """Create and return a session object using Keystone API v2."""
     from keystoneauth1 import loading as keystone_v2
     loader = keystone_v2.get_plugin_loader('password')
     auth = loader.load_from_options(
         auth_url=self._os_auth_url,
         username=self._os_username,
         password=self._os_password,
         project_name=self._os_tenant_name,
     )
     sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert)
     if check:
         log.debug("Checking that Keystone API v2 session works...")
         try:
             # if session is invalid, the following will raise some exception
             nova = nova_client.Client(self._compute_api_version, session=sess, cacert=self._os_cacert)
             nova.flavors.list()
         except keystoneauth1.exceptions.NotFound as err:
             log.warning("Creating Keystone v2 session failed: %s", err)
             return None
         except keystoneauth1.exceptions.ClientException as err:
             log.error("OpenStack server rejected request (likely configuration error?): %s", err)
             return None  # FIXME: should we be raising an error instead?
     # if we got to this point, v2 session is valid
     log.info("Using Keystone API v2 session to authenticate to OpenStack")
     return sess
示例#18
0
    def setup(self):
        """Configure the cluster nodes with the specified  This
        is delegated to the provided :py:class:`elasticluster.providers.AbstractSetupProvider`

        :return: bool - True on success, False otherwise
        """
        try:
            # setup the cluster using the setup provider
            ret = self._setup_provider.setup_cluster(self)
        except Exception as e:
            log.error(
                "the setup provider was not able to setup the cluster, "
                "but the cluster is running by now. Setup provider error "
                "message: `%s`",
                str(e),
            )
            ret = False

        if not ret:
            log.warning(
                "Cluster `%s` not yet configured. Please, re-run "
                "`elasticluster setup %s` and/or check your configuration",
                self.name,
                self.name,
            )

        return ret
示例#19
0
    def pre_run(self):
        # Hack around http://bugs.python.org/issue9253 ?
        if "--version" in sys.argv:
            import pkg_resources
            version = pkg_resources.get_distribution("elasticluster").version
            print("elasticluster version %s" % version)
            sys.exit(0)

        cli.app.CommandLineApp.pre_run(self)

        # print *all* Python warnings through the logging subsystem
        warnings.resetwarnings()
        warnings.simplefilter('once')
        utils.redirect_warnings(logger='gc3.elasticluster')

        # Set verbosity level
        loglevel = max(1, logging.WARNING - 10 * max(0, self.params.verbose))
        coloredlogs.install(logger=log, level=loglevel)

        # In debug mode, avoid forking
        if self.params.verbose > 3:
            log.DO_NOT_FORK = True
            log.raiseExceptions = True

        if not os.path.isdir(self.params.storage):
            # We do not create *all* the parents, but we do create the
            # directory if we can.
            try:
                os.makedirs(self.params.storage)
            except OSError as ex:
                sys.stderr.write("Unable to create storage directory: "
                                 "%s\n" % (str(ex)))
                sys.exit(1)

        # If no configuration file was specified and default does not exists...
        if not os.path.isfile(self.params.config):
            if self.params.config == self.default_configuration_file:
            # Copy the default configuration file to the user's home
                if not os.path.exists(os.path.dirname(self.params.config)):
                    os.mkdir(os.path.dirname(self.params.config))
                template = resource_filename(
                    'elasticluster', 'share/etc/config.template')
                log.warning("Deploying default configuration file to %s.",
                            self.params.config)
                shutil.copyfile(template, self.params.config)
            else:
                # Exit if supplied configuration file does not exists.
                if not os.path.isfile(self.params.config):
                    sys.stderr.write(
                        "Unable to read configuration file `%s`.\n" %
                        self.params.config)
                    sys.exit(1)

        assert self.params.func, ("No subcommand defined in `ElastiCluster.setup()")
        try:
            self.params.func.pre_run()
        except (RuntimeError, ConfigurationError) as ex:
            sys.stderr.write(str(ex).strip())
            sys.stderr.write('\n')
            sys.exit(1)
示例#20
0
    def get_frontend_node(self):
        """Returns the first node of the class specified in the
        configuration file as `ssh_to`, or the first node of
        the first class in alphabetic order.

        :return: :py:class:`Node`
        :raise: :py:class:`elasticluster.exceptions.NodeNotFound` if no
                valid frontend node is found
        """
        if self.ssh_to:
            if self.ssh_to in self.nodes:
                cls = self.nodes[self.ssh_to]
                if cls:
                    return cls[0]
                else:
                    log.warning(
                        "preferred `ssh_to` `%s` is empty: unable to "
                        "get the choosen frontend node from that class.",
                        self.ssh_to)
            else:
                raise NodeNotFound(
                    "Invalid ssh_to `%s`. Please check your "
                    "configuration file." % self.ssh_to)

        # If we reach this point, the preferred class was empty. Pick
        # one using the default logic.
        for cls in sorted(self.nodes.keys()):
            if self.nodes[cls]:
                return self.nodes[cls][0]
        # Uh-oh, no nodes in this cluster.
        raise NodeNotFound("Unable to find a valid frontend: "
                           "cluster has no nodes!")
示例#21
0
    def execute(self):
        """
        Load the cluster and build a GC3Pie configuration snippet.
        """
        log.warning(
            "Command `elasticluster gc3pie-config` is DEPRECATED"
            " and will be removed in release 1.4 of ElastiCluster")
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        cluster_name = self.params.cluster
        try:
            cluster = creator.load_cluster(cluster_name)
        except (ClusterNotFound, ConfigurationError) as ex:
            log.error("Listing nodes from cluster %s: %s", cluster_name, ex)
            return

        from elasticluster.gc3pie_config import create_gc3pie_config_snippet

        if self.params.append:
            path = os.path.expanduser(self.params.append)
            try:
                fd = open(path, 'a')
                fd.write(create_gc3pie_config_snippet(cluster))
                fd.close()
            except IOError as ex:
                log.error("Unable to write configuration to file %s: %s",
                          path, ex)
        else:
            print(create_gc3pie_config_snippet(cluster))
示例#22
0
def inspect_slurm_cluster(ssh, node_information):
    (_in, _out, _err) = ssh.exec_command("sinfo -hNel")

    nodes = []
    for line in _out:
        match = slurm_sinfo_regexp.match(line)
        if match:
            num_nodes = int(match.group('num'))
            num_cores = int(match.group('cpus')) * num_nodes
            memory = int(match.group('memory')) * num_nodes
            memory_per_core = float(match.group('memory')) / num_cores
            nodes.append([num_nodes, num_cores, memory, memory_per_core])
        else:
            log.warning(
                "Unable to parse output of sinfo: following line doesn't match node regexp: '%s'"
                % line.strip())
    node_information['num_nodes'] = sum(i[0] for i in nodes)
    node_information['max_cores'] = sum(i[1] for i in nodes)
    node_information['max_cores_per_job'] = max(i[1] for i in nodes)
    node_information['max_memory_per_core'] = max(i[2] for i in nodes)

    (_in, _out, _err) = ssh.exec_command("scontrol -o show part")
    # Assuming only one partition
    line = _out.read()
    match = slurm_scontrol_maxtime_regexp.match(line)
    node_information['max_walltime'] = '672hours'
    if match:
        maxtime = match.group('MaxTime')
        if maxtime != 'UNLIMITED':
            node_information['max_walltime'] = maxtime

    return node_information
示例#23
0
    def execute(self):
        """
        Load the cluster and build a GC3Pie configuration snippet.
        """
        log.warning("Command `elasticluster gc3pie-config` is DEPRECATED"
                    " and will be removed in release 1.4 of ElastiCluster")
        creator = make_creator(self.params.config,
                               storage_path=self.params.storage)
        cluster_name = self.params.cluster
        try:
            cluster = creator.load_cluster(cluster_name)
        except (ClusterNotFound, ConfigurationError) as ex:
            log.error("Listing nodes from cluster %s: %s", cluster_name, ex)
            return

        from elasticluster.gc3pie_config import create_gc3pie_config_snippet

        if self.params.append:
            path = os.path.expanduser(self.params.append)
            try:
                fd = open(path, 'a')
                fd.write(create_gc3pie_config_snippet(cluster))
                fd.close()
            except IOError as ex:
                log.error("Unable to write configuration to file %s: %s", path,
                          ex)
        else:
            print(create_gc3pie_config_snippet(cluster))
示例#24
0
    def stop(self, force=False, wait=False):
        """
        Terminate all VMs in this cluster and delete its repository.

        :param bool force:
          remove cluster from storage even if not all nodes could be stopped.
        """
        log.debug("Stopping cluster `%s` ...", self.name)

        failed = self._stop_all_nodes(wait)

        if failed:
            if force:
                self._delete_saved_data()
                log.warning(
                    "Not all cluster nodes have been terminated."
                    " However, as requested, data about the cluster"
                    " has been removed from local storage.")
            else:
                self.repository.save_or_update(self)
                log.warning(
                    "Not all cluster nodes have been terminated."
                    " Fix errors above and re-run `elasticluster stop %s`",
                    self.name)
        else:
            self._delete_saved_data()
示例#25
0
    def execute(self):

        configurator = Configurator.fromConfig(
            self.params.config, storage_path=self.params.storage)
        config = configurator.cluster_conf

        print("""%d cluster templates found in configuration file.""" % len(config))
        templates = config.keys()
        for pattern in self.params.clusters:
            templates = [t for t in templates if fnmatch(t, pattern)]

        if self.params.clusters:
            print("""%d cluter templates found matching pattern(s) '%s'""" % (len(templates), str.join(", ", self.params.clusters)))

        for template in templates:
            try:
                cluster = configurator.create_cluster(template, template)
                print("""
name:     %s""" % template)
                for nodekind in cluster.nodes:
                    print("%s nodes: %d" % (
                        nodekind,
                        len(cluster.nodes[nodekind])))
            except ConfigurationError, ex:
                log.warning("unable to load cluster `%s`: %s", template, ex)
示例#26
0
 def update(self):
     for node in self.get_all_nodes():
         try:
             node.update_ips()
         except InstanceError, ex:
             log.warning(
                 "Ignoring error updating information on node %s: %s", node,
                 str(ex))
示例#27
0
    def _gather_node_ip_addresses(self, nodes, lapse, ssh_timeout, remake=False):
        """
        Connect via SSH to each node.

        Return set of nodes that could not be reached with `lapse` seconds.
        """
        # for convenience, we might set this to ``None`` if the file cannot
        # be opened -- but we do not want to forget the cluster-wide
        # setting in case the error is transient
        known_hosts_path = self.known_hosts_file

        # If run with remake=True, deletes known_hosts_file so that it will
        # be recreated. Prevents "Invalid host key" errors
        if remake and os.path.isfile(known_hosts_path):
            os.remove(known_hosts_path)

        # Create the file if it's not present, otherwise the
        # following lines will raise an error
        try:
            fd = open(known_hosts_path, 'a')
            fd.close()
        except IOError as err:
            log.warning("Error opening SSH 'known hosts' file `%s`: %s",
                        known_hosts_path, err)
            known_hosts_path = None

        keys = paramiko.hostkeys.HostKeys(known_hosts_path)

        with timeout(lapse, raise_timeout_error):
            try:
                while nodes:
                    for node in copy(nodes):
                        ssh = node.connect(
                            keyfile=known_hosts_path,
                            timeout=ssh_timeout)
                        if ssh:
                            log.info("Connection to node `%s` successful,"
                                     " using IP address %s to connect.",
                                     node.name, node.connection_ip())
                            # Add host keys to the keys object.
                            for host, key in ssh.get_host_keys().items():
                                for keytype, keydata in key.items():
                                    keys.add(host, keytype, keydata)
                            self._save_keys_to_known_hosts_file(keys)
                            nodes.remove(node)
                    if nodes:
                        time.sleep(self.polling_interval)

            except TimeoutError:
                log.error(
                    "Some nodes of the cluster were unreachable"
                    " within the given %d-seconds timeout: %s",
                    lapse, ', '.join(node.name for node in nodes))

        # return list of nodes
        return nodes
示例#28
0
 def update(self):
     """Update all connection information of the nodes of this cluster.
     It occurs for example public ip's are not available imediatly,
     therefore calling this method might help.
     """
     for node in self.get_all_nodes():
         try:
             node.update_ips()
         except InstanceError, ex:
             log.warning("Ignoring error updating information on node %s: %s", node, str(ex))
示例#29
0
    def pre_run(self):
        # Hack around http://bugs.python.org/issue9253 ?
        if "--version" in sys.argv:
            import pkg_resources
            version = pkg_resources.get_distribution("elasticluster").version
            print("elasticluster version %s" % version)
            sys.exit(0)

        cli.app.CommandLineApp.pre_run(self)
        # Set verbosity level
        loglevel = max(1, logging.WARNING - 10 * max(0, self.params.verbose))
        coloredlogs.install(logger=log, level=loglevel)

        # In debug mode, avoid forking
        if self.params.verbose > 3:
            log.DO_NOT_FORK = True
            log.raiseExceptions = True

        if not os.path.isdir(self.params.storage):
            # We do not create *all* the parents, but we do create the
            # directory if we can.
            try:
                os.makedirs(self.params.storage)
            except OSError as ex:
                sys.stderr.write("Unable to create storage directory: "
                                 "%s\n" % (str(ex)))
                sys.exit(1)

        # If no configuration file was specified and default does not exists...
        if not os.path.isfile(self.params.config):
            if self.params.config == self.default_configuration_file:
                # Copy the default configuration file to the user's home
                if not os.path.exists(os.path.dirname(self.params.config)):
                    os.mkdir(os.path.dirname(self.params.config))
                template = resource_filename('elasticluster',
                                             'share/etc/config.template')
                log.warning("Deploying default configuration file to %s.",
                            self.params.config)
                shutil.copyfile(template, self.params.config)
            else:
                # Exit if supplied configuration file does not exists.
                if not os.path.isfile(self.params.config):
                    sys.stderr.write(
                        "Unable to read configuration file `%s`.\n" %
                        self.params.config)
                    sys.exit(1)

        assert self.params.func, (
            "No subcommand defined in `ElastiCluster.setup()")
        try:
            self.params.func.pre_run()
        except (RuntimeError, ConfigurationError) as ex:
            sys.stderr.write(str(ex).strip())
            sys.stderr.write('\n')
            sys.exit(1)
示例#30
0
 def update(self):
     """Update all connection information of the nodes of this cluster.
     It occurs for example public ip's are not available imediatly,
     therefore calling this method might help.
     """
     for node in self.get_all_nodes():
         try:
             node.update_ips()
         except InstanceError, ex:
             log.warning(
                 "Ignoring error updating information on node %s: %s", node,
                 str(ex))
 def cleanup(self):
     """
     Delete inventory file.
     """
     if self.inventory_path:
         if os.path.exists(self.inventory_path):
             try:
                 os.unlink(self.inventory_path)
             except OSError, ex:
                 log.warning(
                     "AnsibileProvider: Ignoring error while deleting "
                     "inventory file %s: %s", self.inventory_path, ex)
示例#32
0
    def _check_keypair(self, name, public_key_path, private_key_path):
        connection = self._connect()
        keypairs = connection.get_all_key_pairs()
        keypairs = dict((k.name, k) for k in keypairs)

        # decide if dsa or rsa key is provided
        pkey = None
        is_dsa_key = False
        try:
            pkey = DSSKey.from_private_key_file(private_key_path)
            is_dsa_key = True
        except PasswordRequiredException:
            raise KeypairError(
                "Key `%s` is encrypted with a password. Please, use"
                "an unencrypted key or use ssh-agent" %
                private_key_path)
        except SSHException:
            try:
                pkey = RSAKey.from_private_key_file(private_key_path)
            except PasswordRequiredException:
                raise KeypairError(
                    "Key `%s` is encrypted with a password. Please, use"
                    "an unencrypted key or use ssh-agent" %
                    private_key_path)
            except SSHException:
                raise KeypairError('File `%s` is neither a valid DSA key '
                                   'or RSA key.' % private_key_path)

        # create keys that don't exist yet
        if name not in keypairs:
            log.warning(
                "Keypair `%s` not found on resource `%s`, Creating a new one",
                name, self._url)
            with open(os.path.expanduser(public_key_path)) as f:
                key_material = f.read()
                try:
                    # check for DSA on amazon
                    if "amazon" in self._ec2host and is_dsa_key:
                        log.error(
                            "Apparently, amazon does not support DSA keys. "
                            "Please specify a valid RSA key.")
                        raise KeypairError(
                            "Apparently, amazon does not support DSA keys."
                            "Please specify a valid RSA key.")

                    connection.import_key_pair(name, key_material)
                except Exception, ex:
                    log.error(
                        "Could not import key `%s` with name `%s` to `%s`",
                        name, public_key_path, self._url)
                    raise KeypairError(
                        "could not create keypair `%s`: %s" % (name, ex))
示例#33
0
    def get_stored_clusters(self):
        """
        Returns a list of all stored clusters.
        """
        allfiles = os.listdir(self._storage_dir)
        db_files = []
        for fname in allfiles:
            fpath = os.path.join(self._storage_dir, fname)
            if fname.endswith('.json') and os.path.isfile(fpath):
                db_files.append(fname[:-5])
            else:
                log.warning("Ignoring invalid storage file %s", fpath)

        return db_files
示例#34
0
    def _gather_node_ip_addresses(self, nodes, lapse):
        """
        Connect via SSH to each node.

        Return set of nodes that could not be reached with `lapse` seconds.
        """
        # for convenience, we might set this to ``None`` if the file cannot
        # be opened -- but we do not want to forget the cluster-wide
        # setting in case the error is transient
        known_hosts_path = self.known_hosts_file

        # Create the file if it's not present, otherwise the
        # following lines will raise an error
        try:
            fd = open(known_hosts_path, 'a')
            fd.close()
        except IOError as err:
            log.warning("Error opening SSH 'known hosts' file `%s`: %s",
                        known_hosts_path, err)
            known_hosts_path = None

        keys = paramiko.hostkeys.HostKeys(known_hosts_path)

        with timeout(lapse, raise_timeout_error):
            try:
                while nodes:
                    for node in copy(nodes):
                        ssh = node.connect(keyfile=known_hosts_path)
                        if ssh:
                            log.info("Connection to node `%s` successful,"
                                     " using IP address %s to connect.",
                                     node.name, node.connection_ip())
                            # Add host keys to the keys object.
                            for host, key in ssh.get_host_keys().items():
                                for keytype, keydata in key.items():
                                    keys.add(host, keytype, keydata)
                            self._save_keys_to_known_hosts_file(keys)
                            nodes.remove(node)
                    if nodes:
                        time.sleep(self.polling_interval)

            except TimeoutError:
                log.error(
                    "Some nodes of the cluster were unreachable"
                    " within the given %d-seconds timeout: %s",
                    lapse, ', '.join(node.name for node in nodes))

        # return list of nodes
        return nodes
示例#35
0
    def update(self):
        """Update all connection information of the nodes of this cluster.
        It occurs for example public ip's are not available imediatly,
        therefore calling this method might help.
        """
        for node in self.get_all_nodes():
            try:
                node.update_ips()

                # If we previously did not have a preferred_ip or the
                # preferred_ip is not in the current list, then try to connect
                # to one of the node ips and update the preferred_ip.
                if node.ips and not (node.preferred_ip and node.preferred_ip in node.ips):
                    node.connect()
            except InstanceError as ex:
                log.warning("Ignoring error updating information on node %s: %s", node, str(ex))
        self.repository.save_or_update(self)
 def use(self, kind, name):
     """
     Mark a node name as used.
     """
     try:
         params = self.parse(name)
         index = int(params['index'], 10)
         if index in self._free[kind]:
             self._free[kind].remove(index)
         top = self._top[kind]
         if index > top:
             self._free[kind].update(range(top + 1, index))
             self._top[kind] = index
     except ValueError:
         log.warning(
             "Cannot extract numerical index"
             " from node name `%s`!", name)
示例#37
0
 def use(self, kind, name):
     """
     Mark a node name as used.
     """
     try:
         params = self._parse(name)
         index = int(params['index'], 10)
         if index in self._free[kind]:
             self._free[kind].remove(index)
         top = self._top[kind]
         if index > top:
             self._free[kind].update(range(top+1, index))
             self._top[kind] = index
     except ValueError:
         log.warning(
             "Cannot extract numerical index"
             " from node name `%s`!", name)
示例#38
0
    def read_login_section(self, name):
        """
        Reads the login section for the given name from the
        configuration file and returns its properties in a dictionary
        """
        config = self._read_section("login/" + name)
        config["user_key_private"] = os.path.expanduser(os.path.expandvars(config["user_key_private"]))
        config["user_key_public"] = os.path.expanduser(os.path.expandvars(config["user_key_public"]))

        if not os.path.exists(config["user_key_private"]) or not os.path.exists(config["user_key_public"]):
            log.warning(
                "The key files don't exist. Please check your "
                "configuration file `user_key_public`, "
                "`user_key_private`."
            )

        return config
示例#39
0
 def _ensure_sshagent(cls):
     """Function to start a ssh-agent if it is not running
     :raises SSHAgentError if the process does not succed
     """   
     if 'SSH_AUTH_SOCK' in os.environ.keys():
         return
     else:
         try:
             output=subprocess.check_output(['ssh-agent',])
             for output_line in output.split('\n'):
                 match=re.match('(^.*)\=([^\;]*)\;.*$',output_line)
                 if match:
                     var_name, var_value =match.group(1,2)
                     if var_name=='SSH_AUTH_SOCK' or var_name=='SSH_AGENT_PID':
                         os.environ[str(var_name)]=str(var_value)                 
             log.warning('ssh-agent started')
             return
         except subprocess.CalledProcessError:
             raise SSHAgentError
示例#40
0
 def check_config_or_copy_template(self):
     # If no configuration file was specified and default does not exists and the user did not create a config dir...
     if not os.path.isfile(self.params.config) and not os.path.isdir(self.params.config + '.d'):
         if self.params.config == self.default_configuration_file:
             # Copy the default configuration file to the user's home
             if not os.path.exists(os.path.dirname(self.params.config)):
                 os.mkdir(os.path.dirname(self.params.config))
             template = resource_filename(
                 'elasticluster', 'share/etc/config.template')
             log.warning("Deploying default configuration file to %s.",
                         self.params.config)
             shutil.copyfile(template, self.params.config)
         else:
             # Exit if supplied configuration file does not exists.
             if not os.path.isfile(self.params.config):
                 sys.stderr.write(
                     "Unable to read configuration file `%s`.\n" %
                     self.params.config)
                 sys.exit(1)
示例#41
0
    def stop(self, force=False):
        """Destroys all instances of this cluster and calls delete on the
        repository.

        :param bool force: force termination of instances in any case
        """
        for node in self.get_all_nodes():
            if node.instance_id:
                try:
                    node.stop()
                    self.nodes[node.kind].remove(node)
                    log.debug("Removed node with instance id %s from %s" %
                              (node.instance_id, node.kind))
                except:
                    # Boto does not always raises an `Exception` class!
                    log.error(
                        "could not stop instance `%s`, it might "
                        "already be down.", node.instance_id)
            else:
                log.debug("Not stopping node with no instance id. It seems "
                          "like node `%s` did not start correctly." %
                          node.name)
                self.nodes[node.kind].remove(node)

        if not self.get_all_nodes():
            log.debug("Removing cluster %s.", self.name)
            self._setup_provider.cleanup(self)
            self.repository.delete(self)
        elif not force:
            log.warning(
                "Not all instances have been terminated. "
                "Please rerun the `elasticluster stop %s`", self.name)
            self.repository.save_or_update(self)
        else:
            log.warning("Not all instances have been terminated. However, "
                        "as requested, the cluster has been force-removed.")
            self._setup_provider.cleanup(self)
            self.repository.delete(self)

        # Remove also ssh known hosts
        if os.path.exists(self.known_hosts_file):
            os.remove(self.known_hosts_file)
示例#42
0
    def __init_keystone_session_v3(self, check=False):
        """
        Return a new session object, created using Keystone API v3.

        .. note::

          Note that the only supported authN method is password authentication;
          token or other plug-ins are not currently supported.
        """
        try:
            # may fail on Python 2.6?
            from keystoneauth1.identity import v3 as keystone_v3
        except ImportError:
            log.warning("Cannot load Keystone API v3 library.")
            return None
        auth = keystone_v3.Password(
            auth_url=self._os_auth_url,
            username=self._os_username,
            password=self._os_password,
            user_domain_name=self._os_user_domain_name,
            project_domain_name=self._os_project_domain_name,
            project_name=self._os_tenant_name,
        )
        sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert)
        if check:
            log.debug("Checking that Keystone API v3 session works...")
            try:
                # if session is invalid, the following will raise some exception
                nova = nova_client.Client(self.compute_api_version,
                                          session=sess)
                nova.flavors.list()
            except keystoneauth1.exceptions.NotFound as err:
                log.warning("Creating Keystone v3 session failed: %s", err)
                return None
            except keystoneauth1.exceptions.ClientException as err:
                log.error(
                    "OpenStack server rejected request (likely configuration error?): %s",
                    err)
                return None  # FIXME: should we be raising an error instead?
        # if we got to this point, v3 session is valid
        log.info("Using Keystone API v3 session to authenticate to OpenStack")
        return sess
示例#43
0
    def cleanup(self, cluster):
        """Deletes the inventory file used last recently used.

        :param cluster: cluster to clear up inventory file for
        :type cluster: :py:class:`elasticluster.cluster.Cluster`
        """
        if self._storage_path and os.path.exists(self._storage_path):
            filename = (cluster.name + '.inventory')
            inventory_path = os.path.join(self._storage_path, filename)

            if os.path.exists(inventory_path):
                try:
                    os.unlink(inventory_path)
                    if self._storage_path_tmp:
                        if len(os.listdir(self._storage_path)) == 0:
                            shutil.rmtree(self._storage_path)
                except OSError as ex:
                    log.warning(
                        "AnsibileProvider: Ignoring error while deleting "
                        "inventory file %s: %s", inventory_path, ex)
示例#44
0
    def cleanup(self, cluster):
        """Deletes the inventory file used last recently used.

        :param cluster: cluster to clear up inventory file for
        :type cluster: :py:class:`elasticluster.cluster.Cluster`
        """
        if self._storage_path and os.path.exists(self._storage_path):
            fname = "%s.%s" % (AnsibleSetupProvider.inventory_file_ending, cluster.name)
            inventory_path = os.path.join(self._storage_path, fname)

            if os.path.exists(inventory_path):
                try:
                    os.unlink(inventory_path)
                    if self._storage_path_tmp:
                        if len(os.listdir(self._storage_path)) == 0:
                            shutil.rmtree(self._storage_path)
                except OSError, ex:
                    log.warning(
                        "AnsibileProvider: Ignoring error while deleting " "inventory file %s: %s", inventory_path, ex
                    )
示例#45
0
    def update(self):
        """Update all connection information of the nodes of this cluster.
        It occurs for example public ip's are not available imediatly,
        therefore calling this method might help.
        """
        for node in self.get_all_nodes():
            try:
                node.update_ips()

                # If we previously did not have a preferred_ip or the
                # preferred_ip is not in the current list, then try to connect
                # to one of the node ips and update the preferred_ip.
                if node.ips and \
                   not (node.preferred_ip and \
                        node.preferred_ip in node.ips):
                  node.connect()
            except InstanceError as ex:
                log.warning("Ignoring error updating information on node %s: %s",
                          node, str(ex))
        self.repository.save_or_update(self)
示例#46
0
    def execute(self):
        templates = Configuration.Instance().list_cluster_templates()
        for pattern in self.params.clusters:
            templates = [t for t in templates if fnmatch(t, pattern)]
        print("""%d cluster templates found.""" % len(templates))
        for template in templates:
            try:
                cluster = Configurator().create_cluster(template)
                print("""
name:     %s
image id: %s
flavor:   %s
cloud:    %s""" % (template, cluster.extra['image_id'],
                   cluster.extra['flavor'],
                   cluster._cloud))
                for nodetype in cluster.nodes:
                    print("%s nodes: %d" % (
                            nodetype,
                            len(cluster.nodes[nodetype])))
            except ConfigurationError, ex:
                log.warning("unable to load cluster `%s`: %s", template, ex)
示例#47
0
    def _check_keypair(self, name, path):
        connection = self._connect()
        keypairs = connection.get_all_key_pairs()
        keypairs = dict((k.name, k) for k in keypairs)

        # create keys that don't exist yet
        if name not in keypairs:
            log.warning(
                "Keypair `%s` not found on resource `%s`, Creating a new one",
                name, self._url)
            with open(os.path.expanduser(path)) as f:
                key_material = f.read()
                try:
                    # TODO check if given key is a public key file
                    connection.import_key_pair(name, key_material)
                except Exception, ex:
                    log.error(
                        "Could not import key `%s` with name `%s` to `%s`",
                        name, path, self._url)
                    raise KeypairError(
                        "could not create keypair `%s`: %s" % (name, ex))
示例#48
0
    def __init_keystone_session_v3(self, check=False):
        """
        Return a new session object, created using Keystone API v3.

        .. note::

          Note that the only supported authN method is password authentication;
          token or other plug-ins are not currently supported.
        """
        try:
            # may fail on Python 2.6?
            from keystoneauth1.identity import v3 as keystone_v3
        except ImportError:
            log.warning("Cannot load Keystone API v3 library.")
            return None
        auth = keystone_v3.Password(
            auth_url=self._os_auth_url,
            username=self._os_username,
            password=self._os_password,
            user_domain_name=self._os_user_domain_name,
            project_domain_name=self._os_project_domain_name,
            project_name=self._os_tenant_name,
        )
        sess = keystoneauth1.session.Session(auth=auth, verify=self._os_cacert)
        if check:
            log.debug("Checking that Keystone API v3 session works...")
            try:
                # if session is invalid, the following will raise some exception
                nova = nova_client.Client(self._compute_api_version, session=sess)
                nova.flavors.list()
            except keystoneauth1.exceptions.NotFound as err:
                log.warning("Creating Keystone v3 session failed: %s", err)
                return None
            except keystoneauth1.exceptions.ClientException as err:
                log.error("OpenStack server rejected request (likely configuration error?): %s", err)
                return None  # FIXME: should we be raising an error instead?
        # if we got to this point, v3 session is valid
        log.info("Using Keystone API v3 session to authenticate to OpenStack")
        return sess
示例#49
0
    def execute(self):

        configurator = Configurator.fromConfig(
            self.params.config, storage_path=self.params.storage)
        config = configurator.cluster_conf

        print("""%d cluster templates found.""" % len(config))
        templates = config.keys()
        for pattern in self.params.clusters:
            templates = [t for t in templates if fnmatch(t, pattern)]

        for template in templates:
            try:
                cluster = configurator.create_cluster(template, template)
                print("""
name:     %s
cloud:     %s""" % (template, cluster._cloud))
                for nodetype in cluster.nodes:
                    print("%s nodes: %d" %
                          (nodetype, len(cluster.nodes[nodetype])))
            except ConfigurationError, ex:
                log.warning("unable to load cluster `%s`: %s", template, ex)
示例#50
0
    def _check_keypair(self, name, public_key_path, private_key_path):
        """First checks if the keypair is valid, then checks if the keypair
        is registered with on the cloud. If not the keypair is added to the
        users ssh keys.

        :param str name: name of the ssh key
        :param str public_key_path: path to the ssh public key file
        :param str private_key_path: path to the ssh private key file

        :raises: `KeypairError` if key is not a valid RSA or DSA key,
                 the key could not be uploaded, the fingerprint does not
                 match to the one uploaded to the cloud or the key is neither 
                 accessible nor included in the ssh-agent
        """

        # Read key. We do it as first thing because we need it either
        # way, to check the fingerprint of the remote keypair if it
        # exists already, or to create a new keypair.
        
        # Check if a keypair `name` exists on the cloud.
        try:
            keypair = self.client.keypairs.get(name)
        except NotFound:
            log.warning(
                "Keypair `%s` not found on resource `%s`, Creating a new one",
                name, self._os_auth_url)

            # Create a new keypair
            with open(os.path.expanduser(public_key_path)) as f:
                key_material = f.read()
                try:
                    self.client.keypairs.create(name, key_material)
                except Exception, ex:
                    log.error(
                        "Could not import key `%s` with name `%s` to `%s`",
                        name, public_key_path, self._os_auth_url)
                    raise KeypairError(
                        "could not create keypair `%s`: %s" % (name, ex))
示例#51
0
    def setup(self):
        """Configure the cluster nodes with the specified  This
        is delegated to the provided :py:class:`elasticluster.providers.AbstractSetupProvider`

        :return: bool - True on success, False otherwise
        """
        try:
            # setup the cluster using the setup provider
            ret = self._setup_provider.setup_cluster(self)
        except Exception as e:
            log.error(
                "the setup provider was not able to setup the cluster, "
                "but the cluster is running by now. Setup provider error "
                "message: `%s`", str(e))
            ret = False

        if not ret:
            log.warning(
                "Cluster `%s` not yet configured. Please, re-run "
                "`elasticluster setup %s` and/or check your configuration",
                self.name, self.name)

        return ret
示例#52
0
 def stop(self, force=False):
     """
     Terminates all instances corresponding to this cluster and
     deletes the cluster storage.
     """
     for node in self.get_all_nodes():
         if node.instance_id:
             try:
                 node.stop()
                 self.nodes[node.type].remove(node)
                 log.debug("Removed node with instance id %s from %s" %
                           (node.instance_id, node.type))
             except:
                 # Boto does not always raises an `Exception` class!
                 log.error(
                     "could not stop instance `%s`, it might "
                     "already be down.", node.instance_id)
         else:
             log.debug("Not stopping node with no instance id. It seems "
                       "like node `%s` did not start correctly." %
                       node.name)
             self.nodes[node.type].remove(node)
     if not self.get_all_nodes():
         log.debug("Removing cluster %s.", self.name)
         self._setup_provider.cleanup()
         self._storage.delete_cluster(self.name)
     elif not force:
         log.warning(
             "Not all instances have been terminated. "
             "Please rerun the `elasticluster stop %s`", self.name)
         self._storage.dump_cluster(self)
     else:
         log.warning("Not all instances have been terminated. However, "
                     "as requested, the cluster has been force-removed.")
         self._setup_provider.cleanup()
         self._storage.delete_cluster(self.name)
示例#53
0
    def _check_keypair(self, name, public_key_path, private_key_path):
        connection = self._connect()
        keypairs = connection.get_all_key_pairs()
        keypairs = dict((k.name, k) for k in keypairs)

        # decide if dsa or rsa key is provided
        pkey = None
        is_dsa_key = False
        try:
            pkey = DSSKey.from_private_key_file(private_key_path)
            is_dsa_key = True
        except PasswordRequiredException:
            log.warning(
                "Unable to check key file `%s` because it is encrypted with a "
                "password. Please, ensure that you added it to the SSH agent "
                "with `ssh-add %s`", private_key_path, private_key_path)
        except SSHException:
            try:
                pkey = RSAKey.from_private_key_file(private_key_path)
            except PasswordRequiredException:
                log.warning(
                    "Unable to check key file `%s` because it is encrypted with a "
                    "password. Please, ensure that you added it to the SSH agent "
                    "with `ssh-add %s`", private_key_path, private_key_path)
            except SSHException:
                raise KeypairError('File `%s` is neither a valid DSA key '
                                   'or RSA key.' % private_key_path)

        # create keys that don't exist yet
        if name not in keypairs:
            log.warning(
                "Keypair `%s` not found on resource `%s`, Creating a new one",
                name, self._url)
            with open(os.path.expanduser(public_key_path)) as f:
                key_material = f.read()
                try:
                    # check for DSA on amazon
                    if "amazon" in self._ec2host and is_dsa_key:
                        log.error(
                            "Apparently, amazon does not support DSA keys. "
                            "Please specify a valid RSA key.")
                        raise KeypairError(
                            "Apparently, amazon does not support DSA keys."
                            "Please specify a valid RSA key.")

                    connection.import_key_pair(name, key_material)
                except Exception, ex:
                    log.error(
                        "Could not import key `%s` with name `%s` to `%s`",
                        name, public_key_path, self._url)
                    raise KeypairError(
                        "could not create keypair `%s`: %s" % (name, ex))
    def get_ssh_to_node(self, ssh_to=None):
        """
        Return target node for SSH/SFTP connections.

        The target node is the first node of the class specified in
        the configuration file as ``ssh_to`` (but argument ``ssh_to``
        can override this choice).

        If not ``ssh_to`` has been specified in this cluster's config,
        then try node class names ``ssh``, ``login``, ``frontend``,
        and ``master``: if any of these is non-empty, return the first
        node.

        If all else fails, return the first node of the first class
        (in alphabetic order).

        :return: :py:class:`Node`
        :raise: :py:class:`elasticluster.exceptions.NodeNotFound`
          if no valid frontend node is found
        """
        if ssh_to is None:
            ssh_to = self.ssh_to

        # first try to interpret `ssh_to` as a node name
        if ssh_to:
            try:
                return self.get_node_by_name(ssh_to)
            except NodeNotFound:
                pass

        # next, ensure `ssh_to` is a class name
        if ssh_to:
            try:
                parts = self._naming_policy.parse(ssh_to)
                log.warning(
                    "Node `%s` not found."
                    " Trying to find other node in class `%s` ...", ssh_to,
                    parts['kind'])
                ssh_to = parts['kind']
            except ValueError:
                # it's already a class name
                pass

        # try getting first node of kind `ssh_to`
        if ssh_to:
            try:
                nodes = self.nodes[ssh_to]
            except KeyError:
                raise ConfigurationError(
                    "Invalid configuration item `ssh_to={ssh_to}` in cluster `{name}`:"
                    " node class `{ssh_to}` does not exist in this cluster.".
                    format(ssh_to=ssh_to, name=self.name))
            try:
                return nodes[0]
            except IndexError:
                log.warning(
                    "Chosen `ssh_to` class `%s` is empty: unable to "
                    "get the choosen frontend node from that class.", ssh_to)

        # If we reach this point, `ssh_to` was not set or the
        # preferred class was empty. Try "natural" `ssh_to` values.
        for kind in ['ssh', 'login', 'frontend', 'master']:
            try:
                nodes = self.nodes[kind]
                return nodes[0]
            except (KeyError, IndexError):
                pass

        # ... if all else fails, return first node
        for kind in sorted(self.nodes.keys()):
            if self.nodes[kind]:
                return self.nodes[kind][0]

        # Uh-oh, no nodes in this cluster!
        raise NodeNotFound("Unable to find a valid frontend:"
                           " cluster has no nodes!")
 def _save_keys_to_known_hosts_file(self, keys):
     try:
         keys.save(self.known_hosts_file)
     except IOError:
         log.warning("Ignoring error saving known_hosts file: %s",
                     self.known_hosts_file)
    def start(self, min_nodes=None, max_concurrent_requests=0):
        """
        Starts up all the instances in the cloud.

        To speed things up, all
        instances are started in a seperate thread. To make sure
        ElastiCluster is not stopped during creation of an instance, it will
        overwrite the sigint handler. As soon as the last started instance
        is returned and saved to the repository, sigint is executed as usual.

        A VM instance is considered 'up and running' as soon as an SSH
        connection can be established. If the startup timeout is reached before
        all instances are started, ElastiCluster stops the cluster and
        terminates all VM instances.

        This method is blocking and might take some time depending on the
        amount of instances to start.

        :param min_nodes: minimum number of nodes to start in case the quota
                          is reached before all instances are up
        :type min_nodes: dict [node_kind] = number
        :param int max_concurrent_requests:
          Issue at most this number of requests to start
          VMs; if 1 or less, start nodes one at a time (sequentially).
          The special value ``0`` means run 4 threads for each available
          processor.
        """

        nodes = self.get_all_nodes()

        log.info("Starting cluster nodes ...")
        if max_concurrent_requests == 0:
            try:
                max_concurrent_requests = 4 * get_num_processors()
            except RuntimeError:
                log.warning("Cannot determine number of processors!"
                            " will start nodes sequentially...")
                max_concurrent_requests = 1
        if max_concurrent_requests > 1:
            nodes = self._start_nodes_parallel(nodes, max_concurrent_requests)
        else:
            nodes = self._start_nodes_sequentially(nodes)

        # checkpoint cluster state
        self.repository.save_or_update(self)

        not_started_nodes = self._check_starting_nodes(nodes,
                                                       self.startup_timeout)

        # now that all nodes are up, checkpoint cluster state again
        self.repository.save_or_update(self)

        # Try to connect to each node to gather IP addresses and SSH host keys
        log.info("Checking SSH connection to nodes ...")
        pending_nodes = nodes - not_started_nodes
        self._gather_node_ip_addresses(pending_nodes, self.startup_timeout)

        # It might be possible that the node.connect() call updated
        # the `preferred_ip` attribute, so, let's save the cluster
        # again.
        self.repository.save_or_update(self)

        # A lot of things could go wrong when starting the cluster. To
        # ensure a stable cluster fitting the needs of the user in terms of
        # cluster size, we check the minimum nodes within the node groups to
        # match the current setup.
        min_nodes = self._compute_min_nodes(min_nodes)
        self._check_cluster_size(min_nodes)
示例#57
0
    def _check_keypair(self, name, public_key_path, private_key_path):
        """First checks if the keypair is valid, then checks if the keypair
        is registered with on the cloud. If not the keypair is added to the
        users ssh keys.

        :param str name: name of the ssh key
        :param str public_key_path: path to the ssh public key file
        :param str private_key_path: path to the ssh private key file

        :raises: `KeypairError` if key is not a valid RSA or DSA key,
                 the key could not be uploaded or the fingerprint does not
                 match to the one uploaded to the cloud.
        """
        self._init_os_api()
        # Read key. We do it as first thing because we need it either
        # way, to check the fingerprint of the remote keypair if it
        # exists already, or to create a new keypair.
        pkey = None
        try:
            pkey = DSSKey.from_private_key_file(private_key_path)
        except PasswordRequiredException:
            warn(
                "Unable to check key file `{0}` because it is encrypted with a "
                "password. Please, ensure that you added it to the SSH agent "
                "with `ssh-add {1}`".format(private_key_path,
                                            private_key_path))
        except SSHException:
            try:
                pkey = RSAKey.from_private_key_file(private_key_path)
            except PasswordRequiredException:
                warn(
                    "Unable to check key file `{0}` because it is encrypted with a "
                    "password. Please, ensure that you added it to the SSH agent "
                    "with `ssh-add {1}`".format(private_key_path,
                                                private_key_path))
            except SSHException:
                raise KeypairError('File `%s` is neither a valid DSA key '
                                   'or RSA key.' % private_key_path)

        try:
            # Check if a keypair `name` exists on the cloud.
            keypair = self.nova_client.keypairs.get(name)

            # Check if it has the correct keypair, but only if we can read the local key
            if pkey:
                fingerprint = str.join(':', (i.encode('hex')
                                             for i in pkey.get_fingerprint()))
                if fingerprint != keypair.fingerprint:
                    raise KeypairError("Keypair `%s` is present but has "
                                       "different fingerprint. Aborting!" %
                                       name)
            else:
                warn(
                    "Unable to check if the keypair is using the correct key.")
        except NotFound:
            log.warning(
                "Keypair `%s` not found on resource `%s`, Creating a new one",
                name, self._os_auth_url)

            # Create a new keypair
            with open(os.path.expanduser(public_key_path)) as f:
                key_material = f.read()
                try:
                    self.nova_client.keypairs.create(name, key_material)
                except Exception as ex:
                    log.error(
                        "Could not import key `%s` with name `%s` to `%s`",
                        name, public_key_path, self._os_auth_url)
                    raise KeypairError("could not create keypair `%s`: %s" %
                                       (name, ex))
示例#58
0
    def _build_inventory(self, cluster):
        """
        Builds the inventory for the given cluster and returns its path

        :param cluster: cluster to build inventory for
        :type cluster: :py:class:`elasticluster.cluster.Cluster`
        """
        inventory_data = defaultdict(list)

        for node in cluster.get_all_nodes():
            if node.preferred_ip is None:
                log.warning("Ignoring node `{0}`: No IP address.".format(
                    node.name))
                continue
            if node.kind not in self.groups:
                # FIXME: should this raise a `ConfigurationError` instead?
                log.warning("Ignoring node `{0}`:"
                            " Node kind `{1}` not defined in cluster!".format(
                                node.name, node.kind))
                continue

            extra_vars = ['ansible_user=%s' % node.image_user]

            ip_addr, port = parse_ip_address_and_port(node.preferred_ip)
            if port != 22:
                extra_vars.append('ansible_port=%s' % port)

            # write additional `ansible_*` variables to inventory;
            # `ansible_python_interpreter` gets special treatment
            # since we need to tell script `install-py2.sh` that
            # it should create a wrapper script for running `eatmydata python`
            extra_conf = self.extra_conf.copy()
            ansible_python_interpreter = extra_conf.pop(
                'ansible_python_interpreter', '/usr/bin/python')
            extra_vars.append(
                'ansible_python_interpreter={python}{eatmydata}'.format(
                    python=ansible_python_interpreter,
                    eatmydata=('+eatmydata' if self.use_eatmydata else '')))
            extra_vars.extend('%s=%s' % (k, v) for k, v in extra_conf.items()
                              if k.startswith('ansible_'))

            if node.kind in self.environment:
                extra_vars.extend(
                    '%s=%s' % (k, v)
                    for k, v in self.environment[node.kind].items())

            for group in self.groups[node.kind]:
                inventory_data[group].append(
                    (node.name, ip_addr, ' '.join(extra_vars)))

        if not inventory_data:
            log.info("No inventory file was created.")
            return None

        # create a temporary file to pass to ansible, since the
        # api is not stable yet...
        if self._storage_path_tmp:
            if not self._storage_path:
                self._storage_path = tempfile.mkdtemp()
            elasticluster.log.warning("Writing inventory file to tmp dir `%s`",
                                      self._storage_path)

        inventory_path = os.path.join(self._storage_path,
                                      (cluster.name + '.inventory'))
        log.debug("Writing Ansible inventory to file `%s` ...", inventory_path)
        with open(inventory_path, 'w+') as inventory_file:
            for section, hosts in inventory_data.items():
                # Ansible throws an error "argument of type 'NoneType' is not
                # iterable" if a section is empty, so ensure we have something
                # to write in there
                if hosts:
                    inventory_file.write("\n[" + section + "]\n")
                    for host in hosts:
                        hostline = "{0} ansible_host={1} {2}\n".format(*host)
                        inventory_file.write(hostline)
        return inventory_path