Пример #1
0
class ClusterExecService(CommissaireService):
    """
    Executes operations over a cluster by way of remote shell commands.
    """
    def __init__(self, exchange_name, connection_url, config_file=None):
        """
        Creates a new ClusterExecService.  If config_file is omitted,
        it will try the default location (/etc/commissaire/clusterexec.conf).

        :param exchange_name: Name of the topic exchange
        :type exchange_name: str
        :param connection_url: Kombu connection URL
        :type connection_url: str
        :param config_file: Optional configuration file path
        :type config_file: str or None
        """
        queue_kwargs = [{'routing_key': 'jobs.clusterexec.*'}]
        super().__init__(exchange_name, connection_url, queue_kwargs)
        self.storage = StorageClient(self)

        # Apply any logging configuration for this service.
        read_config_file(config_file, '/etc/commissaire/clusterexec.conf')

    def _execute(self, message, model_instance, command_args,
                 finished_hosts_key):
        """
        Remotely executes OS-specific shell commands across a cluster.

        :param message: A message instance
        :type message: kombu.message.Message
        :param model_instance: Initial model for the async operation
        :type model_instance: commissaire.models.Model
        :param command_args: Command name + arguments as a tuple
        :type command_args: tuple
        :param finished_hosts_key: Model attribute name for finished hosts
        :type finished_hosts_key: str
        """
        # Split out the command name.
        command_name = command_args[0]
        command_args = command_args[1:]

        end_status = 'finished'

        # XXX We assume the model instance names a cluster.
        #     Note, cluster_name is used in the except clause,
        #     so it must be reliably defined.
        cluster_name = getattr(model_instance, 'name', None)

        try:
            assert cluster_name is not None
            model_json_data = model_instance.to_dict()

            # Set the initial status in the store.
            self.logger.info('Setting initial status.')
            self.logger.debug('Status={}'.format(model_json_data))
            self.storage.save(model_instance)

            # Respond to the caller with the initial status.
            if message.properties.get('reply_to'):
                # XXX Have to dig up the message ID again.
                #     CommissaireService.on_message() already
                #     does this, but doesn't pass it to us.
                body = message.body
                if isinstance(body, bytes):
                    body = json.loads(body.decode())
                self.respond(message.properties['reply_to'],
                             body.get('id', -1), model_json_data)
        except Exception as error:
            self.logger.error(
                'Unable to save initial state for "{}" clusterexec due to '
                '{}: {}'.format(cluster_name, type(error), error))
            raise error

        # Collect all host addresses in the cluster.

        cluster = self.storage.get_cluster(cluster_name)

        n_hosts = len(cluster.hostset)
        if n_hosts:
            self.logger.debug('{} hosts in cluster "{}"'.format(
                n_hosts, cluster_name))
        else:
            self.logger.warn('No hosts in cluster "{}"'.format(cluster_name))

        for address in cluster.hostset:
            host = self.storage.get_host(address)

            oscmd = get_oscmd(host.os)

            # os_command is only used for logging
            os_command = getattr(oscmd, command_name)(*command_args)
            self.logger.info('Executing {} on {}...'.format(
                os_command, host.address))

            model_instance.in_process.append(host.address)
            self.storage.save(model_instance)

            with TemporarySSHKey(host, self.logger) as key:
                try:
                    transport = ansibleapi.Transport(host.remote_user)
                    method = getattr(transport, command_name)
                    method(host.address, key.path, oscmd, command_args)
                except Exception as error:
                    # If there was a failure, set the end_status and break.
                    end_status = C.HOST_STATUS_FAILED
                    self.logger.error(
                        'Clusterexec {} for {} failed: {}: {}'.format(
                            command_name, host.address, type(error), error))
                    break

            # Set the finished hosts.
            finished_hosts = getattr(model_instance, finished_hosts_key)
            finished_hosts.append(host.address)
            try:
                index = model_instance.in_process.index(host.address)
                model_instance.in_process.pop(index)
            except ValueError:
                self.logger.warn('Host {} was not in_process for {} {}'.format(
                    host.address, command_name, cluster_name))
            self.storage.save(model_instance)

            self.logger.info('Finished executing {} for {} in {}'.format(
                command_name, host.address, cluster_name))

        # Final set of command result.

        model_instance.finished_at = formatted_dt()
        model_instance.status = end_status

        self.logger.info('Cluster {} final {} status: {}'.format(
            cluster_name, command_name, model_instance.to_json()))

        self.storage.save(model_instance)

    def on_upgrade(self, message, cluster_name):
        """
        Executes an upgrade command on hosts across a cluster.

        :param message: A message instance
        :type message: kombu.message.Message
        :param cluster_name: The name of a cluster
        :type cluster_name: str
        """
        self.logger.info(
            'Received message: Upgrade cluster "{}"'.format(cluster_name))
        command_args = ('upgrade', )
        model_instance = ClusterUpgrade.new(name=cluster_name,
                                            status='in_process',
                                            started_at=formatted_dt(),
                                            upgraded=[],
                                            in_process=[])
        self._execute(message, model_instance, command_args, 'upgraded')

    def on_restart(self, message, cluster_name):
        """
        Executes a restart command on hosts across a cluster.

        :param message: A message instance
        :type message: kombu.message.Message
        :param cluster_name: The name of a cluster
        :type cluster_name: str
        """
        self.logger.info(
            'Received message: Restart cluster "{}"'.format(cluster_name))
        command_args = ('restart', )
        model_instance = ClusterRestart.new(name=cluster_name,
                                            status='in_process',
                                            started_at=formatted_dt(),
                                            restarted=[],
                                            in_process=[])
        self._execute(message, model_instance, command_args, 'restarted')

    def on_deploy(self, message, cluster_name, version):
        """
        Executes a deploy command on atomic hosts across a cluster.

        :param message: A message instance
        :type message: kombu.message.Message
        :param cluster_name: The name of a cluster
        :type cluster_name: str
        :param version: The tree image version to deploy
        :type version: str
        """
        self.logger.info(
            'Received message: Deploy version "{}" on cluster "{}"'.format(
                version, cluster_name))
        command_args = ('deploy', version)
        model_instance = ClusterDeploy.new(name=cluster_name,
                                           status='in_process',
                                           started_at=formatted_dt(),
                                           version=version,
                                           deployed=[],
                                           in_process=[])
        self._execute(message, model_instance, command_args, 'deployed')
Пример #2
0
class InvestigatorService(CommissaireService):
    """
    Investigates new hosts to retrieve and store facts.
    """

    #: Default configuration file
    _default_config_file = '/etc/commissaire/investigator.conf'

    def __init__(self, exchange_name, connection_url, config_file=None):
        """
        Creates a new InvestigatorService.  If config_file is omitted,
        it will try the default location (/etc/commissaire/investigator.conf).

        :param exchange_name: Name of the topic exchange
        :type exchange_name: str
        :param connection_url: Kombu connection URL
        :type connection_url: str
        :param config_file: Optional configuration file path
        :type config_file: str or None
        """
        queue_kwargs = [{'routing_key': 'jobs.investigate'}]

        super().__init__(exchange_name,
                         connection_url,
                         queue_kwargs,
                         config_file=config_file)

        self.storage = StorageClient(self)

    def _get_etcd_config(self):
        """
        Extracts etcd configuration from a registered store handler.
        If no matching handler is found, return defaults for required
        values.

        :returns: A dictionary of configuration values
        :rtype: dict
        """
        response = self.request('storage.list_store_handlers')
        for handler in response.get('result', []):
            if handler['handler_type'] == 'EtcdStoreHandler':
                return handler['config']

        raise ConfigurationError(
            'Configuration is missing an EtcdStoreHandler')

    def _get_cluster_and_network_models(self, cluster_data):
        """
        Creates cluster and network models from the given cluster data.

        :param cluster_data: Data for a cluster
        :type cluster_data: dict
        :returns: a Cluster and Network model
        :rtype: tuple
        """
        try:
            cluster = Cluster.new(**cluster_data)
            network = self.storage.get_network(cluster.network)
        except TypeError:
            cluster = None
            network = Network.new(**C.DEFAULT_CLUSTER_NETWORK_JSON)

        return cluster, network

    def on_investigate(self, message, address, cluster_data={}):
        """
        Initiates an investigation of the requested host.

        :param message: A message instance
        :type message: kombu.message.Message
        :param address: Host address to investigate
        :type address: str
        :param cluster_data: Optional data for the associated cluster
        :type cluster_data: dict
        """
        # Statuses follow:
        # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses

        self.logger.info('{} is now in investigating.'.format(address))
        self.logger.debug('Investigating: {}'.format(address))
        if cluster_data:
            self.logger.debug('Related cluster: {}'.format(cluster_data))

        host = self.storage.get_host(address)
        host_creds = self.storage.get(HostCreds.new(address=host.address))
        transport = ansibleapi.Transport(host.remote_user)

        key = TemporarySSHKey(host_creds, self.logger)
        try:
            key.create()
        except Exception as error:
            self.logger.warn('Unable to continue for {} due to '
                             '{}: {}. Returning...'.format(
                                 address, type(error), error))
            raise error

        try:
            facts = transport.get_info(address, key.path)
            # recreate the host instance with new data
            data = json.loads(host.to_json())
            data.update(facts)
            host = Host.new(**data)
            host.last_check = formatted_dt()
            host.status = C.HOST_STATUS_BOOTSTRAPPING
            self.logger.info('Facts for {} retrieved'.format(address))
            self.logger.debug('Data: {}'.format(host.to_json()))
        except Exception as error:
            self.logger.warn('Getting info failed for {}: {}'.format(
                address, str(error)))
            host.status = C.HOST_STATUS_FAILED
            key.remove()
            raise error
        finally:
            # Save the updated host model.
            self.storage.save(host)

        self.logger.info(
            'Finished and stored investigation data for {}'.format(address))
        self.logger.debug('Finished investigation update for {}: {}'.format(
            address, host.to_json()))

        self.logger.info('{} is now in bootstrapping'.format(address))
        oscmd = get_oscmd(host.os)
        try:
            etcd_config = self._get_etcd_config()
            cluster, network = self._get_cluster_and_network_models(
                cluster_data)

            container_manager = None
            if cluster:
                if cluster.container_manager:
                    container_manager = cluster.container_manager
                    self.logger.info(
                        'Using cluster "{}" managed by "{}"'.format(
                            cluster.name, container_manager))
                else:
                    self.logger.info('Using unmanaged cluster "{}"'.format(
                        cluster.name))

            self.logger.info('Using network "{}" of type "{}"'.format(
                network.name, network.type))
            transport.bootstrap(address, key.path, oscmd, etcd_config, network)
            host.status = C.HOST_STATUS_DISASSOCIATED
        except Exception as error:
            self.logger.warn('Unable to start bootstraping for {}: {}'.format(
                address, str(error)))
            host.status = C.HOST_STATUS_FAILED
            key.remove()
            raise error
        finally:
            # Save the updated host model.
            self.storage.save(host)

        # Register with container manager (if applicable).
        try:
            if container_manager:
                self.request('container.register_node', container_manager,
                             address)
                host.status = C.HOST_STATUS_ACTIVE
        except Exception as error:
            self.logger.warn(
                'Unable to register {} to container manager "{}": {}'.format(
                    address, container_manager, error.args[0]))
            key.remove()
            raise error
        finally:
            # Save the updated host model.
            self.storage.save(host)

        self.logger.info('Finished bootstrapping for {}'.format(address))
        self.logger.debug('Finished bootstrapping for {}: {}'.format(
            address, host.to_json()))

        # XXX TEMPORARILY DISABLED
        # WATCHER_QUEUE.put_nowait((host, datetime.datetime.utcnow()))

        key.remove()

        return host.to_json()
Пример #3
0
class WatcherService(CommissaireService):
    """
    Periodically connects to hosts to check their status.
    """

    def __init__(self, exchange_name, connection_url, config_file=None):
        """
        Creates a new WatcherService.  If config_file is omitted,
        it will try the default location (/etc/commissaire/watcher.conf).

        :param exchange_name: Name of the topic exchange
        :type exchange_name: str
        :param connection_url: Kombu connection URL
        :type connection_url: str
        :param config_file: Optional configuration file path
        :type config_file: str or None
        """
        queue_kwargs = [{
            'name': 'watcher',
            'exclusive': False,
            'routing_key': 'jobs.watcher',
        }]
        # Store the last address seen for backoff
        self.last_address = None
        super().__init__(exchange_name, connection_url, queue_kwargs)
        self.storage = StorageClient(self)

        # Apply any logging configuration for this service.
        read_config_file(config_file, '/etc/commissaire/watcher.conf')

    def on_message(self, body, message):
        """
        Called when a non-jsonrpc message arrives.

        :param body: Body of the message.
        :type body: dict
        :param message: The message instance.
        :type message: kombu.message.Message
        """
        record = WatcherRecord(**json.loads(body))
        # Ack the message so it does not requeue on it's own
        message.ack()
        self.logger.debug(
            'Checking on WatcherQueue item: {}'.format(record.to_json()))
        if datetime.strptime(record.last_check, C.DATE_FORMAT) < (
                datetime.utcnow() - timedelta(minutes=1)):
            try:
                self._check(record.address)
            except Exception as error:
                self.logger.debug('Error: {}: {}'.format(type(error), error))
            record.last_check = formatted_dt()
        else:
            if self.last_address == record.address:
                # Since we got the same address we could process twice
                # back off a little extra
                self.logger.debug(
                    'Got "{}" twice. Backing off...'.format(record.address))
                sleep(10)
            else:
                # Since the top item wasn't ready for processing sleep a bit
                sleep(2)
        self.last_address = record.address
        # Requeue the host
        self.producer.publish(record.to_json(), 'jobs.watcher')

    def _check(self, address):
        """
        Initiates an check on the requested host.

        :param address: Host address to investigate
        :type address: str
        :param cluster_data: Optional data for the associated cluster
        :type cluster_data: dict
        """
        # Statuses follow:
        # http://commissaire.readthedocs.org/en/latest/enums.html#host-statuses

        self.logger.info('Checking host "{}".'.format(address))

        host = self.storage.get_host(address)
        transport = ansibleapi.Transport(host.remote_user)

        with TemporarySSHKey(host, self.logger) as key:
            try:
                self.logger.debug(
                    'Starting watcher run for host "{}"'.format(address))
                result = transport.check_host_availability(host, key.path)
                host.last_check = formatted_dt()
                self.logger.debug(
                    'Watcher result for host {}: {}'.format(address, result))
            except Exception as error:
                self.logger.warn(
                    'Failed to connect to host node "{}"'.format(address))
                self.logger.debug(
                    'Watcher failed for host node "{}" with {}: {}'.format(
                        address, str(error), error))
                host.status = C.HOST_STATUS_FAILED
                raise error
            finally:
                # Save the model
                self.storage.save(host)
            self.logger.info(
                'Finished watcher run for host "{}"'.format(address))