예제 #1
0
    def heartbeat(self, task, **kwargs):
        """Method for agent to periodically check in.

        The agent should be sending its agent_url (so Ironic can talk back)
        as a kwarg. kwargs should have the following format::

         {
             'agent_url': 'http://AGENT_HOST:AGENT_PORT'
         }

        AGENT_PORT defaults to 9999.
        """
        node = task.node
        driver_internal_info = node.driver_internal_info
        LOG.debug(
            'Heartbeat from %(node)s, last heartbeat at %(heartbeat)s.',
            {'node': node.uuid,
             'heartbeat': driver_internal_info.get('agent_last_heartbeat')})
        driver_internal_info['agent_last_heartbeat'] = int(_time())
        try:
            driver_internal_info['agent_url'] = kwargs['agent_url']
        except KeyError:
            raise exception.MissingParameterValue(_('For heartbeat operation, '
                                                    '"agent_url" must be '
                                                    'specified.'))

        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
                          'not taking any action.', {'node': node.uuid})
                return
            elif node.provision_state == states.DEPLOYWAIT:
                msg = _('Node failed to get image for deploy.')
                self.continue_deploy(task, **kwargs)
            elif (node.provision_state == states.DEPLOYING and
                  self.deploy_is_done(task)):
                msg = _('Node failed to move to active state.')
                self.reboot_to_instance(task, **kwargs)
            elif (node.provision_state == states.CLEANING and
                  not node.clean_step):
                # Agent booted from prepare_cleaning
                manager.set_node_cleaning_steps(task)
                self._notify_conductor_resume_clean(task)
            elif (node.provision_state == states.CLEANING and
                  node.clean_step):
                self.continue_cleaning(task, **kwargs)

        except Exception as e:
            err_info = {'node': node.uuid, 'msg': msg, 'e': e}
            last_error = _('Asynchronous exception for node %(node)s: '
                           '%(msg)s exception: %(e)s') % err_info
            LOG.exception(last_error)
            deploy_utils.set_failed_state(task, last_error)
예제 #2
0
    def heartbeat(self, task, **kwargs):
        """Method for agent to periodically check in.

        The agent should be sending its agent_url (so Ironic can talk back)
        as a kwarg. kwargs should have the following format::

         {
             'agent_url': 'http://AGENT_HOST:AGENT_PORT'
         }

        AGENT_PORT defaults to 9999.
        """
        node = task.node
        driver_internal_info = node.driver_internal_info
        LOG.debug(
            'Heartbeat from %(node)s, last heartbeat at %(heartbeat)s.',
            {'node': node.uuid,
             'heartbeat': driver_internal_info.get('agent_last_heartbeat')})
        driver_internal_info['agent_last_heartbeat'] = int(_time())
        try:
            driver_internal_info['agent_url'] = kwargs['agent_url']
        except KeyError:
            raise exception.MissingParameterValue(_('For heartbeat operation, '
                                                    '"agent_url" must be '
                                                    'specified.'))

        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _('Failed checking if deploy is done.')
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
                          'not taking any action.', {'node': node.uuid})
                return
            elif node.provision_state == states.DEPLOYWAIT:
                msg = _('Node failed to get image for deploy.')
                self.continue_deploy(task, **kwargs)
            elif (node.provision_state == states.DEPLOYING and
                  self.deploy_is_done(task)):
                msg = _('Node failed to move to active state.')
                self.reboot_to_instance(task, **kwargs)
            elif (node.provision_state == states.CLEANING and
                  not node.clean_step):
                # Agent booted from prepare_cleaning
                manager.set_node_cleaning_steps(task)
                self._notify_conductor_resume_clean(task)
            elif (node.provision_state == states.CLEANING and
                  node.clean_step):
                self.continue_cleaning(task, **kwargs)

        except Exception as e:
            err_info = {'node': node.uuid, 'msg': msg, 'e': e}
            last_error = _('Asynchronous exception for node %(node)s: '
                           '%(msg)s exception: %(e)s') % err_info
            LOG.exception(last_error)
            deploy_utils.set_failed_state(task, last_error)
예제 #3
0
    def continue_cleaning(self, task, **kwargs):
        """Start the next cleaning step if the previous one is complete.

        In order to avoid errors and make agent upgrades painless, cleaning
        will check the version of all hardware managers during get_clean_steps
        at the beginning of cleaning and before executing each step in the
        agent. If the version has changed between steps, the agent is unable
        to tell if an ordering change will cause a cleaning issue. Therefore,
        we restart cleaning.
        """
        command = self._get_completed_cleaning_command(task)
        LOG.debug(
            "Cleaning command status for node %(node)s on step %(step)s:" " %(command)s",
            {"node": task.node.uuid, "step": task.node.clean_step, "command": command},
        )

        if not command:
            # Command is not done yet
            return

        if command.get("command_status") == "FAILED":
            msg = _("Agent returned error for clean step %(step)s on node " "%(node)s : %(err)s.") % {
                "node": task.node.uuid,
                "err": command.get("command_error"),
                "step": task.node.clean_step,
            }
            LOG.error(msg)
            return manager.cleaning_error_handler(task, msg)
        elif command.get("command_status") == "CLEAN_VERSION_MISMATCH":
            # Restart cleaning, agent must have rebooted to new version
            LOG.info(
                _LI("Node %s detected a clean version mismatch, " "resetting clean steps and rebooting the node."),
                task.node.uuid,
            )
            try:
                manager.set_node_cleaning_steps(task)
            except exception.NodeCleaningFailure:
                msg = _("Could not restart cleaning on node %(node)s: " "%(err)s.") % {
                    "node": task.node.uuid,
                    "err": command.get("command_error"),
                    "step": task.node.clean_step,
                }
                LOG.exception(msg)
                return manager.cleaning_error_handler(task, msg)
            self._notify_conductor_resume_clean(task)

        elif command.get("command_status") == "SUCCEEDED":
            LOG.info(
                _LI("Agent on node %s returned cleaning command success, " "moving to next clean step"), task.node.uuid
            )
            self._notify_conductor_resume_clean(task)
        else:
            msg = _("Agent returned unknown status for clean step %(step)s " "on node %(node)s : %(err)s.") % {
                "node": task.node.uuid,
                "err": command.get("command_status"),
                "step": task.node.clean_step,
            }
            LOG.error(msg)
            return manager.cleaning_error_handler(task, msg)
예제 #4
0
    def continue_cleaning(self, task, **kwargs):
        """Start the next cleaning step if the previous one is complete.

        In order to avoid errors and make agent upgrades painless, cleaning
        will check the version of all hardware managers during get_clean_steps
        at the beginning of cleaning and before executing each step in the
        agent. If the version has changed between steps, the agent is unable
        to tell if an ordering change will cause a cleaning issue. Therefore,
        we restart cleaning.
        """
        command = self._get_completed_cleaning_command(task)
        LOG.debug(
            'Cleaning command status for node %(node)s on step %(step)s:'
            ' %(command)s', {
                'node': task.node.uuid,
                'step': task.node.clean_step,
                'command': command
            })

        if not command:
            # Command is not done yet
            return

        if command.get('command_status') == 'FAILED':
            msg = (_('Agent returned error for clean step %(step)s on node '
                     '%(node)s : %(err)s.') % {
                         'node': task.node.uuid,
                         'err': command.get('command_error'),
                         'step': task.node.clean_step
                     })
            LOG.error(msg)
            return manager.cleaning_error_handler(task, msg)
        elif command.get('command_status') == 'CLEAN_VERSION_MISMATCH':
            # Restart cleaning, agent must have rebooted to new version
            try:
                manager.set_node_cleaning_steps(task)
            except exception.NodeCleaningFailure:
                msg = (_('Could not restart cleaning on node %(node)s: '
                         '%(err)s.') % {
                             'node': task.node.uuid,
                             'err': command.get('command_error'),
                             'step': task.node.clean_step
                         })
                LOG.exception(msg)
                return manager.cleaning_error_handler(task, msg)
            self._notify_conductor_resume_clean(task)

        elif command.get('command_status') == 'SUCCEEDED':
            self._notify_conductor_resume_clean(task)
        else:
            msg = (_('Agent returned unknown status for clean step %(step)s '
                     'on node %(node)s : %(err)s.') % {
                         'node': task.node.uuid,
                         'err': command.get('command_status'),
                         'step': task.node.clean_step
                     })
            LOG.error(msg)
            return manager.cleaning_error_handler(task, msg)
예제 #5
0
    def continue_cleaning(self, task, **kwargs):
        """Start the next cleaning step if the previous one is complete.

        In order to avoid errors and make agent upgrades painless, cleaning
        will check the version of all hardware managers during get_clean_steps
        at the beginning of cleaning and before executing each step in the
        agent. If the version has changed between steps, the agent is unable
        to tell if an ordering change will cause a cleaning issue. Therefore,
        we restart cleaning.
        """
        command = self._get_completed_cleaning_command(task)
        LOG.debug('Cleaning command status for node %(node)s on step %(step)s:'
                  ' %(command)s', {'node': task.node.uuid,
                                   'step': task.node.clean_step,
                                   'command': command})

        if not command:
            # Command is not done yet
            return

        if command.get('command_status') == 'FAILED':
            msg = (_('Agent returned error for clean step %(step)s on node '
                     '%(node)s : %(err)s.') %
                   {'node': task.node.uuid,
                    'err': command.get('command_error'),
                    'step': task.node.clean_step})
            LOG.error(msg)
            return manager.cleaning_error_handler(task, msg)
        elif command.get('command_status') == 'CLEAN_VERSION_MISMATCH':
            # Restart cleaning, agent must have rebooted to new version
            LOG.info(_LI('Node %s detected a clean version mismatch, '
                         'resetting clean steps and rebooting the node.'),
                     task.node.uuid)
            try:
                manager.set_node_cleaning_steps(task)
            except exception.NodeCleaningFailure:
                msg = (_('Could not restart cleaning on node %(node)s: '
                         '%(err)s.') %
                       {'node': task.node.uuid,
                        'err': command.get('command_error'),
                        'step': task.node.clean_step})
                LOG.exception(msg)
                return manager.cleaning_error_handler(task, msg)
            self._notify_conductor_resume_clean(task)

        elif command.get('command_status') == 'SUCCEEDED':
            LOG.info(_LI('Agent on node %s returned cleaning command success, '
                         'moving to next clean step'), task.node.uuid)
            self._notify_conductor_resume_clean(task)
        else:
            msg = (_('Agent returned unknown status for clean step %(step)s '
                     'on node %(node)s : %(err)s.') %
                   {'node': task.node.uuid,
                    'err': command.get('command_status'),
                    'step': task.node.clean_step})
            LOG.error(msg)
            return manager.cleaning_error_handler(task, msg)
예제 #6
0
    def _initiate_cleaning(self, task):
        """Initiates the steps required to start cleaning for the node.

        This method polls each interface of the driver for getting the
        clean steps and notifies Ironic conductor to resume cleaning.
        On error, it sets the node to CLEANFAIL state and populates
        node.last_error with the error message.

        :param task: a TaskManager instance containing the node to act on.
        """
        LOG.warning(
            _LW("Bash deploy ramdisk doesn't support in-band cleaning. "
                "Please use the ironic-python-agent (IPA) ramdisk "
                "instead for node %s. "), task.node.uuid)
        try:
            manager.set_node_cleaning_steps(task)
            self.notify_conductor_resume_clean(task)
        except Exception as e:
            last_error = (
                _('Encountered exception for node %(node)s '
                  'while initiating cleaning. Error:  %(error)s') %
                {'node': task.node.uuid, 'error': e})
            return manager.cleaning_error_handler(task, last_error)
예제 #7
0
    def heartbeat(self, task, **kwargs):
        """Method for agent to periodically check in.

        The agent should be sending its agent_url (so Ironic can talk back)
        as a kwarg. kwargs should have the following format::

         {
             'agent_url': 'http://AGENT_HOST:AGENT_PORT'
         }

        AGENT_PORT defaults to 9999.
        """
        node = task.node
        driver_internal_info = node.driver_internal_info
        LOG.debug(
            "Heartbeat from %(node)s, last heartbeat at %(heartbeat)s.",
            {"node": node.uuid, "heartbeat": driver_internal_info.get("agent_last_heartbeat")},
        )
        driver_internal_info["agent_last_heartbeat"] = int(_time())
        try:
            driver_internal_info["agent_url"] = kwargs["agent_url"]
        except KeyError:
            raise exception.MissingParameterValue(_("For heartbeat operation, " '"agent_url" must be ' "specified."))

        node.driver_internal_info = driver_internal_info
        node.save()

        # Async call backs don't set error state on their own
        # TODO(jimrollenhagen) improve error messages here
        msg = _("Failed checking if deploy is done.")
        try:
            if node.maintenance:
                # this shouldn't happen often, but skip the rest if it does.
                LOG.debug(
                    "Heartbeat from node %(node)s in maintenance mode; " "not taking any action.", {"node": node.uuid}
                )
                return
            elif node.provision_state == states.DEPLOYWAIT and not self.deploy_has_started(task):
                msg = _("Node failed to get image for deploy.")
                self.continue_deploy(task, **kwargs)
            elif node.provision_state == states.DEPLOYWAIT and self.deploy_is_done(task):
                msg = _("Node failed to move to active state.")
                self.reboot_to_instance(task, **kwargs)
            elif node.provision_state == states.DEPLOYWAIT and self.deploy_has_started(task):
                node.touch_provisioning()
            # TODO(lucasagomes): CLEANING here for backwards compat
            # with previous code, otherwise nodes in CLEANING when this
            # is deployed would fail. Should be removed once the Mitaka
            # release starts.
            elif node.provision_state in (states.CLEANWAIT, states.CLEANING):
                node.touch_provisioning()
                if not node.clean_step:
                    LOG.debug("Node %s just booted to start cleaning.", node.uuid)
                    msg = _("Node failed to start the next cleaning step.")
                    manager.set_node_cleaning_steps(task)
                    self._notify_conductor_resume_clean(task)
                else:
                    msg = _("Node failed to check cleaning progress.")
                    self.continue_cleaning(task, **kwargs)

        except Exception as e:
            err_info = {"node": node.uuid, "msg": msg, "e": e}
            last_error = _("Asynchronous exception for node %(node)s: " "%(msg)s exception: %(e)s") % err_info
            LOG.exception(last_error)
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager.cleaning_error_handler(task, last_error)
            else:
                deploy_utils.set_failed_state(task, last_error)
예제 #8
0
    def continue_cleaning(self, task, **kwargs):
        """Start the next cleaning step if the previous one is complete.

        In order to avoid errors and make agent upgrades painless, cleaning
        will check the version of all hardware managers during get_clean_steps
        at the beginning of cleaning and before executing each step in the
        agent. If the version has changed between steps, the agent is unable
        to tell if an ordering change will cause a cleaning issue. Therefore,
        we restart cleaning.
        """
        node = task.node
        command = self._get_completed_cleaning_command(task)
        LOG.debug(
            'Cleaning command status for node %(node)s on step %(step)s:'
            ' %(command)s', {
                'node': node.uuid,
                'step': node.clean_step,
                'command': command
            })

        if not command:
            # Command is not done yet
            return

        if command.get('command_status') == 'FAILED':
            msg = (_('Agent returned error for clean step %(step)s on node '
                     '%(node)s : %(err)s.') % {
                         'node': node.uuid,
                         'err': command.get('command_error'),
                         'step': node.clean_step
                     })
            LOG.error(msg)
            return manager.cleaning_error_handler(task, msg)
        elif command.get('command_status') == 'CLEAN_VERSION_MISMATCH':
            # Restart cleaning, agent must have rebooted to new version
            LOG.info(
                _LI('Node %s detected a clean version mismatch, '
                    'resetting clean steps and rebooting the node.'),
                node.uuid)
            try:
                manager.set_node_cleaning_steps(task)
            except exception.NodeCleaningFailure:
                msg = (_('Could not restart cleaning on node %(node)s: '
                         '%(err)s.') % {
                             'node': node.uuid,
                             'err': command.get('command_error'),
                             'step': node.clean_step
                         })
                LOG.exception(msg)
                return manager.cleaning_error_handler(task, msg)
            self._notify_conductor_resume_clean(task)

        elif command.get('command_status') == 'SUCCEEDED':
            clean_step_hook = _get_post_clean_step_hook(node)
            if clean_step_hook is not None:
                LOG.debug(
                    'For node %(node)s, executing post clean step '
                    'hook %(method)s for clean step %(step)s' % {
                        'method': clean_step_hook.__name__,
                        'node': node.uuid,
                        'step': node.clean_step
                    })
                try:
                    clean_step_hook(task, command)
                except Exception as e:
                    msg = (_('For node %(node)s, post clean step hook '
                             '%(method)s failed for clean step %(step)s.'
                             'Error: %(error)s') % {
                                 'method': clean_step_hook.__name__,
                                 'node': node.uuid,
                                 'error': e,
                                 'step': node.clean_step
                             })
                    LOG.exception(msg)
                    return manager.cleaning_error_handler(task, msg)

            LOG.info(
                _LI('Agent on node %s returned cleaning command success, '
                    'moving to next clean step'), node.uuid)
            self._notify_conductor_resume_clean(task)
        else:
            msg = (_('Agent returned unknown status for clean step %(step)s '
                     'on node %(node)s : %(err)s.') % {
                         'node': node.uuid,
                         'err': command.get('command_status'),
                         'step': node.clean_step
                     })
            LOG.error(msg)
            return manager.cleaning_error_handler(task, msg)