def _resume_cleaning(self, task): raid_common.update_raid_info(task.node, self.get_logical_disks(task)) driver_internal_info = task.node.driver_internal_info driver_internal_info['cleaning_reboot'] = True task.node.driver_internal_info = driver_internal_info task.node.save() manager_utils.notify_conductor_resume_clean(task)
def _resume_cleaning(self, task): raid_common.update_raid_info( task.node, self.get_logical_disks(task)) driver_internal_info = task.node.driver_internal_info driver_internal_info['cleaning_reboot'] = True task.node.driver_internal_info = driver_internal_info task.node.save() manager_utils.notify_conductor_resume_clean(task)
def _resume_current_operation(self, task): """Continue cleaning/deployment of the node. For asynchronous operations, it is necessary to notify the conductor manager to continue the cleaning/deployment operation after a job has finished. This is done through an RPC call. The notify_conductor_resume_* wrapper methods provide that. :param task: a TaskManager instance with node to act on """ if task.node.clean_step: manager_utils.notify_conductor_resume_clean(task) else: manager_utils.notify_conductor_resume_deploy(task)
def _check_node_raid_config(self, task): """Check the progress of running RAID config on a node.""" node = task.node raid_configs = node.driver_internal_info['raid_configs'] task.upgrade_lock() raid_configs[:] = [i for i in raid_configs if self._raid_config_in_progress(task, i)] if not raid_configs: self._clear_raid_configs(node) LOG.info('RAID configuration completed for node %(node)s', {'node': node.uuid}) manager_utils.notify_conductor_resume_clean(task)
def _continue_firmware_updates(self, task, update_service, firmware_updates): """Continues processing the firmware updates Continues to process the firmware updates on the node. Note that the caller must have an exclusive lock on the node. :param task: a TaskManager instance containing the node to act on. :param update_service: the sushy firmware update service :param firmware_updates: the remaining firmware updates to apply """ node = task.node firmware_update = firmware_updates[0] wait_interval = firmware_update.get('wait') if wait_interval: time_now = str(timeutils.utcnow().isoformat()) firmware_update['wait_start_time'] = time_now LOG.debug( 'Waiting at %(time)s for %(seconds)s seconds after ' 'firmware update %(firmware_image)s on node %(node)s', { 'time': time_now, 'seconds': wait_interval, 'firmware_image': firmware_update['url'], 'node': node.uuid }) driver_internal_info = node.driver_internal_info driver_internal_info['firmware_updates'] = firmware_updates node.driver_internal_info = driver_internal_info node.save() return if len(firmware_updates) == 1: self._clear_firmware_updates(node) LOG.info('Firmware updates completed for node %(node)s', {'node': node.uuid}) manager_utils.notify_conductor_resume_clean(task) else: firmware_updates.pop(0) self._apply_firmware_update(node, update_service, firmware_updates) node.save() manager_utils.node_power_action(task, states.REBOOT)
def continue_cleaning(self, task, **kwargs): """Start the next cleaning step if the previous one is complete. In order to avoid errors and make agent upgrades painless, the agent compares the version of all hardware managers at the start of the cleaning (the agent's get_clean_steps() call) and before executing each clean step. If the version has changed between steps, the agent is unable to tell if an ordering change will cause a cleaning issue so it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we restart the entire cleaning cycle. For manual cleaning, we don't. Additionally, if a clean_step includes the reboot_requested property set to True, this method will coordinate the reboot once the step is completed. """ node = task.node # For manual clean, the target provision state is MANAGEABLE, whereas # for automated cleaning, it is (the default) AVAILABLE. manual_clean = node.target_provision_state == states.MANAGEABLE agent_commands = self._client.get_commands_status(task.node) if not agent_commands: if task.node.driver_internal_info.get('cleaning_reboot'): # Node finished a cleaning step that requested a reboot, and # this is the first heartbeat after booting. Continue cleaning. info = task.node.driver_internal_info info.pop('cleaning_reboot', None) task.node.driver_internal_info = info task.node.save() manager_utils.notify_conductor_resume_clean(task) return else: # Agent has no commands whatsoever return command = _get_completed_cleaning_command(task, agent_commands) LOG.debug('Cleaning command status for node %(node)s on step %(step)s:' ' %(command)s', {'node': node.uuid, 'step': node.clean_step, 'command': command}) if not command: # Agent command in progress return if command.get('command_status') == 'FAILED': msg = (_('Agent returned error for clean step %(step)s on node ' '%(node)s : %(err)s.') % {'node': node.uuid, 'err': command.get('command_error'), 'step': node.clean_step}) LOG.error(msg) return manager_utils.cleaning_error_handler(task, msg) elif command.get('command_status') == 'CLEAN_VERSION_MISMATCH': # Cache the new clean steps (and 'hardware_manager_version') try: self.refresh_clean_steps(task) except exception.NodeCleaningFailure as e: msg = (_('Could not continue cleaning on node ' '%(node)s: %(err)s.') % {'node': node.uuid, 'err': e}) LOG.exception(msg) return manager_utils.cleaning_error_handler(task, msg) if manual_clean: # Don't restart manual cleaning if agent reboots to a new # version. Both are operator actions, unlike automated # cleaning. Manual clean steps are not necessarily idempotent # like automated clean steps and can be even longer running. LOG.info('During manual cleaning, node %(node)s detected ' 'a clean version mismatch. Re-executing and ' 'continuing from current step %(step)s.', {'node': node.uuid, 'step': node.clean_step}) driver_internal_info = node.driver_internal_info driver_internal_info['skip_current_clean_step'] = False node.driver_internal_info = driver_internal_info node.save() else: # Restart cleaning, agent must have rebooted to new version LOG.info('During automated cleaning, node %s detected a ' 'clean version mismatch. Resetting clean steps ' 'and rebooting the node.', node.uuid) try: conductor_steps.set_node_cleaning_steps(task) except exception.NodeCleaningFailure: msg = (_('Could not restart automated cleaning on node ' '%(node)s: %(err)s.') % {'node': node.uuid, 'err': command.get('command_error'), 'step': node.clean_step}) LOG.exception(msg) return manager_utils.cleaning_error_handler(task, msg) manager_utils.notify_conductor_resume_clean(task) elif command.get('command_status') == 'SUCCEEDED': clean_step_hook = _get_post_clean_step_hook(node) if clean_step_hook is not None: LOG.debug('For node %(node)s, executing post clean step ' 'hook %(method)s for clean step %(step)s', {'method': clean_step_hook.__name__, 'node': node.uuid, 'step': node.clean_step}) try: clean_step_hook(task, command) except Exception as e: msg = (_('For node %(node)s, post clean step hook ' '%(method)s failed for clean step %(step)s.' '%(cls)s: %(error)s') % {'method': clean_step_hook.__name__, 'node': node.uuid, 'error': e, 'cls': e.__class__.__name__, 'step': node.clean_step}) LOG.exception(msg) return manager_utils.cleaning_error_handler(task, msg) if task.node.clean_step.get('reboot_requested'): _cleaning_reboot(task) return LOG.info('Agent on node %s returned cleaning command success, ' 'moving to next clean step', node.uuid) manager_utils.notify_conductor_resume_clean(task) else: msg = (_('Agent returned unknown status for clean step %(step)s ' 'on node %(node)s : %(err)s.') % {'node': node.uuid, 'err': command.get('command_status'), 'step': node.clean_step}) LOG.error(msg) return manager_utils.cleaning_error_handler(task, msg)
def heartbeat(self, task, callback_url, agent_version): """Process a heartbeat. :param task: task to work with. :param callback_url: agent HTTP API URL. :param agent_version: The version of the agent that is heartbeating """ # NOTE(pas-ha) immediately skip the rest if nothing to do if (task.node.provision_state not in self.heartbeat_allowed_states and not manager_utils.fast_track_able(task)): LOG.debug('Heartbeat from node %(node)s in unsupported ' 'provision state %(state)s, not taking any action.', {'node': task.node.uuid, 'state': task.node.provision_state}) return try: task.upgrade_lock() except exception.NodeLocked: LOG.warning('Node %s is currently locked, skipping heartbeat ' 'processing (will retry on the next heartbeat)', task.node.uuid) return node = task.node LOG.debug('Heartbeat from node %s', node.uuid) driver_internal_info = node.driver_internal_info driver_internal_info['agent_url'] = callback_url driver_internal_info['agent_version'] = agent_version # Record the last heartbeat event time in UTC, so we can make # decisions about it later. Can be decoded to datetime object with: # datetime.datetime.strptime(var, "%Y-%m-%d %H:%M:%S.%f") driver_internal_info['agent_last_heartbeat'] = str( timeutils.utcnow().isoformat()) node.driver_internal_info = driver_internal_info node.save() if node.provision_state in _HEARTBEAT_RECORD_ONLY: # We shouldn't take any additional action. The agent will # silently continue to heartbeat to ironic until user initiated # state change occurs causing it to match a state below. LOG.debug('Heartbeat from %(node)s recorded to identify the ' 'node as on-line.', {'node': task.node.uuid}) return # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.maintenance: # this shouldn't happen often, but skip the rest if it does. LOG.debug('Heartbeat from node %(node)s in maintenance mode; ' 'not taking any action.', {'node': node.uuid}) return # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we # are currently in the core deploy.deploy step. Other deploy steps # may cause the agent to boot, but we should not trigger deployment # at that point. elif node.provision_state == states.DEPLOYWAIT: if self.in_core_deploy_step(task): if not self.deploy_has_started(task): msg = _('Node failed to deploy.') self.continue_deploy(task) elif self.deploy_is_done(task): msg = _('Node failed to move to active state.') self.reboot_to_instance(task) else: node.touch_provisioning() else: node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() if not node.clean_step: LOG.debug('Node %s just booted to start cleaning.', node.uuid) msg = _('Node failed to start the first cleaning step.') # First, cache the clean steps self.refresh_clean_steps(task) # Then set/verify node clean steps and start cleaning conductor_steps.set_node_cleaning_steps(task) # The exceptions from RPC are not possible as we using cast # here manager_utils.notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') self.continue_cleaning(task) elif (node.provision_state == states.RESCUEWAIT): msg = _('Node failed to perform rescue operation.') self._finalize_rescue(task) except Exception as e: err_info = {'msg': msg, 'e': e} last_error = _('Asynchronous exception: %(msg)s ' 'Exception: %(e)s for node') % err_info errmsg = last_error + ' %(node)s' LOG.exception(errmsg, {'node': node.uuid}) if node.provision_state in (states.CLEANING, states.CLEANWAIT): manager_utils.cleaning_error_handler(task, last_error) elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): deploy_utils.set_failed_state( task, last_error, collect_logs=bool(self._client)) elif node.provision_state in (states.RESCUING, states.RESCUEWAIT): manager_utils.rescuing_error_handler(task, last_error)
def heartbeat(self, task, callback_url, agent_version): """Process a heartbeat. :param task: task to work with. :param callback_url: agent HTTP API URL. :param agent_version: The version of the agent that is heartbeating """ # NOTE(pas-ha) immediately skip the rest if nothing to do if task.node.provision_state not in self.heartbeat_allowed_states: LOG.debug( 'Heartbeat from node %(node)s in unsupported ' 'provision state %(state)s, not taking any action.', { 'node': task.node.uuid, 'state': task.node.provision_state }) return try: task.upgrade_lock() except exception.NodeLocked: LOG.warning( 'Node %s is currently locked, skipping heartbeat ' 'processing (will retry on the next heartbeat)', task.node.uuid) return node = task.node LOG.debug('Heartbeat from node %s', node.uuid) driver_internal_info = node.driver_internal_info driver_internal_info['agent_url'] = callback_url driver_internal_info['agent_version'] = agent_version node.driver_internal_info = driver_internal_info node.save() # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.maintenance: # this shouldn't happen often, but skip the rest if it does. LOG.debug( 'Heartbeat from node %(node)s in maintenance mode; ' 'not taking any action.', {'node': node.uuid}) return elif (node.provision_state == states.DEPLOYWAIT and not self.deploy_has_started(task)): msg = _('Node failed to deploy.') self.continue_deploy(task) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_is_done(task)): msg = _('Node failed to move to active state.') self.reboot_to_instance(task) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_has_started(task)): node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() if not node.clean_step: LOG.debug('Node %s just booted to start cleaning.', node.uuid) msg = _('Node failed to start the first cleaning step.') # First, cache the clean steps self.refresh_clean_steps(task) # Then set/verify node clean steps and start cleaning manager_utils.set_node_cleaning_steps(task) # The exceptions from RPC are not possible as we using cast # here manager_utils.notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') self.continue_cleaning(task) elif (node.provision_state == states.RESCUEWAIT): msg = _('Node failed to perform rescue operation.') self._finalize_rescue(task) except Exception as e: err_info = {'msg': msg, 'e': e} last_error = _('Asynchronous exception: %(msg)s ' 'Exception: %(e)s for node') % err_info errmsg = last_error + ' %(node)s' LOG.exception(errmsg, {'node': node.uuid}) if node.provision_state in (states.CLEANING, states.CLEANWAIT): manager_utils.cleaning_error_handler(task, last_error) elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): deploy_utils.set_failed_state(task, last_error, collect_logs=bool(self._client)) elif node.provision_state in (states.RESCUING, states.RESCUEWAIT): manager_utils.rescuing_error_handler(task, last_error)
def _resume_cleaning(self, task): raid_common.update_raid_info(task.node, task.node.raid_config) manager_utils.notify_conductor_resume_clean(task)
def continue_cleaning(self, task, **kwargs): """Start the next cleaning step if the previous one is complete. In order to avoid errors and make agent upgrades painless, the agent compares the version of all hardware managers at the start of the cleaning (the agent's get_clean_steps() call) and before executing each clean step. If the version has changed between steps, the agent is unable to tell if an ordering change will cause a cleaning issue so it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we restart the entire cleaning cycle. For manual cleaning, we don't. Additionally, if a clean_step includes the reboot_requested property set to True, this method will coordinate the reboot once the step is completed. """ node = task.node # For manual clean, the target provision state is MANAGEABLE, whereas # for automated cleaning, it is (the default) AVAILABLE. manual_clean = node.target_provision_state == states.MANAGEABLE agent_commands = self._client.get_commands_status(task.node) if not agent_commands: if task.node.driver_internal_info.get('cleaning_reboot'): # Node finished a cleaning step that requested a reboot, and # this is the first heartbeat after booting. Continue cleaning. info = task.node.driver_internal_info info.pop('cleaning_reboot', None) task.node.driver_internal_info = info task.node.save() manager_utils.notify_conductor_resume_clean(task) return else: # Agent has no commands whatsoever return command = _get_completed_cleaning_command(task, agent_commands) LOG.debug('Cleaning command status for node %(node)s on step %(step)s:' ' %(command)s', {'node': node.uuid, 'step': node.clean_step, 'command': command}) if not command: # Agent command in progress return if command.get('command_status') == 'FAILED': msg = (_('Agent returned error for clean step %(step)s on node ' '%(node)s : %(err)s.') % {'node': node.uuid, 'err': command.get('command_error'), 'step': node.clean_step}) LOG.error(msg) return manager_utils.cleaning_error_handler(task, msg) elif command.get('command_status') == 'CLEAN_VERSION_MISMATCH': # Cache the new clean steps (and 'hardware_manager_version') try: self.refresh_clean_steps(task) except exception.NodeCleaningFailure as e: msg = (_('Could not continue cleaning on node ' '%(node)s: %(err)s.') % {'node': node.uuid, 'err': e}) LOG.exception(msg) return manager_utils.cleaning_error_handler(task, msg) if manual_clean: # Don't restart manual cleaning if agent reboots to a new # version. Both are operator actions, unlike automated # cleaning. Manual clean steps are not necessarily idempotent # like automated clean steps and can be even longer running. LOG.info('During manual cleaning, node %(node)s detected ' 'a clean version mismatch. Re-executing and ' 'continuing from current step %(step)s.', {'node': node.uuid, 'step': node.clean_step}) driver_internal_info = node.driver_internal_info driver_internal_info['skip_current_clean_step'] = False node.driver_internal_info = driver_internal_info node.save() else: # Restart cleaning, agent must have rebooted to new version LOG.info('During automated cleaning, node %s detected a ' 'clean version mismatch. Resetting clean steps ' 'and rebooting the node.', node.uuid) try: conductor_steps.set_node_cleaning_steps(task) except exception.NodeCleaningFailure: msg = (_('Could not restart automated cleaning on node ' '%(node)s: %(err)s.') % {'node': node.uuid, 'err': command.get('command_error'), 'step': node.clean_step}) LOG.exception(msg) return manager_utils.cleaning_error_handler(task, msg) manager_utils.notify_conductor_resume_clean(task) elif command.get('command_status') == 'SUCCEEDED': clean_step_hook = _get_post_clean_step_hook(node) if clean_step_hook is not None: LOG.debug('For node %(node)s, executing post clean step ' 'hook %(method)s for clean step %(step)s', {'method': clean_step_hook.__name__, 'node': node.uuid, 'step': node.clean_step}) try: clean_step_hook(task, command) except Exception as e: msg = (_('For node %(node)s, post clean step hook ' '%(method)s failed for clean step %(step)s.' '%(cls)s: %(error)s') % {'method': clean_step_hook.__name__, 'node': node.uuid, 'error': e, 'cls': e.__class__.__name__, 'step': node.clean_step}) LOG.exception(msg) return manager_utils.cleaning_error_handler(task, msg) if task.node.clean_step.get('reboot_requested'): _cleaning_reboot(task) return LOG.info('Agent on node %s returned cleaning command success, ' 'moving to next clean step', node.uuid) manager_utils.notify_conductor_resume_clean(task) else: msg = (_('Agent returned unknown status for clean step %(step)s ' 'on node %(node)s : %(err)s.') % {'node': node.uuid, 'err': command.get('command_status'), 'step': node.clean_step}) LOG.error(msg) return manager_utils.cleaning_error_handler(task, msg)
def heartbeat(self, task, callback_url, agent_version): """Process a heartbeat. :param task: task to work with. :param callback_url: agent HTTP API URL. :param agent_version: The version of the agent that is heartbeating """ # NOTE(pas-ha) immediately skip the rest if nothing to do if (task.node.provision_state not in self.heartbeat_allowed_states and not manager_utils.fast_track_able(task)): LOG.debug('Heartbeat from node %(node)s in unsupported ' 'provision state %(state)s, not taking any action.', {'node': task.node.uuid, 'state': task.node.provision_state}) return try: task.upgrade_lock() except exception.NodeLocked: LOG.warning('Node %s is currently locked, skipping heartbeat ' 'processing (will retry on the next heartbeat)', task.node.uuid) return node = task.node LOG.debug('Heartbeat from node %s', node.uuid) driver_internal_info = node.driver_internal_info driver_internal_info['agent_url'] = callback_url driver_internal_info['agent_version'] = agent_version # Record the last heartbeat event time in UTC, so we can make # decisions about it later. Can be decoded to datetime object with: # datetime.datetime.strptime(var, "%Y-%m-%d %H:%M:%S.%f") driver_internal_info['agent_last_heartbeat'] = str( timeutils.utcnow().isoformat()) node.driver_internal_info = driver_internal_info node.save() if node.provision_state in _HEARTBEAT_RECORD_ONLY: # We shouldn't take any additional action. The agent will # silently continue to heartbeat to ironic until user initiated # state change occurs causing it to match a state below. LOG.debug('Heartbeat from %(node)s recorded to identify the ' 'node as on-line.', {'node': task.node.uuid}) return # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.maintenance: # this shouldn't happen often, but skip the rest if it does. LOG.debug('Heartbeat from node %(node)s in maintenance mode; ' 'not taking any action.', {'node': node.uuid}) return # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we # are currently in the core deploy.deploy step. Other deploy steps # may cause the agent to boot, but we should not trigger deployment # at that point. elif node.provision_state == states.DEPLOYWAIT: if self.in_core_deploy_step(task): if not self.deploy_has_started(task): msg = _('Node failed to deploy.') self.continue_deploy(task) elif self.deploy_is_done(task): msg = _('Node failed to move to active state.') self.reboot_to_instance(task) else: node.touch_provisioning() else: node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() if not node.clean_step: LOG.debug('Node %s just booted to start cleaning.', node.uuid) msg = _('Node failed to start the first cleaning step.') # First, cache the clean steps self.refresh_clean_steps(task) # Then set/verify node clean steps and start cleaning conductor_steps.set_node_cleaning_steps(task) # The exceptions from RPC are not possible as we using cast # here manager_utils.notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') self.continue_cleaning(task) elif (node.provision_state == states.RESCUEWAIT): msg = _('Node failed to perform rescue operation.') self._finalize_rescue(task) except Exception as e: err_info = {'msg': msg, 'e': e} last_error = _('Asynchronous exception: %(msg)s ' 'Exception: %(e)s for node') % err_info errmsg = last_error + ' %(node)s' LOG.exception(errmsg, {'node': node.uuid}) if node.provision_state in (states.CLEANING, states.CLEANWAIT): manager_utils.cleaning_error_handler(task, last_error) elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): deploy_utils.set_failed_state( task, last_error, collect_logs=bool(self._client)) elif node.provision_state in (states.RESCUING, states.RESCUEWAIT): manager_utils.rescuing_error_handler(task, last_error)
def _resume_cleaning(self, task): raid_common.update_raid_info(task.node, task.node.raid_config) manager_utils.notify_conductor_resume_clean(task)
def _resume_cleaning(self, task): raid_common.update_raid_info(task.node, self.get_logical_disks(task)) manager_utils.notify_conductor_resume_clean(task)
def _set_success(self, task): if task.node.clean_step: manager_utils.notify_conductor_resume_clean(task) else: manager_utils.notify_conductor_resume_deploy(task)