def heartbeat(self, task, callback_url, agent_version): """Process a heartbeat. :param task: task to work with. :param callback_url: agent HTTP API URL. :param agent_version: The version of the agent that is heartbeating """ # NOTE(pas-ha) immediately skip the rest if nothing to do if (task.node.provision_state not in self.heartbeat_allowed_states and not manager_utils.fast_track_able(task)): LOG.debug('Heartbeat from node %(node)s in unsupported ' 'provision state %(state)s, not taking any action.', {'node': task.node.uuid, 'state': task.node.provision_state}) return try: task.upgrade_lock() except exception.NodeLocked: LOG.warning('Node %s is currently locked, skipping heartbeat ' 'processing (will retry on the next heartbeat)', task.node.uuid) return node = task.node LOG.debug('Heartbeat from node %s', node.uuid) driver_internal_info = node.driver_internal_info driver_internal_info['agent_url'] = callback_url driver_internal_info['agent_version'] = agent_version # Record the last heartbeat event time in UTC, so we can make # decisions about it later. Can be decoded to datetime object with: # datetime.datetime.strptime(var, "%Y-%m-%d %H:%M:%S.%f") driver_internal_info['agent_last_heartbeat'] = str( timeutils.utcnow().isoformat()) node.driver_internal_info = driver_internal_info node.save() if node.provision_state in _HEARTBEAT_RECORD_ONLY: # We shouldn't take any additional action. The agent will # silently continue to heartbeat to ironic until user initiated # state change occurs causing it to match a state below. LOG.debug('Heartbeat from %(node)s recorded to identify the ' 'node as on-line.', {'node': task.node.uuid}) return # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.maintenance: # this shouldn't happen often, but skip the rest if it does. LOG.debug('Heartbeat from node %(node)s in maintenance mode; ' 'not taking any action.', {'node': node.uuid}) return # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we # are currently in the core deploy.deploy step. Other deploy steps # may cause the agent to boot, but we should not trigger deployment # at that point. elif node.provision_state == states.DEPLOYWAIT: if self.in_core_deploy_step(task): if not self.deploy_has_started(task): msg = _('Node failed to deploy.') self.continue_deploy(task) elif self.deploy_is_done(task): msg = _('Node failed to move to active state.') self.reboot_to_instance(task) else: node.touch_provisioning() else: node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() if not node.clean_step: LOG.debug('Node %s just booted to start cleaning.', node.uuid) msg = _('Node failed to start the first cleaning step.') # First, cache the clean steps self.refresh_clean_steps(task) # Then set/verify node clean steps and start cleaning conductor_steps.set_node_cleaning_steps(task) # The exceptions from RPC are not possible as we using cast # here manager_utils.notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') self.continue_cleaning(task) elif (node.provision_state == states.RESCUEWAIT): msg = _('Node failed to perform rescue operation.') self._finalize_rescue(task) except Exception as e: err_info = {'msg': msg, 'e': e} last_error = _('Asynchronous exception: %(msg)s ' 'Exception: %(e)s for node') % err_info errmsg = last_error + ' %(node)s' LOG.exception(errmsg, {'node': node.uuid}) if node.provision_state in (states.CLEANING, states.CLEANWAIT): manager_utils.cleaning_error_handler(task, last_error) elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): deploy_utils.set_failed_state( task, last_error, collect_logs=bool(self._client)) elif node.provision_state in (states.RESCUING, states.RESCUEWAIT): manager_utils.rescuing_error_handler(task, last_error)
def do_next_clean_step(task, step_index): """Do cleaning, starting from the specified clean step. :param task: a TaskManager instance with an exclusive lock :param step_index: The first clean step in the list to execute. This is the index (from 0) into the list of clean steps in the node's driver_internal_info['clean_steps']. Is None if there are no steps to execute. """ node = task.node # For manual cleaning, the target provision state is MANAGEABLE, # whereas for automated cleaning, it is AVAILABLE. manual_clean = node.target_provision_state == states.MANAGEABLE if step_index is None: steps = [] else: steps = node.driver_internal_info['clean_steps'][step_index:] LOG.info('Executing %(state)s on node %(node)s, remaining steps: ' '%(steps)s', {'node': node.uuid, 'steps': steps, 'state': node.provision_state}) # Execute each step until we hit an async step or run out of steps for ind, step in enumerate(steps): # Save which step we're about to start so we can restart # if necessary node.clean_step = step driver_internal_info = node.driver_internal_info driver_internal_info['clean_step_index'] = step_index + ind node.driver_internal_info = driver_internal_info node.save() interface = getattr(task.driver, step.get('interface')) LOG.info('Executing %(step)s on node %(node)s', {'step': step, 'node': node.uuid}) try: result = interface.execute_clean_step(task, step) except Exception as e: if isinstance(e, exception.AgentConnectionFailed): if task.node.driver_internal_info.get('cleaning_reboot'): LOG.info('Agent is not yet running on node %(node)s ' 'after cleaning reboot, waiting for agent to ' 'come up to run next clean step %(step)s.', {'node': node.uuid, 'step': step}) driver_internal_info['skip_current_clean_step'] = False node.driver_internal_info = driver_internal_info target_state = (states.MANAGEABLE if manual_clean else None) task.process_event('wait', target_state=target_state) return msg = (_('Node %(node)s failed step %(step)s: ' '%(exc)s') % {'node': node.uuid, 'exc': e, 'step': node.clean_step}) LOG.exception(msg) driver_utils.collect_ramdisk_logs(task.node, label='cleaning') utils.cleaning_error_handler(task, msg) return # Check if the step is done or not. The step should return # states.CLEANWAIT if the step is still being executed, or # None if the step is done. if result == states.CLEANWAIT: # Kill this worker, the async step will make an RPC call to # continue_node_clean to continue cleaning LOG.info('Clean step %(step)s on node %(node)s being ' 'executed asynchronously, waiting for driver.', {'node': node.uuid, 'step': step}) target_state = states.MANAGEABLE if manual_clean else None task.process_event('wait', target_state=target_state) return elif result is not None: msg = (_('While executing step %(step)s on node ' '%(node)s, step returned invalid value: %(val)s') % {'step': step, 'node': node.uuid, 'val': result}) LOG.error(msg) return utils.cleaning_error_handler(task, msg) LOG.info('Node %(node)s finished clean step %(step)s', {'node': node.uuid, 'step': step}) if CONF.agent.deploy_logs_collect == 'always': driver_utils.collect_ramdisk_logs(task.node, label='cleaning') # Clear clean_step node.clean_step = None driver_internal_info = node.driver_internal_info driver_internal_info['clean_steps'] = None driver_internal_info.pop('clean_step_index', None) driver_internal_info.pop('cleaning_reboot', None) driver_internal_info.pop('cleaning_polling', None) driver_internal_info.pop('agent_secret_token', None) driver_internal_info.pop('agent_secret_token_pregenerated', None) # Remove agent_url if not utils.fast_track_able(task): driver_internal_info.pop('agent_url', None) node.driver_internal_info = driver_internal_info node.save() try: task.driver.deploy.tear_down_cleaning(task) except Exception as e: msg = (_('Failed to tear down from cleaning for node %(node)s, ' 'reason: %(err)s') % {'node': node.uuid, 'err': e}) LOG.exception(msg) return utils.cleaning_error_handler(task, msg, tear_down_cleaning=False) LOG.info('Node %s cleaning complete', node.uuid) event = 'manage' if manual_clean or node.retired else 'done' # NOTE(rloo): No need to specify target prov. state; we're done task.process_event(event)