def _heartbeat_in_maintenance(self, task): node = task.node if (node.provision_state in (states.CLEANING, states.CLEANWAIT) and not CONF.conductor.allow_provisioning_in_maintenance): LOG.error( 'Aborting cleaning for node %s, as it is in maintenance ' 'mode', node.uuid) last_error = _('Cleaning aborted as node is in maintenance mode') manager_utils.cleaning_error_handler(task, last_error) elif (node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT) and not CONF.conductor.allow_provisioning_in_maintenance): LOG.error( 'Aborting deployment for node %s, as it is in ' 'maintenance mode', node.uuid) last_error = _('Deploy aborted as node is in maintenance mode') deploy_utils.set_failed_state(task, last_error, collect_logs=False) elif (node.provision_state in (states.RESCUING, states.RESCUEWAIT) and not CONF.conductor.allow_provisioning_in_maintenance): LOG.error( 'Aborting rescuing for node %s, as it is in ' 'maintenance mode', node.uuid) last_error = _('Rescue aborted as node is in maintenance mode') manager_utils.rescuing_error_handler(task, last_error) else: LOG.warning( 'Heartbeat from node %(node)s in ' 'maintenance mode; not taking any action.', {'node': node.uuid})
def heartbeat(self, task, callback_url, agent_version): """Process a heartbeat. :param task: task to work with. :param callback_url: agent HTTP API URL. :param agent_version: The version of the agent that is heartbeating """ # NOTE(pas-ha) immediately skip the rest if nothing to do if (task.node.provision_state not in self.heartbeat_allowed_states and not manager_utils.fast_track_able(task)): LOG.debug('Heartbeat from node %(node)s in unsupported ' 'provision state %(state)s, not taking any action.', {'node': task.node.uuid, 'state': task.node.provision_state}) return try: task.upgrade_lock() except exception.NodeLocked: LOG.warning('Node %s is currently locked, skipping heartbeat ' 'processing (will retry on the next heartbeat)', task.node.uuid) return node = task.node LOG.debug('Heartbeat from node %s', node.uuid) driver_internal_info = node.driver_internal_info driver_internal_info['agent_url'] = callback_url driver_internal_info['agent_version'] = agent_version # Record the last heartbeat event time in UTC, so we can make # decisions about it later. Can be decoded to datetime object with: # datetime.datetime.strptime(var, "%Y-%m-%d %H:%M:%S.%f") driver_internal_info['agent_last_heartbeat'] = str( timeutils.utcnow().isoformat()) node.driver_internal_info = driver_internal_info node.save() if node.provision_state in _HEARTBEAT_RECORD_ONLY: # We shouldn't take any additional action. The agent will # silently continue to heartbeat to ironic until user initiated # state change occurs causing it to match a state below. LOG.debug('Heartbeat from %(node)s recorded to identify the ' 'node as on-line.', {'node': task.node.uuid}) return # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.maintenance: # this shouldn't happen often, but skip the rest if it does. LOG.debug('Heartbeat from node %(node)s in maintenance mode; ' 'not taking any action.', {'node': node.uuid}) return # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we # are currently in the core deploy.deploy step. Other deploy steps # may cause the agent to boot, but we should not trigger deployment # at that point. elif node.provision_state == states.DEPLOYWAIT: if self.in_core_deploy_step(task): if not self.deploy_has_started(task): msg = _('Node failed to deploy.') self.continue_deploy(task) elif self.deploy_is_done(task): msg = _('Node failed to move to active state.') self.reboot_to_instance(task) else: node.touch_provisioning() else: node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() if not node.clean_step: LOG.debug('Node %s just booted to start cleaning.', node.uuid) msg = _('Node failed to start the first cleaning step.') # First, cache the clean steps self.refresh_clean_steps(task) # Then set/verify node clean steps and start cleaning conductor_steps.set_node_cleaning_steps(task) # The exceptions from RPC are not possible as we using cast # here manager_utils.notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') self.continue_cleaning(task) elif (node.provision_state == states.RESCUEWAIT): msg = _('Node failed to perform rescue operation.') self._finalize_rescue(task) except Exception as e: err_info = {'msg': msg, 'e': e} last_error = _('Asynchronous exception: %(msg)s ' 'Exception: %(e)s for node') % err_info errmsg = last_error + ' %(node)s' LOG.exception(errmsg, {'node': node.uuid}) if node.provision_state in (states.CLEANING, states.CLEANWAIT): manager_utils.cleaning_error_handler(task, last_error) elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): deploy_utils.set_failed_state( task, last_error, collect_logs=bool(self._client)) elif node.provision_state in (states.RESCUING, states.RESCUEWAIT): manager_utils.rescuing_error_handler(task, last_error)
def heartbeat(self, task, callback_url, agent_version): """Process a heartbeat. :param task: task to work with. :param callback_url: agent HTTP API URL. :param agent_version: The version of the agent that is heartbeating """ # NOTE(pas-ha) immediately skip the rest if nothing to do if task.node.provision_state not in self.heartbeat_allowed_states: LOG.debug( 'Heartbeat from node %(node)s in unsupported ' 'provision state %(state)s, not taking any action.', { 'node': task.node.uuid, 'state': task.node.provision_state }) return try: task.upgrade_lock() except exception.NodeLocked: LOG.warning( 'Node %s is currently locked, skipping heartbeat ' 'processing (will retry on the next heartbeat)', task.node.uuid) return node = task.node LOG.debug('Heartbeat from node %s', node.uuid) driver_internal_info = node.driver_internal_info driver_internal_info['agent_url'] = callback_url driver_internal_info['agent_version'] = agent_version node.driver_internal_info = driver_internal_info node.save() # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.maintenance: # this shouldn't happen often, but skip the rest if it does. LOG.debug( 'Heartbeat from node %(node)s in maintenance mode; ' 'not taking any action.', {'node': node.uuid}) return elif (node.provision_state == states.DEPLOYWAIT and not self.deploy_has_started(task)): msg = _('Node failed to deploy.') self.continue_deploy(task) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_is_done(task)): msg = _('Node failed to move to active state.') self.reboot_to_instance(task) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_has_started(task)): node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() if not node.clean_step: LOG.debug('Node %s just booted to start cleaning.', node.uuid) msg = _('Node failed to start the first cleaning step.') # First, cache the clean steps self.refresh_clean_steps(task) # Then set/verify node clean steps and start cleaning manager_utils.set_node_cleaning_steps(task) # The exceptions from RPC are not possible as we using cast # here manager_utils.notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') self.continue_cleaning(task) elif (node.provision_state == states.RESCUEWAIT): msg = _('Node failed to perform rescue operation.') self._finalize_rescue(task) except Exception as e: err_info = {'msg': msg, 'e': e} last_error = _('Asynchronous exception: %(msg)s ' 'Exception: %(e)s for node') % err_info errmsg = last_error + ' %(node)s' LOG.exception(errmsg, {'node': node.uuid}) if node.provision_state in (states.CLEANING, states.CLEANWAIT): manager_utils.cleaning_error_handler(task, last_error) elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): deploy_utils.set_failed_state(task, last_error, collect_logs=bool(self._client)) elif node.provision_state in (states.RESCUING, states.RESCUEWAIT): manager_utils.rescuing_error_handler(task, last_error)