def heartbeat(self, task, **kwargs): """Method for agent to periodically check in. The agent should be sending its agent_url (so Ironic can talk back) as a kwarg. kwargs should have the following format:: { 'agent_url': 'http://AGENT_HOST:AGENT_PORT' } AGENT_PORT defaults to 9999. """ node = task.node driver_internal_info = node.driver_internal_info LOG.debug( 'Heartbeat from %(node)s, last heartbeat at %(heartbeat)s.', {'node': node.uuid, 'heartbeat': driver_internal_info.get('agent_last_heartbeat')}) driver_internal_info['agent_last_heartbeat'] = int(_time()) try: driver_internal_info['agent_url'] = kwargs['agent_url'] except KeyError: raise exception.MissingParameterValue(_('For heartbeat operation, ' '"agent_url" must be ' 'specified.')) node.driver_internal_info = driver_internal_info node.save() # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.maintenance: # this shouldn't happen often, but skip the rest if it does. LOG.debug('Heartbeat from node %(node)s in maintenance mode; ' 'not taking any action.', {'node': node.uuid}) return elif node.provision_state == states.DEPLOYWAIT: msg = _('Node failed to get image for deploy.') self.continue_deploy(task, **kwargs) elif (node.provision_state == states.DEPLOYING and self.deploy_is_done(task)): msg = _('Node failed to move to active state.') self.reboot_to_instance(task, **kwargs) elif (node.provision_state == states.CLEANING and not node.clean_step): # Agent booted from prepare_cleaning manager.set_node_cleaning_steps(task) self._notify_conductor_resume_clean(task) elif (node.provision_state == states.CLEANING and node.clean_step): self.continue_cleaning(task, **kwargs) except Exception as e: err_info = {'node': node.uuid, 'msg': msg, 'e': e} last_error = _('Asynchronous exception for node %(node)s: ' '%(msg)s exception: %(e)s') % err_info LOG.exception(last_error) deploy_utils.set_failed_state(task, last_error)
def do_agent_iscsi_deploy(task, agent_client): """Method invoked when deployed with the agent ramdisk. This method is invoked by drivers for doing iSCSI deploy using agent ramdisk. This method assumes that the agent is booted up on the node and is heartbeating. :param task: a TaskManager object containing the node. :param agent_client: an instance of agent_client.AgentClient which will be used during iscsi deploy (for exposing node's target disk via iSCSI, for install boot loader, etc). :returns: a dictionary containing the following keys: For partition image: 'root uuid': UUID of root partition 'efi system partition uuid': UUID of the uefi system partition (if boot mode is uefi). NOTE: If key exists but value is None, it means partition doesn't exist. For whole disk image: 'disk identifier': ID of the disk to which image was deployed. :raises: InstanceDeployFailure, if it encounters some error during the deploy. """ node = task.node iscsi_options = build_deploy_ramdisk_options(node) iqn = iscsi_options['iscsi_target_iqn'] result = agent_client.start_iscsi_target(node, iqn) if result['command_status'] == 'FAILED': msg = (_("Failed to start the iSCSI target to deploy the " "node %(node)s. Error: %(error)s") % {'node': node.uuid, 'error': result['command_error']}) deploy_utils.set_failed_state(task, msg) raise exception.InstanceDeployFailure(reason=msg) address = parse.urlparse(node.driver_internal_info['agent_url']) address = address.hostname # TODO(lucasagomes): The 'error' and 'key' parameters in the # dictionary below are just being passed because it's needed for # the iscsi_deploy.continue_deploy() method, we are fooling it # for now. The agent driver doesn't use/need those. So we need to # refactor this bits here later. iscsi_params = {'error': result['command_error'], 'iqn': iqn, 'key': iscsi_options['deployment_key'], 'address': address} uuid_dict_returned = continue_deploy(task, **iscsi_params) root_uuid_or_disk_id = uuid_dict_returned.get( 'root uuid', uuid_dict_returned.get('disk identifier')) # TODO(lucasagomes): Move this bit saving the root_uuid to # iscsi_deploy.continue_deploy() driver_internal_info = node.driver_internal_info driver_internal_info['root_uuid_or_disk_id'] = root_uuid_or_disk_id node.driver_internal_info = driver_internal_info node.save() return uuid_dict_returned
def deploy_steps(self, task, **data): http_method = data.pop('http_method') driver_info = _NodeDriverInfoAdapter(task.node) if http_method == 'GET': ssh_keys_step = _InjectSSHKeyStepRequest(task, driver_info) return ssh_keys_step() steps_mapping = _DeployStepMapping() data = _DeployStepsAdapter(data) try: request_cls = steps_mapping.name_to_step[data.action] except KeyError: if data.action is not None: raise RuntimeError( 'There is no name mapping for deployment step: ' '{!r}'.format(data.action)) message = ( 'Bareon\'s callback service have failed with internall error') if data.status_details: message += '\nFailure details: {}'.format( pprint.pformat(data.status_details)) # TODO(dbogun): add support for existing log extraction mechanism deploy_utils.set_failed_state( task, message, collect_logs=False) else: handler = request_cls.result_handler( task, driver_info, data) handler() return {'url': None}
def finish_deploy(task, address): """Notifies the ramdisk to reboot the node and makes the instance active. This method notifies the ramdisk to proceed to reboot and then makes the instance active. :param task: a TaskManager object. :param address: The IP address of the bare metal node. :raises: InstanceDeployFailure, if notifying ramdisk failed. """ node = task.node try: deploy_utils.notify_ramdisk_to_proceed(address) except Exception as e: LOG.error( _LE("Deploy failed for instance %(instance)s. " "Error: %(error)s"), {"instance": node.instance_uuid, "error": e}, ) msg = _("Failed to notify ramdisk to reboot after bootloader " "installation. Error: %s") % e deploy_utils.set_failed_state(task, msg) raise exception.InstanceDeployFailure(msg) # TODO(lucasagomes): When deploying a node with the DIB ramdisk # Ironic will not power control the node at the end of the deployment, # it's the DIB ramdisk that reboots the node. But, for the SSH driver # some changes like setting the boot device only gets applied when the # machine is powered off and on again. So the code below is enforcing # it. For Liberty we need to change the DIB ramdisk so that Ironic # always controls the power state of the node for all drivers. if deploy_utils.get_boot_option(node) == "local" and "ssh" in node.driver: manager_utils.node_power_action(task, states.REBOOT) LOG.info(_LI("Deployment to node %s done"), node.uuid) task.process_event("done")
def continue_deploy(self, task, **kwargs): """Method invoked when deployed with the IPA ramdisk. This method is invoked during a heartbeat from an agent when the node is in wait-call-back state. This deploys the image on the node and then configures the node to boot according to the desired boot option (netboot or localboot). :param task: a TaskManager object containing the node. :param kwargs: the kwargs passed from the heartbeat method. :raises: InstanceDeployFailure, if it encounters some error during the deploy. """ task.process_event("resume") node = task.node LOG.debug("Continuing the deployment on node %s", node.uuid) uuid_dict_returned = do_agent_iscsi_deploy(task, self._client) if deploy_utils.get_boot_option(node) == "local": # Install the boot loader root_uuid = uuid_dict_returned.get("root uuid") efi_sys_uuid = uuid_dict_returned.get("efi system partition uuid") self.configure_local_boot(task, root_uuid=root_uuid, efi_system_part_uuid=efi_sys_uuid) try: task.driver.boot.prepare_instance(task) except Exception as e: LOG.error( _LE("Deploy failed for instance %(instance)s. " "Error: %(error)s"), {"instance": node.instance_uuid, "error": e}, ) msg = _("Failed to continue agent deployment.") deploy_utils.set_failed_state(task, msg) self.reboot_and_finish_deploy(task)
def reboot_to_instance(self, task, **kwargs): task.process_event('resume') node = task.node error = self.check_deploy_success(node) if error is not None: # TODO(jimrollenhagen) power off if using neutron dhcp to # align with pxe driver? msg = (_('node %(node)s command status errored: %(error)s') % {'node': node.uuid, 'error': error}) LOG.error(msg) deploy_utils.set_failed_state(task, msg) return LOG.info(_LI('Image successfully written to node %s'), node.uuid) LOG.debug('Rebooting node %s to instance', node.uuid) manager_utils.node_set_boot_device(task, 'disk', persistent=True) self.reboot_and_finish_deploy(task) # NOTE(TheJulia): If we deployed a whole disk image, we # should expect a whole disk image and clean-up the tftp files # on-disk incase the node is disregarding the boot preference. # TODO(rameshg87): Not all in-tree drivers using reboot_to_instance # have a boot interface. So include a check for now. Remove this # check once all in-tree drivers have a boot interface. if task.driver.boot: task.driver.boot.clean_up_ramdisk(task)
def reboot_to_instance(self, task, **kwargs): task.process_event('resume') node = task.node error = self.check_deploy_success(node) if error is not None: # TODO(jimrollenhagen) power off if using neutron dhcp to # align with pxe driver? msg = (_('node %(node)s command status errored: %(error)s') % {'node': node.uuid, 'error': error}) LOG.error(msg) deploy_utils.set_failed_state(task, msg) return LOG.info(_LI('Image successfully written to node %s'), node.uuid) LOG.debug('Rebooting node %s to instance', node.uuid) manager_utils.node_set_boot_device(task, 'disk', persistent=True) self.reboot_and_finish_deploy(task) # NOTE(TheJulia): If we we deployed a whole disk image, we # should expect a whole disk image and clean-up the tftp files # on-disk incase the node is disregarding the boot preference. # TODO(rameshg87): This shouldn't get called for virtual media deploy # drivers (iLO and iRMC). This is just a hack, but it will be taken # care in boot/deploy interface separation. if (_driver_uses_pxe(task.driver) and node.driver_internal_info.get('is_whole_disk_image')): _clean_up_pxe(task)
def pass_deploy_info(self, task, **kwargs): """Continues the iSCSI deployment from where ramdisk left off. This method continues the iSCSI deployment from the conductor node and writes the deploy image to the bare metal's disk. After that, it does the following depending on boot_option for deploy: - If the boot_option requested for this deploy is 'local', then it sets the node to boot from disk (ramdisk installs the boot loader present within the image to the bare metal's disk). - If the boot_option requested is 'netboot' or no boot_option is requested, it finds/creates the boot ISO to boot the instance image, attaches the boot ISO to the bare metal and then sets the node to boot from CDROM. :param task: a TaskManager instance containing the node to act on. :param kwargs: kwargs containing parameters for iSCSI deployment. :raises: InvalidState """ node = task.node task.process_event('resume') iwdi = node.driver_internal_info.get('is_whole_disk_image') ilo_common.cleanup_vmedia_boot(task) uuid_dict = iscsi_deploy.continue_deploy(task, **kwargs) root_uuid_or_disk_id = uuid_dict.get( 'root uuid', uuid_dict.get('disk identifier')) try: # Set boot mode ilo_common.update_boot_mode(task) # Need to enable secure boot, if being requested _update_secure_boot_mode(task, True) # For iscsi_ilo driver, we boot from disk every time if the image # deployed is a whole disk image. if iscsi_deploy.get_boot_option(node) == "local" or iwdi: manager_utils.node_set_boot_device(task, boot_devices.DISK, persistent=True) # Ask the ramdisk to install bootloader and # wait for the call-back through the vendor passthru # 'pass_bootloader_install_info', if it's not a whole # disk image. if not iwdi: deploy_utils.notify_ramdisk_to_proceed(kwargs['address']) task.process_event('wait') return else: self._configure_vmedia_boot(task, root_uuid_or_disk_id) except Exception as e: LOG.error(_LE('Deploy failed for instance %(instance)s. ' 'Error: %(error)s'), {'instance': node.instance_uuid, 'error': e}) msg = _('Failed to continue iSCSI deployment.') deploy_utils.set_failed_state(task, msg) else: iscsi_deploy.finish_deploy(task, kwargs.get('address'))
def pass_deploy_info(self, task, **kwargs): """Continues the deployment of baremetal node over iSCSI. This method continues the deployment of the baremetal node over iSCSI from where the deployment ramdisk has left off. :param task: a TaskManager instance containing the node to act on. :param kwargs: kwargs for performing iscsi deployment. :raises: InvalidState """ node = task.node task.process_event('resume') _destroy_token_file(node) is_whole_disk_image = node.driver_internal_info['is_whole_disk_image'] uuid_dict = iscsi_deploy.continue_deploy(task, **kwargs) root_uuid_or_disk_id = uuid_dict.get( 'root uuid', uuid_dict.get('disk identifier')) # save the node's root disk UUID so that another conductor could # rebuild the PXE config file. Due to a shortcoming in Nova objects, # we have to assign to node.driver_internal_info so the node knows it # has changed. driver_internal_info = node.driver_internal_info driver_internal_info['root_uuid_or_disk_id'] = root_uuid_or_disk_id node.driver_internal_info = driver_internal_info node.save() try: if iscsi_deploy.get_boot_option(node) == "local": deploy_utils.try_set_boot_device(task, boot_devices.DISK) # If it's going to boot from the local disk, get rid of # the PXE configuration files used for the deployment pxe_utils.clean_up_pxe_config(task) # Ask the ramdisk to install bootloader and # wait for the call-back through the vendor passthru # 'pass_bootloader_install_info', if it's not a # whole disk image. if not is_whole_disk_image: deploy_utils.notify_ramdisk_to_proceed(kwargs['address']) task.process_event('wait') return else: pxe_config_path = pxe_utils.get_pxe_config_file_path(node.uuid) boot_mode = deploy_utils.get_boot_mode_for_deploy(node) deploy_utils.switch_pxe_config(pxe_config_path, root_uuid_or_disk_id, boot_mode, is_whole_disk_image) except Exception as e: LOG.error(_LE('Deploy failed for instance %(instance)s. ' 'Error: %(error)s'), {'instance': node.instance_uuid, 'error': e}) msg = _('Failed to continue iSCSI deployment.') deploy_utils.set_failed_state(task, msg) else: iscsi_deploy.finish_deploy(task, kwargs.get('address'))
def _set_failed_state(self, task, error): try: deploy_utils.set_failed_state(task, error, collect_logs=False) except TypeError: LOG.warning(_LW("To have proper error handling please update " "your Ironic installation to contain commit " "bb62f256f7aa55c292ebeae73ca25a4a9f0ec8c0.")) deploy_utils.set_failed_state(task, error)
def pass_deploy_info(self, task, **kwargs): """Continues the deployment of baremetal node over iSCSI. This method continues the deployment of the baremetal node over iSCSI from where the deployment ramdisk has left off. :param task: a TaskManager instance containing the node to act on. :param kwargs: kwargs for performing iscsi deployment. :raises: InvalidState """ node = task.node LOG.warning(_LW("The node %s is using the bash deploy ramdisk for " "its deployment. This deploy ramdisk has been " "deprecated. Please use the ironic-python-agent " "(IPA) ramdisk instead."), node.uuid) # TODO(rameshg87): Remove the below code once we stop supporting # bash ramdisk in Ironic. if node.provision_state == states.CLEANWAIT: return self._initiate_cleaning(task) task.process_event('resume') LOG.debug('Continuing the deployment on node %s', node.uuid) is_whole_disk_image = node.driver_internal_info['is_whole_disk_image'] uuid_dict_returned = continue_deploy(task, **kwargs) root_uuid_or_disk_id = uuid_dict_returned.get( 'root uuid', uuid_dict_returned.get('disk identifier')) # save the node's root disk UUID so that another conductor could # rebuild the PXE config file. Due to a shortcoming in Nova objects, # we have to assign to node.driver_internal_info so the node knows it # has changed. driver_internal_info = node.driver_internal_info driver_internal_info['root_uuid_or_disk_id'] = root_uuid_or_disk_id node.driver_internal_info = driver_internal_info node.save() try: task.driver.boot.prepare_instance(task) if deploy_utils.get_boot_option(node) == "local": if not is_whole_disk_image: LOG.debug('Installing the bootloader on node %s', node.uuid) deploy_utils.notify_ramdisk_to_proceed(kwargs['address']) task.process_event('wait') return except Exception as e: LOG.error(_LE('Deploy failed for instance %(instance)s. ' 'Error: %(error)s'), {'instance': node.instance_uuid, 'error': e}) msg = _('Failed to continue iSCSI deployment.') deploy_utils.set_failed_state(task, msg) else: finish_deploy(task, kwargs.get('address'))
def _fail_deploy(task, msg, raise_exception=True): """Fail the deploy after logging and setting error states.""" if isinstance(msg, Exception): msg = (_('Deploy failed for instance %(instance)s. ' 'Error: %(error)s') % {'instance': node.instance_uuid, 'error': msg}) deploy_utils.set_failed_state(task, msg) destroy_images(task.node.uuid) if raise_exception: raise exception.InstanceDeployFailure(msg)
def do_agent_iscsi_deploy(task, agent_client): """Method invoked when deployed with the agent ramdisk. This method is invoked by drivers for doing iSCSI deploy using agent ramdisk. This method assumes that the agent is booted up on the node and is heartbeating. :param task: a TaskManager object containing the node. :param agent_client: an instance of agent_client.AgentClient which will be used during iscsi deploy (for exposing node's target disk via iSCSI, for install boot loader, etc). :returns: a dictionary containing the following keys: For partition image: 'root uuid': UUID of root partition 'efi system partition uuid': UUID of the uefi system partition (if boot mode is uefi). NOTE: If key exists but value is None, it means partition doesn't exist. For whole disk image: 'disk identifier': ID of the disk to which image was deployed. :raises: InstanceDeployFailure, if it encounters some error during the deploy. """ node = task.node i_info = deploy_utils.parse_instance_info(node) wipe_disk_metadata = not i_info['preserve_ephemeral'] iqn = 'iqn.2008-10.org.openstack:%s' % node.uuid portal_port = CONF.iscsi.portal_port result = agent_client.start_iscsi_target( node, iqn, portal_port, wipe_disk_metadata=wipe_disk_metadata) if result['command_status'] == 'FAILED': msg = (_("Failed to start the iSCSI target to deploy the " "node %(node)s. Error: %(error)s") % {'node': node.uuid, 'error': result['command_error']}) deploy_utils.set_failed_state(task, msg) raise exception.InstanceDeployFailure(reason=msg) address = parse.urlparse(node.driver_internal_info['agent_url']) address = address.hostname uuid_dict_returned = continue_deploy(task, iqn=iqn, address=address) root_uuid_or_disk_id = uuid_dict_returned.get( 'root uuid', uuid_dict_returned.get('disk identifier')) # TODO(lucasagomes): Move this bit saving the root_uuid to # continue_deploy() driver_internal_info = node.driver_internal_info driver_internal_info['root_uuid_or_disk_id'] = root_uuid_or_disk_id node.driver_internal_info = driver_internal_info node.save() return uuid_dict_returned
def pass_deploy_info(self, task, **kwargs): """Continues the deployment of baremetal node.""" node = task.node task.process_event('resume') err_msg = _('Failed to continue deployment with Fuel Agent.') agent_status = kwargs.get('status') if agent_status != 'ready': LOG.error(_LE('Deploy failed for node %(node)s. Fuel Agent is not ' 'in ready state, error: %(error)s'), {'node': node.uuid, 'error': kwargs.get('error_message')}) deploy_utils.set_failed_state(task, err_msg) return params = _parse_driver_info(node) params['host'] = kwargs.get('address') cmd = ('%s --data_driver ironic --config-file ' '/etc/fuel-agent/fuel-agent.conf' % params.pop('script')) if CONF.debug: cmd += ' --debug' instance_info = node.instance_info try: deploy_data = _get_deploy_data(task.context, instance_info['image_source']) image_data = {"/": {"uri": instance_info['image_url'], "format": "raw", "container": "raw"}} deploy_data['ks_meta']['image_data'] = image_data ssh = utils.ssh_connect(params) sftp = ssh.open_sftp() _sftp_upload(sftp, json.dumps(deploy_data), '/tmp/provision.json') # swift configdrive store should be disabled configdrive = instance_info.get('configdrive') if configdrive is not None: _sftp_upload(sftp, configdrive, '/tmp/config-drive.img') _ssh_execute(ssh, cmd, params) LOG.info(_LI('Fuel Agent pass on node %s'), node.uuid) manager_utils.node_set_boot_device(task, boot_devices.DISK, persistent=True) manager_utils.node_power_action(task, states.REBOOT) except Exception as e: msg = (_('Deploy failed for node %(node)s. Error: %(error)s') % {'node': node.uuid, 'error': e}) LOG.error(msg) deploy_utils.set_failed_state(task, msg) else: task.process_event('done') LOG.info(_LI('Deployment to node %s done'), task.node.uuid)
def pass_deploy_info(self, task, **kwargs): """Continues the deployment of baremetal node over iSCSI. This method continues the deployment of the baremetal node over iSCSI from where the deployment ramdisk has left off. :param task: a TaskManager instance containing the node to act on. :param kwargs: kwargs for performing iscsi deployment. :raises: InvalidState """ node = task.node LOG.warning( _LW( "The node %s is using the bash deploy ramdisk for " "its deployment. This deploy ramdisk has been " "deprecated. Please use the ironic-python-agent " "(IPA) ramdisk instead." ), node.uuid, ) task.process_event("resume") LOG.debug("Continuing the deployment on node %s", node.uuid) is_whole_disk_image = node.driver_internal_info["is_whole_disk_image"] uuid_dict_returned = continue_deploy(task, **kwargs) root_uuid_or_disk_id = uuid_dict_returned.get("root uuid", uuid_dict_returned.get("disk identifier")) # save the node's root disk UUID so that another conductor could # rebuild the PXE config file. Due to a shortcoming in Nova objects, # we have to assign to node.driver_internal_info so the node knows it # has changed. driver_internal_info = node.driver_internal_info driver_internal_info["root_uuid_or_disk_id"] = root_uuid_or_disk_id node.driver_internal_info = driver_internal_info node.save() try: task.driver.boot.prepare_instance(task) if deploy_utils.get_boot_option(node) == "local": if not is_whole_disk_image: LOG.debug("Installing the bootloader on node %s", node.uuid) deploy_utils.notify_ramdisk_to_proceed(kwargs["address"]) task.process_event("wait") return except Exception as e: LOG.error( _LE("Deploy failed for instance %(instance)s. " "Error: %(error)s"), {"instance": node.instance_uuid, "error": e}, ) msg = _("Failed to continue iSCSI deployment.") deploy_utils.set_failed_state(task, msg) else: finish_deploy(task, kwargs.get("address"))
def pass_deploy_info(self, task, **kwargs): """Continues the deployment of baremetal node over iSCSI. This method continues the deployment of the baremetal node over iSCSI from where the deployment ramdisk has left off. :param task: a TaskManager instance containing the node to act on. :param kwargs: kwargs for performing iscsi deployment. :raises: InvalidState """ node = task.node task.process_event('resume') LOG.debug('Continuing the deployment on node %s', node.uuid) is_whole_disk_image = node.driver_internal_info['is_whole_disk_image'] uuid_dict_returned = continue_deploy(task, **kwargs) root_uuid_or_disk_id = uuid_dict_returned.get( 'root uuid', uuid_dict_returned.get('disk identifier')) # save the node's root disk UUID so that another conductor could # rebuild the PXE config file. Due to a shortcoming in Nova objects, # we have to assign to node.driver_internal_info so the node knows it # has changed. driver_internal_info = node.driver_internal_info driver_internal_info['root_uuid_or_disk_id'] = root_uuid_or_disk_id node.driver_internal_info = driver_internal_info node.save() try: if deploy_utils.get_boot_option(node) == "local": deploy_utils.try_set_boot_device(task, boot_devices.DISK) if not is_whole_disk_image: LOG.debug('Installing the bootloader on node %s', node.uuid) deploy_utils.notify_ramdisk_to_proceed(kwargs['address']) task.process_event('wait') return task.driver.boot.prepare_instance(task) except Exception as e: LOG.error(_LE('Deploy failed for instance %(instance)s. ' 'Error: %(error)s'), {'instance': node.instance_uuid, 'error': e}) msg = _('Failed to continue iSCSI deployment.') deploy_utils.set_failed_state(task, msg) else: finish_deploy(task, kwargs.get('address'))
def log_and_raise_deployment_error(task, msg, collect_logs=True, exc=None): """Helper method to log the error and raise exception. :param task: a TaskManager instance containing the node to act on. :param msg: the message to set in last_error of the node. :param collect_logs: Boolean indicating whether to attempt to collect logs from IPA-based ramdisk. Defaults to True. Actual log collection is also affected by CONF.agent.deploy_logs_collect config option. :param exc: Exception that caused the failure. """ log_traceback = (exc is not None and not isinstance(exc, exception.IronicException)) LOG.error(msg, exc_info=log_traceback) deploy_utils.set_failed_state(task, msg, collect_logs=collect_logs) raise exception.InstanceDeployFailure(msg)
def heartbeat(self, task, **kwargs): """Method for agent to periodically check in. The agent should be sending its agent_url (so Ironic can talk back) as a kwarg. kwargs should have the following format:: { 'agent_url': 'http://AGENT_HOST:AGENT_PORT' } AGENT_PORT defaults to 9999. """ node = task.node driver_info = node.driver_info LOG.debug( 'Heartbeat from %(node)s, last heartbeat at %(heartbeat)s.', {'node': node.uuid, 'heartbeat': driver_info.get('agent_last_heartbeat')}) driver_info['agent_last_heartbeat'] = int(_time()) try: driver_info['agent_url'] = kwargs['agent_url'] except KeyError: raise exception.MissingParameterValue(_('For heartbeat operation, ' '"agent_url" must be ' 'specified.')) node.driver_info = driver_info node.save() # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.provision_state == states.DEPLOYWAIT: msg = _('Node failed to get image for deploy.') self._continue_deploy(task, **kwargs) elif (node.provision_state == states.DEPLOYING and self._deploy_is_done(node)): msg = _('Node failed to move to active state.') self._reboot_to_instance(task, **kwargs) except Exception: LOG.exception(_LE('Async exception for %(node)s: %(msg)s'), {'node': node, 'msg': msg}) deploy_utils.set_failed_state(task, msg)
def reboot_to_instance(self, task, **kwargs): node = task.node LOG.debug("Preparing to reboot to instance for node %s", node.uuid) error = self._check_deploy_success(node) if error is not None: # TODO(jimrollenhagen) power off if using neutron dhcp to # align with pxe driver? msg = _("node %(node)s command status errored: %(error)s") % ({"node": node.uuid, "error": error}) LOG.error(msg) deploy_utils.set_failed_state(task, msg) return LOG.debug("Rebooting node %s to disk", node.uuid) manager_utils.node_set_boot_device(task, "disk", persistent=True) manager_utils.node_power_action(task, states.REBOOT) task.process_event("done")
def reboot_to_instance(self, task, **kwargs): node = task.node LOG.debug('Preparing to reboot to instance for node %s', node.uuid) error = self.check_deploy_success(node) if error is not None: # TODO(jimrollenhagen) power off if using neutron dhcp to # align with pxe driver? msg = (_('node %(node)s command status errored: %(error)s') % {'node': node.uuid, 'error': error}) LOG.error(msg) deploy_utils.set_failed_state(task, msg) return LOG.debug('Rebooting node %s to disk', node.uuid) manager_utils.node_set_boot_device(task, 'disk', persistent=True) self.reboot_and_finish_deploy(task)
def _continue_deploy(self, task, **kwargs): """Continues the iSCSI deployment from where ramdisk left off. Continues the iSCSI deployment from the conductor node, finds the boot ISO to boot the node, and sets the node to boot from boot ISO. :param task: a TaskManager instance containing the node to act on. :param kwargs: kwargs containing parameters for iSCSI deployment. :raises: InvalidState """ node = task.node task.process_event('resume') ilo_common.cleanup_vmedia_boot(task) root_uuid = iscsi_deploy.continue_deploy(task, **kwargs) if not root_uuid: return try: boot_iso = _get_boot_iso(task, root_uuid) if not boot_iso: LOG.error(_LE("Cannot get boot ISO for node %s"), node.uuid) return ilo_common.setup_vmedia_for_boot(task, boot_iso) manager_utils.node_set_boot_device(task, boot_devices.CDROM) address = kwargs.get('address') deploy_utils.notify_deploy_complete(address) LOG.info(_LI('Deployment to node %s done'), node.uuid) i_info = node.instance_info i_info['ilo_boot_iso'] = boot_iso node.instance_info = i_info task.process_event('done') except Exception as e: LOG.error(_LE('Deploy failed for instance %(instance)s. ' 'Error: %(error)s'), {'instance': node.instance_uuid, 'error': e}) msg = _('Failed to continue iSCSI deployment.') deploy_utils.set_failed_state(task, msg)
def validate_bootloader_install_status(task, input_params): """Validate if bootloader was installed. This method first validates if deploy key sent in vendor passthru was correct one, and then validates whether bootloader installation was successful or not. :param task: A TaskManager object. :param input_params: A dictionary of params sent as input to passthru. :raises: InstanceDeployFailure, if bootloader installation was reported from ramdisk as failure. """ if input_params['status'] != 'SUCCEEDED': msg = (_('Failed to install bootloader on node %(node)s. ' 'Error: %(error)s.') % {'node': task.node.uuid, 'error': input_params.get('error')}) LOG.error(msg) deploy_utils.set_failed_state(task, msg) raise exception.InstanceDeployFailure(msg)
def _get_uuid_from_result(self, task, type_uuid): command = self._client.get_commands_status(task.node)[-1] if command['command_result'] is not None: words = command['command_result']['result'].split() for word in words: if type_uuid in word: result = word.split('=')[1] if not result: msg = (_('Command result did not return %(type_uuid)s ' 'for node %(node)s. The version of the IPA ' 'ramdisk used in the deployment might not ' 'have support for provisioning of ' 'partition images.') % {'type_uuid': type_uuid, 'node': task.node.uuid}) LOG.error(msg) deploy_utils.set_failed_state(task, msg) return return result
def deploy(self, task): """Perform a deployment to a node.""" manager_utils.node_power_action(task, states.REBOOT) if CONF.ansible.use_ramdisk_callback: return states.DEPLOYWAIT node = task.node ip_addr = _get_node_ip_dhcp(task) try: self._ansible_deploy(task, ip_addr) except Exception as e: error = _('Deploy failed for node %(node)s: ' 'Error: %(exc)s') % {'node': node.uuid, 'exc': six.text_type(e)} LOG.exception(error) deploy_utils.set_failed_state(task, error, collect_logs=False) else: self.reboot_to_instance(task) return states.DEPLOYDONE
def reboot_to_instance(self, task): task.process_event('resume') node = task.node iwdi = task.node.driver_internal_info.get('is_whole_disk_image') error = self.check_deploy_success(node) if error is not None: # TODO(jimrollenhagen) power off if using neutron dhcp to # align with pxe driver? msg = (_('node %(node)s command status errored: %(error)s') % {'node': node.uuid, 'error': error}) LOG.error(msg) deploy_utils.set_failed_state(task, msg) return if not iwdi: root_uuid = self._get_uuid_from_result(task, 'root_uuid') if deploy_utils.get_boot_mode_for_deploy(node) == 'uefi': efi_sys_uuid = ( self._get_uuid_from_result(task, 'efi_system_partition_uuid')) else: efi_sys_uuid = None driver_internal_info = task.node.driver_internal_info driver_internal_info['root_uuid_or_disk_id'] = root_uuid task.node.driver_internal_info = driver_internal_info task.node.save() self.prepare_instance_to_boot(task, root_uuid, efi_sys_uuid) LOG.info('Image successfully written to node %s', node.uuid) LOG.debug('Rebooting node %s to instance', node.uuid) if iwdi: manager_utils.node_set_boot_device(task, 'disk', persistent=True) self.reboot_and_finish_deploy(task) # NOTE(TheJulia): If we deployed a whole disk image, we # should expect a whole disk image and clean-up the tftp files # on-disk incase the node is disregarding the boot preference. # TODO(rameshg87): Not all in-tree drivers using reboot_to_instance # have a boot interface. So include a check for now. Remove this # check once all in-tree drivers have a boot interface. if task.driver.boot and iwdi: task.driver.boot.clean_up_ramdisk(task)
def _continue_deploy(self, task, **kwargs): """Continues the deployment of baremetal node over iSCSI. This method continues the deployment of the baremetal node over iSCSI from where the deployment ramdisk has left off. :param task: a TaskManager instance containing the node to act on. :param kwargs: kwargs for performing iscsi deployment. :raises: InvalidState """ node = task.node task.process_event('resume') _destroy_token_file(node) root_uuid = iscsi_deploy.continue_deploy(task, **kwargs) if not root_uuid: return try: if iscsi_deploy.get_boot_option(node) == "local": try_set_boot_device(task, boot_devices.DISK) # If it's going to boot from the local disk, get rid of # the PXE configuration files used for the deployment pxe_utils.clean_up_pxe_config(task) else: pxe_config_path = pxe_utils.get_pxe_config_file_path(node.uuid) deploy_utils.switch_pxe_config(pxe_config_path, root_uuid, driver_utils.get_node_capability(node, 'boot_mode')) deploy_utils.notify_deploy_complete(kwargs['address']) LOG.info(_LI('Deployment to node %s done'), node.uuid) task.process_event('done') except Exception as e: LOG.error(_LE('Deploy failed for instance %(instance)s. ' 'Error: %(error)s'), {'instance': node.instance_uuid, 'error': e}) msg = _('Failed to continue iSCSI deployment.') deploy_utils.set_failed_state(task, msg)
def continue_deploy(task, **kwargs): """Resume a deployment upon getting POST data from deploy ramdisk. This method raises no exceptions because it is intended to be invoked asynchronously as a callback from the deploy ramdisk. :param task: a TaskManager instance containing the node to act on. :param kwargs: the kwargs to be passed to deploy. :returns: UUID of the root partition or None on error. """ node = task.node node.provision_state = states.DEPLOYING node.save() params = get_deploy_info(node, **kwargs) ramdisk_error = kwargs.get('error') if ramdisk_error: LOG.error(_LE('Error returned from deploy ramdisk: %s'), ramdisk_error) deploy_utils.set_failed_state(task, _('Failure in deploy ramdisk.')) destroy_images(node.uuid) return LOG.info(_LI('Continuing deployment for node %(node)s, params %(params)s'), {'node': node.uuid, 'params': params}) root_uuid = None try: root_uuid = deploy_utils.deploy(**params) except Exception as e: LOG.error(_LE('Deploy failed for instance %(instance)s. ' 'Error: %(error)s'), {'instance': node.instance_uuid, 'error': e}) deploy_utils.set_failed_state(task, _('Failed to continue ' 'iSCSI deployment.')) destroy_images(node.uuid) return root_uuid
def _continue_deploy(self, task, **kwargs): """Continues the deployment of baremetal node over iSCSI. This method continues the deployment of the baremetal node over iSCSI from where the deployment ramdisk has left off. :param task: a TaskManager instance containing the node to act on. :param kwargs: kwargs for performing iscsi deployment. """ node = task.node if node.provision_state != states.DEPLOYWAIT: LOG.error(_LE('Node %s is not waiting to be deployed.'), node.uuid) return _destroy_token_file(node) root_uuid = iscsi_deploy.continue_deploy(task, **kwargs) if not root_uuid: return try: pxe_config_path = pxe_utils.get_pxe_config_file_path(node.uuid) deploy_utils.switch_pxe_config(pxe_config_path, root_uuid, driver_utils.get_node_capability(node, 'boot_mode')) deploy_utils.notify_deploy_complete(kwargs['address']) LOG.info(_LI('Deployment to node %s done'), node.uuid) node.provision_state = states.ACTIVE node.target_provision_state = states.NOSTATE node.save() except Exception as e: LOG.error(_LE('Deploy failed for instance %(instance)s. ' 'Error: %(error)s'), {'instance': node.instance_uuid, 'error': e}) msg = _('Failed to continue iSCSI deployment.') deploy_utils.set_failed_state(task, msg)
def _reboot_to_instance(self, task, **kwargs): node = task.node LOG.debug('Preparing to reboot to instance for node %s', node.uuid) error = self._check_deploy_success(node) if error is not None: # TODO(jimrollenhagen) power off if using neutron dhcp to # align with pxe driver? msg = _('node %(node)s command status errored: %(error)s') % ( {'node': node.uuid, 'error': error}) LOG.error(msg) deploy_utils.set_failed_state(task, msg) return LOG.debug('Rebooting node %s to disk', node.uuid) manager_utils.node_set_boot_device(task, 'disk', persistent=True) manager_utils.node_power_action(task, states.REBOOT) node.provision_state = states.ACTIVE node.target_provision_state = states.NOSTATE node.save()
def validate_bootloader_install_status(task, input_params): """Validate if bootloader was installed. This method first validates if deploy key sent in vendor passthru was correct one, and then validates whether bootloader installation was successful or not. :param task: A TaskManager object. :param input_params: A dictionary of params sent as input to passthru. :raises: InstanceDeployFailure, if bootloader installation was reported from ramdisk as failure. """ node = task.node if input_params["status"] != "SUCCEEDED": msg = _("Failed to install bootloader on node %(node)s. " "Error: %(error)s.") % { "node": node.uuid, "error": input_params.get("error"), } LOG.error(msg) deploy_utils.set_failed_state(task, msg) raise exception.InstanceDeployFailure(msg) LOG.info(_LI("Bootloader successfully installed on node %s"), node.uuid)
def pass_deploy_info(self, task, **kwargs): """Continues the deployment of baremetal node over iSCSI. This method continues the deployment of the baremetal node over iSCSI from where the deployment ramdisk has left off. :param task: a TaskManager instance containing the node to act on. :param kwargs: kwargs for performing iscsi deployment. :raises: InvalidState """ node = task.node task.process_event('resume') LOG.debug('Continuing the deployment on node %s', node.uuid) is_whole_disk_image = node.driver_internal_info['is_whole_disk_image'] uuid_dict = iscsi_deploy.continue_deploy(task, **kwargs) root_uuid_or_disk_id = uuid_dict.get('root uuid', uuid_dict.get('disk identifier')) # save the node's root disk UUID so that another conductor could # rebuild the PXE config file. Due to a shortcoming in Nova objects, # we have to assign to node.driver_internal_info so the node knows it # has changed. driver_internal_info = node.driver_internal_info driver_internal_info['root_uuid_or_disk_id'] = root_uuid_or_disk_id node.driver_internal_info = driver_internal_info node.save() try: if iscsi_deploy.get_boot_option(node) == "local": deploy_utils.try_set_boot_device(task, boot_devices.DISK) # If it's going to boot from the local disk, get rid of # the PXE configuration files used for the deployment pxe_utils.clean_up_pxe_config(task) # Ask the ramdisk to install bootloader and # wait for the call-back through the vendor passthru # 'pass_bootloader_install_info', if it's not a # whole disk image. if not is_whole_disk_image: LOG.debug('Installing the bootloader on node %s', node.uuid) deploy_utils.notify_ramdisk_to_proceed(kwargs['address']) task.process_event('wait') return else: pxe_config_path = pxe_utils.get_pxe_config_file_path(node.uuid) boot_mode = deploy_utils.get_boot_mode_for_deploy(node) deploy_utils.switch_pxe_config( pxe_config_path, root_uuid_or_disk_id, boot_mode, is_whole_disk_image, deploy_utils.is_trusted_boot_requested(node)) except Exception as e: LOG.error( _LE('Deploy failed for instance %(instance)s. ' 'Error: %(error)s'), { 'instance': node.instance_uuid, 'error': e }) msg = _('Failed to continue iSCSI deployment.') deploy_utils.set_failed_state(task, msg) else: iscsi_deploy.finish_deploy(task, kwargs.get('address'))
def heartbeat(self, task, callback_url): """Process a heartbeat. :param task: task to work with. :param callback_url: agent HTTP API URL. """ # TODO(dtantsur): upgrade lock only if we actually take action other # than updating the last timestamp. task.upgrade_lock() node = task.node driver_internal_info = node.driver_internal_info LOG.debug( 'Heartbeat from %(node)s, last heartbeat at %(heartbeat)s.', { 'node': node.uuid, 'heartbeat': driver_internal_info.get('agent_last_heartbeat') }) driver_internal_info['agent_last_heartbeat'] = int(time.time()) try: driver_internal_info['agent_url'] = callback_url except KeyError: raise exception.MissingParameterValue( _('For heartbeat operation, ' '"agent_url" must be ' 'specified.')) node.driver_internal_info = driver_internal_info node.save() # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.maintenance: # this shouldn't happen often, but skip the rest if it does. LOG.debug( 'Heartbeat from node %(node)s in maintenance mode; ' 'not taking any action.', {'node': node.uuid}) return elif (node.provision_state == states.DEPLOYWAIT and not self.deploy_has_started(task)): msg = _('Node failed to get image for deploy.') self.continue_deploy(task) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_is_done(task)): msg = _('Node failed to move to active state.') self.reboot_to_instance(task) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_has_started(task)): node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() try: if not node.clean_step: LOG.debug('Node %s just booted to start cleaning.', node.uuid) msg = _('Node failed to start the first cleaning ' 'step.') # First, cache the clean steps self._refresh_clean_steps(task) # Then set/verify node clean steps and start cleaning manager_utils.set_node_cleaning_steps(task) _notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') self.continue_cleaning(task) except exception.NoFreeConductorWorker: # waiting for the next heartbeat, node.last_error and # logging message is filled already via conductor's hook pass except Exception as e: err_info = {'node': node.uuid, 'msg': msg, 'e': e} last_error = _('Asynchronous exception for node %(node)s: ' '%(msg)s Exception: %(e)s') % err_info LOG.exception(last_error) if node.provision_state in (states.CLEANING, states.CLEANWAIT): manager_utils.cleaning_error_handler(task, last_error) elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): deploy_utils.set_failed_state(task, last_error)
def do_agent_iscsi_deploy(task, agent_client): """Method invoked when deployed with the agent ramdisk. This method is invoked by drivers for doing iSCSI deploy using agent ramdisk. This method assumes that the agent is booted up on the node and is heartbeating. :param task: a TaskManager object containing the node. :param agent_client: an instance of agent_client.AgentClient which will be used during iscsi deploy (for exposing node's target disk via iSCSI, for install boot loader, etc). :returns: a dictionary containing the following keys: For partition image: * 'root uuid': UUID of root partition * 'efi system partition uuid': UUID of the uefi system partition (if boot mode is uefi). .. note:: If key exists but value is None, it means partition doesn't exist. For whole disk image: * 'disk identifier': ID of the disk to which image was deployed. :raises: InstanceDeployFailure if it encounters some error during the deploy. """ node = task.node i_info = deploy_utils.parse_instance_info(node) wipe_disk_metadata = not i_info['preserve_ephemeral'] iqn = 'iqn.2008-10.org.openstack:%s' % node.uuid portal_port = CONF.iscsi.portal_port conv_flags = CONF.iscsi.conv_flags result = agent_client.start_iscsi_target( node, iqn, portal_port, wipe_disk_metadata=wipe_disk_metadata) if result['command_status'] == 'FAILED': msg = (_("Failed to start the iSCSI target to deploy the " "node %(node)s. Error: %(error)s") % {'node': node.uuid, 'error': result['command_error']}) deploy_utils.set_failed_state(task, msg) raise exception.InstanceDeployFailure(reason=msg) address = urlparse.urlparse(node.driver_internal_info['agent_url']) address = address.hostname uuid_dict_returned = continue_deploy(task, iqn=iqn, address=address, conv_flags=conv_flags) root_uuid_or_disk_id = uuid_dict_returned.get( 'root uuid', uuid_dict_returned.get('disk identifier')) # TODO(lucasagomes): Move this bit saving the root_uuid to # continue_deploy() driver_internal_info = node.driver_internal_info driver_internal_info['root_uuid_or_disk_id'] = root_uuid_or_disk_id node.driver_internal_info = driver_internal_info node.save() return uuid_dict_returned
def write_image(self, task): if not task.driver.storage.should_write_image(task): return node = task.node image_source = node.instance_info.get('image_source') LOG.debug('Continuing deploy for node %(node)s with image %(img)s', { 'node': node.uuid, 'img': image_source }) image_info = { 'id': image_source.split('/')[-1], 'urls': [node.instance_info['image_url']], # NOTE(comstud): Older versions of ironic do not set # 'disk_format' nor 'container_format', so we use .get() # to maintain backwards compatibility in case code was # upgraded in the middle of a build request. 'disk_format': node.instance_info.get('image_disk_format'), 'container_format': node.instance_info.get('image_container_format'), 'stream_raw_images': CONF.agent.stream_raw_images, } if node.instance_info.get('image_checksum'): image_info['checksum'] = node.instance_info['image_checksum'] if (node.instance_info.get('image_os_hash_algo') and node.instance_info.get('image_os_hash_value')): image_info['os_hash_algo'] = node.instance_info[ 'image_os_hash_algo'] image_info['os_hash_value'] = node.instance_info[ 'image_os_hash_value'] proxies = {} for scheme in ('http', 'https'): proxy_param = 'image_%s_proxy' % scheme proxy = node.driver_info.get(proxy_param) if proxy: proxies[scheme] = proxy if proxies: image_info['proxies'] = proxies no_proxy = node.driver_info.get('image_no_proxy') if no_proxy is not None: image_info['no_proxy'] = no_proxy image_info['node_uuid'] = node.uuid iwdi = node.driver_internal_info.get('is_whole_disk_image') if not iwdi: for label in PARTITION_IMAGE_LABELS: image_info[label] = node.instance_info.get(label) boot_option = deploy_utils.get_boot_option(node) image_info['deploy_boot_mode'] = ( boot_mode_utils.get_boot_mode(node)) image_info['boot_option'] = boot_option disk_label = deploy_utils.get_disk_label(node) if disk_label is not None: image_info['disk_label'] = disk_label has_write_image = agent_base.find_step(task, 'deploy', 'deploy', 'write_image') is not None if not has_write_image: LOG.warning( 'The agent on node %s does not have the deploy ' 'step deploy.write_image, using the deprecated ' 'synchronous fall-back', task.node.uuid) if self.has_decomposed_deploy_steps and has_write_image: configdrive = node.instance_info.get('configdrive') # Now switch into the corresponding in-band deploy step and let the # result be polled normally. new_step = { 'interface': 'deploy', 'step': 'write_image', 'args': { 'image_info': image_info, 'configdrive': configdrive } } return agent_base.execute_step(task, new_step, 'deploy', client=self._client) else: # TODO(dtantsur): remove in W command = self._client.prepare_image(node, image_info, wait=True) if command['command_status'] == 'FAILED': # TODO(jimrollenhagen) power off if using neutron dhcp to # align with pxe driver? msg = (_('node %(node)s command status errored: %(error)s') % { 'node': node.uuid, 'error': command['command_error'] }) LOG.error(msg) deploy_utils.set_failed_state(task, msg)
def heartbeat(self, task, **kwargs): """Method for agent to periodically check in. The agent should be sending its agent_url (so Ironic can talk back) as a kwarg. kwargs should have the following format:: { 'agent_url': 'http://AGENT_HOST:AGENT_PORT' } AGENT_PORT defaults to 9999. """ node = task.node driver_internal_info = node.driver_internal_info LOG.debug( 'Heartbeat from %(node)s, last heartbeat at %(heartbeat)s.', { 'node': node.uuid, 'heartbeat': driver_internal_info.get('agent_last_heartbeat') }) driver_internal_info['agent_last_heartbeat'] = int(time.time()) try: driver_internal_info['agent_url'] = kwargs['agent_url'] except KeyError: raise exception.MissingParameterValue( _('For heartbeat operation, ' '"agent_url" must be ' 'specified.')) node.driver_internal_info = driver_internal_info node.save() # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.maintenance: # this shouldn't happen often, but skip the rest if it does. LOG.debug( 'Heartbeat from node %(node)s in maintenance mode; ' 'not taking any action.', {'node': node.uuid}) return elif (node.provision_state == states.DEPLOYWAIT and not self.deploy_has_started(task)): msg = _('Node failed to get image for deploy.') self.continue_deploy(task, **kwargs) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_is_done(task)): msg = _('Node failed to move to active state.') self.reboot_to_instance(task, **kwargs) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_has_started(task)): node.touch_provisioning() # TODO(lucasagomes): CLEANING here for backwards compat # with previous code, otherwise nodes in CLEANING when this # is deployed would fail. Should be removed once the Mitaka # release starts. elif node.provision_state in (states.CLEANWAIT, states.CLEANING): node.touch_provisioning() try: if not node.clean_step: LOG.debug('Node %s just booted to start cleaning.', node.uuid) msg = _('Node failed to start the first cleaning ' 'step.') # First, cache the clean steps self._refresh_clean_steps(task) # Then set/verify node clean steps and start cleaning manager_utils.set_node_cleaning_steps(task) self.notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') self.continue_cleaning(task, **kwargs) except exception.NoFreeConductorWorker: # waiting for the next heartbeat, node.last_error and # logging message is filled already via conductor's hook pass except Exception as e: err_info = {'node': node.uuid, 'msg': msg, 'e': e} last_error = _('Asynchronous exception for node %(node)s: ' '%(msg)s Exception: %(e)s') % err_info LOG.exception(last_error) if node.provision_state in (states.CLEANING, states.CLEANWAIT): manager_utils.cleaning_error_handler(task, last_error) elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): deploy_utils.set_failed_state(task, last_error)
def heartbeat(self, task, **kwargs): """Method for agent to periodically check in. The agent should be sending its agent_url (so Ironic can talk back) as a kwarg. kwargs should have the following format:: { 'agent_url': 'http://AGENT_HOST:AGENT_PORT' } AGENT_PORT defaults to 9999. """ node = task.node driver_internal_info = node.driver_internal_info LOG.debug( 'Heartbeat from %(node)s, last heartbeat at %(heartbeat)s.', { 'node': node.uuid, 'heartbeat': driver_internal_info.get('agent_last_heartbeat') }) driver_internal_info['agent_last_heartbeat'] = int(_time()) try: driver_internal_info['agent_url'] = kwargs['agent_url'] except KeyError: raise exception.MissingParameterValue( _('For heartbeat operation, ' '"agent_url" must be ' 'specified.')) node.driver_internal_info = driver_internal_info node.save() # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.maintenance: # this shouldn't happen often, but skip the rest if it does. LOG.debug( 'Heartbeat from node %(node)s in maintenance mode; ' 'not taking any action.', {'node': node.uuid}) return elif (node.provision_state == states.DEPLOYWAIT and not self.deploy_has_started(task)): msg = _('Node failed to get image for deploy.') self.continue_deploy(task, **kwargs) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_is_done(task)): msg = _('Node failed to move to active state.') self.reboot_to_instance(task, **kwargs) elif (node.provision_state == states.CLEANING and not node.clean_step): # Agent booted from prepare_cleaning LOG.debug('Node %s just booted to start cleaning.', node.uuid) manager.set_node_cleaning_steps(task) self._notify_conductor_resume_clean(task) elif (node.provision_state == states.CLEANING and node.clean_step): self.continue_cleaning(task, **kwargs) except Exception as e: err_info = {'node': node.uuid, 'msg': msg, 'e': e} last_error = _('Asynchronous exception for node %(node)s: ' '%(msg)s exception: %(e)s') % err_info LOG.exception(last_error) deploy_utils.set_failed_state(task, last_error)
def reboot_to_instance(self, task): task.process_event('resume') node = task.node iwdi = task.node.driver_internal_info.get('is_whole_disk_image') cpu_arch = task.node.properties.get('cpu_arch') error = self.check_deploy_success(node) if error is not None: # TODO(jimrollenhagen) power off if using neutron dhcp to # align with pxe driver? msg = (_('node %(node)s command status errored: %(error)s') % { 'node': node.uuid, 'error': error }) LOG.error(msg) deploy_utils.set_failed_state(task, msg) return # If `boot_option` is set to `netboot`, PXEBoot.prepare_instance() # would need root_uuid of the whole disk image to add it into the # pxe config to perform chain boot. # IPA would have returned us the 'root_uuid_or_disk_id' if image # being provisioned is a whole disk image. IPA would also provide us # 'efi_system_partition_uuid' if the image being provisioned is a # partition image. # In case of local boot using partition image, we need both # 'root_uuid_or_disk_id' and 'efi_system_partition_uuid' to configure # bootloader for local boot. # NOTE(mjturek): In the case of local boot using a partition image on # ppc64* hardware we need to provide the 'PReP_Boot_partition_uuid' to # direct where the bootloader should be installed. driver_internal_info = task.node.driver_internal_info root_uuid = self._get_uuid_from_result(task, 'root_uuid') if root_uuid: driver_internal_info['root_uuid_or_disk_id'] = root_uuid task.node.driver_internal_info = driver_internal_info task.node.save() elif iwdi and CONF.agent.manage_agent_boot: # IPA version less than 3.1.0 will not return root_uuid for # whole disk image. Also IPA version introduced a requirement # for hexdump utility that may not be always available. Need to # fall back to older behavior for the same. LOG.warning( "With the deploy ramdisk based on Ironic Python Agent " "version 3.1.0 and beyond, the drivers using " "`direct` deploy interface performs `netboot` or " "`local` boot for whole disk image based on value " "of boot option setting. When you upgrade Ironic " "Python Agent in your deploy ramdisk, ensure that " "boot option is set appropriately for the node %s. " "The boot option can be set using configuration " "`[deploy]/default_boot_option` or as a `boot_option` " "capability in node's `properties['capabilities']`. " "Also please note that this functionality requires " "`hexdump` command in the ramdisk.", node.uuid) efi_sys_uuid = None if not iwdi: if boot_mode_utils.get_boot_mode_for_deploy(node) == 'uefi': efi_sys_uuid = (self._get_uuid_from_result( task, 'efi_system_partition_uuid')) prep_boot_part_uuid = None if cpu_arch is not None and cpu_arch.startswith('ppc64'): prep_boot_part_uuid = (self._get_uuid_from_result( task, 'PReP_Boot_partition_uuid')) LOG.info('Image successfully written to node %s', node.uuid) if CONF.agent.manage_agent_boot: # It is necessary to invoke prepare_instance() of the node's # boot interface, so that the any necessary configurations like # setting of the boot mode (e.g. UEFI secure boot) which cannot # be done on node during deploy stage can be performed. LOG.debug( 'Executing driver specific tasks before booting up the ' 'instance for node %s', node.uuid) self.prepare_instance_to_boot(task, root_uuid, efi_sys_uuid, prep_boot_part_uuid) else: manager_utils.node_set_boot_device(task, 'disk', persistent=True) LOG.debug('Rebooting node %s to instance', node.uuid) self.reboot_and_finish_deploy(task)
def heartbeat(self, task, callback_url, agent_version): """Process a heartbeat. :param task: task to work with. :param callback_url: agent HTTP API URL. :param agent_version: The version of the agent that is heartbeating """ # NOTE(pas-ha) immediately skip the rest if nothing to do if task.node.provision_state not in self.heartbeat_allowed_states: LOG.debug( 'Heartbeat from node %(node)s in unsupported ' 'provision state %(state)s, not taking any action.', { 'node': task.node.uuid, 'state': task.node.provision_state }) return task.upgrade_lock() node = task.node LOG.debug('Heartbeat from node %s', node.uuid) driver_internal_info = node.driver_internal_info driver_internal_info['agent_url'] = callback_url driver_internal_info['agent_version'] = agent_version node.driver_internal_info = driver_internal_info node.save() # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.maintenance: # this shouldn't happen often, but skip the rest if it does. LOG.debug( 'Heartbeat from node %(node)s in maintenance mode; ' 'not taking any action.', {'node': node.uuid}) return elif (node.provision_state == states.DEPLOYWAIT and not self.deploy_has_started(task)): msg = _('Node failed to deploy.') self.continue_deploy(task) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_is_done(task)): msg = _('Node failed to move to active state.') self.reboot_to_instance(task) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_has_started(task)): node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() if not node.clean_step: LOG.debug('Node %s just booted to start cleaning.', node.uuid) msg = _('Node failed to start the first cleaning step.') # First, cache the clean steps self.refresh_clean_steps(task) # Then set/verify node clean steps and start cleaning manager_utils.set_node_cleaning_steps(task) # The exceptions from RPC are not possible as we using cast # here _notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') self.continue_cleaning(task) elif (node.provision_state == states.RESCUEWAIT): msg = _('Node failed to perform rescue operation.') self._finalize_rescue(task) except Exception as e: err_info = {'msg': msg, 'e': e} last_error = _('Asynchronous exception: %(msg)s ' 'Exception: %(e)s for node') % err_info errmsg = last_error + ' %(node)s' LOG.exception(errmsg, {'node': node.uuid}) if node.provision_state in (states.CLEANING, states.CLEANWAIT): manager_utils.cleaning_error_handler(task, last_error) elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): deploy_utils.set_failed_state(task, last_error, collect_logs=bool(self._client)) elif node.provision_state in (states.RESCUING, states.RESCUEWAIT): manager_utils.rescuing_error_handler(task, last_error)
def pass_deploy_info(self, task, **kwargs): """Continues the deployment of baremetal node.""" node = task.node task.process_event('resume') err_msg = _('Failed to continue deployment with Fuel Agent.') agent_status = kwargs.get('status') if agent_status != 'ready': LOG.error( _LE('Deploy failed for node %(node)s. Fuel Agent is not ' 'in ready state, error: %(error)s'), { 'node': node.uuid, 'error': kwargs.get('error_message') }) deploy_utils.set_failed_state(task, err_msg) return params = _parse_driver_info(node) params['host'] = kwargs.get('address') cmd = ('%s --data_driver ironic --config-file ' '/etc/fuel-agent/fuel-agent.conf' % params.pop('script')) if CONF.debug: cmd += ' --debug' instance_info = node.instance_info try: deploy_data = _get_deploy_data(task.context, instance_info['image_source']) image_data = { "/": { "uri": instance_info['image_url'], "format": "raw", "container": "raw" } } deploy_data['ks_meta']['image_data'] = image_data ssh = utils.ssh_connect(params) sftp = ssh.open_sftp() _sftp_upload(sftp, json.dumps(deploy_data), '/tmp/provision.json') # swift configdrive store should be disabled configdrive = instance_info.get('configdrive') if configdrive is not None: _sftp_upload(sftp, configdrive, '/tmp/config-drive.img') _ssh_execute(ssh, cmd, params) LOG.info(_LI('Fuel Agent pass on node %s'), node.uuid) manager_utils.node_set_boot_device(task, boot_devices.DISK, persistent=True) manager_utils.node_power_action(task, states.REBOOT) except Exception as e: msg = (_('Deploy failed for node %(node)s. Error: %(error)s') % { 'node': node.uuid, 'error': e }) LOG.error(msg) deploy_utils.set_failed_state(task, msg) else: task.process_event('done') LOG.info(_LI('Deployment to node %s done'), task.node.uuid)
def pass_deploy_info(self, task, **kwargs): """Continues the iSCSI deployment from where ramdisk left off. This method continues the iSCSI deployment from the conductor node and writes the deploy image to the bare metal's disk. After that, it does the following depending on boot_option for deploy: - If the boot_option requested for this deploy is 'local', then it sets the node to boot from disk (ramdisk installs the boot loader present within the image to the bare metal's disk). - If the boot_option requested is 'netboot' or no boot_option is requested, it finds/creates the boot ISO to boot the instance image, attaches the boot ISO to the bare metal and then sets the node to boot from CDROM. :param task: a TaskManager instance containing the node to act on. :param kwargs: kwargs containing parameters for iSCSI deployment. :raises: InvalidState """ node = task.node task.process_event('resume') is_whole_disk_image = node.driver_internal_info.get( 'is_whole_disk_image') uuid_dict = iscsi_deploy.continue_deploy(task, **kwargs) root_uuid_or_disk_id = uuid_dict.get('root uuid', uuid_dict.get('disk identifier')) try: _cleanup_vmedia_boot(task) if (iscsi_deploy.get_boot_option(node) == "local" or is_whole_disk_image): manager_utils.node_set_boot_device(task, boot_devices.DISK, persistent=True) # Ask the ramdisk to install bootloader and # wait for the call-back through the vendor passthru # 'pass_bootloader_install_info', if it's not a whole # disk image. if not is_whole_disk_image: deploy_utils.notify_ramdisk_to_proceed(kwargs['address']) task.process_event('wait') return else: _prepare_boot_iso(task, root_uuid_or_disk_id) setup_vmedia_for_boot( task, node.driver_internal_info['irmc_boot_iso']) manager_utils.node_set_boot_device(task, boot_devices.CDROM, persistent=True) except Exception as e: LOG.exception( _LE('Deploy failed for instance %(instance)s. ' 'Error: %(error)s'), { 'instance': node.instance_uuid, 'error': e }) msg = _('Failed to continue iSCSI deployment.') deploy_utils.set_failed_state(task, msg) else: iscsi_deploy.finish_deploy(task, kwargs.get('address'))
def heartbeat(self, task, callback_url, agent_version): """Process a heartbeat. :param task: task to work with. :param callback_url: agent HTTP API URL. :param agent_version: The version of the agent that is heartbeating """ # NOTE(pas-ha) immediately skip the rest if nothing to do if (task.node.provision_state not in self.heartbeat_allowed_states and not manager_utils.fast_track_able(task)): LOG.debug( 'Heartbeat from node %(node)s in unsupported ' 'provision state %(state)s, not taking any action.', { 'node': task.node.uuid, 'state': task.node.provision_state }) return try: task.upgrade_lock() except exception.NodeLocked: LOG.warning( 'Node %s is currently locked, skipping heartbeat ' 'processing (will retry on the next heartbeat)', task.node.uuid) return node = task.node LOG.debug('Heartbeat from node %s', node.uuid) driver_internal_info = node.driver_internal_info driver_internal_info['agent_url'] = callback_url driver_internal_info['agent_version'] = agent_version # Record the last heartbeat event time in UTC, so we can make # decisions about it later. Can be decoded to datetime object with: # datetime.datetime.strptime(var, "%Y-%m-%d %H:%M:%S.%f") driver_internal_info['agent_last_heartbeat'] = str( timeutils.utcnow().isoformat()) node.driver_internal_info = driver_internal_info node.save() if node.provision_state in _HEARTBEAT_RECORD_ONLY: # We shouldn't take any additional action. The agent will # silently continue to heartbeat to ironic until user initiated # state change occurs causing it to match a state below. LOG.debug( 'Heartbeat from %(node)s recorded to identify the ' 'node as on-line.', {'node': task.node.uuid}) return # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.maintenance: # this shouldn't happen often, but skip the rest if it does. LOG.debug( 'Heartbeat from node %(node)s in maintenance mode; ' 'not taking any action.', {'node': node.uuid}) return elif (node.provision_state == states.DEPLOYWAIT and not self.deploy_has_started(task)): msg = _('Node failed to deploy.') self.continue_deploy(task) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_is_done(task)): msg = _('Node failed to move to active state.') self.reboot_to_instance(task) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_has_started(task)): node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() if not node.clean_step: LOG.debug('Node %s just booted to start cleaning.', node.uuid) msg = _('Node failed to start the first cleaning step.') # First, cache the clean steps self.refresh_clean_steps(task) # Then set/verify node clean steps and start cleaning conductor_steps.set_node_cleaning_steps(task) # The exceptions from RPC are not possible as we using cast # here manager_utils.notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') self.continue_cleaning(task) elif (node.provision_state == states.RESCUEWAIT): msg = _('Node failed to perform rescue operation.') self._finalize_rescue(task) except Exception as e: err_info = {'msg': msg, 'e': e} last_error = _('Asynchronous exception: %(msg)s ' 'Exception: %(e)s for node') % err_info errmsg = last_error + ' %(node)s' LOG.exception(errmsg, {'node': node.uuid}) if node.provision_state in (states.CLEANING, states.CLEANWAIT): manager_utils.cleaning_error_handler(task, last_error) elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): deploy_utils.set_failed_state(task, last_error, collect_logs=bool(self._client)) elif node.provision_state in (states.RESCUING, states.RESCUEWAIT): manager_utils.rescuing_error_handler(task, last_error)
def log_and_raise_deployment_error(task, msg): """Helper method to log the error and raise exception.""" LOG.error(msg) deploy_utils.set_failed_state(task, msg) raise exception.InstanceDeployFailure(msg)
def pass_deploy_info(self, task, **kwargs): """Continues the deployment of baremetal node over iSCSI. This method continues the deployment of the baremetal node over iSCSI from where the deployment ramdisk has left off. :param task: a TaskManager instance containing the node to act on. :param kwargs: kwargs for performing iscsi deployment. :raises: InvalidState """ node = task.node LOG.warning( _LW("The node %s is using the bash deploy ramdisk for " "its deployment. This deploy ramdisk has been " "deprecated. Please use the ironic-python-agent " "(IPA) ramdisk instead."), node.uuid) # TODO(rameshg87): Remove the below code once we stop supporting # bash ramdisk in Ironic. if node.provision_state == states.CLEANWAIT: return self._initiate_cleaning(task) task.process_event('resume') LOG.debug('Continuing the deployment on node %s', node.uuid) is_whole_disk_image = node.driver_internal_info['is_whole_disk_image'] uuid_dict_returned = continue_deploy(task, **kwargs) root_uuid_or_disk_id = uuid_dict_returned.get( 'root uuid', uuid_dict_returned.get('disk identifier')) # save the node's root disk UUID so that another conductor could # rebuild the PXE config file. Due to a shortcoming in Nova objects, # we have to assign to node.driver_internal_info so the node knows it # has changed. driver_internal_info = node.driver_internal_info driver_internal_info['root_uuid_or_disk_id'] = root_uuid_or_disk_id node.driver_internal_info = driver_internal_info node.save() try: task.driver.boot.prepare_instance(task) if deploy_utils.get_boot_option(node) == "local": if not is_whole_disk_image: LOG.debug('Installing the bootloader on node %s', node.uuid) deploy_utils.notify_ramdisk_to_proceed(kwargs['address']) task.process_event('wait') return except Exception as e: LOG.error( _LE('Deploy failed for instance %(instance)s. ' 'Error: %(error)s'), { 'instance': node.instance_uuid, 'error': e }) msg = _('Failed to continue iSCSI deployment.') deploy_utils.set_failed_state(task, msg) else: finish_deploy(task, kwargs.get('address'))
def do_agent_iscsi_deploy(task, agent_client): """Method invoked when deployed with the agent ramdisk. This method is invoked by drivers for doing iSCSI deploy using agent ramdisk. This method assumes that the agent is booted up on the node and is heartbeating. :param task: a TaskManager object containing the node. :param agent_client: an instance of agent_client.AgentClient which will be used during iscsi deploy (for exposing node's target disk via iSCSI, for install boot loader, etc). :returns: a dictionary containing the following keys: For partition image: 'root uuid': UUID of root partition 'efi system partition uuid': UUID of the uefi system partition (if boot mode is uefi). NOTE: If key exists but value is None, it means partition doesn't exist. For whole disk image: 'disk identifier': ID of the disk to which image was deployed. :raises: InstanceDeployFailure, if it encounters some error during the deploy. """ node = task.node iscsi_options = build_deploy_ramdisk_options(node) iqn = iscsi_options['iscsi_target_iqn'] portal_port = iscsi_options['iscsi_portal_port'] result = agent_client.start_iscsi_target(node, iqn, portal_port) if result['command_status'] == 'FAILED': msg = (_("Failed to start the iSCSI target to deploy the " "node %(node)s. Error: %(error)s") % { 'node': node.uuid, 'error': result['command_error'] }) deploy_utils.set_failed_state(task, msg) raise exception.InstanceDeployFailure(reason=msg) address = parse.urlparse(node.driver_internal_info['agent_url']) address = address.hostname # TODO(lucasagomes): The 'error' and 'key' parameters in the # dictionary below are just being passed because it's needed for # the continue_deploy() method, we are fooling it # for now. The agent driver doesn't use/need those. So we need to # refactor this bits here later. iscsi_params = { 'error': result['command_error'], 'iqn': iqn, 'key': iscsi_options['deployment_key'], 'address': address } uuid_dict_returned = continue_deploy(task, **iscsi_params) root_uuid_or_disk_id = uuid_dict_returned.get( 'root uuid', uuid_dict_returned.get('disk identifier')) # TODO(lucasagomes): Move this bit saving the root_uuid to # continue_deploy() driver_internal_info = node.driver_internal_info driver_internal_info['root_uuid_or_disk_id'] = root_uuid_or_disk_id node.driver_internal_info = driver_internal_info node.save() return uuid_dict_returned
def pass_deploy_info(self, task, **kwargs): """Continues the iSCSI deployment from where ramdisk left off. This method continues the iSCSI deployment from the conductor node and writes the deploy image to the bare metal's disk. After that, it does the following depending on boot_option for deploy: - If the boot_option requested for this deploy is 'local', then it sets the node to boot from disk (ramdisk installs the boot loader present within the image to the bare metal's disk). - If the boot_option requested is 'netboot' or no boot_option is requested, it finds/creates the boot ISO to boot the instance image, attaches the boot ISO to the bare metal and then sets the node to boot from CDROM. :param task: a TaskManager instance containing the node to act on. :param kwargs: kwargs containing parameters for iSCSI deployment. :raises: InvalidState """ node = task.node LOG.warning( _LW("The node %s is using the bash deploy ramdisk for " "its deployment. This deploy ramdisk has been " "deprecated. Please use the ironic-python-agent " "(IPA) ramdisk instead."), node.uuid) task.process_event('resume') iwdi = node.driver_internal_info.get('is_whole_disk_image') ilo_common.cleanup_vmedia_boot(task) uuid_dict = iscsi_deploy.continue_deploy(task, **kwargs) root_uuid_or_disk_id = uuid_dict.get('root uuid', uuid_dict.get('disk identifier')) driver_internal_info = task.node.driver_internal_info driver_internal_info['root_uuid_or_disk_id'] = root_uuid_or_disk_id task.node.driver_internal_info = driver_internal_info task.node.save() try: # Set boot mode ilo_common.update_boot_mode(task) # Need to enable secure boot, if being requested _update_secure_boot_mode(task, True) # For iscsi_ilo driver, we boot from disk every time if the image # deployed is a whole disk image. if deploy_utils.get_boot_option(node) == "local" or iwdi: manager_utils.node_set_boot_device(task, boot_devices.DISK, persistent=True) # Ask the ramdisk to install bootloader and # wait for the call-back through the vendor passthru # 'pass_bootloader_install_info', if it's not a whole # disk image. if not iwdi: deploy_utils.notify_ramdisk_to_proceed(kwargs['address']) task.process_event('wait') return else: self._configure_vmedia_boot(task, root_uuid_or_disk_id) except Exception as e: LOG.error( _LE('Deploy failed for instance %(instance)s. ' 'Error: %(error)s'), { 'instance': node.instance_uuid, 'error': e }) msg = _('Failed to continue iSCSI deployment.') deploy_utils.set_failed_state(task, msg) else: iscsi_deploy.finish_deploy(task, kwargs.get('address'))
def _fail_deploy(task, msg): """Fail the deploy after logging and setting error states.""" LOG.error(msg) deploy_utils.set_failed_state(task, msg) destroy_images(task.node.uuid) raise exception.InstanceDeployFailure(msg)
def heartbeat(self, task, callback_url): """Process a heartbeat. :param task: task to work with. :param callback_url: agent HTTP API URL. """ # TODO(dtantsur): upgrade lock only if we actually take action other # than updating the last timestamp. task.upgrade_lock() node = task.node LOG.debug('Heartbeat from node %s', node.uuid) driver_internal_info = node.driver_internal_info driver_internal_info['agent_url'] = callback_url # TODO(rloo): 'agent_last_heartbeat' was deprecated since it wasn't # being used so remove that entry if it exists. # Hopefully all nodes will have been updated by Pike, so # we can delete this code then. driver_internal_info.pop('agent_last_heartbeat', None) node.driver_internal_info = driver_internal_info node.save() # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.maintenance: # this shouldn't happen often, but skip the rest if it does. LOG.debug('Heartbeat from node %(node)s in maintenance mode; ' 'not taking any action.', {'node': node.uuid}) return elif (node.provision_state == states.DEPLOYWAIT and not self.deploy_has_started(task)): msg = _('Node failed to deploy.') self.continue_deploy(task) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_is_done(task)): msg = _('Node failed to move to active state.') self.reboot_to_instance(task) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_has_started(task)): node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() if not node.clean_step: LOG.debug('Node %s just booted to start cleaning.', node.uuid) msg = _('Node failed to start the first cleaning step.') # First, cache the clean steps self.refresh_clean_steps(task) # Then set/verify node clean steps and start cleaning manager_utils.set_node_cleaning_steps(task) # The exceptions from RPC are not possible as we using cast # here _notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') self.continue_cleaning(task) except Exception as e: err_info = {'node': node.uuid, 'msg': msg, 'e': e} last_error = _('Asynchronous exception for node %(node)s: ' '%(msg)s Exception: %(e)s') % err_info LOG.exception(last_error) if node.provision_state in (states.CLEANING, states.CLEANWAIT): manager_utils.cleaning_error_handler(task, last_error) elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): deploy_utils.set_failed_state( task, last_error, collect_logs=bool(self._client))
def reboot_to_instance(self, task): task.process_event('resume') node = task.node iwdi = task.node.driver_internal_info.get('is_whole_disk_image') cpu_arch = task.node.properties.get('cpu_arch') error = self.check_deploy_success(node) if error is not None: # TODO(jimrollenhagen) power off if using neutron dhcp to # align with pxe driver? msg = (_('node %(node)s command status errored: %(error)s') % { 'node': node.uuid, 'error': error }) LOG.error(msg) deploy_utils.set_failed_state(task, msg) return # If `boot_option` is set to `netboot`, PXEBoot.prepare_instance() # would need root_uuid of the whole disk image to add it into the # pxe config to perform chain boot. # IPA would have returned us the 'root_uuid_or_disk_id' if image # being provisioned is a whole disk image. IPA would also provide us # 'efi_system_partition_uuid' if the image being provisioned is a # partition image. # In case of local boot using partition image, we need both # 'root_uuid_or_disk_id' and 'efi_system_partition_uuid' to configure # bootloader for local boot. # NOTE(mjturek): In the case of local boot using a partition image on # ppc64* hardware we need to provide the 'PReP_Boot_partition_uuid' to # direct where the bootloader should be installed. driver_internal_info = task.node.driver_internal_info try: partition_uuids = self._client.get_partition_uuids(node).get( 'command_result') or {} root_uuid = partition_uuids.get('root uuid') except exception.AgentAPIError: # TODO(dtantsur): remove in W LOG.warning('Old ironic-python-agent detected, please update ' 'to Victoria or newer') partition_uuids = None root_uuid = self._get_uuid_from_result(task, 'root_uuid') if root_uuid: driver_internal_info['root_uuid_or_disk_id'] = root_uuid task.node.driver_internal_info = driver_internal_info task.node.save() elif not iwdi: LOG.error( 'No root UUID returned from the ramdisk for node ' '%(node)s, the deploy will likely fail. Partition ' 'UUIDs are %(uuids)s', { 'node': node.uuid, 'uuid': partition_uuids }) efi_sys_uuid = None if not iwdi: if boot_mode_utils.get_boot_mode(node) == 'uefi': # TODO(dtantsur): remove in W if partition_uuids is None: efi_sys_uuid = (self._get_uuid_from_result( task, 'efi_system_partition_uuid')) else: efi_sys_uuid = partition_uuids.get( 'efi system partition uuid') prep_boot_part_uuid = None if cpu_arch is not None and cpu_arch.startswith('ppc64'): # TODO(dtantsur): remove in W if partition_uuids is None: prep_boot_part_uuid = (self._get_uuid_from_result( task, 'PReP_Boot_partition_uuid')) else: prep_boot_part_uuid = partition_uuids.get( 'PReP Boot partition uuid') LOG.info('Image successfully written to node %s', node.uuid) if CONF.agent.manage_agent_boot: # It is necessary to invoke prepare_instance() of the node's # boot interface, so that the any necessary configurations like # setting of the boot mode (e.g. UEFI secure boot) which cannot # be done on node during deploy stage can be performed. LOG.debug( 'Executing driver specific tasks before booting up the ' 'instance for node %s', node.uuid) self.prepare_instance_to_boot(task, root_uuid, efi_sys_uuid, prep_boot_part_uuid) else: manager_utils.node_set_boot_device(task, 'disk', persistent=True) # Remove symbolic link when deploy is done. if CONF.agent.image_download_source == 'http': deploy_utils.remove_http_instance_symlink(task.node.uuid) LOG.debug('Rebooting node %s to instance', node.uuid) self.reboot_and_finish_deploy(task)