def _power_off(driver_info): """Turn the power to this node OFF.""" # use mutable objects so the looped method can change them state = [None] retries = [0] def _wait_for_power_off(state, retries): """Called at an interval until the node's power is off.""" state[0] = _power_status(driver_info) if state[0] == states.POWER_OFF: raise loopingcall.LoopingCallDone() if retries[0] > CONF.ipmi_power_retry: state[0] = states.ERROR raise loopingcall.LoopingCallDone() try: retries[0] += 1 _exec_ipmitool(driver_info, "power off") except Exception: # Log failures but keep trying LOG.warning( _("IPMI power off failed for node %s.") % driver_info['uuid']) timer = loopingcall.FixedIntervalLoopingCall(_wait_for_power_off, state=state, retries=retries) timer.start(interval=1).wait() return state[0]
def _wait_for_state_change(node, target_state): """Wait for the power state change to get reflected.""" state = [None] retries = [0] def _wait(state): state[0] = _get_power_state(node) # NOTE(rameshg87): For reboot operations, initially the state # will be same as the final state. So defer the check for one retry. if retries[0] != 0 and state[0] == target_state: raise loopingcall.LoopingCallDone() if retries[0] > CONF.ilo.power_retry: state[0] = states.ERROR raise loopingcall.LoopingCallDone() retries[0] += 1 # Start a timer and wait for the operation to complete. timer = loopingcall.FixedIntervalLoopingCall(_wait, state) timer.start(interval=CONF.ilo.power_wait).wait() return state[0]
def _snmp_wait_for_state(self, goal_state): """Wait for the power state of the PDU outlet to change. :param goal_state: The power state to wait for, one of :class:`ironic.common.states`. :raises: SNMPFailure if an SNMP request fails. :returns: power state. One of :class:`ironic.common.states`. """ def _poll_for_state(mutable): """Called at an interval until the node's power is consistent. :param mutable: dict object containing "state" and "next_time" :raises: SNMPFailure if an SNMP request fails. """ mutable["state"] = self._snmp_power_state() if mutable["state"] == goal_state: raise loopingcall.LoopingCallDone() mutable["next_time"] += self.retry_interval if mutable["next_time"] >= CONF.snmp.power_timeout: mutable["state"] = states.ERROR raise loopingcall.LoopingCallDone() # Pass state to the looped function call in a mutable form. state = {"state": None, "next_time": 0} timer = loopingcall.FixedIntervalLoopingCall(_poll_for_state, state) timer.start(interval=self.retry_interval).wait() LOG.debug("power state '%s'", state["state"]) return state["state"]
def add_timer(self, interval, callback, initial_delay=None, *args, **kwargs): pulse = loopingcall.FixedIntervalLoopingCall(callback, *args, **kwargs) pulse.start(interval=interval, initial_delay=initial_delay) self.timers.append(pulse)
def _ssh_execute(ssh, cmd, ssh_params): # NOTE(yuriyz): this ugly code is work-around against paramiko with # eventlet issues LOG.debug('Running cmd (SSH): %s', cmd) stdin_stream, stdout_stream, stderr_stream = ssh.exec_command(cmd) paramiko_channel = stdout_stream.channel paramiko_channel.setblocking(0) stdout_io = six.moves.StringIO() stderr_io = six.moves.StringIO() def _wait_execution(mutable, channel): try: stdout_data = channel.recv(1048576) except Exception: LOG.debug('No data from SSH stdout.') else: LOG.debug('Got %d from SSH stdout.', len(stdout_data)) stdout_io.write(stdout_data) try: stderr_data = channel.recv_stderr(1048576) except Exception: LOG.debug('No data from SSH stderr.') else: LOG.debug('Got %d from SSH stderr.', len(stderr_data)) stderr_io.write(stderr_data) if channel.exit_status_ready(): raise loopingcall.LoopingCallDone() try: ssh = utils.ssh_connect(ssh_params) except exception.SSHConnectFailed: mutable['error'] = True raise loopingcall.LoopingCallDone() else: ssh.close() error = {'error': False} timer = loopingcall.FixedIntervalLoopingCall(_wait_execution, error, paramiko_channel) timer.start(interval=60).wait() stdout = stdout_io.getvalue() stderr = stderr_io.getvalue() LOG.debug('SSH stdout is: "%s"', stdout) LOG.debug('SSH stderr is: "%s"', stderr) if error['error']: message = _('connection to the node lost') raise exception.SSHCommandFailed(cmd=message) exit_status = paramiko_channel.recv_exit_status() if exit_status != 0: message = _('wrong exit status %d') % exit_status raise exception.SSHCommandFailed(cmd=message) return stdout, stderr
def commit(self): """Write to the disk.""" LOG.debug("Committing partitions to disk.") cmd_args = ['mklabel', self._disk_label] # NOTE(lucasagomes): Lead in with 1MiB to allow room for the # partition table itself. start = 1 for num, part in self.get_partitions(): end = start + part['size'] cmd_args.extend([ 'mkpart', part['type'], part['fs_type'], str(start), str(end) ]) if part['bootable']: cmd_args.extend(['set', str(num), 'boot', 'on']) start = end self._exec(*cmd_args) retries = [0] pids = [''] fuser_err = [''] interval = CONF.disk_partitioner.check_device_interval max_retries = CONF.disk_partitioner.check_device_max_retries timer = loopingcall.FixedIntervalLoopingCall( self._wait_for_disk_to_become_available, retries, max_retries, pids, fuser_err) timer.start(interval=interval).wait() if retries[0] > max_retries: if pids[0]: raise exception.InstanceDeployFailure( _('Disk partitioning failed on device %(device)s. ' 'Processes with the following PIDs are holding it: ' '%(pids)s. Time out waiting for completion.') % { 'device': self._device, 'pids': pids[0] }) else: raise exception.InstanceDeployFailure( _('Disk partitioning failed on device %(device)s. Fuser ' 'exited with "%(fuser_err)s". Time out waiting for ' 'completion.') % { 'device': self._device, 'fuser_err': fuser_err[0] })
def _wait_for_node_deploy(self, task): """Wait for xCAT node deployment to complete.""" locals = {'errstr': ''} driver_info = _parse_deploy_info(task.node) node_mac_addrsses = driver_utils.get_node_mac_addresses(task) i_info = task.node.instance_info def _wait_for_deploy(): out, err = xcat_util.exec_xcatcmd(driver_info, 'nodels', 'nodelist.status') if err: locals['errstr'] = _( "Error returned when quering node status" " for node %s:%s") % (driver_info['xcat_node'], err) LOG.warning(locals['errstr']) raise loopingcall.LoopingCallDone() if out: node, status = out.split(": ") status = status.strip() if status == "booted": LOG.info( _("Deployment for node %s completed.") % driver_info['xcat_node']) raise loopingcall.LoopingCallDone() if (CONF.xcat.deploy_timeout and timeutils.utcnow() > expiration): locals['errstr'] = _( "Timeout while waiting for" " deployment of node %s.") % driver_info['xcat_node'] LOG.warning(locals['errstr']) raise loopingcall.LoopingCallDone() expiration = timeutils.utcnow() + datetime.timedelta( seconds=CONF.xcat.deploy_timeout) timer = loopingcall.FixedIntervalLoopingCall(_wait_for_deploy) # default check every 10 seconds timer.start(interval=CONF.xcat.deploy_checking_interval).wait() if locals['errstr']: raise xcat_exception.xCATDeploymentFailure(locals['errstr']) # deploy end, delete the dhcp rule for xcat self._ssh_delete_dhcp_rule(CONF.xcat.network_node_ip, CONF.xcat.ssh_port, CONF.xcat.ssh_user, CONF.xcat.ssh_password, i_info['network_id'], node_mac_addrsses[0])
def activate_node(self, context, node, instance): """Wait for PXE deployment to complete.""" locals = {'error': '', 'started': False} def _wait_for_deploy(): """Called at an interval until the deployment completes.""" try: row = db.bm_node_get(context, node['id']) if instance['uuid'] != row.get('instance_uuid'): locals['error'] = _("Node associated with another instance" " while waiting for deploy of %s") raise loopingcall.LoopingCallDone() status = row.get('task_state') if (status == states.DEPLOYING and locals['started'] is False): LOG.info( _("PXE deploy started for instance %s") % instance['uuid']) locals['started'] = True elif status in (states.DEPLOYDONE, states.ACTIVE): LOG.info( _("PXE deploy completed for instance %s") % instance['uuid']) raise loopingcall.LoopingCallDone() elif status == states.DEPLOYFAIL: locals['error'] = _("PXE deploy failed for instance %s") except exception.NodeNotFound: locals['error'] = _("Baremetal node deleted while waiting " "for deployment of instance %s") if (CONF.pxe_deploy_timeout and timeutils.utcnow() > expiration): locals['error'] = _("Timeout reached while waiting for " "PXE deploy of instance %s") if locals['error']: raise loopingcall.LoopingCallDone() expiration = timeutils.utcnow() + datetime.timedelta( seconds=CONF.pxe_deploy_timeout) timer = loopingcall.FixedIntervalLoopingCall(_wait_for_deploy) timer.start(interval=1).wait() if locals['error']: raise exception.InstanceDeployFailure(locals['error'] % instance['uuid'])
def _wait_for_state_change(target_state, ucs_power_handle): """Wait and check for the power state change.""" state = [None] retries = [0] def _wait(state, retries): state[0] = ucs_power_handle.get_power_state() if ((retries[0] != 0) and (UCS_TO_IRONIC_POWER_STATE.get(state[0]) == target_state)): raise loopingcall.LoopingCallDone() if retries[0] > CONF.cisco_ucs.max_retry: state[0] = states.ERROR raise loopingcall.LoopingCallDone() retries[0] += 1 timer = loopingcall.FixedIntervalLoopingCall(_wait, state, retries) timer.start(interval=CONF.cisco_ucs.action_interval).wait() return UCS_TO_IRONIC_POWER_STATE.get(state[0], states.ERROR)
def _get_image(ctx, path, uuid, master_path=None, image_service=None): #TODO(ghe): Revise this logic and cdocument process Bug #1199665 # When master_path defined, we save the images in this dir using the iamge # uuid as the file name. Deployments that use this images, creates a hard # link to keep track of this. When the link count of a master image is # equal to 1, can be deleted. #TODO(ghe): have hard links and count links the same behaviour in all fs #TODO(ghe): timeout and retry for downloads def _wait_for_download(): if not os.path.exists(lock_file): raise loopingcall.LoopingCallDone() # If the download of the image needed is in progress (lock file present) # we wait until the locks disappears and create the link. if master_path is None: #NOTE(ghe): We don't share images between instances/hosts images.fetch_to_raw(ctx, uuid, path, image_service) else: master_uuid = os.path.join(master_path, service_utils.parse_image_ref(uuid)[0]) lock_file = os.path.join(master_path, master_uuid + '.lock') _link_master_image(master_uuid, path) if not os.path.exists(path): fileutils.ensure_tree(master_path) if not _download_in_progress(lock_file): with fileutils.remove_path_on_error(lock_file): #TODO(ghe): logging when image cannot be created fd, tmp_path = tempfile.mkstemp(dir=master_path) os.close(fd) images.fetch_to_raw(ctx, uuid, tmp_path, image_service) _create_master_image(tmp_path, master_uuid, path) _remove_download_in_progress_lock(lock_file) else: #TODO(ghe): expiration time timer = loopingcall.FixedIntervalLoopingCall( _wait_for_download) timer.start(interval=1).wait() _link_master_image(master_uuid, path)
def _power_off(self): """Turn the power to this node OFF.""" def _wait_for_power_off(): """Called at an interval until the node's power is off.""" self._update_state() if self.state == states.POWER_OFF: raise loopingcall.LoopingCallDone() if self.retries > CONF.ipmi_power_retry: self.state = states.ERROR raise loopingcall.LoopingCallDone() try: self.retries += 1 self._exec_ipmitool("power off") except Exception: LOG.exception(_("IPMI power off failed")) self.retries = 0 timer = loopingcall.FixedIntervalLoopingCall(_wait_for_power_off) timer.start(interval=1).wait()
def _reboot(node, timeout=None): """Reboot this node. :param node: Ironic node one of :class:`ironic.db.models.Node` :param timeout: Time in seconds to wait till reboot is compelete :raises: InvalidParameterValue if a seamicro parameter is invalid. :raises: MissingParameterValue if required seamicro parameters are missing. :returns: Power state of the given node """ if timeout is None: timeout = CONF.seamicro.action_timeout state = [None] retries = [0] seamicro_info = _parse_driver_info(node) server = _get_server(seamicro_info) def _wait_for_reboot(state, retries): """Called at an interval until the node is rebooted successfully.""" state[0] = _get_power_status(node) if state[0] == states.POWER_ON: raise loopingcall.LoopingCallDone() if retries[0] > CONF.seamicro.max_retry: state[0] = states.ERROR raise loopingcall.LoopingCallDone() try: retries[0] += 1 server.reset() except seamicro_client_exception.ClientException: LOG.warning(_LW("Reboot failed for node %s."), node.uuid) timer = loopingcall.FixedIntervalLoopingCall(_wait_for_reboot, state, retries) server.reset() timer.start(interval=timeout).wait() return state[0]
def _power_off(driver_info): """Turn the power OFF for this node. :param driver_info: the ipmitool parameters for accessing a node. :returns: one of ironic.common.states POWER_OFF or ERROR. :raises: IPMIFailure on an error from ipmitool (from _power_status call). """ # use mutable objects so the looped method can change them state = [None] retries = [0] def _wait_for_power_off(state, retries): """Called at an interval until the node's power is off.""" state[0] = _power_status(driver_info) if state[0] == states.POWER_OFF: raise loopingcall.LoopingCallDone() if retries[0] > CONF.ipmi.retry_timeout: LOG.error(_('IPMI power off timed out after %(tries)s retries.'), {'tries': retries[0]}) state[0] = states.ERROR raise loopingcall.LoopingCallDone() try: # only issue "power off" once if retries[0] == 0: _exec_ipmitool(driver_info, "power off") retries[0] += 1 except Exception: # Log failures but keep trying LOG.warning( _("IPMI power off failed for node %s.") % driver_info['uuid']) timer = loopingcall.FixedIntervalLoopingCall(_wait_for_power_off, state=state, retries=retries) timer.start(interval=1.0).wait() return state[0]
def _power_on(node, timeout=None): """Power ON this node :param node: An Ironic node object. :param timeout: Time in seconds to wait till power on is complete. :raises: InvalidParameterValue if a seamicro parameter is invalid. :raises: MissingParameterValue if required seamicro parameters are missing. :returns: Power state of the given node. """ if timeout is None: timeout = CONF.seamicro.action_timeout state = [None] retries = [0] seamicro_info = _parse_driver_info(node) server = _get_server(seamicro_info) def _wait_for_power_on(state, retries): """Called at an interval until the node is powered on.""" state[0] = _get_power_status(node) if state[0] == states.POWER_ON: raise loopingcall.LoopingCallDone() if retries[0] > CONF.seamicro.max_retry: state[0] = states.ERROR raise loopingcall.LoopingCallDone() try: retries[0] += 1 server.power_on() except seamicro_client_exception.ClientException: LOG.warning(_LW("Power-on failed for node %s."), node.uuid) timer = loopingcall.FixedIntervalLoopingCall(_wait_for_power_on, state, retries) timer.start(interval=timeout).wait() return state[0]
def start_shellinabox_console(node_uuid, port, console_cmd): """Open the serial console for a node. :param node_uuid: the uuid for the node. :param port: the terminal port for the node. :param console_cmd: the shell command that gets the console. :raises: ConsoleError if the directory for the PID file cannot be created. :raises: ConsoleSubprocessFailed when invoking the subprocess failed. """ # make sure that the old console for this node is stopped # and the files are cleared try: _stop_console(node_uuid) except exception.NoConsolePid: pass except processutils.ProcessExecutionError as exc: LOG.warning(_LW("Failed to kill the old console process " "before starting a new shellinabox console " "for node %(node)s. Reason: %(err)s"), {'node': node_uuid, 'err': exc}) _ensure_console_pid_dir_exists() pid_file = _get_console_pid_file(node_uuid) # put together the command and arguments for invoking the console args = [] args.append(CONF.console.terminal) if CONF.console.terminal_cert_dir: args.append("-c") args.append(CONF.console.terminal_cert_dir) else: args.append("-t") args.append("-p") args.append(str(port)) args.append("--background=%s" % pid_file) args.append("-s") args.append(console_cmd) # run the command as a subprocess try: LOG.debug('Running subprocess: %s', ' '.join(args)) # use pipe here to catch the error in case shellinaboxd # failed to start. obj = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except (OSError, ValueError) as e: error = _("%(exec_error)s\n" "Command: %(command)s") % {'exec_error': str(e), 'command': ' '.join(args)} LOG.warning(error) raise exception.ConsoleSubprocessFailed(error=error) def _wait(node_uuid, popen_obj): locals['returncode'] = popen_obj.poll() # check if the console pid is created. # if it is, then the shellinaboxd is invoked successfully as a daemon. # otherwise check the error. if locals['returncode'] is not None: if locals['returncode'] == 0 and os.path.exists(pid_file): raise loopingcall.LoopingCallDone() else: (stdout, stderr) = popen_obj.communicate() locals['errstr'] = _("Command: %(command)s.\n" "Exit code: %(return_code)s.\n" "Stdout: %(stdout)r\n" "Stderr: %(stderr)r") % {'command': ' '.join(args), 'return_code': locals['returncode'], 'stdout': stdout, 'stderr': stderr} LOG.warning(locals['errstr']) raise loopingcall.LoopingCallDone() if (time.time() > expiration): locals['errstr'] = _("Timeout while waiting for console" " subprocess to start for node %s.") % node_uuid LOG.warning(locals['errstr']) raise loopingcall.LoopingCallDone() locals = {'returncode': None, 'errstr': ''} expiration = time.time() + CONF.console.subprocess_timeout timer = loopingcall.FixedIntervalLoopingCall(_wait, node_uuid, obj) timer.start(interval=CONF.console.subprocess_checking_interval).wait() if locals['errstr']: raise exception.ConsoleSubprocessFailed(error=locals['errstr'])
def _set_and_wait(task, target_state): """Helper function for DynamicLoopingCall. This method changes the power state and polls AMT until the desired power state is reached. :param task: a TaskManager instance contains the target node. :param target_state: desired power state. :returns: one of ironic.common.states. :raises: PowerStateFailure if cannot set the node to target_state. :raises: AMTFailure. :raises: AMTConnectFailure :raises: InvalidParameterValue """ node = task.node driver = task.driver if target_state not in (states.POWER_ON, states.POWER_OFF): raise exception.InvalidParameterValue( _('Unsupported target_state: %s') % target_state) elif target_state == states.POWER_ON: boot_device = node.driver_internal_info.get('amt_boot_device') if boot_device and boot_device != amt_common.DEFAULT_BOOT_DEVICE: driver.management.ensure_next_boot_device(node, boot_device) def _wait(status): status['power'] = _power_status(node) if status['power'] == target_state: raise loopingcall.LoopingCallDone() if status['iter'] >= CONF.amt.max_attempts: status['power'] = states.ERROR LOG.warning( _LW("AMT failed to set power state %(state)s after " "%(tries)s retries on node %(node_id)s."), { 'state': target_state, 'tries': status['iter'], 'node_id': node.uuid }) raise loopingcall.LoopingCallDone() try: _set_power_state(node, target_state) except Exception: # Log failures but keep trying LOG.warning( _LW("AMT set power state %(state)s for node %(node)s " "- Attempt %(attempt)s times of %(max_attempt)s " "failed."), { 'state': target_state, 'node': node.uuid, 'attempt': status['iter'] + 1, 'max_attempt': CONF.amt.max_attempts }) status['iter'] += 1 status = {'power': None, 'iter': 0} timer = loopingcall.FixedIntervalLoopingCall(_wait, status) timer.start(interval=CONF.amt.action_wait).wait() if status['power'] != target_state: raise exception.PowerStateFailure(pstate=target_state) return status['power']