def KillProcess(pid, signal_=signal.SIGTERM, timeout=30, waitpid=False): """Kill a process given by its pid. @type pid: int @param pid: The PID to terminate. @type signal_: int @param signal_: The signal to send, by default SIGTERM @type timeout: int @param timeout: The timeout after which, if the process is still alive, a SIGKILL will be sent. If not positive, no such checking will be done @type waitpid: boolean @param waitpid: If true, we should waitpid on this process after sending signals, since it's our own child and otherwise it would remain as zombie """ def _helper(pid, signal_, wait): """Simple helper to encapsulate the kill/waitpid sequence""" if utils_wrapper.IgnoreProcessNotFound(os.kill, pid, signal_) and wait: try: os.waitpid(pid, os.WNOHANG) except OSError: pass if pid <= 0: # kill with pid=0 == suicide raise errors.ProgrammerError("Invalid pid given '%s'" % pid) if not IsProcessAlive(pid): return _helper(pid, signal_, waitpid) if timeout <= 0: return def _CheckProcess(): if not IsProcessAlive(pid): return try: (result_pid, _) = os.waitpid(pid, os.WNOHANG) except OSError: raise utils_retry.RetryAgain() if result_pid > 0: return raise utils_retry.RetryAgain() try: # Wait up to $timeout seconds utils_retry.Retry(_CheckProcess, (0.01, 1.5, 0.1), timeout) except utils_retry.RetryTimeout: pass if IsProcessAlive(pid): # Kill process if it's still alive _helper(pid, signal.SIGKILL, waitpid)
def IsProcessAlive(pid): """Check if a given pid exists on the system. @note: zombie status is not handled, so zombie processes will be returned as alive @type pid: int @param pid: the process ID to check @rtype: boolean @return: True if the process exists """ def _TryStat(name): try: os.stat(name) return True except EnvironmentError as err: if err.errno in (errno.ENOENT, errno.ENOTDIR): return False elif err.errno == errno.EINVAL: raise utils_retry.RetryAgain(err) raise assert isinstance(pid, int), "pid must be an integer" if pid <= 0: return False # /proc in a multiprocessor environment can have strange behaviors. # Retry the os.stat a few times until we get a good result. try: return utils_retry.Retry(_TryStat, (0.01, 1.5, 0.1), 0.5, args=[_GetProcStatusPath(pid)]) except utils_retry.RetryTimeout as err: err.RaiseInner()
def _flock(self, flag, blocking, timeout, errmsg): """Wrapper for fcntl.flock. @type flag: int @param flag: operation flag @type blocking: bool @param blocking: whether the operation should be done in blocking mode. @type timeout: None or float @param timeout: for how long the operation should be retried (implies non-blocking mode). @type errmsg: string @param errmsg: error message in case operation fails. """ assert self.fd, "Lock was closed" assert timeout is None or timeout >= 0, \ "If specified, timeout must be positive" assert not (flag & fcntl.LOCK_NB), "LOCK_NB must not be set" # When a timeout is used, LOCK_NB must always be set if not (timeout is None and blocking): flag |= fcntl.LOCK_NB if timeout is None: self._Lock(self.fd, flag, timeout) else: try: retry.Retry(self._Lock, (0.1, 1.2, 1.0), timeout, args=(self.fd, flag, timeout)) except retry.RetryTimeout: raise errors.LockError(errmsg)
def _WaitForProcess(child, timeout): """Waits for the child to terminate or until we reach timeout. """ try: utils_retry.Retry(_CheckIfAlive, (1.0, 1.2, 5.0), max(0, timeout), args=[child]) except utils_retry.RetryTimeout: pass
def _AssertInstanceMove(inst, move_type): def fn(): out = stdout_of([ "gnt-job", "list", "--output=status", "--no-headers", "--filter", '"%s(%s)" in summary' % (move_type, inst.name) ]) if 'success' not in out: raise retry.RetryAgain() retry.Retry(fn, 5.0, 500.0)
def _AssertNodeDrained(node): def fn(): out = stdout_of([ "gnt-node", "list", "--output=name", "--no-headers", "--filter", "drained" ]) if node.primary not in out: raise retry.RetryAgain() retry.Retry(fn, 5.0, 500.0)
def _AssertRepairCommand(): def fn(): out = stdout_of([ "gnt-job", "list", "--output=status", "--no-headers", "--filter", '"REPAIR_COMMAND" in summary' ]) if 'success' not in out: raise retry.RetryAgain() retry.Retry(fn, 5.0, 500.0)
def _AssertInstanceRunning(inst): def fn(): out = stdout_of([ "gnt-instance", "list", "--output=status", "--no-headers", "--filter", "name == \"%s\"" % inst.name ]) if "running" not in out: raise retry.RetryAgain() retry.Retry(fn, 5.0, 500.0)
def _AssertRepairTagAddition(node): def fn(): tags = _GetMaintTags(node) if len(tags) == 0: raise retry.RetryAgain() if len(tags) > 1: raise qa_error.Error("Only one tag should be added") else: return tags[0] return retry.Retry(fn, 5.0, 500.0)
def TestJobCancellation(): """gnt-job cancel""" # The delay used for the first command should be large enough for the next # command and the cancellation command to complete before the first job is # done. The second delay should be small enough that not too much time is # spend waiting in the case of a failed cancel and a running command. FIRST_COMMAND_DELAY = 10.0 AssertCommand(["gnt-debug", "delay", "--submit", str(FIRST_COMMAND_DELAY)]) SECOND_COMMAND_DELAY = 3.0 master = qa_config.GetMasterNode() # Forcing tty usage does not work on buildbot, so force all output of this # command to be redirected to stdout job_id_output = GetCommandOutput( master.primary, "gnt-debug delay --submit %s 2>&1" % SECOND_COMMAND_DELAY) possible_job_ids = re.findall("JobID: ([0-9]+)", job_id_output) if len(possible_job_ids) != 1: raise qa_error.Error( "Cannot parse gnt-debug delay output to find job id") job_id = possible_job_ids[0] AssertCommand(["gnt-job", "cancel", job_id]) # Now wait until the second job finishes, and expect the watch to fail due to # job cancellation AssertCommand(["gnt-job", "watch", job_id], fail=True) # Then check for job cancellation job_status = _GetJobStatus(job_id) if job_status != constants.JOB_STATUS_CANCELED: # Try and see if the job is being cancelled, and wait until the status # changes or we hit a timeout if job_status == constants.JOB_STATUS_CANCELING: retry_fn = functools.partial(_RetryingFetchJobStatus, constants.JOB_STATUS_CANCELING, job_id) try: # The multiplier to use is arbitrary, setting it higher could prevent # flakiness WAIT_MULTIPLIER = 4.0 job_status = retry.Retry(retry_fn, 2.0, WAIT_MULTIPLIER * FIRST_COMMAND_DELAY) except retry.RetryTimeout: # The job status remains the same pass if job_status != constants.JOB_STATUS_CANCELED: raise qa_error.Error("Job was not successfully cancelled, status " "found: %s" % job_status)
def AssertStatusRetry(jid, status, interval=1.0, timeout=20.0): """Keeps polling the given job until a given status is reached. @type jid: int @param jid: job ID of the job to poll @type status: string @param status: status to wait for @type interval: float @param interval: polling interval in seconds @type timeout: float @param timeout: polling timeout in seconds @raise retry:RetryTimeout: If the status was not reached within the timeout """ retry_fn = lambda: qa_job_utils.RetryingUntilJobStatus(status, str(jid)) retry.Retry(retry_fn, interval, timeout)
def CheckSsconfInstanceList(instance): """Checks if a certain instance is in the ssconf instance list. Because ssconf is updated in an asynchronous manner, this function will retry reading the ssconf instance list until it either contains the desired instance, or a timeout is reached. @type instance: string @param instance: Instance name """ instance_name = qa_utils.ResolveInstanceName(instance) def _CheckSsconfInstanceList(): if instance_name not in _ReadSsconfInstanceList(): raise retry.RetryAgain() retry.Retry(_CheckSsconfInstanceList, 1, 5)
return True except EnvironmentError, err: if err.errno in (errno.ENOENT, errno.ENOTDIR): return False elif err.errno == errno.EINVAL: raise utils_retry.RetryAgain(err) raise assert isinstance(pid, int), "pid must be an integer" if pid <= 0: return False # /proc in a multiprocessor environment can have strange behaviors. # Retry the os.stat a few times until we get a good result. try: return utils_retry.Retry(_TryStat, (0.01, 1.5, 0.1), 0.5, args=[_GetProcStatusPath(pid)]) except utils_retry.RetryTimeout, err: err.RaiseInner() def IsDaemonAlive(name): """Determines whether a daemon is alive @type name: string @param name: daemon name @rtype: boolean @return: True if daemon is running, False otherwise """ return IsProcessAlive(utils_io.ReadPidFile(utils_io.DaemonPidFileName(name)))