def stop_all(self): """We go through all the blox running on this node and send them a sigterm. We then poll them every second for five seconds to see if they are still alive. On the third poll, we resend a SIGTERM. Finally, any processes that are still alive after 5 polls are then killed. """ logger.info("[caretaker] stopping all blocks") pinfo = [(p, d[0]) for (p, d) in self.processes.items()] for (p, pname) in pinfo: p.terminate() logger.info("Sent sigterm to process %d (%s)" % (p.pid, pname)) logger.info("Waiting for processes to exit") for i in range(1, 6): time.sleep(1) alive_procs = [] for (p, pname) in pinfo: if utils.is_process_alive(p.pid): if i==3: p.terminate() logger.info("Check %d: %d (%s) still alive - resent SIGTERM" % (i, p.pid, pname)) else: logger.info("Check %d: %d (%s) still alive" % (i, p.pid, pname)) alive_procs.append((p, pname),) pinfo = alive_procs if len(pinfo)==0: break for (p, pname) in pinfo: if utils.is_process_alive(p.pid): logger.info("%d (%s) still alive after %d seconds, sending SIGKILL" % (p.pid, pname, i)) p.kill() logger.info("[caretaker] done")
def acquire(self): logger.debug("Acquiring lock %s" % self.lock_id) if not os.path.exists(self.path): self._make_lockfile() while True: with open(self.path, "r+") as f: fcntl.flock(f, fcntl.LOCK_EX) pid = _readpid(f) if pid==None: _writepid(f) fcntl.flock(f, fcntl.LOCK_UN) logger.debug("%d got lock %s" % (os.getpid(), self.lock_id)) return elif pid == os.getpid(): fcntl.flock(f, fcntl.LOCK_UN) raise LockError("Process %d attempting to acquire lock %s which it already holds" % (pid, self.lock_id)) elif not is_process_alive(pid): logger.debug("%d: Lock holder %d of lock %s is dead, taking lock for myself" % (os.getpid(), pid, self.lock_id)) _writepid(f) fcntl.flock(f, fcntl.LOCK_UN) return else: fcntl.flock(f, fcntl.LOCK_UN) time.sleep(TIMEOUT)
def stop(pid_file): pid = get_daemon_pid(pid_file) if not pid: return 0 logger.info("Terminating process %d" % pid) os.kill(pid, signal.SIGTERM) for i in range(10): if utils.is_process_alive(pid): time.sleep(1) else: break if utils.is_process_alive(pid): logger.error("Unable to stop worker daemon %d" % pid) return 1 else: logger.info("Worker daemon stopped") return 0
def is_locked(self): if not os.path.exists(self.path): return False with open(self.path, "r") as f: fcntl.flock(f, fcntl.LOCK_EX) pid = _readpid(f) fcntl.flock(f, fcntl.LOCK_UN) if pid!=None and (pid==os.getpid() or is_process_alive(pid)): return True else: return False
def get_daemon_pid(pid_file): if not os.path.exists(pid_file): logger.info("Not running: pid file %s does not exist" % pid_file) return None with open(pid_file, "r") as pf: pid = int(pf.read().rstrip()) if not utils.is_process_alive(pid): logger.info("Process %d is no longer running" % pid) return None else: return pid
def collect_poll_data(self, get_stats): loads = {} errors = [] for block_id, block_file in self.processes.values(): try: block_load = None with open(block_file, 'r') as f: s = f.read() block_load = json.loads(s) assert len(block_load)==LoadTuple.TUPLE_LEN, \ "block load data for %s wrong len: %s" % (block_id, block_load) pid = block_load[LoadTuple.BLOCK_PID] if (block_load[LoadTuple.STATUS] in BlockStatus.alive_status_values) and\ (not utils.is_process_alive(pid)): logger.error("Block %s, Process %d has died" % (block_id, pid)) errfile = get_error_file_path(block_id, pid, self.log_dir) if os.path.exists(errfile): ue = get_user_error_from_file(errfile) ue.append_to_context_top("Datablox node %s" % HOSTNAME) errors.append(ue.json_repr()) logger.error("Found error file for %s, will forward to master" % block_id) else: logger.error("Did not find an error file for %s, perhaps the process crashed or was killed" % block_id) block_load[LoadTuple.STATUS] = BlockStatus.DEAD elif block_load[LoadTuple.STATUS]==BlockStatus.BLOCKED: # The block is blocked in a long-running operation and cannot update load statistics. # We do the updates for the block so the master won't time it out. We know the # associated process is still alive since the above check succeeded. last_poll_time = block_load[LoadTuple.LAST_POLL_TIME] current_time = time.time() block_load[LoadTuple.TOTAL_PROCESSING_TIME] += current_time - last_poll_time block_load[LoadTuple.LAST_POLL_TIME] = current_time logger.info("Block %s reported as BLOCKED, updating stats: total processing_time = %s" % (block_id, block_load[LoadTuple.TOTAL_PROCESSING_TIME])) else: logger.debug("Block %s reported as ALIVE" % block_id) loads[block_id] = block_load #TODO: try to re-read the file as the block could have been writing to it at this time except Exception, e: logger.error("got error when attempting to read block file %s: %s" % (block_file, e)) continue