예제 #1
0
 def stop_all(self):
   """We go through all the blox running on this node and send them a sigterm.
   We then poll them every second for five seconds to see if they are still alive.
   On the third poll, we resend a SIGTERM. Finally, any processes that are still
   alive after 5 polls are then killed.
   """
   logger.info("[caretaker] stopping all blocks")
   pinfo = [(p, d[0]) for (p, d) in self.processes.items()]
   for (p, pname) in pinfo:
     p.terminate()
     logger.info("Sent sigterm to process %d (%s)" % (p.pid, pname))
   logger.info("Waiting for processes to exit")
   for i in range(1, 6):
     time.sleep(1)
     alive_procs = []
     for (p, pname) in pinfo:
       if utils.is_process_alive(p.pid):
         if i==3:
           p.terminate()
           logger.info("Check %d: %d (%s) still alive - resent SIGTERM" % (i, p.pid, pname))
         else:
           logger.info("Check %d: %d (%s) still alive" % (i, p.pid, pname))
         alive_procs.append((p, pname),)
     pinfo = alive_procs
     if len(pinfo)==0:
       break
   for (p, pname) in pinfo:
     if utils.is_process_alive(p.pid):
       logger.info("%d (%s) still alive after %d seconds, sending SIGKILL" % (p.pid, pname, i))
       p.kill()
   logger.info("[caretaker] done")
예제 #2
0
 def acquire(self):
     logger.debug("Acquiring lock %s" % self.lock_id)
     if not os.path.exists(self.path):
         self._make_lockfile()
     while True:
         with open(self.path, "r+") as f:
             fcntl.flock(f, fcntl.LOCK_EX)
             pid = _readpid(f)
             if pid==None:
                 _writepid(f)
                 fcntl.flock(f, fcntl.LOCK_UN)
                 logger.debug("%d got lock %s" % (os.getpid(), self.lock_id))
                 return
             elif pid == os.getpid():
                 fcntl.flock(f, fcntl.LOCK_UN)
                 raise LockError("Process %d attempting to acquire lock %s which it already holds" %
                                 (pid, self.lock_id))
             elif not is_process_alive(pid):
                 logger.debug("%d: Lock holder %d of lock %s is dead, taking lock for myself" %
                              (os.getpid(), pid, self.lock_id))
                 _writepid(f)
                 fcntl.flock(f, fcntl.LOCK_UN)
                 return
             else:
                 fcntl.flock(f, fcntl.LOCK_UN)
         time.sleep(TIMEOUT)
예제 #3
0
def stop(pid_file):
    pid = get_daemon_pid(pid_file)
    if not pid:
        return 0
    logger.info("Terminating process %d" % pid)
    os.kill(pid, signal.SIGTERM)
    for i in range(10):
        if utils.is_process_alive(pid):
            time.sleep(1)
        else:
            break
    if utils.is_process_alive(pid):
        logger.error("Unable to stop worker daemon %d" % pid)
        return 1
    else:
        logger.info("Worker daemon stopped")
        return 0
예제 #4
0
 def is_locked(self):
     if not os.path.exists(self.path):
         return False
     with open(self.path, "r") as f:
         fcntl.flock(f, fcntl.LOCK_EX)
         pid = _readpid(f)
         fcntl.flock(f, fcntl.LOCK_UN)
     if pid!=None and (pid==os.getpid() or is_process_alive(pid)):
         return True
     else:
         return False
예제 #5
0
def get_daemon_pid(pid_file):
    if not os.path.exists(pid_file):
        logger.info("Not running: pid file %s does not exist" % pid_file)
        return None
    with open(pid_file, "r") as pf:
        pid = int(pf.read().rstrip())
    if not utils.is_process_alive(pid):
        logger.info("Process %d is no longer running" % pid)
        return None
    else:
        return pid
예제 #6
0
 def collect_poll_data(self, get_stats):
   loads = {}
   errors = []
   for block_id, block_file in self.processes.values():
     try:
       block_load = None
       with open(block_file, 'r') as f:
         s = f.read()
         block_load = json.loads(s)
       assert len(block_load)==LoadTuple.TUPLE_LEN, \
              "block load data for %s wrong len: %s" % (block_id, block_load)
       pid = block_load[LoadTuple.BLOCK_PID]
       if (block_load[LoadTuple.STATUS] in BlockStatus.alive_status_values) and\
          (not utils.is_process_alive(pid)):
         logger.error("Block %s, Process %d has died" % (block_id, pid))
         errfile = get_error_file_path(block_id, pid, self.log_dir)
         if os.path.exists(errfile):
           ue = get_user_error_from_file(errfile)
           ue.append_to_context_top("Datablox node %s" % HOSTNAME)
           errors.append(ue.json_repr())
           logger.error("Found error file for %s, will forward to master" % block_id)
         else:
           logger.error("Did not find an error file for %s, perhaps the process crashed or was killed" %
                        block_id)
         block_load[LoadTuple.STATUS] = BlockStatus.DEAD
       elif block_load[LoadTuple.STATUS]==BlockStatus.BLOCKED:
         # The block is blocked in a long-running operation and cannot update load statistics.
         # We do the updates for the block so the master won't time it out. We know the
         # associated process is still alive since the above check succeeded.
         last_poll_time = block_load[LoadTuple.LAST_POLL_TIME]
         current_time = time.time()
         block_load[LoadTuple.TOTAL_PROCESSING_TIME] += current_time - last_poll_time
         block_load[LoadTuple.LAST_POLL_TIME] = current_time
         logger.info("Block %s reported as BLOCKED, updating stats: total processing_time = %s" % (block_id, block_load[LoadTuple.TOTAL_PROCESSING_TIME]))
       else:
         logger.debug("Block %s reported as ALIVE" % block_id)
       loads[block_id] = block_load
     #TODO: try to re-read the file as the block could have been writing to it at this time
     except Exception, e:
       logger.error("got error when attempting to read block file %s: %s" % (block_file, e))
       continue