def investigate_worker_failure(self, worker): ciel.log.error('Investigating possible failure of worker %s (%s)' % (worker.id, worker.netloc), 'WORKER_POOL', logging.WARNING) try: content = get_string('http://%s/control' % worker.netloc) id = simplejson.loads(content) assert id == worker.id except: self.worker_failed(worker)
def investigate_worker_failure(self, worker): ciel.log.error( 'Investigating possible failure of worker %s (%s)' % (worker.id, worker.netloc), 'WORKER_POOL', logging.WARNING) try: content = get_string('http://%s/control' % worker.netloc) id = simplejson.loads(content) assert id == worker.id except: self.worker_failed(worker)
def backoff_request(self, url, method, payload=None, num_attempts=1, initial_wait=0, need_result=True): initial_wait = 5 for _ in range(0, num_attempts): if self.stop_event.is_set(): break try: try: if method == "POST": if need_result or num_attempts > 1: content = post_string(url, payload) else: post_string_noreturn(url, payload, result_callback=self. master_post_result_callback) return elif method == "GET": content = get_string(url) else: raise Exception("Invalid method %s" % method) return 200, content except Exception as e: ciel.log( "Backoff-request failed with exception %s; re-raising MasterNotResponding" % e, "MASTER_PROXY", logging.ERROR) raise MasterNotRespondingException() except: ciel.log.error("Error contacting master", "MSTRPRXY", logging.WARN, True) self.stop_event.wait(initial_wait) initial_wait += initial_wait * random.uniform(0.5, 1.5) ciel.log.error("Given up trying to contact master", "MSTRPRXY", logging.ERROR, True) if self.stop_event.is_set(): raise WorkerShutdownException() else: raise MasterNotRespondingException()
def shutdown(self): for worker in self.workers.values(): try: get_string('http://%s/control/kill/' % worker.netloc) except: pass