def shutdown(self): """ Tell all of the workers we know about to kill themselves. Only sends the RPC; doesn't update any master-side state. """ for worker in self.workers.values(): try: get_string('http://%s/control/kill/' % worker.netloc) except: pass
def _backoff_request(self, url, method, payload=None, num_attempts=1, initial_wait=0, need_result=True, callback=None): initial_wait = 5 for _ in range(0, num_attempts): if self.stop_event.is_set(): break try: try: if method == "POST": if need_result or num_attempts > 1: content = post_string(url, payload) else: if callback is None: callback = self.master_post_result_callback post_string_noreturn(url, payload, result_callback=callback) return elif method == "GET": content = get_string(url) else: raise Exception("Invalid method %s" % method) return 200, content except Exception as e: ciel.log("Backoff-request failed with exception %s; re-raising MasterNotResponding" % e, "MASTER_PROXY", logging.ERROR) raise MasterNotRespondingException() except: ciel.log.error("Error contacting master", "MSTRPRXY", logging.WARN, True) self.stop_event.wait(initial_wait) initial_wait += initial_wait * random.uniform(0.5, 1.5) ciel.log.error("Given up trying to contact master", "MSTRPRXY", logging.ERROR, True) if self.stop_event.is_set(): raise WorkerShutdownException() else: raise MasterNotRespondingException()
def backoff_request(self, url, method, payload=None, need_result=True, callback=None): if self.stop_event.is_set(): return try: if method == "POST": if need_result: content = post_string(url, payload) else: if callback is None: callback = self.master_post_result_callback post_string_noreturn(url, payload, result_callback=callback) return elif method == "GET": content = get_string(url) else: raise Exception("Invalid method %s" % method) return 200, content except: ciel.log("Error attempting to contact master, aborting", "MSTRPRXY", logging.WARNING, True) raise
def investigate_worker_failure(self, worker): ciel.log.error('Investigating possible failure of worker %s (%s)' % (worker.id, worker.netloc), 'WORKER_POOL', logging.WARNING) try: content = get_string('http://%s/control/master/' % worker.netloc) worker_fetch = simplejson.loads(content) assert worker_fetch['id'] == worker.id except: self.worker_failed(worker)
def _investigate_worker_failure(self, worker): """ Called by _reap_dead_workers() if the worker has gone too long without giving us a ping. """ ciel.log.error('Investigating possible failure of worker %s (%s)' % (worker.id, worker.netloc), 'WORKER_POOL', logging.WARNING) try: content = get_string('http://%s/control/master/' % worker.netloc) worker_fetch = simplejson.loads(content) assert worker_fetch['id'] == worker.id except: self.worker_failed(worker)
def _backoff_request(self, url, method, payload=None, num_attempts=1, initial_wait=0, need_result=True, callback=None): initial_wait = 5 for _ in range(0, num_attempts): if self.stop_event.is_set(): break try: try: if method == "POST": if need_result or num_attempts > 1: content = post_string(url, payload) else: if callback is None: callback = self.master_post_result_callback post_string_noreturn(url, payload, result_callback=callback) return elif method == "GET": content = get_string(url) else: raise Exception("Invalid method %s" % method) return 200, content except Exception as e: ciel.log( "Backoff-request failed with exception %s; re-raising MasterNotResponding" % e, "MASTER_PROXY", logging.ERROR) raise MasterNotRespondingException() except: ciel.log.error("Error contacting master", "MSTRPRXY", logging.WARN, True) self.stop_event.wait(initial_wait) initial_wait += initial_wait * random.uniform(0.5, 1.5) ciel.log.error("Given up trying to contact master", "MSTRPRXY", logging.ERROR, True) if self.stop_event.is_set(): raise WorkerShutdownException() else: raise MasterNotRespondingException()
def shutdown(self): for worker in self.workers.values(): try: get_string('http://%s/control/kill/' % worker.netloc) except: pass