def _start_worker(self, death_timeout=60, **kwargs): if self.status and self.status.startswith('clos'): warnings.warn("Tried to start a worker while status=='%s'" % self.status) return if self.processes: W = Nanny kwargs['quiet'] = True else: W = Worker w = W(self.scheduler.address, loop=self.loop, death_timeout=death_timeout, silence_logs=self.silence_logs, **kwargs) yield w._start() self.workers.append(w) while w.status != 'closed' and w.worker_address not in self.scheduler.workers: yield gen.sleep(0.01) if w.status == 'closed' and self.scheduler.status == 'running': self.workers.remove(w) raise gen.TimeoutError("Worker failed to start") raise gen.Return(w)
def on_timeout(): self.log.warning("Timeout waiting for kernel_info_reply: %s", kernel_id) finish() if not future.done(): future.set_exception( gen.TimeoutError("Timeout waiting for restart"))
def test_handle_layer_error(self): context = LayerContext(mode="socks", src_stream=self.src_stream, port=443, scheme="h2") layer_manager._handle_layer_error(gen.TimeoutError("timeout"), context) context.src_stream.close.assert_called_once_with() context.src_stream.reset_mock() layer_manager._handle_layer_error( DestNotConnectedError("stream closed"), context) context.src_stream.close.assert_not_called() context.src_stream.reset_mock() layer_manager._handle_layer_error( DestStreamClosedError("stream closed"), context) context.src_stream.close.assert_called_once_with() context.src_stream.reset_mock() layer_manager._handle_layer_error( SrcStreamClosedError("stream closed"), context) context.src_stream.close.assert_not_called() context.src_stream.reset_mock() layer_manager._handle_layer_error( iostream.StreamClosedError("stream closed"), context) context.src_stream.close.assert_called_once_with()
def _start_worker(self, port=0, processes=None, death_timeout=60, **kwargs): if processes is not None: raise ValueError("overriding `processes` for individual workers " "in a LocalCluster is not supported anymore") if port: raise ValueError("overriding `port` for individual workers " "in a LocalCluster is not supported anymore") if self.processes: W = Nanny kwargs['quiet'] = True else: W = Worker w = W(self.scheduler.address, loop=self.loop, death_timeout=death_timeout, silence_logs=self.silence_logs, **kwargs) yield w._start() self.workers.append(w) while w.status != 'closed' and w.worker_address not in self.scheduler.worker_info: yield gen.sleep(0.01) if w.status == 'closed': self.workers.remove(w) raise gen.TimeoutError("Worker failed to start") raise gen.Return(w)
def _start_worker(self, death_timeout=60, **kwargs): if self.status and self.status.startswith("clos"): warnings.warn("Tried to start a worker while status=='%s'" % self.status) return if self.processes: kwargs["quiet"] = True w = yield self.worker_class(self.scheduler.address, loop=self.loop, death_timeout=death_timeout, silence_logs=self.silence_logs, **kwargs) self.workers.append(w) while w.status != "closed" and w.worker_address not in self.scheduler.workers: yield gen.sleep(0.01) if w.status == "closed" and self.scheduler.status == "running": self.workers.remove(w) raise gen.TimeoutError("Worker failed to start") raise gen.Return(w)
def get(self, stream=None, name=None, client=None, timeout=None): start = time() while name not in self.variables: if timeout is not None: left = timeout - (time() - start) else: left = None if left and left < 0: raise gen.TimeoutError() yield self.started.wait(timeout=left) record = self.variables[name] if record['type'] == 'Future': key = record['value'] token = uuid.uuid4().hex try: state = self.scheduler.task_state[key] except KeyError: state = 'lost' msg = {'token': token, 'state': state} if state == 'erred': msg['exception'] = self.scheduler.exceptions[ self.scheduler.exceptions_blame[key]] msg['traceback'] = self.scheduler.tracebacks[ self.scheduler.exceptions_blame[key]] record = merge(record, msg) self.waiting[key, name].add(token) raise gen.Return(record)
async def wait_for(future, timeout=None): try: await asyncio.wait_for(future, timeout=timeout) except Exception: await self.close(timeout=1) raise gen.TimeoutError( "{} failed to start in {} seconds".format( type(self).__name__, timeout))
def _start_worker(self, port=0, processes=None, death_timeout=60, **kwargs): """ dask-worker --help Usage: dask-worker [OPTIONS] SCHEDULER Options: X--worker-port INTEGER Serving worker port, defaults to randomly assigned --http-port INTEGER Serving http port, defaults to randomly assigned X--nanny-port INTEGER Serving nanny port, defaults to randomly assigned X--port INTEGER Deprecated, see --nanny-port --host TEXT Serving host. Defaults to an ip address that can hopefully be visible from the scheduler network. --nthreads INTEGER Number of threads per process. Defaults to number of cores X--nprocs INTEGER Number of worker processes. Defaults to one. --name TEXT Alias --memory-limit TEXT Number of bytes before spilling data to disk --no-nanny X--help Show this message and exit.""" #todo change this to bsub a job and then grab its configuration? #while job name not in jobs list as running # and worker name not in scheduler worker names #stop until job is running #hash clock time for name name = jobRunning = False: while jobRunning == False: #bjobs #is the worker running #grab IP adress of the process while w.worker_address not in self.scheduler.worker_info: yield gen.sleep(0.01) #store job, worker address to dictionary self.workers.append(w) if w.status == 'closed': self.workers.remove(w) raise gen.TimeoutError("Worker failed to start") raise gen.Return(w)
async def _wait_for_workers(self): while { str(d["name"]) for d in ( await self.scheduler_comm.identity())["workers"].values() } != set(map(str, self.workers)): if (any(w.status == "closed" for w in self.workers.values()) and self.scheduler.status == "running"): raise gen.TimeoutError("Worker unexpectedly closed") await asyncio.sleep(0.1)
async def _wait_for_workers(self): # TODO: this function needs to query scheduler and worker state # remotely without assuming that they are local while { d["name"] for d in self.scheduler.identity()["workers"].values() } != set(self.workers): if (any(w.status == "closed" for w in self.workers.values()) and self.scheduler.status == "running"): raise gen.TimeoutError("Worker unexpectedly closed") await asyncio.sleep(0.1)
def sync(loop, func, *args, callback_timeout=None, **kwargs): """ Run coroutine in loop running in separate thread. """ # Tornado's PollIOLoop doesn't raise when using closed, do it ourselves if PollIOLoop and ( (isinstance(loop, PollIOLoop) and getattr(loop, "_closing", False)) or (hasattr(loop, "asyncio_loop") and loop.asyncio_loop._closed)): raise RuntimeError("IOLoop is closed") try: if loop.asyncio_loop.is_closed(): # tornado 6 raise RuntimeError("IOLoop is closed") except AttributeError: pass e = threading.Event() main_tid = threading.get_ident() result = [None] error = [False] @gen.coroutine def f(): try: if main_tid == threading.get_ident(): raise RuntimeError("sync() called from thread of running loop") yield gen.moment thread_state.asynchronous = True future = func(*args, **kwargs) if callback_timeout is not None: future = gen.with_timeout(timedelta(seconds=callback_timeout), future) result[0] = yield future except Exception as exc: error[0] = sys.exc_info() finally: thread_state.asynchronous = False e.set() loop.add_callback(f) if callback_timeout is not None: if not e.wait(callback_timeout): raise gen.TimeoutError("timed out after %s s." % (callback_timeout, )) else: while not e.is_set(): e.wait(10) if error[0]: typ, exc, tb = error[0] raise exc.with_traceback(tb) else: return result[0]
def _get(self, timeout=None): if timeout is not None: timeout = datetime.timedelta(seconds=timeout) start = datetime.datetime.now() while not self.buffer: if timeout is not None: timeout2 = timeout - (datetime.datetime.now() - start) if timeout2.total_seconds() < 0: raise gen.TimeoutError() else: timeout2 = None yield self.condition.wait(timeout=timeout2) raise gen.Return(self.buffer.popleft())
def sync(loop, func, *args, **kwargs): """ Run coroutine in loop running in separate thread. """ # Tornado's PollIOLoop doesn't raise when using closed, do it ourselves if ((isinstance(loop, PollIOLoop) and getattr(loop, '_closing', False)) or (hasattr(loop, 'asyncio_loop') and loop.asyncio_loop._closed)): raise RuntimeError("IOLoop is closed") timeout = kwargs.pop('callback_timeout', None) def make_coro(): coro = gen.maybe_future(func(*args, **kwargs)) if timeout is None: return coro else: return gen.with_timeout(timedelta(seconds=timeout), coro) e = threading.Event() main_tid = get_thread_identity() result = [None] error = [False] @gen.coroutine def f(): try: if main_tid == get_thread_identity(): raise RuntimeError("sync() called from thread of running loop") yield gen.moment thread_state.asynchronous = True result[0] = yield make_coro() except Exception as exc: logger.exception(exc) error[0] = sys.exc_info() finally: thread_state.asynchronous = False e.set() loop.add_callback(f) if timeout is not None: if not e.wait(timeout): raise gen.TimeoutError("timed out after %s s." % (timeout, )) else: while not e.is_set(): e.wait(10) if error[0]: six.reraise(*error[0]) else: return result[0]
def _wait_until_started(self): delay = 0.05 while True: if self.status != 'starting': return try: msg = self.init_result_q.get_nowait() if msg != 'started': logger.warn("Nanny got unexpected message %s. " "Starting worker again", msg) raise gen.TimeoutError() return except Empty: yield gen.sleep(delay) continue
def test_app_poll(disp, mocker): stop_side_effect = [True for _ in running_apps] stop_side_effect.append(True) # reset state message stop_side_effect.append(False) mocker.patch.object(burlak.LoopSentry, 'should_run', side_effect=stop_side_effect) disp.input_queue.get = mocker.Mock( side_effect=[make_future(gen.TimeoutError()) for _ in running_apps]) disp.node_service.list = mocker.Mock( side_effect=[make_mock_channel_with(d.keys()) for d in running_apps]) def slaves_count(app): for apps in running_apps: for a in apps: if a == app: return apps[a] def info_mock(app, flags=None): count = slaves_count(app) ans = dict(pool=dict(slaves={app: 'dummy_info' for _ in xrange(count)})) return make_mock_channel_with(ans) def check_workers_mismatch(state, workers_count): for d in running_apps: if d == workers_count: return True return False disp.node_service.info = mocker.Mock(side_effect=info_mock) disp.workers_diff = mocker.Mock(side_effect=check_workers_mismatch) control_filter = dict(apply_control=True, white_list=[]) control_filter = ControlFilter.from_dict(control_filter) yield disp.filter_queue.put(burlak.ControlFilterMessage(control_filter)) yield disp.process_loop() assert disp.workers_diff.call_count == len(running_apps) for d in running_apps: assert disp.workers_diff.called_with(dict(), d)
def get(self, stream=None, name=None, client=None, timeout=None): start = time() while name not in self.variables: if timeout is not None: left = timeout - (time() - start) else: left = None if left and left < 0: raise gen.TimeoutError() yield self.started.wait(timeout=left) record = self.variables[name] if record['type'] == 'Future': token = uuid.uuid4().hex record['token'] = token self.waiting[record['value'], name].add(token) raise gen.Return(record)
async def get(self, stream=None, name=None, client=None, timeout=None): start = time() while name not in self.variables: if timeout is not None: left = timeout - (time() - start) else: left = None if left and left < 0: raise gen.TimeoutError() await self.started.wait(timeout=left) record = self.variables[name] if record["type"] == "Future": key = record["value"] token = uuid.uuid4().hex ts = self.scheduler.tasks.get(key) state = ts.state if ts is not None else "lost" msg = {"token": token, "state": state} if state == "erred": msg["exception"] = ts.exception_blame.exception msg["traceback"] = ts.exception_blame.traceback record = merge(record, msg) self.waiting[key, name].add(token) return record
def cmd_timeout(self, ft, cmd): ft.set_exception( gen.TimeoutError( 'Timeout in waiting response of command: {}'.format(str(cmd))))
def _timeout_callback(self, fut): self._ioloop.remove_handler(self._fileno) fut.set_exception(gen.TimeoutError())
def on_timeout(): future.set_exception(gen.TimeoutError())
def on_timeout(): waiter.set_exception(gen.TimeoutError()) self._garbage_collect()
def on_timeout() -> None: if not waiter.done(): waiter.set_exception(gen.TimeoutError()) self._garbage_collect()
def __error_callback(self, future): future.set_exception(gen.TimeoutError("Timeout"))
def on_timeout(): if not future.done(): future.set_exception(gen.TimeoutError())
def _kill(self, comm=None, timeout=10): """ Kill the local worker process Blocks until both the process is down and the scheduler is properly informed """ timeout_time = time() + timeout while not self.worker_address: yield gen.sleep(0.1) if time() > timeout_time: raise gen.TimeoutError() if self.process is None: raise gen.Return('OK') should_watch, self.should_watch = self.should_watch, False if isalive(self.process): try: # Ask worker to close with rpc(self.worker_address) as worker: result = yield gen.with_timeout( timedelta(seconds=min(1, timeout)), worker.terminate(report=False), ) except gen.TimeoutError: logger.info("Worker non-responsive. Terminating.") except CommClosedError: pass except BaseException as e: if (not self.loop._closing and self.loop._running and not self.loop._stopped): logger.exception(e) allowed_errors = (gen.TimeoutError, CommClosedError, EnvironmentError, RPCClosed) try: # Tell scheduler that worker is gone result = yield gen.with_timeout( timedelta(seconds=timeout), self.scheduler.unregister(address=self.worker_address), quiet_exceptions=allowed_errors) if result not in ('OK', 'already-removed'): logger.critical( "Unable to unregister with scheduler %s. " "Nanny: %s, Worker: %s", result, self.address, self.worker_address) else: logger.info("Unregister worker %r from scheduler", self.worker_address) except allowed_errors as e: # Maybe the scheduler is gone, or it is unresponsive logger.warn("Nanny %r failed to unregister worker %r: %s", self.address, self.worker_address, e) except Exception as e: logger.exception(e) if self.process: with ignoring(OSError): self.process.terminate() join(self.process, timeout) processes_to_close.discard(self.process) start = time() while isalive(self.process) and time() < start + timeout: sleep(0.01) self.process = None self.cleanup() logger.info("Nanny %r kills worker process %r", self.address, self.worker_address) self.should_watch = should_watch return