def stop(self): logging.info('Stopping worker watchdog.') self._worker_manager.configure( event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1, ))) self._running = False self.join()
def __exit__(self, exc_type, exc_val, exc_tb): logging.info('Disabling worker watchdog.') self._worker_manager.configure( event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1, ))) self._running = False self.join()
def _reset_manager(self, stopping=False): """Reset the graph, session and worker manager.""" self._graph = ops.Graph() self._session = session_lib.Session( target=self._target, graph=self._graph, config=self._config, ) if self._devices is None: self._devices = all_worker_devices(self._session) with self._graph.as_default(): self._worker_manager = WorkerHeartbeatManager.from_devices( self._session, self._devices) if stopping: timeout_ms = -1 shutdown_mode = event_pb2.NOT_CONFIGURED else: timeout_ms = self.shutdown_timeout * 1000 shutdown_mode = event_pb2.WAIT_FOR_COORDINATOR self._worker_manager.configure( event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig( timeout_ms=timeout_ms), shutdown_mode=shutdown_mode))
def configure_and_run(self): logging.info('Enabling worker watchdog.') self._running = True self._worker_manager.configure( event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig( timeout_ms=self.shutdown_timeout * 1000, ))) self.start()
def shutdown(self, timeout_ms=10000): """Shutdown all workers after `shutdown_timeout_secs`.""" logging.info('Shutting down %s.', self) req = event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms)) self.configure(req) # Wait for workers to shutdown. This isn't strictly required # but it avoids triggering multiple checkpoints with the same lame worker. logging.info('Waiting %dms for worker shutdown.', timeout_ms) time.sleep(timeout_ms / 1000)
def shutdown(self, wait_time_in_ms=60000): """Shutdown all workers after `shutdown_timeout_secs`.""" logging.info('Shutting down %s.', self) req = event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig(timeout_ms=wait_time_in_ms), shutdown_mode=event_pb2.SHUTDOWN_AFTER_TIMEOUT) self.configure(req) # Wait for workers to shutdown. sleep_sec = 10.0 + wait_time_in_ms / 1000 logging.info('Waiting %.2f seconds for worker shutdown.', sleep_sec) time.sleep(sleep_sec)
def shutdown(self, timeout_ms=10000): """Shutdown all workers after `shutdown_timeout_secs`.""" logging.info('Shutting down %s.', self) req = event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms), shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR) self.configure(req) # Wait for workers to shutdown. If we continue immediately, we can create a # new heartbeat manager before the workers shutdown: this keeps the workers # alive and can introduce confusing behavior. sleep_sec = 10.0 + timeout_ms / 1000 logging.info('Waiting %.2f seconds for worker shutdown.', sleep_sec) time.sleep(sleep_sec)
def _reset_manager(self): """Reset the graph, session and worker manager.""" self._graph = ops.Graph() self._session = session_lib.Session( target=self._target, graph=self._graph, ) if self._devices is None: self._devices = all_worker_devices(self._session) with self._graph.as_default(): self._worker_manager = WorkerHeartbeatManager.from_devices( self._session, self._devices) self._worker_manager.configure( event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig( timeout_ms=self.shutdown_timeout * 1000, )))
def shutdown(self, timeout_ms=10000): """Shutdown all workers after `shutdown_timeout_secs`.""" req = event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms)) self.configure(req)