def shutdown(self, timeout_ms=10000): """Shutdown all workers after `shutdown_timeout_secs`.""" logging.info('Shutting down %s.', self) req = event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms), shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR) self.configure(req) # Wait for workers to shutdown. This isn't strictly required # but it avoids triggering multiple checkpoints with the same lame worker. logging.info('Waiting %dms for worker shutdown.', timeout_ms) time.sleep(timeout_ms / 1000)
def shutdown(self, timeout_ms=10000): """Shutdown all workers after `shutdown_timeout_secs`.""" logging.info('Shutting down %s.', self) req = event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms), shutdown_mode=event_pb2.SHUTDOWN_AFTER_TIMEOUT) self.configure(req) # Wait for workers to shutdown. sleep_sec = 10.0 + timeout_ms / 1000 logging.info('Waiting %.2f seconds for worker shutdown.', sleep_sec) time.sleep(sleep_sec)
def shutdown(self, timeout_ms=10000): """Shutdown all workers after `shutdown_timeout_secs`.""" logging.info('Shutting down %s.', self) req = event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms), shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR) self.configure(req) # Wait for workers to shutdown. If we continue immediately, we can create a # new heartbeat manager before the workers shutdown: this keeps the workers # alive and can introduce confusing behavior. sleep_sec = 10.0 + timeout_ms / 1000 logging.info('Waiting %.2f seconds for worker shutdown.', sleep_sec) time.sleep(sleep_sec)
def shutdown(self, wait_time_in_ms=60000, exit_code=None): """Shutdown all workers after `shutdown_timeout_secs`.""" logging.info('Shutting down %s.', self) req = event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig( timeout_ms=wait_time_in_ms), shutdown_mode=event_pb2.SHUTDOWN_AFTER_TIMEOUT, exit_code=event_pb2.RequestedExitCode( exit_code=exit_code) if exit_code is not None else None) self.configure(req) # Wait for workers to shutdown. sleep_sec = 10.0 + wait_time_in_ms / 1000 logging.info('Waiting %.2f seconds for worker shutdown.', sleep_sec) time.sleep(sleep_sec)
def ping(self, request=None, timeout_in_ms=5000): """Ping all workers, returning the parsed status results.""" if request is None: request = event_pb2.WorkerHeartbeatRequest() options = config_pb2.RunOptions(timeout_in_ms=timeout_in_ms) results = self._session.run( self._ops, feed_dict={self._request_placeholder: request.SerializeToString()}, options=options) parsed_results = [ event_pb2.WorkerHeartbeatResponse.FromString(res_pb) for res_pb in results ] logging.debug('Ping results: %s', parsed_results) return parsed_results
def after_create_session(self, training_session, coord): # pylint: disable=unused-argument # N.B. We have to pull the global step here to avoid it being unavailable # at checkpoint time; the graph has been frozen at that point. if training_util.get_global_step() is None and self.saver( ) is not None: raise ValueError( 'Saver defined but no global step. Run `get_or_create_global_step()`' ' in your model definition to allow checkpointing.') with self._graph.as_default(): self._session = session_lib.Session( target=training_session.sess_str, graph=self._graph) self._workers = WorkerHeartbeatManager.from_devices( self._session, all_worker_devices(self._session)) self._workers.configure( event_pb2.WorkerHeartbeatRequest( shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
def _reset_manager(self): """Reset the graph, session and worker manager.""" self._graph = ops.Graph() self._session = session_lib.Session( target=self._target, graph=self._graph, ) if self._devices is None: self._devices = all_worker_devices(self._session) with self._graph.as_default(): self._worker_manager = WorkerHeartbeatManager.from_devices( self._session, self._devices) self._worker_manager.configure( event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig( timeout_ms=self.shutdown_timeout * 1000, )))
def after_create_session(self, training_session, coord): # pylint: disable=unused-argument # N.B. We have to pull the global step here to avoid it being unavailable # at checkpoint time; the graph has been frozen at that point. if training_util.get_global_step() is None and self.saver() is not None: raise ValueError( 'Saver defined but no global step. Run `get_or_create_global_step()`' ' in your model definition to allow checkpointing.') with self._graph.as_default(): logging.info('Installing graceful shutdown hook.') self._session = _clone_session(training_session, self._graph) self._workers = WorkerHeartbeatManager.from_devices( self._session, all_worker_devices(self._session)) self._heartbeat_supported = self._workers.num_workers() > 0 if self._heartbeat_supported: self._workers.configure( event_pb2.WorkerHeartbeatRequest( shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR)) else: logging.warn( 'No workers support hearbeats. Failure handling will be disabled.')
def shutdown(self, timeout_ms=10000): """Shutdown all workers after `shutdown_timeout_secs`.""" req = event_pb2.WorkerHeartbeatRequest( watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms)) self.configure(req)