async def daemon_killer( *, settings: configuration.OperatorSettings, memories: containers.ResourceMemories, ) -> None: """ An operator's root task to kill the daemons on the operator's shutdown. """ # Sleep forever, or until cancelled, which happens when the operator begins its shutdown. try: await asyncio.Event().wait() # Terminate all running daemons when the operator exits (and this task is cancelled). finally: coros = [ stop_daemon(daemon=daemon, settings=settings) for memory in memories.iter_all_memories() for daemon in memory.running_daemons.values() ] if coros: await asyncio.wait(coros)
async def daemon_killer( *, settings: configuration.OperatorSettings, memories: containers.ResourceMemories, operator_paused: primitives.ToggleSet, ) -> None: """ An operator's root task to kill the daemons on the operator's demand. The "demand" comes in two cases: when the operator is exiting (gracefully or not), and when the operator is pausing because of peering. In that case, all watch-streams are disconnected, and all daemons/timers should stop. When pausing, the daemons/timers are stopped via their regular stopping procedure: with graceful or forced termination, backoffs, timeouts. .. warning:: Each daemon will be respawned on the next K8s watch-event strictly after the previous daemon is fully stopped. There are never 2 instances of the same daemon running in parallel. In normal cases (enough time is given to stop), this is usually done by the post-pause re-listing event. In rare cases when the re-pausing happens faster than the daemon is stopped (highly unlikely to happen), that event can be missed because the daemon is being stopped yet, so the respawn can happen with a significant delay. This issue is considered low-priority & auxiliary, so as the peering itself. It can be fixed later. Workaround: make daemons to exit fast. """ # Unlimited job pool size —- the same as if we would be managing the tasks directly. # Unlimited timeout in `close()` -- since we have our own per-daemon timeout management. scheduler: aiojobs.Scheduler = await aiojobs.create_scheduler( limit=None, close_timeout=99999) try: while True: # Stay here while the operator is running normally, until it is paused. await operator_paused.wait_for(True) # The stopping tasks are "fire-and-forget" -- we do not get (or care of) the result. # The daemons remain resumable, since they exit not on their own accord. for memory in memories.iter_all_memories(): for daemon in memory.running_daemons.values(): await scheduler.spawn( stop_daemon(settings=settings, daemon=daemon, reason=primitives.DaemonStoppingReason. OPERATOR_PAUSING)) # Stay here while the operator is paused, until it is resumed. # The fresh stream of watch-events will spawn new daemons naturally. await operator_paused.wait_for(False) # Terminate all running daemons when the operator exits (and this task is cancelled). finally: for memory in memories.iter_all_memories(): for daemon in memory.running_daemons.values(): await scheduler.spawn( stop_daemon(settings=settings, daemon=daemon, reason=primitives.DaemonStoppingReason. OPERATOR_EXITING)) await scheduler.close()