async def inner(): async with PgRenderLocker() as locker1: async with PgRenderLocker() as locker2: async with locker1.render_lock(1) as lock1: with self.assertRaises(WorkflowAlreadyLocked): async with locker2.render_lock(1) as lock2: await lock2.stall_others() await lock1.stall_others()
async def inner(): async with PgRenderLocker() as locker1: async with PgRenderLocker() as locker2: async with locker1.render_lock(1) as lock1: # do not raise WorkflowAlreadyLocked here: it's a # different workflow async with locker2.render_lock(2) as lock2: await lock2.stall_others() await lock1.stall_others()
async def inner(): async with PgRenderLocker() as locker1: async with PgRenderLocker() as locker2: async with locker1.render_lock(1) as lock1: await lock1.stall_others() # do not raise WorkflowAlreadyLocked here async with locker2.render_lock(1) as lock1: await lock1.stall_others()
async def inner(): async with PgRenderLocker() as locker1: async with PgRenderLocker() as locker2: async with locker1.render_lock(1) as lock1: # "break" locker2: make it raise an exception with self.assertRaises(WorkflowAlreadyLocked): async with locker2.render_lock(1) as lock2: await lock2.stall_others() await lock1.stall_others() # now locker2 should be reset to its original state -- # meaning it can acquire a lock just fine async with locker2.render_lock(1) as lock2: await lock2.stall_others()
async def inner(): async with PgRenderLocker() as locker: done, _ = await asyncio.wait( {use_lock(locker, i) for i in range(5)} ) for task in done: task.result() # throw error, if any
async def queue_fetches(pg_render_locker: PgRenderLocker): """Queue all pending fetches in RabbitMQ. We'll set is_busy=True as we queue them, so we don't send double-fetches. """ pending_ids = await load_pending_steps() for workflow_id, step_id in pending_ids: # Don't schedule a fetch if we're currently rendering. # # This still lets us schedule a fetch if a render is _queued_, so it # doesn't solve any races. But it should lower the number of fetches of # resource-intensive workflows. # # Using pg_render_locker means we can only queue a fetch _between_ # renders. The fetch/render queues may be non-empty (we aren't # checking); but we're giving the renderers a chance to tackle some # backlog. try: async with pg_render_locker.render_lock(workflow_id) as lock: # At this moment, the workflow isn't rendering. Let's pass # through and queue the fetch. await lock.stall_others() # required by the PgRenderLocker API logger.info("Queue fetch of step(%d, %d)", workflow_id, step_id) await set_step_busy(step_id) await rabbitmq.send_update_to_workflow_clients( workflow_id, clientside.Update(steps={step_id: clientside.StepUpdate(is_busy=True)}), ) await rabbitmq.queue_fetch(workflow_id, step_id) except WorkflowAlreadyLocked: # Don't queue a fetch. We'll revisit this Step next time we # query for pending fetches. pass
async def main(): """Queue fetches for users' "automatic updates". Run this forever, as a singleton daemon. """ from .autoupdate import queue_fetches # AFTER django.setup() from cjwstate import rabbitmq from cjwstate.rabbitmq.connection import open_global_connection async with PgRenderLocker() as pg_render_locker, open_global_connection( ) as rabbitmq_connection: await rabbitmq_connection.exchange_declare(rabbitmq.GroupsExchange) await rabbitmq_connection.queue_declare(rabbitmq.Fetch, durable=True) while not rabbitmq_connection.closed.done(): t1 = time.time() await benchmark(logger, queue_fetches(pg_render_locker), "queue_fetches()") # Try to fetch at the beginning of each interval. Canonical example # is FetchInterval=60: queue all our fetches as soon as the minute # hand of the clock moves. next_t = (math.floor(t1 / FetchInterval) + 1) * FetchInterval delay = max(0, next_t - time.time()) # Sleep ... or die, if RabbitMQ dies. await asyncio.wait({rabbitmq_connection.closed}, timeout=delay) # raise await rabbitmq_connection.closed # raise on failure # Now, raise on _success_! We should never get here raise RuntimeError( "RabbitMQ closed successfully. That's strange because cron never closes it." )
async def render_workflow_and_maybe_requeue( pg_render_locker: PgRenderLocker, workflow_id: int, delta_id: int, ) -> None: """ Acquire an advisory lock and render, or re-queue task if the lock is held. If a render is requested on a Workflow that's already being rendered, there's no point in wasting CPU cycles starting from scratch. Wait for the first render to exit (which will happen at the next stale database-write). It should then re-schedule a render. """ # Query for workflow before locking. We don't need a lock for this, and no # lock means we can dismiss spurious renders sooner, so they don't fill the # render queue. try: workflow = await _lookup_workflow(workflow_id) except Workflow.DoesNotExist: logger.info("Skipping render of deleted Workflow %d", workflow_id) return try: async with pg_render_locker.render_lock(workflow_id) as lock: # any error leads to undefined behavior result = await render_workflow_once(workflow, delta_id) # requeue if needed await lock.stall_others() if result == RenderResult.MUST_REQUEUE: want_requeue = True elif result == RenderResult.MUST_NOT_REQUEUE: want_requeue = False else: try: workflow = await _lookup_workflow(workflow_id) if workflow.last_delta_id != delta_id: logger.info( "Requeueing render(workflow=%d, delta=%d)", workflow_id, workflow.last_delta_id, ) want_requeue = True else: want_requeue = False except Workflow.DoesNotExist: logger.info("Skipping requeue of deleted Workflow %d", workflow_id) want_requeue = False if want_requeue: await rabbitmq.queue_render(workflow_id, workflow.last_delta_id) # This is why we used `lock.stall_others()`: after requeue, # another renderer may try to lock this workflow and we want # that lock to _succeed_ -- not raise WorkflowAlreadyLocked. # Only ack() _after_ requeue. That preserves our invariant: if we # schedule a render, there is always an un-acked render for that # workflow queued in RabbitMQ until the workflow is up-to-date. (At # this exact moment, there are briefly two un-acked renders.) except WorkflowAlreadyLocked: logger.info("Workflow %d is being rendered elsewhere; ignoring", workflow_id)
async def inner(): async with PgRenderLocker() as locker: async with locker.render_lock(1) as lock1: async with locker.render_lock(2) as lock2: await lock2.stall_others() async with locker.render_lock(2) as lock2: await lock2.stall_others() await lock1.stall_others()
async def inner(): async with PgRenderLocker() as locker1: async with PgRenderLocker() as locker2: last_line = 'the initial value' async with locker1.render_lock(1) as lock1: await lock1.stall_others() async def stalling_op(): nonlocal last_line async with locker2.render_lock(1) as lock2: last_line = 'entered stalling_op' await lock2.stall_others() last_line = 'exited stalling_op' task = asyncio.create_task(stalling_op()) await asyncio.sleep(0) # Even though we started stalling_op(), it will stall # rather than acquire a lock. self.assertEqual(last_line, 'the initial value') await task self.assertEqual(last_line, 'exited stalling_op')
async def main_loop(): """ Run fetchers and renderers, forever. """ async with PgRenderLocker() as pg_render_locker: @rabbitmq.manual_acking_callback async def render_callback(message, ack): return await handle_render(message, ack, pg_render_locker) connection = rabbitmq.get_connection() connection.declare_queue_consume(rabbitmq.Render, render_callback) # Run forever await connection._closed_event.wait()
async def queue_fetches_forever(): async with PgRenderLocker() as pg_render_locker: while True: t1 = time.time() await benchmark(logger, queue_fetches(pg_render_locker), 'queue_fetches()') # Try to fetch at the beginning of each interval. Canonical example # is FetchInterval=60: queue all our fetches as soon as the minute # hand of the clock moves. next_t = (math.floor(t1 / FetchInterval) + 1) * FetchInterval delay = max(0, next_t - time.time()) await asyncio.sleep(delay)
async def queue_fetches(pg_render_locker: PgRenderLocker): """ Queue all pending fetches in RabbitMQ. We'll set is_busy=True as we queue them, so we don't send double-fetches. """ wf_modules = await load_pending_wf_modules() for workflow_id, wf_module in wf_modules: # Don't schedule a fetch if we're currently rendering. # # This still lets us schedule a fetch if a render is _queued_, so it # doesn't solve any races. But it should lower the number of fetches of # resource-intensive workflows. # # Using pg_render_locker means we can only queue a fetch _between_ # renders. The fetch/render queues may be non-empty (we aren't # checking); but we're giving the renderers a chance to tackle some # backlog. try: async with pg_render_locker.render_lock(workflow_id) as lock: # At this moment, the workflow isn't rendering. Let's pass # through and queue the fetch. await lock.stall_others() # required by the PgRenderLocker API logger.info("Queue fetch of wf_module(%d, %d)", workflow_id, wf_module.id) await set_wf_module_busy(wf_module) await websockets.ws_client_send_delta_async( workflow_id, { "updateWfModules": { str(wf_module.id): { "is_busy": True, "fetch_error": "" } } }, ) await rabbitmq.queue_fetch(wf_module) except WorkflowAlreadyLocked: # Don't queue a fetch. We'll revisit this WfModule next time we # query for pending fetches. pass
async def main(): """Run fetchers and renderers, forever.""" # import AFTER django.setup() import cjwstate.modules from cjworkbench.pg_render_locker import PgRenderLocker from cjwstate import rabbitmq from cjwstate.rabbitmq.connection import open_global_connection from .render import handle_render cjwstate.modules.init_module_system() async with PgRenderLocker() as pg_render_locker, open_global_connection() as rabbitmq_connection: await rabbitmq_connection.queue_declare(rabbitmq.Render, durable=True) await rabbitmq_connection.exchange_declare(rabbitmq.GroupsExchange) # Render; ack; render; ack ... forever. async with rabbitmq_connection.acking_consumer(rabbitmq.Render) as consumer: async for message_bytes in consumer: message = msgpack.unpackb(message_bytes) # Crash on error, and don't ack. await handle_render(message, pg_render_locker)
async def render_workflow_and_maybe_requeue( pg_render_locker: PgRenderLocker, workflow_id: int, delta_id: int, ack: Callable[[], Awaitable[None]], requeue: Callable[[int, int], Awaitable[None]], ) -> None: """ Acquire an advisory lock and render, or re-queue task if the lock is held. If a render is requested on a Workflow that's already being rendered, there's no point in wasting CPU cycles starting from scratch. Wait for the first render to exit (which will happen at the next stale database-write). It should then re-schedule a render. """ # Query for workflow before locking. We don't need a lock for this, and no # lock means we can dismiss spurious renders sooner, so they don't fill the # render queue. try: workflow = await _lookup_workflow(workflow_id) except Workflow.DoesNotExist: logger.info("Skipping render of deleted Workflow %d", workflow_id) await ack() return try: async with pg_render_locker.render_lock(workflow_id) as lock: try: result = await render_workflow_once(workflow, delta_id) except (asyncio.CancelledError, DatabaseError, InterfaceError): raise # all undefined behavior # requeue if needed await lock.stall_others() if result == RenderResult.MUST_REQUEUE: want_requeue = True elif result == RenderResult.MUST_NOT_REQUEUE: want_requeue = False else: try: workflow = await _lookup_workflow(workflow_id) if workflow.last_delta_id != delta_id: logger.info( "Requeueing render(workflow=%d, delta=%d)", workflow_id, workflow.last_delta_id, ) want_requeue = True else: want_requeue = False except Workflow.DoesNotExist: logger.info("Skipping requeue of deleted Workflow %d", workflow_id) want_requeue = False if want_requeue: await requeue(workflow_id, workflow.last_delta_id) # This is why we used `lock.stall_others()`: after requeue, # another renderer may try to lock this workflow and we want # that lock to _succeed_ -- not raise WorkflowAlreadyLocked. # Only ack() _after_ requeue. That preserves our invariant: if we # schedule a render, there is always an un-acked render for that # workflow queued in RabbitMQ until the workflow is up-to-date. (At # this exact moment, there are briefly two un-acked renders.) await ack() except WorkflowAlreadyLocked: logger.info("Workflow %d is being rendered elsewhere; ignoring", workflow_id) await ack() except (DatabaseError, InterfaceError): # Possibilities: # # 1. There's a bug in renderer.execute. This may leave the event # loop's executor thread's database connection in an inconsistent # state. [2018-11-06 saw this on production.] The best way to clear # up the leaked, broken connection is to die. (Our parent process # should restart us, and RabbitMQ will give the job to someone # else.) # # 2. The database connection died (e.g., Postgres went away). The # best way to clear up the leaked, broken connection is to die. # (Our parent process should restart us, and RabbitMQ will give the # job to someone else.) # # 3. PgRenderLocker's database connection died (e.g., Postgres went # away). We haven't seen this much in practice; so let's die and let # the parent process restart us. # # 4. There's some design flaw we haven't thought of, and we # shouldn't ever render this workflow. If this is the case, we're # doomed. # # If you're seeing this error that means there's a bug somewhere # _else_. If you're staring at a case-3 situation, please remember # that cases 1 and 2 are important, too. logger.exception("Fatal database error; exiting") os._exit(1)