def test_handlers_do_not_block(SlackClient, container_factory, config, tracker): work_1 = Event() work_2 = Event() class Service: name = 'sample' @rtm.handle_event def handle_1(self, event): work_1.wait() tracker.handle_1(event) @rtm.handle_event def handle_2(self, event): work_2.wait() tracker.handle_2(event) events = [{'spam': 'ham'}] def rtm_read(): if events: return [events.pop(0)] else: return [] SlackClient.return_value.rtm_read.side_effect = rtm_read container = container_factory(Service, config) container.start() try: # both handlers are still working assert (tracker.handle_1.call_args_list == []) assert (tracker.handle_2.call_args_list == []) # finish work of the second handler work_2.send() sleep(0.1) # second handler is done assert (tracker.handle_1.call_args_list == []) assert (tracker.handle_2.call_args_list == [call({'spam': 'ham'})]) # finish work of the first handler work_1.send() sleep(0.1) # first handler is done assert (tracker.handle_1.call_args_list == [call({'spam': 'ham'})]) assert (tracker.handle_2.call_args_list == [call({'spam': 'ham'})]) finally: if not work_1.ready(): work_1.send() if not work_2.ready(): work_2.send()
class Client(object): def __init__(self): self.results = [] self.stop = Event() self.no_more_results = Event() self.failure = None def read(self, path, **kwargs): try: result = self.results.pop(0) except IndexError: if not self.no_more_results.ready(): self.no_more_results.send() eventlet.with_timeout(5, self.stop.wait) raise NoMoreResults() if result.op != READ: self.failure = "Unexpected result type for read(): %s" % result.op raise UnexpectedResultType() if result.exception is not None: log.debug("Raise read exception %s", type(result.exception).__name__) raise result.exception log.debug("Return read result %s", result) return result def write(self, path, value, **kwargs): log.debug("Write of %s to %s", value, path) try: result = self.results.pop(0) except IndexError: if not self.no_more_results.ready(): self.no_more_results.send() eventlet.with_timeout(5, self.stop.wait) raise NoMoreResults() if result.op != WRITE: self.failure = "Unexpected result type for write(): %s" % result.op raise UnexpectedResultType() if result.exception is not None: log.debug("Raise write exception %s", result.exception) raise result.exception log.debug("Return write result") return result def add_read_exception(self, exception): assert (isinstance(exception, Exception)) self.results.append(EtcdResult(exception=exception)) def add_read_result(self, **kwargs): self.results.append(EtcdResult(**kwargs)) def add_write_result(self): # Write results have no useful content. self.results.append(EtcdResult(op=WRITE)) def add_write_exception(self, exception): self.results.append(EtcdResult(op=WRITE, exception=exception))
class Client(object): def __init__(self): self.results = [] self.stop = Event() self.no_more_results = Event() self.failure = None def read(self, path, **kwargs): try: result = self.results.pop(0) except IndexError: if not self.no_more_results.ready(): self.no_more_results.send() eventlet.with_timeout(5, self.stop.wait) raise NoMoreResults() if result.op != READ: self.failure = "Unexpected result type for read(): %s" % result.op raise UnexpectedResultType() if result.exception is not None: log.debug("Raise read exception %s", type(result.exception).__name__) raise result.exception log.debug("Return read result %s", result) return result def write(self, path, value, **kwargs): log.debug("Write of %s to %s", value, path) try: result = self.results.pop(0) except IndexError: if not self.no_more_results.ready(): self.no_more_results.send() eventlet.with_timeout(5, self.stop.wait) raise NoMoreResults() if result.op != WRITE: self.failure = "Unexpected result type for write(): %s" % result.op raise UnexpectedResultType() if result.exception is not None: log.debug("Raise write exception %s", result.exception) raise result.exception log.debug("Return write result") return result def add_read_exception(self, exception): assert(isinstance(exception, Exception)) self.results.append(EtcdResult(exception=exception)) def add_read_result(self, **kwargs): self.results.append(EtcdResult(**kwargs)) def add_write_result(self): # Write results have no useful content. self.results.append(EtcdResult(op=WRITE)) def add_write_exception(self, exception): self.results.append(EtcdResult(op=WRITE, exception=exception))
def test_defer_event(ctx): from datetime import datetime, timedelta from eventlet import sleep, spawn, with_timeout from eventlet.event import Event from melkman.messaging import EventBus from melkman.scheduler import defer_event from melkman.scheduler.worker import ScheduledMessageService CHAN = 'test_chan' sms = ScheduledMessageService(ctx) sched = spawn(sms.run) got_message = Event() def got_message_cb(*args, **kw): got_message.send(True) eb = EventBus(ctx) eb.add_listener(CHAN, got_message_cb) now = datetime.utcnow() wait = timedelta(seconds=2) defer_event(now + wait, CHAN, {'foo': 'bar'}, ctx) sleep(3) try: with_timeout(10, got_message.wait) assert got_message.ready() finally: eb.kill() sched.kill() sched.wait()
def test_fail_fast_imap(): # A failing call... failing_exception = Exception() def failing_call(): raise failing_exception # ...and an eventually successful call. slow_call_returned = Event() def slow_call(): sleep(5) slow_call_returned.send() # pragma: no cover def identity_fn(fn): return fn() calls = [slow_call, failing_call] pool = GreenPool(2) # fail_fast_imap fails as soon as the exception is raised with pytest.raises(Exception) as raised_exc: list(fail_fast_imap(pool, identity_fn, calls)) assert raised_exc.value == failing_exception # The slow call won't go past the sleep as it was killed assert not slow_call_returned.ready() assert pool.free() == 2
class Queue(LightQueue): '''Create a queue object with a given maximum size. If *maxsize* is less than zero or ``None``, the queue size is infinite. ``Queue(0)`` is a channel, that is, its :meth:`put` method always blocks until the item is delivered. (This is unlike the standard :class:`Queue`, where 0 means infinite size). In all other respects, this Queue class resembled the standard library, :class:`Queue`. ''' def __init__(self, maxsize=None): LightQueue.__init__(self, maxsize) self.unfinished_tasks = 0 self._cond = Event() def _format(self): result = LightQueue._format(self) if self.unfinished_tasks: result += ' tasks=%s _cond=%s' % (self.unfinished_tasks, self._cond) return result def _put(self, item): LightQueue._put(self, item) self._put_bookkeeping() def _put_bookkeeping(self): self.unfinished_tasks += 1 if self._cond.ready(): self._cond.reset() def task_done(self): '''Indicate that a formerly enqueued task is complete. Used by queue consumer threads. For each :meth:`get <Queue.get>` used to fetch a task, a subsequent call to :meth:`task_done` tells the queue that the processing on the task is complete. If a :meth:`join` is currently blocking, it will resume when all items have been processed (meaning that a :meth:`task_done` call was received for every item that had been :meth:`put <Queue.put>` into the queue). Raises a :exc:`ValueError` if called more times than there were items placed in the queue. ''' if self.unfinished_tasks <= 0: raise ValueError('task_done() called too many times') self.unfinished_tasks -= 1 if self.unfinished_tasks == 0: self._cond.send(None) def join(self): '''Block until all items in the queue have been gotten and processed. The count of unfinished tasks goes up whenever an item is added to the queue. The count goes down whenever a consumer thread calls :meth:`task_done` to indicate that the item was retrieved and all work on it is complete. When the count of unfinished tasks drops to zero, :meth:`join` unblocks. ''' self._cond.wait()
class Queue(LightQueue): '''Create a queue object with a given maximum size. If *maxsize* is less than zero or ``None``, the queue size is infinite. ``Queue(0)`` is a channel, that is, its :meth:`put` method always blocks until the item is delivered. (This is unlike the standard :class:`Queue`, where 0 means infinite size). In all other respects, this Queue class resembled the standard library, :class:`Queue`. ''' def __init__(self, maxsize=None): LightQueue.__init__(self, maxsize) self.unfinished_tasks = 0 self._cond = Event() def _format(self): result = LightQueue._format(self) if self.unfinished_tasks: result += ' tasks=%s _cond=%s' % (self.unfinished_tasks, self._cond) return result def _put(self, item): LightQueue._put(self, item) self._put_bookkeeping() def _put_bookkeeping(self): self.unfinished_tasks += 1 if self._cond.ready(): self._cond.reset() def task_done(self): '''Indicate that a formerly enqueued task is complete. Used by queue consumer threads. For each :meth:`get <Queue.get>` used to fetch a task, a subsequent call to :meth:`task_done` tells the queue that the processing on the task is complete. If a :meth:`join` is currently blocking, it will resume when all items have been processed (meaning that a :meth:`task_done` call was received for every item that had been :meth:`put <Queue.put>` into the queue). Raises a :exc:`ValueError` if called more times than there were items placed in the queue. ''' if self.unfinished_tasks <= 0: raise ValueError('task_done() called too many times') self.unfinished_tasks -= 1 if self.unfinished_tasks == 0: self._cond.send(None) def join(self): '''Block until all items in the queue have been gotten and processed. The count of unfinished tasks goes up whenever an item is added to the queue. The count goes down whenever a consumer thread calls :meth:`task_done` to indicate that the item was retrieved and all work on it is complete. When the count of unfinished tasks drops to zero, :meth:`join` unblocks. ''' if self.unfinished_tasks > 0: self._cond.wait()
class TimerProvider(EntrypointProvider): def __init__(self, interval, config_key): self._default_interval = interval self.config_key = config_key self.should_stop = Event() self.gt = None def prepare(self): interval = self._default_interval if self.config_key: config = self.container.config interval = config.get(self.config_key, interval) self.interval = interval def start(self): _log.debug('starting %s', self) self.gt = self.container.spawn_managed_thread(self._run) def stop(self): _log.debug('stopping %s', self) self.should_stop.send(True) self.gt.wait() def kill(self, exc): _log.debug('killing %s', self) self.gt.kill() def _run(self): ''' Runs the interval loop. This should not be called directly, rather the `start()` method should be used. ''' while not self.should_stop.ready(): start = time.time() self.handle_timer_tick() elapsed_time = (time.time() - start) sleep_time = max(self.interval - elapsed_time, 0) self._sleep_or_stop(sleep_time) def _sleep_or_stop(self, sleep_time): ''' Sleeps for `sleep_time` seconds or until a `should_stop` event has been fired, whichever comes first. ''' try: with Timeout(sleep_time): self.should_stop.wait() except Timeout: # we use the timeout as a cancellable sleep pass def handle_timer_tick(self): args = tuple() kwargs = {} self.container.spawn_worker(self, args, kwargs)
def save_to(self, data): event = Event() gt = self.container.spawn_managed_thread(lambda: save_to_hbase(data)) gt.link(lambda res: event.send(res.wait())) while True: if event.ready(): is_saved = event.wait() return is_saved eventlet.sleep()
class Timer(object): ''' A timer object, which will call a given method repeatedly at a given interval. ''' def __init__(self, interval, func): self.interval = interval self.func = func self.gt = None self.should_stop = Event() def start(self): ''' Starts the timer in a separate green thread. Once started it may be stopped using its `stop()` method. ''' self.gt = eventlet.spawn(self._run) _log.debug( 'started timer for %s with %ss interval', self.func, self.interval) def _run(self): ''' Runs the interval loop. This should not be called directly, rather the `start()` method should be used. ''' while not self.should_stop.ready(): start = time.time() try: self.func() except Exception as e: _log.exception('error in timer handler: %s', e) sleep_time = max(self.interval - (time.time() - start), 0) self._sleep_or_stop(sleep_time) def _sleep_or_stop(self, sleep_time): ''' Sleeps for `sleep_time` seconds or until a `should_stop` event has been fired, whichever comes first. ''' try: with Timeout(sleep_time): self.should_stop.wait() except Timeout: # we use the timeout as a cancellable sleep pass def stop(self): ''' Gracefully stops the timer, waiting for it's timer_method to complete if it is running. ''' self.should_stop.send(True) self.gt.wait()
def test_deferred_send_receive(ctx): from datetime import datetime, timedelta from carrot.messaging import Consumer from eventlet import sleep, spawn, with_timeout from eventlet.event import Event from eventlet.support.greenlets import GreenletExit import logging from melk.util.nonce import nonce_str import sys from melkman.context import Context from melkman.scheduler import defer_amqp_message, cancel_deferred from melkman.scheduler.worker import ScheduledMessageService got_message = Event() def got_message_cb(*args, **kw): got_message.send(True) def do_consume(): consumer = Consumer(ctx.broker, exchange='testx', queue='testq', routing_key='testq', exclusive=True, durable=False) consumer.register_callback(got_message_cb) try: consumer.wait(limit=1) except StopIteration: pass except GreenletExit: pass finally: consumer.close() cons = spawn(do_consume) sms = ScheduledMessageService(ctx) sched = spawn(sms.run) m1 = {'hello': 'world'} now = datetime.utcnow() wait = timedelta(seconds=2) defer_amqp_message(now + wait, m1, 'testq', 'testx', ctx) try: #sleep(1) with_timeout(10, got_message.wait) assert got_message.ready() finally: sched.kill() sched.wait() cons.kill() cons.wait()
def send_it(self, data): pr_data = process_data(data) event = Event() gt = self.container.spawn_managed_thread( lambda: send_to_kafka(pr_data)) gt.link(lambda res: event.send(res.wait())) eventlet.sleep() while True: if event.ready(): is_sent = event.wait() return is_sent eventlet.sleep()
def all_data(self, table_name): event = Event() gt = self.container.spawn_managed_thread( lambda: all_data_hbase(table_name)) gt.link(lambda res: event.send(res.wait())) while True: if event.ready(): retFile = event.wait() retJson = {"results": retFile} return json.dumps(retJson) eventlet.sleep()
class GreenBody(GreenPool): """ Special subclass of GreenPool which has a wait() method, that will return when any greenthread inside the pool exits. """ def __init__(self, *args, **kwargs): super(GreenBody, self).__init__(*args, **kwargs) self.one_exited = Event() def wait(self): return self.one_exited.wait() def _spawn_done(self, coro): super(GreenBody, self)._spawn_done(coro) if not self.one_exited.ready(): self.one_exited.send(coro.wait())
def test_zero_max_size(self): q = coros.queue(0) def sender(evt, q): q.send('hi') evt.send('done') def receiver(evt, q): x = q.wait() evt.send(x) e1 = Event() e2 = Event() spawn(sender, e1, q) sleep(0) self.assert_(not e1.ready()) spawn(receiver, e2, q) self.assertEqual(e2.wait(),'hi') self.assertEqual(e1.wait(),'done')
def test_zero_max_size(self): q = coros.queue(0) def sender(evt, q): q.send('hi') evt.send('done') def receiver(evt, q): x = q.wait() evt.send(x) e1 = Event() e2 = Event() spawn(sender, e1, q) sleep(0) self.assert_(not e1.ready()) spawn(receiver, e2, q) self.assertEqual(e2.wait(), 'hi') self.assertEqual(e1.wait(), 'done')
def predict(self, data): # generate unique id task_id = uuid.uuid4().hex # execute it in a container thread and send the result to an Event event = Event() gt = self.container.spawn_managed_thread(lambda: randomforest(data)) gt.link(lambda res: event.send(res.wait())) while True: if event.ready(): life_cycle = event.wait() retJson = { "data": data, "life_cycle": life_cycle[0], "task_id": task_id, } return json.dumps(retJson) eventlet.sleep()
class QueueConsumer(SharedExtension, ProviderCollector, ConsumerMixin): def __init__(self): self._consumers = {} self._pending_messages = set() self._pending_ack_messages = [] self._pending_requeue_messages = [] self._pending_remove_providers = {} self._gt = None self._starting = False self._consumers_ready = Event() super(QueueConsumer, self).__init__() @property def amqp_uri(self): return self.container.config[AMQP_URI_CONFIG_KEY] @property def prefetch_count(self): # The prefetch_count should be larger than max_workers. # If the max_workers <= max_workers, # then there will be a dead lock between # drain_events and on_iteration(since msg.ack in it) # which leads slow down the throughout capacity. return self.container.max_workers + 1 @property def accept(self): return self.container.accept def _handle_thread_exited(self, gt): exc = None try: gt.wait() except Exception as e: exc = e if not self._consumers_ready.ready(): self._consumers_ready.send_exception(exc) def setup(self): verify_amqp_uri(self.amqp_uri) def start(self): if not self._starting: self._starting = True _log.debug('starting %s', self) self._gt = self.container.spawn_managed_thread(self.run) self._gt.link(self._handle_thread_exited) try: _log.debug('waiting for consumer ready %s', self) self._consumers_ready.wait() except QueueConsumerStopped: _log.debug('consumer was stopped before it started %s', self) except Exception as exc: _log.debug('consumer failed to start %s (%s)', self, exc) else: _log.debug('started %s', self) def stop(self): """ Stop the queue-consumer gracefully. Wait until the last provider has been unregistered and for the ConsumerMixin's greenthread to exit (i.e. until all pending messages have been acked or requeued and all consumers stopped). """ if not self._consumers_ready.ready(): _log.debug('stopping while consumer is starting %s', self) stop_exc = QueueConsumerStopped() # stopping before we have started successfully by brutally # killing the consumer thread as we don't have a way to hook # into the pre-consumption startup process self._gt.kill(stop_exc) self.wait_for_providers() try: _log.debug('waiting for consumer death %s', self) self._gt.wait() except QueueConsumerStopped: pass super(QueueConsumer, self).stop() _log.debug('stopped %s', self) def kill(self): """ Kill the queue-consumer. Unlike `stop()` any pending message ack or requeue-requests, requests to remove providers, etc are lost and the consume thread is asked to terminate as soon as possible. """ # greenlet has a magic attribute ``dead`` - pylint: disable=E1101 if self._gt is not None and not self._gt.dead: # we can't just kill the thread because we have to give # ConsumerMixin a chance to close the sockets properly. self._providers = set() self._pending_messages = set() self._pending_ack_messages = [] self._pending_requeue_messages = [] self._pending_remove_providers = {} self.should_stop = True try: self._gt.wait() except Exception as exc: # discard the exception since we're already being killed _log.warn('QueueConsumer %s raised `%s` during kill', self, exc) super(QueueConsumer, self).kill() _log.debug('killed %s', self) def unregister_provider(self, provider): if not self._consumers_ready.ready(): # we cannot handle the situation where we are starting up and # want to remove a consumer at the same time # TODO: With the upcomming error handling mechanism, this needs # TODO: to be thought through again. self._last_provider_unregistered.send() return removed_event = Event() # we can only cancel a consumer from within the consumer thread self._pending_remove_providers[provider] = removed_event # so we will just register the consumer to be canceled removed_event.wait() super(QueueConsumer, self).unregister_provider(provider) def ack_message(self, message): _log.debug("stashing message-ack: %s", message) self._pending_messages.remove(message) self._pending_ack_messages.append(message) def requeue_message(self, message): _log.debug("stashing message-requeue: %s", message) self._pending_messages.remove(message) self._pending_requeue_messages.append(message) def _on_message(self, body, message): _log.debug("received message: %s", message) self._pending_messages.add(message) def _cancel_consumers_if_requested(self): provider_remove_events = self._pending_remove_providers.items() self._pending_remove_providers = {} for provider, removed_event in provider_remove_events: consumer = self._consumers.pop(provider) _log.debug('cancelling consumer [%s]: %s', provider, consumer) consumer.cancel() removed_event.send() def _process_pending_message_acks(self): messages = self._pending_ack_messages if messages: _log.debug('ack() %d processed messages', len(messages)) while messages: msg = messages.pop() msg.ack() eventlet.sleep() messages = self._pending_requeue_messages if messages: _log.debug('requeue() %d processed messages', len(messages)) while messages: msg = messages.pop() msg.requeue() eventlet.sleep() @property def connection(self): """ Provide the connection parameters for kombu's ConsumerMixin. The `Connection` object is a declaration of connection parameters that is lazily evaluated. It doesn't represent an established connection to the broker at this point. """ heartbeat = self.container.config.get(HEARTBEAT_CONFIG_KEY, DEFAULT_HEARTBEAT) return Connection(self.amqp_uri, heartbeat=heartbeat) def get_consumers(self, consumer_cls, channel): """ Kombu callback to set up consumers. Called after any (re)connection to the broker. """ _log.debug('setting up consumers %s', self) for provider in self._providers: callbacks = [self._on_message, provider.handle_message] consumer = consumer_cls(queues=[provider.queue], callbacks=callbacks, accept=self.accept) consumer.qos(prefetch_count=self.prefetch_count) self._consumers[provider] = consumer return self._consumers.values() def on_iteration(self): """ Kombu callback for each `drain_events` loop iteration.""" self._cancel_consumers_if_requested() self._process_pending_message_acks() num_consumers = len(self._consumers) num_pending_messages = len(self._pending_messages) if num_consumers + num_pending_messages == 0: _log.debug('requesting stop after iteration') self.should_stop = True def on_connection_error(self, exc, interval): _log.warn("Error connecting to broker at {} ({}).\n" "Retrying in {} seconds.".format(self.amqp_uri, exc, interval)) def on_consume_ready(self, connection, channel, consumers, **kwargs): """ Kombu callback when consumers are ready to accept messages. Called after any (re)connection to the broker. """ if not self._consumers_ready.ready(): _log.debug('consumer started %s', self) self._consumers_ready.send(None) for provider in self._providers: try: callback = provider.on_consume_ready except AttributeError: pass else: callback()
def generateSpritesheets(self, pc_tsid, actuals, base_hash): event = Event() # store the data in the shared dict for eleven.http to use self.shared[pc_tsid] = { 'tsid': pc_tsid, 'actuals': actuals, 'avatar_hash': base_hash, 'event': event, } self.log.info('generateSpritesheets started for %r', pc_tsid) self.log.debug('generateSpritesheets data %r %r %r', pc_tsid, actuals, base_hash) # We need to loop here in case the displays are already being used. We loop until # we find a free display. xvfb = xvfb_threads = browser = browser_threads = None try: self.log.info('Starting Xvfb for %s' % (pc_tsid,)) for DISPLAY_NUM in xrange(0, 100): xvfb = subprocess.Popen( ['Xvfb', ':%i' % (DISPLAY_NUM,)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) xvfb.stdin.close() (_, _, xvfb_threads) = multiproc.run_subproc(xvfb, '[xvfb] ', wait=False) # sleep for 1s to see if it's going to die time.sleep(1) if xvfb.returncode is None: # Xvfb is running break # we got a returncode, so Xvfb exited before it should have, terminate and # try the next display number multiproc.terminate_subproc(xvfb, xvfb_threads) self.log.debug('Display number %i appears to be in use, trying the next one', DISPLAY_NUM) if xvfb.returncode is not None: raise Error('No free display found for use with Xvfb') tsid_signed = URLSafeSerializer(self.secret_key).dumps(pc_tsid) # grab the existing environment variables and add/change DISPLAY to the one Xvfb # is running on. env = os.environ.copy() env['DISPLAY'] = ':%i' % (DISPLAY_NUM,) url = 'http://127.0.0.1:%i/generate/%s' % (self.http_port, tsid_signed) self.log.info('Loading URL %s for %s' % (url, pc_tsid)) browser = subprocess.Popen( ['arora', url], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, ) browser.stdin.close() (_, _, browser_threads) = multiproc.run_subproc(browser, '[browser] ', wait=False) start = time.time() while time.time() - start < self.task_timeout: # wait for the http worker to send us an event saying that the browser hit the done API if event.ready(): break time.sleep(0.05) if not event.ready(): raise Exception('Spritesheet generation exceeded task timeout %is' % (self.task_timeout,)) self.log.info('generateSpritesheets done for %r', pc_tsid) finally: if browser is not None: self.log.info('terminating browser proc') browser.terminate() multiproc.terminate_subproc(browser, browser_threads) if xvfb is not None: self.log.info('terminating xvfb proc') xvfb.terminate() multiproc.terminate_subproc(xvfb, xvfb_threads) del self.shared[pc_tsid] self.log.info('finished')
def test_handlers_do_not_block(config, container_factory, make_cometd_server, message_maker, run_services, tracker, waiter): """ Test that entrypoints do not block each other """ work_a = Event() work_b = Event() class Service: name = 'example-service' @subscribe('/topic/example-a') def handle_event_a(self, channel, payload): work_a.wait() tracker.handle_event_a(channel, payload) @subscribe('/topic/example-b') def handle_event_b(self, channel, payload): work_b.wait() tracker.handle_event_b(channel, payload) responses = [ # respond to handshake [message_maker.make_handshake_response()], # respond to subscribe [ message_maker.make_subscribe_response( subscription='/topic/example-a'), message_maker.make_subscribe_response( subscription='/topic/example-b'), ], # respond to initial connect [ message_maker.make_connect_response( advice={'reconnect': Reconnection.retry.value}), ], # two events to deliver [ message_maker.make_event_delivery_message( channel='/topic/example-a', data={'spam': 'one'}), message_maker.make_event_delivery_message( channel='/topic/example-b', data={'spam': 'two'}), ], ] cometd_server = make_cometd_server(responses) container = container_factory(Service, config) cometd_server.start() container.start() try: # both handlers are still working assert (tracker.handle_event_a.call_args_list == []) assert (tracker.handle_event_b.call_args_list == []) # finish work of the second handler work_b.send() sleep(0.1) # second handler is done assert (tracker.handle_event_a.call_args_list == []) assert (tracker.handle_event_b.call_args_list == [ call('/topic/example-b', {'spam': 'two'}) ]) # finish work of the first handler work_a.send() sleep(0.1) # first handler is done assert (tracker.handle_event_a.call_args_list == [ call('/topic/example-a', {'spam': 'one'}) ]) assert (tracker.handle_event_b.call_args_list == [ call('/topic/example-b', {'spam': 'two'}) ]) finally: if not work_a.ready(): work_a.send() if not work_b.ready(): work_b.send() waiter.wait() container.kill() cometd_server.stop()
class QueueConsumer(SharedExtension, ProviderCollector, ConsumerMixin): def __init__(self): self._consumers = {} self._pending_remove_providers = {} self._gt = None self._starting = False self._consumers_ready = Event() super(QueueConsumer, self).__init__() @property def amqp_uri(self): return self.container.config[AMQP_URI_CONFIG_KEY] @property def prefetch_count(self): return self.container.max_workers @property def accept(self): return self.container.accept def _handle_thread_exited(self, gt): exc = None try: gt.wait() except Exception as e: exc = e if not self._consumers_ready.ready(): self._consumers_ready.send_exception(exc) def start(self): if not self._starting: self._starting = True _log.debug('starting %s', self) self._gt = self.container.spawn_managed_thread(self.run) self._gt.link(self._handle_thread_exited) try: _log.debug('waiting for consumer ready %s', self) self._consumers_ready.wait() except QueueConsumerStopped: _log.debug('consumer was stopped before it started %s', self) except Exception as exc: _log.debug('consumer failed to start %s (%s)', self, exc) else: _log.debug('started %s', self) def stop(self): """ Stop the queue-consumer gracefully. Wait until the last provider has been unregistered and for the ConsumerMixin's greenthread to exit (i.e. until all pending messages have been acked or requeued and all consumers stopped). """ if not self._consumers_ready.ready(): _log.debug('stopping while consumer is starting %s', self) stop_exc = QueueConsumerStopped() # stopping before we have started successfully by brutally # killing the consumer thread as we don't have a way to hook # into the pre-consumption startup process self._gt.kill(stop_exc) self.wait_for_providers() try: _log.debug('waiting for consumer death %s', self) self._gt.wait() except QueueConsumerStopped: pass super(QueueConsumer, self).stop() _log.debug('stopped %s', self) def kill(self): """ Kill the queue-consumer. Unlike `stop()` any pending message ack or requeue-requests, requests to remove providers, etc are lost and the consume thread is asked to terminate as soon as possible. """ # greenlet has a magic attribute ``dead`` - pylint: disable=E1101 if self._gt is not None and not self._gt.dead: # we can't just kill the thread because we have to give # ConsumerMixin a chance to close the sockets properly. self._providers = set() self._pending_remove_providers = {} self.should_stop = True try: self._gt.wait() except Exception as exc: # discard the exception since we're already being killed _log.warn('QueueConsumer %s raised `%s` during kill', self, exc) super(QueueConsumer, self).kill() _log.debug('killed %s', self) def unregister_provider(self, provider): if not self._consumers_ready.ready(): # we cannot handle the situation where we are starting up and # want to remove a consumer at the same time # TODO: With the upcomming error handling mechanism, this needs # TODO: to be thought through again. self._last_provider_unregistered.send() return removed_event = Event() # we can only cancel a consumer from within the consumer thread self._pending_remove_providers[provider] = removed_event # so we will just register the consumer to be canceled removed_event.wait() super(QueueConsumer, self).unregister_provider(provider) def ack_message(self, message): # only attempt to ack if the message connection is alive; # otherwise the message will already have been reclaimed by the broker if message.channel.connection: try: message.ack() except ConnectionError: # pragma: no cover pass # ignore connection closing inside conditional def requeue_message(self, message): # only attempt to requeue if the message connection is alive; # otherwise the message will already have been reclaimed by the broker if message.channel.connection: try: message.requeue() except ConnectionError: # pragma: no cover pass # ignore connection closing inside conditional def _cancel_consumers_if_requested(self): provider_remove_events = self._pending_remove_providers.items() self._pending_remove_providers = {} for provider, removed_event in provider_remove_events: consumer = self._consumers.pop(provider) _log.debug('cancelling consumer [%s]: %s', provider, consumer) consumer.cancel() removed_event.send() @property def connection(self): """ Provide the connection parameters for kombu's ConsumerMixin. The `Connection` object is a declaration of connection parameters that is lazily evaluated. It doesn't represent an established connection to the broker at this point. """ heartbeat = self.container.config.get(HEARTBEAT_CONFIG_KEY, DEFAULT_HEARTBEAT) transport_options = self.container.config.get( TRANSPORT_OPTIONS_CONFIG_KEY, DEFAULT_TRANSPORT_OPTIONS) ssl = self.container.config.get(AMQP_SSL_CONFIG_KEY) login_method = self.container.config.get(LOGIN_METHOD_CONFIG_KEY) conn = Connection(self.amqp_uri, transport_options=transport_options, heartbeat=heartbeat, ssl=ssl, login_method=login_method) return conn def handle_message(self, provider, body, message): ident = u"{}.handle_message[{}]".format( type(provider).__name__, message.delivery_info['routing_key']) self.container.spawn_managed_thread(partial(provider.handle_message, body, message), identifier=ident) def get_consumers(self, consumer_cls, channel): """ Kombu callback to set up consumers. Called after any (re)connection to the broker. """ _log.debug('setting up consumers %s', self) for provider in self._providers: callbacks = [partial(self.handle_message, provider)] consumer = consumer_cls(queues=[provider.queue], callbacks=callbacks, accept=self.accept) consumer.qos(prefetch_count=self.prefetch_count) self._consumers[provider] = consumer return self._consumers.values() def on_iteration(self): """ Kombu callback for each `drain_events` loop iteration.""" self._cancel_consumers_if_requested() if len(self._consumers) == 0: _log.debug('requesting stop after iteration') self.should_stop = True def on_connection_error(self, exc, interval): _log.warning("Error connecting to broker at {} ({}).\n" "Retrying in {} seconds.".format( sanitize_url(self.amqp_uri), exc, interval)) def on_consume_ready(self, connection, channel, consumers, **kwargs): """ Kombu callback when consumers are ready to accept messages. Called after any (re)connection to the broker. """ if not self._consumers_ready.ready(): _log.debug('consumer started %s', self) self._consumers_ready.send(None)
class ServiceContainer(object): def __init__(self, service_cls, worker_ctx_cls, config): self.service_cls = service_cls self.worker_ctx_cls = worker_ctx_cls self.service_name = get_service_name(service_cls) self.config = config self.max_workers = (config.get(MAX_WORKERS_CONFIG_KEY) or DEFAULT_MAX_WORKERS) self.dependencies = DependencySet() for dep in prepare_dependencies(self): self.dependencies.add(dep) self.started = False self._worker_pool = GreenPool(size=self.max_workers) self._active_threads = {} self._protected_threads = set() self._being_killed = False self._died = Event() @property def entrypoints(self): return filter(is_entrypoint_provider, self.dependencies) @property def injections(self): return filter(is_injection_provider, self.dependencies) def start(self): """ Start a container by starting all the dependency providers. """ _log.debug('starting %s', self) self.started = True with _log_time('started %s', self): self.dependencies.all.prepare() self.dependencies.all.start() def stop(self): """ Stop the container gracefully. First all entrypoints are asked to ``stop()``. This ensures that no new worker threads are started. It is the providers' responsibility to gracefully shut down when ``stop()`` is called on them and only return when they have stopped. After all entrypoints have stopped the container waits for any active workers to complete. After all active workers have stopped the container stops all injections. At this point there should be no more managed threads. In case there are any managed threads, they are killed by the container. """ if self._died.ready(): _log.debug('already stopped %s', self) return if self._being_killed: # this race condition can happen when a container is hosted by a # runner and yields during its kill method; if it's unlucky in # scheduling the runner will try to stop() it before self._died # has a result _log.debug('already being killed %s', self) try: self._died.wait() except: pass # don't re-raise if we died with an exception return _log.debug('stopping %s', self) with _log_time('stopped %s', self): dependencies = self.dependencies # entrypoint deps have to be stopped before injection deps # to ensure that running workers can successfully complete dependencies.entrypoints.all.stop() # there might still be some running workers, which we have to # wait for to complete before we can stop injection dependencies self._worker_pool.waitall() # it should be safe now to stop any injection as there is no # active worker which could be using it dependencies.injections.all.stop() # finally, stop nested dependencies dependencies.nested.all.stop() # just in case there was a provider not taking care of its workers, # or a dependency not taking care of its protected threads self._kill_active_threads() self._kill_protected_threads() self.started = False self._died.send(None) def kill(self, exc_info=None): """ Kill the container in a semi-graceful way. All non-protected managed threads are killed first. This includes all active workers generated by :meth:`ServiceContainer.spawn_worker`. Next, dependencies are killed. Finally, any remaining protected threads are killed. If ``exc_info`` is provided, the exception will be raised by :meth:`~wait``. """ if self._being_killed: # this happens if a managed thread exits with an exception # while the container is being killed or if multiple errors # happen simultaneously _log.debug('already killing %s ... waiting for death', self) try: self._died.wait() except: pass # don't re-raise if we died with an exception return self._being_killed = True if self._died.ready(): _log.debug('already stopped %s', self) return if exc_info is not None: _log.info('killing %s due to %s', self, exc_info[1]) else: _log.info('killing %s', self) # protect against dependencies that throw during kill; the container # is already dying with an exception, so ignore anything else def safely_kill_dependencies(dep_set): try: dep_set.kill() except Exception as exc: _log.warning('Dependency raised `%s` during kill', exc) safely_kill_dependencies(self.dependencies.entrypoints.all) self._kill_active_threads() safely_kill_dependencies(self.dependencies.all) self._kill_protected_threads() self.started = False self._died.send(None, exc_info) def wait(self): """ Block until the container has been stopped. If the container was stopped due to an exception, ``wait()`` will raise it. Any unhandled exception raised in a managed thread or in the life-cycle management code also causes the container to be ``kill()``ed, which causes an exception to be raised from ``wait()``. """ return self._died.wait() def spawn_worker(self, provider, args, kwargs, context_data=None, handle_result=None): """ Spawn a worker thread for running the service method decorated with an entrypoint ``provider``. ``args`` and ``kwargs`` are used as arguments for the service method. ``context_data`` is used to initialize a ``WorkerContext``. ``handle_result`` is an optional function which may be passed in by the entrypoint provider. It is called with the result returned or error raised by the service method. If provided it must return a value for ``result`` and ``exc_info`` to propagate to dependencies; these may be different to those returned by the service method. """ if self._being_killed: _log.info("Worker spawn prevented due to being killed") raise ContainerBeingKilled() service = self.service_cls() worker_ctx = self.worker_ctx_cls(self, service, provider, args, kwargs, data=context_data) _log.debug('spawning %s', worker_ctx) gt = self._worker_pool.spawn(self._run_worker, worker_ctx, handle_result) self._active_threads[gt] = provider gt.link(self._handle_thread_exited) return worker_ctx def spawn_managed_thread(self, run_method, protected=False): """ Spawn a managed thread to run ``run_method``. Threads can be marked as ``protected``, which means the container will not forcibly kill them until after all dependencies have been killed. Dependencies that require a managed thread to complete their kill procedure should ensure to mark them as ``protected``. Any uncaught errors inside ``run_method`` cause the container to be killed. It is the caller's responsibility to terminate their spawned threads. Threads are killed automatically if they are still running after all dependencies are stopped during :meth:`ServiceContainer.stop`. Entrypoints may only create separate threads using this method, to ensure they are life-cycle managed. """ gt = eventlet.spawn(run_method) if not protected: self._active_threads[gt] = MANAGED_THREAD else: self._protected_threads.add(gt) gt.link(self._handle_thread_exited) return gt def _run_worker(self, worker_ctx, handle_result): _log.debug('setting up %s', worker_ctx) if not worker_ctx.parent_call_stack: _log.debug('starting call chain') _log.debug('call stack for %s: %s', worker_ctx, '->'.join(worker_ctx.call_id_stack)) with _log_time('ran worker %s', worker_ctx): self.dependencies.injections.all.inject(worker_ctx) self.dependencies.all.worker_setup(worker_ctx) result = exc_info = None method = getattr(worker_ctx.service, worker_ctx.provider.name) try: _log.debug('calling handler for %s', worker_ctx) with _log_time('ran handler for %s', worker_ctx): result = method(*worker_ctx.args, **worker_ctx.kwargs) except Exception as exc: _log.debug('error handling worker %s: %s', worker_ctx, exc, exc_info=True) exc_info = sys.exc_info() if handle_result is not None: _log.debug('handling result for %s', worker_ctx) with _log_time('handled result for %s', worker_ctx): result, exc_info = handle_result(worker_ctx, result, exc_info) with _log_time('tore down worker %s', worker_ctx): _log.debug('signalling result for %s', worker_ctx) self.dependencies.injections.all.worker_result( worker_ctx, result, exc_info) # we don't need this any more, and breaking the cycle means # this can be reclaimed immediately, rather than waiting for a # gc sweep del exc_info self.dependencies.all.worker_teardown(worker_ctx) self.dependencies.injections.all.release(worker_ctx) def _kill_active_threads(self): """ Kill all managed threads that were not marked as "protected" when they were spawned. This set will include all worker threads generated by :meth:`ServiceContainer.spawn_worker`. See :meth:`ServiceContainer.spawn_managed_thread` """ num_active_threads = len(self._active_threads) if num_active_threads: _log.warning('killing %s active thread(s)', num_active_threads) for gt, provider in list(self._active_threads.items()): if provider is not MANAGED_THREAD: description = '{}.{}'.format(self.service_name, provider.name) _log.warning('killing active thread for %s', description) gt.kill() def _kill_protected_threads(self): """ Kill any managed threads marked as protected when they were spawned. See :meth:`ServiceContainer.spawn_managed_thread` """ num_protected_threads = len(self._protected_threads) if num_protected_threads: _log.warning('killing %s protected thread(s)', num_protected_threads) for gt in list(self._protected_threads): gt.kill() def _handle_thread_exited(self, gt): self._active_threads.pop(gt, None) self._protected_threads.discard(gt) try: gt.wait() except GreenletExit: # we don't care much about threads killed by the container # this can happen in stop() and kill() if providers # don't properly take care of their threads _log.warning('%s thread killed by container', self) except Exception: _log.error('%s thread exited with error', self, exc_info=True) # any error raised inside an active thread is unexpected behavior # and probably a bug in the providers or container. # to be safe we call self.kill() to kill our dependencies and # provide the exception info to be raised in self.wait(). self.kill(sys.exc_info()) def __repr__(self): service_name = repr_safe_str(self.service_name) return '<ServiceContainer [{}] at 0x{:x}>'.format( service_name, id(self))
class QueueConsumer(SharedExtension, ProviderCollector, ConsumerMixin): def __init__(self): self._consumers = {} self._pending_remove_providers = {} self._gt = None self._starting = False self._consumers_ready = Event() super(QueueConsumer, self).__init__() @property def amqp_uri(self): return self.container.config[AMQP_URI_CONFIG_KEY] @property def prefetch_count(self): return self.container.max_workers @property def accept(self): return self.container.accept def _handle_thread_exited(self, gt): exc = None try: gt.wait() except Exception as e: exc = e if not self._consumers_ready.ready(): self._consumers_ready.send_exception(exc) def setup(self): ssl = self.container.config.get(AMQP_SSL_CONFIG_KEY) verify_amqp_uri(self.amqp_uri, ssl=ssl) def start(self): if not self._starting: self._starting = True _log.debug('starting %s', self) self._gt = self.container.spawn_managed_thread(self.run) self._gt.link(self._handle_thread_exited) try: _log.debug('waiting for consumer ready %s', self) self._consumers_ready.wait() except QueueConsumerStopped: _log.debug('consumer was stopped before it started %s', self) except Exception as exc: _log.debug('consumer failed to start %s (%s)', self, exc) else: _log.debug('started %s', self) def stop(self): """ Stop the queue-consumer gracefully. Wait until the last provider has been unregistered and for the ConsumerMixin's greenthread to exit (i.e. until all pending messages have been acked or requeued and all consumers stopped). """ if not self._consumers_ready.ready(): _log.debug('stopping while consumer is starting %s', self) stop_exc = QueueConsumerStopped() # stopping before we have started successfully by brutally # killing the consumer thread as we don't have a way to hook # into the pre-consumption startup process self._gt.kill(stop_exc) self.wait_for_providers() try: _log.debug('waiting for consumer death %s', self) self._gt.wait() except QueueConsumerStopped: pass super(QueueConsumer, self).stop() _log.debug('stopped %s', self) def kill(self): """ Kill the queue-consumer. Unlike `stop()` any pending message ack or requeue-requests, requests to remove providers, etc are lost and the consume thread is asked to terminate as soon as possible. """ # greenlet has a magic attribute ``dead`` - pylint: disable=E1101 if self._gt is not None and not self._gt.dead: # we can't just kill the thread because we have to give # ConsumerMixin a chance to close the sockets properly. self._providers = set() self._pending_remove_providers = {} self.should_stop = True try: self._gt.wait() except Exception as exc: # discard the exception since we're already being killed _log.warn( 'QueueConsumer %s raised `%s` during kill', self, exc) super(QueueConsumer, self).kill() _log.debug('killed %s', self) def unregister_provider(self, provider): if not self._consumers_ready.ready(): # we cannot handle the situation where we are starting up and # want to remove a consumer at the same time # TODO: With the upcomming error handling mechanism, this needs # TODO: to be thought through again. self._last_provider_unregistered.send() return removed_event = Event() # we can only cancel a consumer from within the consumer thread self._pending_remove_providers[provider] = removed_event # so we will just register the consumer to be canceled removed_event.wait() super(QueueConsumer, self).unregister_provider(provider) def ack_message(self, message): # only attempt to ack if the message connection is alive; # otherwise the message will already have been reclaimed by the broker if message.channel.connection: try: message.ack() except ConnectionError: # pragma: no cover pass # ignore connection closing inside conditional def requeue_message(self, message): # only attempt to requeue if the message connection is alive; # otherwise the message will already have been reclaimed by the broker if message.channel.connection: try: message.requeue() except ConnectionError: # pragma: no cover pass # ignore connection closing inside conditional def _cancel_consumers_if_requested(self): provider_remove_events = self._pending_remove_providers.items() self._pending_remove_providers = {} for provider, removed_event in provider_remove_events: consumer = self._consumers.pop(provider) _log.debug('cancelling consumer [%s]: %s', provider, consumer) consumer.cancel() removed_event.send() @property def connection(self): """ Provide the connection parameters for kombu's ConsumerMixin. The `Connection` object is a declaration of connection parameters that is lazily evaluated. It doesn't represent an established connection to the broker at this point. """ heartbeat = self.container.config.get( HEARTBEAT_CONFIG_KEY, DEFAULT_HEARTBEAT ) ssl = self.container.config.get(AMQP_SSL_CONFIG_KEY) return Connection(self.amqp_uri, heartbeat=heartbeat, ssl=ssl) def handle_message(self, provider, body, message): ident = u"{}.handle_message[{}]".format( type(provider).__name__, message.delivery_info['routing_key'] ) self.container.spawn_managed_thread( partial(provider.handle_message, body, message), identifier=ident ) def get_consumers(self, consumer_cls, channel): """ Kombu callback to set up consumers. Called after any (re)connection to the broker. """ _log.debug('setting up consumers %s', self) for provider in self._providers: callbacks = [partial(self.handle_message, provider)] consumer = consumer_cls( queues=[provider.queue], callbacks=callbacks, accept=self.accept ) consumer.qos(prefetch_count=self.prefetch_count) self._consumers[provider] = consumer return self._consumers.values() def on_iteration(self): """ Kombu callback for each `drain_events` loop iteration.""" self._cancel_consumers_if_requested() if len(self._consumers) == 0: _log.debug('requesting stop after iteration') self.should_stop = True def on_connection_error(self, exc, interval): _log.warning( "Error connecting to broker at {} ({}).\n" "Retrying in {} seconds.".format(self.amqp_uri, exc, interval)) def on_consume_ready(self, connection, channel, consumers, **kwargs): """ Kombu callback when consumers are ready to accept messages. Called after any (re)connection to the broker. """ if not self._consumers_ready.ready(): _log.debug('consumer started %s', self) self._consumers_ready.send(None)
class TimerProvider(EntrypointProvider): def __init__(self, interval, config_key): self._default_interval = interval self.config_key = config_key self.should_stop = Event() self.gt = None def prepare(self): interval = self._default_interval if self.config_key: config = self.container.config interval = config.get(self.config_key, interval) self.interval = interval def start(self): _log.debug('starting %s', self) self.gt = self.container.spawn_managed_thread(self._run) def stop(self): _log.debug('stopping %s', self) self.should_stop.send(True) self.gt.wait() def kill(self): _log.debug('killing %s', self) self.gt.kill() def _run(self): ''' Runs the interval loop. This should not be called directly, rather the `start()` method should be used. ''' while not self.should_stop.ready(): start = time.time() self.handle_timer_tick() elapsed_time = (time.time() - start) sleep_time = max(self.interval - elapsed_time, 0) self._sleep_or_stop(sleep_time) def _sleep_or_stop(self, sleep_time): ''' Sleeps for `sleep_time` seconds or until a `should_stop` event has been fired, whichever comes first. ''' try: with Timeout(sleep_time): self.should_stop.wait() except Timeout: # we use the timeout as a cancellable sleep pass def handle_timer_tick(self): args = () kwargs = {} # Note that we don't catch ContainerBeingKilled here. If that's raised, # there is nothing for us to do anyway. The exception bubbles, and is # caught by :meth:`Container._handle_thread_exited`, though the # triggered `kill` is a no-op, since the container is alredy # `_being_killed`. self.container.spawn_worker(self, args, kwargs)
class Producer(object): """The producer object, a server which takes requests from a TCP socket and forwards them to a zmq.PUSH socket that is PULLed from by workers that the producer starts. The port is the TCP port to listen on, but the host is used by all sockets. The consumer should be a Consumer object that will run in the worker processes and actually handle requests.""" def __init__(self, consumer, port, processes=num_cpus, host='127.0.0.1'): self.outstanding = {} self.port = port self.host = host self.consumer = consumer self.consumer.initialize(self) self.init_events() self.pool = TokenPool(max_size=processes) self.pushpool = TokenPool(max_size=1) self.forker = Forker(self, consumer, processes) def init_events(self): # these events correspond to the server socket self.server_start = Event() self.server_stop = Event() # these events more or less correspond to the completion of the # startup process, including forking self.running = Event() self.stopped = Event() def setup_zmq(self): """Set up a PUSH and a PULL socket. The PUSH socket will push out requests to the workers. The PULL socket will receive responses from the workers and reply through the server socket.""" self.context = zmq.Context() self.push = self.context.socket(zmq.PUSH) self.push_port = self.push.bind_to_random_port("tcp://%s" % self.host) # start a listener for the pull socket eventlet.spawn(self.zmq_pull) eventlet.sleep(0) def zmq_pull(self): # bind to the port and wait for the workers to start self.pull = self.context.socket(zmq.PULL) self.pull_port = self.pull.bind_to_random_port("tcp://%s" % self.host) self.running.wait() while True: try: packed = self.pull.recv() self.pool.put(None) eventlet.spawn(self.response_handler, packed) except zmq.ZMQError: eventlet.sleep(0.05) except: import traceback traceback.print_exc() return def serve(self): self.server = eventlet.listen((self.host, self.port)) self.server_addr = self.server.getsockname() # finish server listening, fire off event which fires workers and wait self.server_start.send() self.running.wait() while not self.server_stop.ready(): try: conn, addr = self.server.accept() except error: if self.server_stop.ready(): return logger.error("error accepting connection: %r" % error) eventlet.spawn(self.request_handler, conn, addr) def start(self, blocking=True): """Start the producer. This will eventually fire the ``server_start`` and ``running`` events in sequence, which signify that the incoming TCP request socket is running and the workers have been forked, respectively. If ``blocking`` is False, control .""" self.setup_zmq() if blocking: self.serve() else: eventlet.spawn(self.serve) # ensure that self.serve runs now as calling code will # expect start() to have started the server even non-blk eventlet.sleep(0) def stop(self): self.push.close(linger=0) self.pull.close(linger=0) try: self.server.shutdown(SHUT_RDWR) except error, e: if e.errno != 57: raise self.server.close() self.server_stop.send() # let event listeners listening to this event run eventlet.sleep(0)
class _Vxrd(service.Vxfld): """ Main Class that provides methods used by the Vxlan Registration Daemon. """ __IP_ADDR_REGEX = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}' __BRPORT_MAC_ADDR = '00:00:00:00:00:00' __VXLAN_REGEX = re.compile( r'^\d+: (?P<dev_name>\S+):' r'(?=.*state\s+(?P<state>\S+))' r'(?=.*vxlan\s+id\s+(?P<vni>\d+))' r'(?=.*dstport\s+(?P<dstport>\d+))?' r'(?=.*local\s+(?P<local_addr>{0}))?' r'(?=.*(?:svcnode|remote)\s+(?P<sn_addr>{0}))?'.format(__IP_ADDR_REGEX) ) __BRPORT_REGEX = re.compile( r'^{0}' r'(?=.*dst\s+(?P<dst_addr>{1}))' r'(?=.*dev\s+(?P<dev_name>\S+))'.format(__BRPORT_MAC_ADDR, __IP_ADDR_REGEX) ) def __init__(self, conf): super(_Vxrd, self).__init__(conf) self.__vni_config = {} # vni_config[vni] = DeviceConfig self.__peerdb = {} # peerdb[vni] = {ip, ...} self.__sock = None if self._conf.head_rep: # Cumulus Linux: When operating in HER mode, the RD should limit # the pool size to 1 inorder to avoid the situation where multiple # threads update the kernel state in parallel. self.__herpool = eventlet.GreenPool(size=1) # Used to keep track of when the last response was received from # the SND. self.__last_response = None # To ensure deletes are sent before adds self.__removed = self.__update_event = None def _run(self): """ Periodically sends the registration method to the svcnode address. Usually at regular time intervals but may be accelerated if membership has changed. """ # Open a socket for sending the refresh msgs try: self.__sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) self.__sock.bind((self._conf.src_ip, self._conf.vxfld_port)) except socket.error as ex: raise RuntimeError('opening transmit socket : %s' % ex) if self._conf.head_rep: self._pool.spawn_n(self._serve, self.__sock, self.__handle_vxfld_msg, pool=self.__herpool) next_refresh = 0 next_config_check = 0 while True: now = int(time.time()) if now >= next_config_check: next_config_check = now + self._conf.config_check_rate current = self.__get_vxlan_config() if current is not None and self.__vni_config != current: old, self.__vni_config = self.__vni_config, current added = {vni: vni_config for vni, vni_config in current.iteritems() if vni not in old} # Updated VNIs are those for which the VTEP's source # address has changed. This check is needed because the # VNI can move to a new source address when CLAGd detects # a failure and replaces the anycast source address with a # unicast one. updated = { vni: vni_config for vni, vni_config in old.iteritems() if vni in current and ( current[vni].localip != vni_config.localip or current[vni].svcnodeip != vni_config.svcnodeip ) } if self.__remove_vnis(old, current, updated=updated): # Send the updated configuration to the SND once # the delete message has been acknowledged. if added or updated: # VXLAN config has changed. Send refresh # immediately. Merge updated and added VNIs and # send an update to the SND. Schedule another # refresh in 1 sec just in case the UDP msg is # lost. self.__send_refresh(self.__vni_config, self._conf.holdtime) next_refresh = now + 1 else: # Delete message has not been acknowledged. Revert # back to the old config. self.__vni_config = old if self._conf.head_rep: # HER: Keep the peerdb and bridge fdb entries in sync. hrep_updated = {} for vni, vni_config in current.iteritems(): peerips = self.__peerdb.get(vni, set()) myaddr = {vni_config.localip} if vni in old: myaddr.add(old[vni].localip) if vni_config.hrep_addrs ^ peerips - myaddr: hrep_updated[vni] = peerips if hrep_updated: self.__herpool.spawn(self.__update_peerdb, hrep_updated, current).wait() if (self._conf.head_rep and self.__last_response is not None and (now - self.__last_response) > self._conf.holdtime): # HER: Flush the peerdb and HREP MAC entries if the RD # hasn't heard from the SND in more than holdtime seconds. self._logger.warning('Lost contact with SND. Cleaning up...') self.__last_response = None self.__remove_vnis(self.__vni_config, {}) if now >= next_refresh: self.__send_refresh(self.__vni_config, self._conf.holdtime) next_refresh = ( now + self._conf.holdtime / self._conf.refresh_rate ) eventlet.sleep(max(0, min(next_refresh - now, next_config_check - now))) def __get_vxlan_config(self, vnis=None): """ Parses the output of 'bridge fdb show' and 'ip link show' to map a VNI to a DeviceConfig object. :param vnis: used to filter the output of this method :returns: a dictionary mapping VNIs to DeviceConfig objects if successful, otherwise None. """ vni_config = {} dev_map = collections.defaultdict(set) if self._conf.head_rep: # Map the device to bridge fdb entries. bridgecmd = ( '/sbin/bridge fdb show | /bin/grep %s' % self.__BRPORT_MAC_ADDR ) try: # shell=True is being passed to subprocess because eventlet's # version doesn't support piping of data between processes. # This is because eventlet assumes that the stdout file # descriptor will be used from within the Python process, and # it helpfully marks it as non-blocking so methods like # communicate won't block. This results in the process on the # other end of the pipe getting an -EAGAIN when it tries to # read from what it thinks is a blocking socket. # See http://russ.garrett.co.uk/2011/12/16/ # green-threads-and-pipes-in-python/ for details. bridge = subprocess.Popen(bridgecmd, stdout=subprocess.PIPE, shell=True) except Exception as ex: # pylint: disable=broad-except self._logger.warning('Command failed. out:%s', ex) return None for line in iter(bridge.stdout.readline, b''): pat_match = self.__BRPORT_REGEX.match(line.rstrip()) if pat_match is not None: pat_dict = pat_match.groupdict() dev_map[pat_dict['dev_name']].add(pat_dict['dst_addr']) bridge.wait() cmd = '/bin/ip -d -o link show' try: iplinkshow = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE) except Exception as ex: # pylint: disable=broad-except self._logger.warning('Command failed. out:%s', ex) return None for line in iter(iplinkshow.stdout.readline, b''): pat_match = self.__VXLAN_REGEX.match(line.rstrip()) if pat_match is not None: match_dict = pat_match.groupdict() vni = int(match_dict['vni']) if ( # match_dict['dstport'] is None or match_dict['local_addr'] is None or (not self._conf.head_rep and match_dict['sn_addr'] is None) ): self._logger.warning('Invalid configuration %s detected ' 'for device. Skipping', match_dict) continue if match_dict['state'] in ['UP', 'UNKNOWN']: svcnode_ip = self._conf.svcnode_ip # If the svcnode_ip is not set in the configuration, # then use the VTEP's svcnode IP in the kernel. if (svcnode_ip == config.Config.CommonConfig.svcnode_ip.default and match_dict['sn_addr'] is not None): svcnode_ip = match_dict['sn_addr'] # got vni, local and sn. Add to dict if vnis is None or vni in vnis: vni_config[vni] = _DeviceConfig( dev_name=match_dict['dev_name'], localip=match_dict['local_addr'], svcnodeip=svcnode_ip, hrep_addrs=dev_map[match_dict['dev_name']] ) if iplinkshow.wait() == 0: return vni_config else: self._logger.error('%s returned non-zero exit code', cmd) return None def _process(self, msg): """ Returns result object and Exception. """ # pylint: disable=too-many-return-statements try: if msg['vxlans']: if msg['hrep'] and not self._conf.head_rep: return None, None return self.__vni_config, None elif msg['peers']: if self._conf.head_rep: return self.__peerdb, None else: return None, None elif msg['get'] and msg['config']: if msg['<parameter>'] is not None: parameter = msg['<parameter>'] paramaters = self._conf.get_params() if parameter in paramaters: value = paramaters.get(parameter, None) return {parameter: value}, None else: return None, RuntimeError('Unknown parameter') else: return self._conf.get_params(), None else: return None, RuntimeError('Unknown request') except Exception: # pylint: disable=broad-except return None, RuntimeError('Bad message') def __send_refresh(self, vni_data, hold): """ Sends a refresh message to the SND :param vni_data: dictionary mapping VNIs to DeviceConfig objects :param hold: holdtimer """ # pylint: disable=missing-docstring # Build the right datastructure for the message # need msg_data as {svcnode: {vni: [local]}} def send_vxfld_pkt(pkt_in, svcnode_in): self._logger.debug('Sending to %s: %s. Holdtime: %s', svcnode_in, pkt_in.data.vni_vteps.items(), hold) try: self.__sock.sendto(str(pkt_in), (svcnode_in, self._conf.vxfld_port)) except Exception as ex: # pylint: disable=broad-except self._logger.error('Error sending refresh packet: %s', type(ex)) for svcnode, grouper in itertools.groupby( vni_data, lambda k: vni_data.get(k).svcnodeip): vxfld_pkt = refresh_pkt = None for vni in grouper: addrs = [vni_data.get(vni).localip] # Limit the refresh message to max_packet_size. if (vxfld_pkt is None or VXFLD.BASE_PKT_SIZE + len(vxfld_pkt) + len(VXFLD.Refresh.vtep_to_str(vni, addrs)) >= self._conf.max_packet_size): if vxfld_pkt is not None: send_vxfld_pkt(vxfld_pkt, svcnode) vxfld_pkt = VXFLD.Packet() vxfld_pkt.version = VXFLD.VERSION refresh_pkt = VXFLD.Refresh(version=VXFLD.VERSION, holdtime=hold, originator=True) vxfld_pkt.type = refresh_pkt.type vxfld_pkt.data = refresh_pkt if self._conf.head_rep: refresh_pkt.response_type = ( VXFLD.ResponseType.REQUESTED ) refresh_pkt.add_vni_vteps({vni: addrs}) if refresh_pkt is not None and refresh_pkt.vni_vteps: send_vxfld_pkt(vxfld_pkt, svcnode) def __remove_vnis(self, old, current, updated=None): """ This function handles the cleanup of VNIs that have either been updated or removed from the config by sending a refresh message with holdtime of 0 so that vxsnd can quickly age these out. It also removes the peer list for VNIs that have been removed from the config. :param old: Old configuration of the following format {vni: DeviceConfig} :param current: New configuration of the following format {vni: DeviceConfig} :param updated: dictionary of the following format {vni: DeviceConfig} :returns True if message was either not sent or sent and acknowledged, False if it wasn't acknowledged. """ status = True # Removed VNIs are those that have been unconfigured fromt the device. removed = {vni: vni_config for vni, vni_config in old.iteritems() if vni not in current} if removed or updated: # Merge updated and removed VNIs and send an update to the SND. refresh = removed.copy() refresh.update(updated or {}) if self._conf.head_rep: self.__update_event = Event() self.__removed = refresh no_of_attempts = 3 for _ in range(no_of_attempts): self.__send_refresh(refresh, 0) eventlet.sleep(1) if self.__update_event.ready(): break else: self._logger.warning('Did not receive an acknowledgement ' 'from the SND after %d attempts', no_of_attempts) status = False self.__removed = self.__update_event = None else: self.__send_refresh(refresh, 0) if self._conf.head_rep and removed: # HER: Remove VNIs that are no longer configured on the device. # Using the herpool guarantees that updates to the bridge fdb are # sequential. self.__herpool.spawn(self.__update_peerdb, {vni: set() for vni in removed if self.__peerdb.get(vni, set())}, old).wait() return status def __handle_vxfld_msg(self, buf, addr): """ HER: Handles the VXFLD message. :param buf: socket buffer :param addr: source address This is specific to Cumulus Linux. """ srcip, _ = addr self.__last_response = int(time.time()) try: vxfld_pkt = VXFLD.Packet(buf) except Exception as ex: # pylint: disable=broad-except self._logger.error('Unknown VXFLD packet received from %s: %s', srcip, ex.message) return refresh_pkt = vxfld_pkt.data if vxfld_pkt.type > VXFLD.MsgType.REFRESH: self._logger.warning('Unexpected vxfld pkt of type %d', vxfld_pkt.type) return self._logger.debug('Refresh msg from %s: %s', srcip, refresh_pkt.vni_vteps) if self.__removed is not None and self.__update_event is not None: if (set(refresh_pkt.vni_vteps) ^ set(self.__removed) or any(self.__removed[vni].localip in iplist for vni, iplist in refresh_pkt.vni_vteps.iteritems())): self._logger.debug('Refresh msg ignored because VNI deletion ' 'is in progress') return else: self._logger.debug('Received ack for deleted vnis %s', set(self.__removed)) self.__update_event.send(True) # Compute the list of updated VNIs updated_vnis = { vni: set(iplist) for vni, iplist in refresh_pkt.vni_vteps.iteritems() if set(iplist) != self.__peerdb.get(vni, set()) } if updated_vnis: self.__update_peerdb(updated_vnis, self.__vni_config) def __update_peerdb(self, updated_vnis, vni_config): """ HER: Update the RD's peerdb. :param updated_vnis: a dictionary mapping VNIs to a set of peer IP addresses :param vni_config: a dictionary mapping VNIs to DeviceConfig objects {vni: DeviceConfig} :returns: None if successful, otherwise raises an Exception. This is specific to Cumulus Linux. """ def update_bridgefdb(filepath): """ Updates the bridge fdb by calling the bridge command with the temporary batch file as input. :param filepath: location of the temporary batch file :return: None. We ignore the return value because the periodic config scan will synchronize the kernel state with the peerdb. """ cmd = '/sbin/bridge -force -batch %s' % filepath try: subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT) except subprocess.CalledProcessError as ex: self._logger.info('Failed to update bridge fdb. out: %s', ex.output) max_cmds_per_batch = 2000 with tempfile.NamedTemporaryFile('w', prefix='vxrd_tmp') as tmpf: no_of_commands = 0 for vni, ipset in updated_vnis.iteritems(): if vni not in vni_config: self._logger.debug('Unexpected VNI %d', vni) continue dev_name = vni_config[vni].dev_name my_addr = vni_config[vni].localip cur_addrs = vni_config[vni].hrep_addrs self._logger.debug('Updating peer list for VTEP %s. new: %s. ' 'peerdb: %s, hrep: %s, myaddr: %s', dev_name, ipset, self.__peerdb.get(vni, set()), cur_addrs, my_addr) if ipset: self.__peerdb[vni] = ipset else: self.__peerdb.pop(vni, None) for operation, peerips in \ (('del', cur_addrs - (ipset - {my_addr})), ('append', ipset - cur_addrs - {my_addr})): for peerip in peerips: tmpf.write('fdb %s %s dev %s dst %s\n' % ( operation, self.__BRPORT_MAC_ADDR, dev_name, peerip)) if no_of_commands == max_cmds_per_batch - 1: tmpf.flush() update_bridgefdb(tmpf.name) no_of_commands = 0 tmpf.truncate(0) tmpf.seek(0) else: no_of_commands += 1 if no_of_commands: tmpf.flush() update_bridgefdb(tmpf.name)
class ServiceContainer(object): def __init__(self, service_cls, worker_ctx_cls, config): self.service_cls = service_cls self.worker_ctx_cls = worker_ctx_cls self.service_name = get_service_name(service_cls) self.config = config self.max_workers = config.get(MAX_WORKERS_KEY) or DEFAULT_MAX_WORKERS self.dependencies = DependencySet() for dep in prepare_dependencies(self): self.dependencies.add(dep) self.started = False self._worker_pool = GreenPool(size=self.max_workers) self._active_threads = set() self._protected_threads = set() self._being_killed = False self._died = Event() @property def entrypoints(self): return filter(is_entrypoint_provider, self.dependencies) @property def injections(self): return filter(is_injection_provider, self.dependencies) def start(self): """ Start a container by starting all the dependency providers. """ _log.debug('starting %s', self) self.started = True with log_time(_log.debug, 'started %s in %0.3f sec', self): self.dependencies.all.prepare() self.dependencies.all.start() def stop(self): """ Stop the container gracefully. First all entrypoints are asked to ``stop()``. This ensures that no new worker threads are started. It is the providers' responsibility to gracefully shut down when ``stop()`` is called on them and only return when they have stopped. After all entrypoints have stopped the container waits for any active workers to complete. After all active workers have stopped the container stops all injections. At this point there should be no more managed threads. In case there are any managed threads, they are killed by the container. """ if self._died.ready(): _log.debug('already stopped %s', self) return _log.debug('stopping %s', self) with log_time(_log.debug, 'stopped %s in %0.3f sec', self): dependencies = self.dependencies # entrypoint deps have to be stopped before injection deps # to ensure that running workers can successfully complete dependencies.entrypoints.all.stop() # there might still be some running workers, which we have to # wait for to complete before we can stop injection dependencies self._worker_pool.waitall() # it should be safe now to stop any injection as there is no # active worker which could be using it dependencies.injections.all.stop() # finally, stop nested dependencies dependencies.nested.all.stop() # just in case there was a provider not taking care of its workers, # or a dependency not taking care of its protected threads self._kill_active_threads() self._kill_protected_threads() self.started = False self._died.send(None) def kill(self, exc): """ Kill the container in a semi-graceful way. All non-protected managed threads are killed first. This includes all active workers generated by :meth:`ServiceContainer.spawn_worker`. Next, dependencies are killed. Finally, any remaining protected threads are killed. The container dies with the given ``exc``. """ if self._being_killed: # this happens if a managed thread exits with an exception # while the container is being killed or another caller # behaves in a similar manner _log.debug('already killing %s ... waiting for death', self) self._died.wait() self._being_killed = True if self._died.ready(): _log.debug('already stopped %s', self) return _log.info('killing %s due to "%s"', self, exc) self.dependencies.entrypoints.all.kill(exc) self._kill_active_threads() self.dependencies.all.kill(exc) self._kill_protected_threads() self.started = False self._died.send_exception(exc) def wait(self): """ Block until the container has been stopped. If the container was stopped using ``kill(exc)``, ``wait()`` raises ``exc``. Any unhandled exception raised in a managed thread or in the life-cycle management code also causes the container to be ``kill()``ed, which causes an exception to be raised from ``wait()``. """ return self._died.wait() def spawn_worker(self, provider, args, kwargs, context_data=None, handle_result=None): """ Spawn a worker thread for running the service method decorated with an entrypoint ``provider``. ``args`` and ``kwargs`` are used as arguments for the service method. ``context_data`` is used to initialize a ``WorkerContext``. ``handle_result`` is an optional callback which may be passed in by the calling entrypoint provider. It is called with the result returned or error raised by the service method. """ service = self.service_cls() worker_ctx = self.worker_ctx_cls( self, service, provider.name, args, kwargs, data=context_data) _log.debug('spawning %s', worker_ctx, extra=worker_ctx.extra_for_logging) gt = self._worker_pool.spawn(self._run_worker, worker_ctx, handle_result) self._active_threads.add(gt) gt.link(self._handle_thread_exited) return worker_ctx def spawn_managed_thread(self, run_method, protected=False): """ Spawn a managed thread to run ``run_method``. Threads can be marked as ``protected``, which means the container will not forcibly kill them until after all dependencies have been killed. Dependencies that require a managed thread to complete their kill procedure should ensure to mark them as ``protected``. Any uncaught errors inside ``run_method`` cause the container to be killed. It is the caller's responsibility to terminate their spawned threads. Threads are killed automatically if they are still running after all dependencies are stopped during :meth:`ServiceContainer.stop`. Entrypoints may only create separate threads using this method, to ensure they are life-cycle managed. """ gt = eventlet.spawn(run_method) if not protected: self._active_threads.add(gt) else: self._protected_threads.add(gt) gt.link(self._handle_thread_exited) return gt def _run_worker(self, worker_ctx, handle_result): _log.debug('setting up %s', worker_ctx, extra=worker_ctx.extra_for_logging) if not worker_ctx.parent_call_stack: _log.debug('starting call chain', extra=worker_ctx.extra_for_logging) _log.debug('call stack for %s: %s', worker_ctx, '->'.join(worker_ctx.call_id_stack), extra=worker_ctx.extra_for_logging) with log_time(_log.debug, 'ran worker %s in %0.3fsec', worker_ctx): self.dependencies.injections.all.inject(worker_ctx) self.dependencies.all.worker_setup(worker_ctx) result = exc = None try: _log.debug('calling handler for %s', worker_ctx, extra=worker_ctx.extra_for_logging) method = getattr(worker_ctx.service, worker_ctx.method_name) with log_time(_log.debug, 'ran handler for %s in %0.3fsec', worker_ctx): result = method(*worker_ctx.args, **worker_ctx.kwargs) except Exception as e: log_worker_exception(worker_ctx, e) exc = e with log_time(_log.debug, 'tore down worker %s in %0.3fsec', worker_ctx): _log.debug('signalling result for %s', worker_ctx, extra=worker_ctx.extra_for_logging) self.dependencies.injections.all.worker_result( worker_ctx, result, exc) _log.debug('tearing down %s', worker_ctx, extra=worker_ctx.extra_for_logging) self.dependencies.all.worker_teardown(worker_ctx) self.dependencies.injections.all.release(worker_ctx) if handle_result is not None: _log.debug('handling result for %s', worker_ctx, extra=worker_ctx.extra_for_logging) with log_time(_log.debug, 'handled result for %s in %0.3fsec', worker_ctx): handle_result(worker_ctx, result, exc) def _kill_active_threads(self): """ Kill all managed threads that were not marked as "protected" when they were spawned. This set will include all worker threads generated by :meth:`ServiceContainer.spawn_worker`. See :meth:`ServiceContainer.spawn_managed_thread` """ num_active_threads = len(self._active_threads) if num_active_threads: _log.warning('killing %s active thread(s)', num_active_threads) for gt in list(self._active_threads): gt.kill() def _kill_protected_threads(self): """ Kill any managed threads marked as protected when they were spawned. See :meth:`ServiceContainer.spawn_managed_thread` """ num_protected_threads = len(self._protected_threads) if num_protected_threads: _log.warning('killing %s protected thread(s)', num_protected_threads) for gt in list(self._protected_threads): gt.kill() def _handle_thread_exited(self, gt): self._active_threads.discard(gt) self._protected_threads.discard(gt) try: gt.wait() except greenlet.GreenletExit: # we don't care much about threads killed by the container # this can happen in stop() and kill() if providers # don't properly take care of their threads _log.warning('%s thread killed by container', self) except Exception as exc: _log.error('%s thread exited with error', self, exc_info=True) # any error raised inside an active thread is unexpected behavior # and probably a bug in the providers or container # to be safe we kill the container self.kill(exc) def __str__(self): return '<ServiceContainer [{}] at 0x{:x}>'.format( self.service_name, id(self))
class ServiceContainer(object): def __init__(self, service_cls, worker_ctx_cls, config): self.service_cls = service_cls self.worker_ctx_cls = worker_ctx_cls self.service_name = get_service_name(service_cls) self.config = config self.max_workers = ( config.get(MAX_WORKERS_CONFIG_KEY) or DEFAULT_MAX_WORKERS) self.dependencies = DependencySet() for dep in prepare_dependencies(self): self.dependencies.add(dep) self.started = False self._worker_pool = GreenPool(size=self.max_workers) self._active_threads = {} self._protected_threads = set() self._being_killed = False self._died = Event() @property def entrypoints(self): return filter(is_entrypoint_provider, self.dependencies) @property def injections(self): return filter(is_injection_provider, self.dependencies) def start(self): """ Start a container by starting all the dependency providers. """ _log.debug('starting %s', self) self.started = True with _log_time('started %s', self): self.dependencies.all.prepare() self.dependencies.all.start() def stop(self): """ Stop the container gracefully. First all entrypoints are asked to ``stop()``. This ensures that no new worker threads are started. It is the providers' responsibility to gracefully shut down when ``stop()`` is called on them and only return when they have stopped. After all entrypoints have stopped the container waits for any active workers to complete. After all active workers have stopped the container stops all injections. At this point there should be no more managed threads. In case there are any managed threads, they are killed by the container. """ if self._died.ready(): _log.debug('already stopped %s', self) return if self._being_killed: # this race condition can happen when a container is hosted by a # runner and yields during its kill method; if it's unlucky in # scheduling the runner will try to stop() it before self._died # has a result _log.debug('already being killed %s', self) try: self._died.wait() except: pass # don't re-raise if we died with an exception return _log.debug('stopping %s', self) with _log_time('stopped %s', self): dependencies = self.dependencies # entrypoint deps have to be stopped before injection deps # to ensure that running workers can successfully complete dependencies.entrypoints.all.stop() # there might still be some running workers, which we have to # wait for to complete before we can stop injection dependencies self._worker_pool.waitall() # it should be safe now to stop any injection as there is no # active worker which could be using it dependencies.injections.all.stop() # finally, stop nested dependencies dependencies.nested.all.stop() # just in case there was a provider not taking care of its workers, # or a dependency not taking care of its protected threads self._kill_active_threads() self._kill_protected_threads() self.started = False self._died.send(None) def kill(self, exc_info=None): """ Kill the container in a semi-graceful way. All non-protected managed threads are killed first. This includes all active workers generated by :meth:`ServiceContainer.spawn_worker`. Next, dependencies are killed. Finally, any remaining protected threads are killed. If ``exc_info`` is provided, the exception will be raised by :meth:`~wait``. """ if self._being_killed: # this happens if a managed thread exits with an exception # while the container is being killed or if multiple errors # happen simultaneously _log.debug('already killing %s ... waiting for death', self) try: self._died.wait() except: pass # don't re-raise if we died with an exception return self._being_killed = True if self._died.ready(): _log.debug('already stopped %s', self) return if exc_info is not None: _log.info('killing %s due to %s', self, exc_info[1]) else: _log.info('killing %s', self) # protect against dependencies that throw during kill; the container # is already dying with an exception, so ignore anything else def safely_kill_dependencies(dep_set): try: dep_set.kill() except Exception as exc: _log.warning('Dependency raised `%s` during kill', exc) safely_kill_dependencies(self.dependencies.entrypoints.all) self._kill_active_threads() safely_kill_dependencies(self.dependencies.all) self._kill_protected_threads() self.started = False self._died.send(None, exc_info) def wait(self): """ Block until the container has been stopped. If the container was stopped due to an exception, ``wait()`` will raise it. Any unhandled exception raised in a managed thread or in the life-cycle management code also causes the container to be ``kill()``ed, which causes an exception to be raised from ``wait()``. """ return self._died.wait() def spawn_worker(self, provider, args, kwargs, context_data=None, handle_result=None): """ Spawn a worker thread for running the service method decorated with an entrypoint ``provider``. ``args`` and ``kwargs`` are used as arguments for the service method. ``context_data`` is used to initialize a ``WorkerContext``. ``handle_result`` is an optional function which may be passed in by the entrypoint provider. It is called with the result returned or error raised by the service method. If provided it must return a value for ``result`` and ``exc_info`` to propagate to dependencies; these may be different to those returned by the service method. """ if self._being_killed: _log.info("Worker spawn prevented due to being killed") raise ContainerBeingKilled() service = self.service_cls() worker_ctx = self.worker_ctx_cls( self, service, provider, args, kwargs, data=context_data) _log.debug('spawning %s', worker_ctx) gt = self._worker_pool.spawn(self._run_worker, worker_ctx, handle_result) self._active_threads[gt] = provider gt.link(self._handle_thread_exited) return worker_ctx def spawn_managed_thread(self, run_method, protected=False): """ Spawn a managed thread to run ``run_method``. Threads can be marked as ``protected``, which means the container will not forcibly kill them until after all dependencies have been killed. Dependencies that require a managed thread to complete their kill procedure should ensure to mark them as ``protected``. Any uncaught errors inside ``run_method`` cause the container to be killed. It is the caller's responsibility to terminate their spawned threads. Threads are killed automatically if they are still running after all dependencies are stopped during :meth:`ServiceContainer.stop`. Entrypoints may only create separate threads using this method, to ensure they are life-cycle managed. """ gt = eventlet.spawn(run_method) if not protected: self._active_threads[gt] = MANAGED_THREAD else: self._protected_threads.add(gt) gt.link(self._handle_thread_exited) return gt def _run_worker(self, worker_ctx, handle_result): _log.debug('setting up %s', worker_ctx) if not worker_ctx.parent_call_stack: _log.debug('starting call chain') _log.debug('call stack for %s: %s', worker_ctx, '->'.join(worker_ctx.call_id_stack)) with _log_time('ran worker %s', worker_ctx): self.dependencies.injections.all.inject(worker_ctx) self.dependencies.all.worker_setup(worker_ctx) result = exc_info = None method = getattr(worker_ctx.service, worker_ctx.provider.name) try: _log.debug('calling handler for %s', worker_ctx) with _log_time('ran handler for %s', worker_ctx): result = method(*worker_ctx.args, **worker_ctx.kwargs) except Exception as exc: _log.debug('error handling worker %s: %s', worker_ctx, exc, exc_info=True) exc_info = sys.exc_info() if handle_result is not None: _log.debug('handling result for %s', worker_ctx) with _log_time('handled result for %s', worker_ctx): result, exc_info = handle_result( worker_ctx, result, exc_info) with _log_time('tore down worker %s', worker_ctx): _log.debug('signalling result for %s', worker_ctx) self.dependencies.injections.all.worker_result( worker_ctx, result, exc_info) # we don't need this any more, and breaking the cycle means # this can be reclaimed immediately, rather than waiting for a # gc sweep del exc_info self.dependencies.all.worker_teardown(worker_ctx) self.dependencies.injections.all.release(worker_ctx) def _kill_active_threads(self): """ Kill all managed threads that were not marked as "protected" when they were spawned. This set will include all worker threads generated by :meth:`ServiceContainer.spawn_worker`. See :meth:`ServiceContainer.spawn_managed_thread` """ num_active_threads = len(self._active_threads) if num_active_threads: _log.warning('killing %s active thread(s)', num_active_threads) for gt, provider in list(self._active_threads.items()): if provider is not MANAGED_THREAD: description = '{}.{}'.format( self.service_name, provider.name) _log.warning('killing active thread for %s', description) gt.kill() def _kill_protected_threads(self): """ Kill any managed threads marked as protected when they were spawned. See :meth:`ServiceContainer.spawn_managed_thread` """ num_protected_threads = len(self._protected_threads) if num_protected_threads: _log.warning('killing %s protected thread(s)', num_protected_threads) for gt in list(self._protected_threads): gt.kill() def _handle_thread_exited(self, gt): self._active_threads.pop(gt, None) self._protected_threads.discard(gt) try: gt.wait() except GreenletExit: # we don't care much about threads killed by the container # this can happen in stop() and kill() if providers # don't properly take care of their threads _log.warning('%s thread killed by container', self) except Exception: _log.error('%s thread exited with error', self, exc_info=True) # any error raised inside an active thread is unexpected behavior # and probably a bug in the providers or container. # to be safe we call self.kill() to kill our dependencies and # provide the exception info to be raised in self.wait(). self.kill(sys.exc_info()) def __str__(self): return '<ServiceContainer [{}] at 0x{:x}>'.format( self.service_name, id(self))
class BaseAMQPConsumer(ConsumerMixin, ControlExtension): def __init__(self, *args, **kwargs): self.gt = None self.started = False self.connection = None self.consumers_ready = Event() self.consumers_channels = set() super(BaseAMQPConsumer, self).__init__(*args, **kwargs) # Extension def _link_manage_results(self, gt): def exc_func(exc_info): exc_type, exc_value, exc_trace = exc_info self.consumers_ready.send_exception(exc_value) ignore_exception(gt.wait, exc_func=exc_func)() def setup(self): self.connection = AMQPConnect(self.container.config).instance def start(self): if self.started: return self.started = True self.gt = self.container.spawn_manage_thread(self.run) self.gt.link(self._link_manage_results) try: self.consumers_ready.wait() except Exception as e: msg = 'amqp consumers failed to start, {}'.format(e.message) logger.error(msg) else: msg = 'amqp consumers ready.' logger.debug(msg) return def stop(self): self.should_stop = True self.started = False self.gt.kill() def kill(self): self.should_stop = True self.started = False self.gt.kill() # ConsumerMixin def get_consumers(self, consumer_cls, channel): raise NotImplementedError def create_connection(self): return super(BaseAMQPConsumer, self).create_connection() def on_message(self, extension, body, message): self.container.spawn_manage_thread(extension.handle_message, args=(body, message)) def on_connection_revived(self): return super(BaseAMQPConsumer, self).on_connection_revived() def on_consume_ready(self, connection, channel, consumers, **kwargs): not self.consumers_ready.ready() and self.consumers_ready.send(None) return super(BaseAMQPConsumer, self).on_consume_ready(connection, channel, consumers, **kwargs) def on_consume_end(self, connection, channel): return super(BaseAMQPConsumer, self).on_consume_end(connection, channel) def on_iteration(self): return super(BaseAMQPConsumer, self).on_iteration() def on_decode_error(self, message, exc): return super(BaseAMQPConsumer, self).on_decode_error(message, exc) def on_connection_error(self, exc, interval): return super(BaseAMQPConsumer, self).on_connection_error(exc, interval)
class QueueConsumer(DependencyProvider, ProviderCollector, ConsumerMixin): def __init__(self): super(QueueConsumer, self).__init__() self._connection = None self._consumers = {} self._pending_messages = set() self._pending_ack_messages = [] self._pending_requeue_messages = [] self._pending_remove_providers = {} self._gt = None self._starting = False self._consumers_ready = Event() @property def _amqp_uri(self): return self.container.config[AMQP_URI_CONFIG_KEY] @property def _prefetch_count(self): return self.container.max_workers def _handle_thread_exited(self, gt): exc = None try: gt.wait() except Exception as e: exc = e if not self._consumers_ready.ready(): self._consumers_ready.send_exception(exc) def start(self): if not self._starting: self._starting = True _log.debug('starting %s', self) self._gt = self.container.spawn_managed_thread( self.run, protected=True) self._gt.link(self._handle_thread_exited) try: _log.debug('waiting for consumer ready %s', self) self._consumers_ready.wait() except QueueConsumerStopped: _log.debug('consumer was stopped before it started %s', self) except Exception as exc: _log.debug('consumer failed to start %s (%s)', self, exc) else: _log.debug('started %s', self) def stop(self): """ Stop the queue-consumer gracefully. Wait until the last provider has been unregistered and for the ConsumerMixin's greenthread to exit (i.e. until all pending messages have been acked or requeued and all consumers stopped). """ if not self._consumers_ready.ready(): _log.debug('stopping while consumer is starting %s', self) stop_exc = QueueConsumerStopped() # stopping before we have started successfully by brutally # killing the consumer thread as we don't have a way to hook # into the pre-consumption startup process self._gt.kill(stop_exc) self.wait_for_providers() try: _log.debug('waiting for consumer death %s', self) self._gt.wait() except QueueConsumerStopped: pass super(QueueConsumer, self).stop() _log.debug('stopped %s', self) def kill(self): """ Kill the queue-consumer. Unlike `stop()` any pending message ack or requeue-requests, requests to remove providers, etc are lost and the consume thread is asked to terminate as soon as possible. """ # greenlet has a magic attribute ``dead`` - pylint: disable=E1101 if self._gt and not self._gt.dead: # we can't just kill the thread because we have to give # ConsumerMixin a chance to close the sockets properly. self._providers = set() self._pending_messages = set() self._pending_ack_messages = [] self._pending_requeue_messages = [] self._pending_remove_providers = {} self.should_stop = True self._gt.wait() super(QueueConsumer, self).kill() _log.debug('killed %s', self) def unregister_provider(self, provider): if not self._consumers_ready.ready(): # we cannot handle the situation where we are starting up and # want to remove a consumer at the same time # TODO: With the upcomming error handling mechanism, this needs # TODO: to be thought through again. self._last_provider_unregistered.send() return removed_event = Event() # we can only cancel a consumer from within the consumer thread self._pending_remove_providers[provider] = removed_event # so we will just register the consumer to be canceled removed_event.wait() super(QueueConsumer, self).unregister_provider(provider) def ack_message(self, message): _log.debug("stashing message-ack: %s", message) self._pending_messages.remove(message) self._pending_ack_messages.append(message) def requeue_message(self, message): _log.debug("stashing message-requeue: %s", message) self._pending_messages.remove(message) self._pending_requeue_messages.append(message) def _on_message(self, body, message): _log.debug("received message: %s", message) self._pending_messages.add(message) def _cancel_consumers_if_requested(self): provider_remove_events = self._pending_remove_providers.items() self._pending_remove_providers = {} for provider, removed_event in provider_remove_events: consumer = self._consumers.pop(provider) _log.debug('cancelling consumer [%s]: %s', provider, consumer) consumer.cancel() removed_event.send() def _process_pending_message_acks(self): messages = self._pending_ack_messages if messages: _log.debug('ack() %d processed messages', len(messages)) while messages: msg = messages.pop() msg.ack() eventlet.sleep() messages = self._pending_requeue_messages if messages: _log.debug('requeue() %d processed messages', len(messages)) while messages: msg = messages.pop() msg.requeue() eventlet.sleep() @property def connection(self): """ Kombu requirement """ if self._connection is None: self._connection = Connection(self._amqp_uri) return self._connection def get_consumers(self, Consumer, channel): """ Kombu callback to set up consumers. Called after any (re)connection to the broker. """ _log.debug('setting up consumers %s', self) for provider in self._providers: callbacks = [self._on_message, provider.handle_message] consumer = Consumer(queues=[provider.queue], callbacks=callbacks) consumer.qos(prefetch_count=self._prefetch_count) self._consumers[provider] = consumer return self._consumers.values() def on_iteration(self): """ Kombu callback for each `drain_events` loop iteration.""" self._cancel_consumers_if_requested() self._process_pending_message_acks() num_consumers = len(self._consumers) num_pending_messages = len(self._pending_messages) if num_consumers + num_pending_messages == 0: _log.debug('requesting stop after iteration') self.should_stop = True def on_connection_error(self, exc, interval): _log.warn('broker connection error: {}. ' 'Retrying in {} seconds.'.format(exc, interval)) def on_consume_ready(self, connection, channel, consumers, **kwargs): """ Kombu callback when consumers are ready to accept messages. Called after any (re)connection to the broker. """ if not self._consumers_ready.ready(): _log.debug('consumer started %s', self) self._consumers_ready.send(None) def consume(self, limit=None, timeout=None, safety_interval=0.1, **kwargs): """ Lifted from Kombu. We switch the order of the `break` and `self.on_iteration()` to avoid waiting on a drain_events timeout before breaking the loop. """ elapsed = 0 with self.consumer_context(**kwargs) as (conn, channel, consumers): for i in limit and range(limit) or count(): self.on_iteration() if self.should_stop: break try: conn.drain_events(timeout=safety_interval) except socket.timeout: elapsed += safety_interval # Excluding the following clause from coverage, # as timeout never appears to be set - This method # is a lift from kombu so will leave in place for now. if timeout and elapsed >= timeout: # pragma: no cover raise except socket.error: if not self.should_stop: raise else: yield elapsed = 0
class TimerProvider(EntrypointProvider): def __init__(self, interval, config_key): self._default_interval = interval self.config_key = config_key self.should_stop = Event() self.gt = None def prepare(self): interval = self._default_interval if self.config_key: config = self.container.config interval = config.get(self.config_key, interval) self.interval = interval def start(self): _log.debug('starting %s', self) self.gt = self.container.spawn_managed_thread(self._run) def stop(self): _log.debug('stopping %s', self) self.should_stop.send(True) self.gt.wait() def kill(self): _log.debug('killing %s', self) self.gt.kill() def _run(self): ''' Runs the interval loop. This should not be called directly, rather the `start()` method should be used. ''' while not self.should_stop.ready(): start = time.time() self.handle_timer_tick() elapsed_time = (time.time() - start) sleep_time = max(self.interval - elapsed_time, 0) self._sleep_or_stop(sleep_time) def _sleep_or_stop(self, sleep_time): ''' Sleeps for `sleep_time` seconds or until a `should_stop` event has been fired, whichever comes first. ''' try: with Timeout(sleep_time): self.should_stop.wait() except Timeout: # we use the timeout as a cancellable sleep pass def handle_timer_tick(self): args = tuple() kwargs = {} # Note that we don't catch ContainerBeingKilled here. If that's raised, # there is nothing for us to do anyway. The exception bubbles, and is # caught by :meth:`Container._handle_thread_exited`, though the # triggered `kill` is a no-op, since the container is alredy # `_being_killed`. self.container.spawn_worker(self, args, kwargs)
class ServiceContainer(object): def __init__(self, service_cls, config): self.service_cls = service_cls self.config = config self.service_name = get_service_name(service_cls) self.shared_extensions = {} self.max_workers = (config.get(MAX_WORKERS_CONFIG_KEY) or DEFAULT_MAX_WORKERS) self.serializer = config.get(SERIALIZER_CONFIG_KEY, DEFAULT_SERIALIZER) self.accept = [self.serializer] self.entrypoints = SpawningSet() self.dependencies = SpawningSet() self.subextensions = SpawningSet() for attr_name, dependency in inspect.getmembers( service_cls, is_dependency): bound = dependency.bind(self.interface, attr_name) self.dependencies.add(bound) self.subextensions.update(iter_extensions(bound)) for method_name, method in inspect.getmembers(service_cls, is_method): entrypoints = getattr(method, ENTRYPOINT_EXTENSIONS_ATTR, []) for entrypoint in entrypoints: bound = entrypoint.bind(self.interface, method_name) self.entrypoints.add(bound) self.subextensions.update(iter_extensions(bound)) self.started = False self._worker_pool = GreenPool(size=self.max_workers) self._worker_threads = {} self._managed_threads = {} self._being_killed = False self._died = Event() @property def extensions(self): return SpawningSet(self.entrypoints | self.dependencies | self.subextensions) @property def interface(self): """ An interface to this container for use by extensions. """ return self def start(self): """ Start a container by starting all of its extensions. """ _log.debug('starting %s', self) self.started = True with _log_time('started %s', self): self.extensions.all.setup() self.extensions.all.start() def stop(self): """ Stop the container gracefully. First all entrypoints are asked to ``stop()``. This ensures that no new worker threads are started. It is the extensions' responsibility to gracefully shut down when ``stop()`` is called on them and only return when they have stopped. After all entrypoints have stopped the container waits for any active workers to complete. After all active workers have stopped the container stops all dependency providers. At this point there should be no more managed threads. In case there are any managed threads, they are killed by the container. """ if self._died.ready(): _log.debug('already stopped %s', self) return if self._being_killed: # this race condition can happen when a container is hosted by a # runner and yields during its kill method; if it's unlucky in # scheduling the runner will try to stop() it before self._died # has a result _log.debug('already being killed %s', self) try: self._died.wait() except: pass # don't re-raise if we died with an exception return _log.debug('stopping %s', self) with _log_time('stopped %s', self): # entrypoint have to be stopped before dependencies to ensure # that running workers can successfully complete self.entrypoints.all.stop() # there might still be some running workers, which we have to # wait for to complete before we can stop dependencies self._worker_pool.waitall() # it should be safe now to stop any dependency as there is no # active worker which could be using it self.dependencies.all.stop() # finally, stop remaining extensions self.subextensions.all.stop() # any any managed threads they spawned self._kill_managed_threads() self.started = False # if `kill` is called after `stop`, they race to send this if not self._died.ready(): self._died.send(None) def kill(self, exc_info=None): """ Kill the container in a semi-graceful way. Entrypoints are killed, followed by any active worker threads. Next, dependencies are killed. Finally, any remaining managed threads are killed. If ``exc_info`` is provided, the exception will be raised by :meth:`~wait``. """ if self._being_killed: # this happens if a managed thread exits with an exception # while the container is being killed or if multiple errors # happen simultaneously _log.debug('already killing %s ... waiting for death', self) try: self._died.wait() except: pass # don't re-raise if we died with an exception return self._being_killed = True if self._died.ready(): _log.debug('already stopped %s', self) return if exc_info is not None: _log.info('killing %s due to %s', self, exc_info[1]) else: _log.info('killing %s', self) # protect against extensions that throw during kill; the container # is already dying with an exception, so ignore anything else def safely_kill_extensions(ext_set): try: ext_set.kill() except Exception as exc: _log.warning('Extension raised `%s` during kill', exc) safely_kill_extensions(self.entrypoints.all) self._kill_worker_threads() safely_kill_extensions(self.extensions.all) self._kill_managed_threads() self.started = False # if `kill` is called after `stop`, they race to send this if not self._died.ready(): self._died.send(None, exc_info) def wait(self): """ Block until the container has been stopped. If the container was stopped due to an exception, ``wait()`` will raise it. Any unhandled exception raised in a managed thread or in the worker lifecycle (e.g. inside :meth:`DependencyProvider.worker_setup`) results in the container being ``kill()``ed, and the exception raised from ``wait()``. """ return self._died.wait() def spawn_worker(self, entrypoint, args, kwargs, context_data=None, handle_result=None): """ Spawn a worker thread for running the service method decorated by `entrypoint`. ``args`` and ``kwargs`` are used as parameters for the service method. ``context_data`` is used to initialize a ``WorkerContext``. ``handle_result`` is an optional function which may be passed in by the entrypoint. It is called with the result returned or error raised by the service method. If provided it must return a value for ``result`` and ``exc_info`` to propagate to dependencies; these may be different to those returned by the service method. """ if self._being_killed: _log.info("Worker spawn prevented due to being killed") raise ContainerBeingKilled() service = self.service_cls() worker_ctx = WorkerContext(self, service, entrypoint, args, kwargs, data=context_data) _log.debug('spawning %s', worker_ctx) gt = self._worker_pool.spawn(self._run_worker, worker_ctx, handle_result) gt.link(self._handle_worker_thread_exited, worker_ctx) self._worker_threads[worker_ctx] = gt return worker_ctx def spawn_managed_thread(self, fn, identifier=None): """ Spawn a managed thread to run ``fn`` on behalf of an extension. The passed `identifier` will be included in logs related to this thread, and otherwise defaults to `fn.__name__`, if it is set. Any uncaught errors inside ``fn`` cause the container to be killed. It is the caller's responsibility to terminate their spawned threads. Threads are killed automatically if they are still running after all extensions are stopped during :meth:`ServiceContainer.stop`. Extensions should delegate all thread spawning to the container. """ if identifier is None: identifier = getattr(fn, '__name__', "<unknown>") gt = eventlet.spawn(fn) self._managed_threads[gt] = identifier gt.link(self._handle_managed_thread_exited, identifier) return gt def _run_worker(self, worker_ctx, handle_result): _log.debug('setting up %s', worker_ctx) _log.debug('call stack for %s: %s', worker_ctx, '->'.join(worker_ctx.call_id_stack)) with _log_time('ran worker %s', worker_ctx): self._inject_dependencies(worker_ctx) self._worker_setup(worker_ctx) result = exc_info = None method_name = worker_ctx.entrypoint.method_name method = getattr(worker_ctx.service, method_name) try: _log.debug('calling handler for %s', worker_ctx) with _log_time('ran handler for %s', worker_ctx): result = method(*worker_ctx.args, **worker_ctx.kwargs) except Exception as exc: _log.info('error handling worker %s: %s', worker_ctx, exc, exc_info=True) exc_info = sys.exc_info() if handle_result is not None: _log.debug('handling result for %s', worker_ctx) with _log_time('handled result for %s', worker_ctx): result, exc_info = handle_result(worker_ctx, result, exc_info) with _log_time('tore down worker %s', worker_ctx): self._worker_result(worker_ctx, result, exc_info) # we don't need this any more, and breaking the cycle means # this can be reclaimed immediately, rather than waiting for a # gc sweep del exc_info self._worker_teardown(worker_ctx) def _inject_dependencies(self, worker_ctx): for provider in self.dependencies: dependency = provider.get_dependency(worker_ctx) setattr(worker_ctx.service, provider.attr_name, dependency) def _worker_setup(self, worker_ctx): for provider in self.dependencies: provider.worker_setup(worker_ctx) def _worker_result(self, worker_ctx, result, exc_info): _log.debug('signalling result for %s', worker_ctx) for provider in self.dependencies: provider.worker_result(worker_ctx, result, exc_info) def _worker_teardown(self, worker_ctx): for provider in self.dependencies: provider.worker_teardown(worker_ctx) def _kill_worker_threads(self): """ Kill any currently executing worker threads. See :meth:`ServiceContainer.spawn_worker` """ num_workers = len(self._worker_threads) if num_workers: _log.warning('killing %s active workers(s)', num_workers) for worker_ctx, gt in list(self._worker_threads.items()): _log.warning('killing active worker for %s', worker_ctx) gt.kill() def _kill_managed_threads(self): """ Kill any currently executing managed threads. See :meth:`ServiceContainer.spawn_managed_thread` """ num_threads = len(self._managed_threads) if num_threads: _log.warning('killing %s managed thread(s)', num_threads) for gt, identifier in list(self._managed_threads.items()): _log.warning('killing managed thread `%s`', identifier) gt.kill() def _handle_worker_thread_exited(self, gt, worker_ctx): self._worker_threads.pop(worker_ctx, None) self._handle_thread_exited(gt) def _handle_managed_thread_exited(self, gt, extension): self._managed_threads.pop(gt, None) self._handle_thread_exited(gt) def _handle_thread_exited(self, gt): try: gt.wait() except GreenletExit: # we don't care much about threads killed by the container # this can happen in stop() and kill() if extensions # don't properly take care of their threads _log.debug('%s thread killed by container', self) except Exception: _log.error('%s thread exited with error', self, exc_info=True) # any uncaught error in a thread is unexpected behavior # and probably a bug in the extension or container. # to be safe we call self.kill() to kill our dependencies and # provide the exception info to be raised in self.wait(). self.kill(sys.exc_info()) def __repr__(self): service_name = self.service_name return '<ServiceContainer [{}] at 0x{:x}>'.format( service_name, id(self))
class ServiceContainer(object): def __init__(self, service_cls, config): self.service_cls = service_cls self.config = config self.service_name = get_service_name(service_cls) self.shared_extensions = {} self.max_workers = ( config.get(MAX_WORKERS_CONFIG_KEY) or DEFAULT_MAX_WORKERS) self.serializer, self.accept = serialization.setup(self.config) self.entrypoints = SpawningSet() self.dependencies = SpawningSet() self.subextensions = SpawningSet() for attr_name, dependency in inspect.getmembers(service_cls, is_dependency): bound = dependency.bind(self.interface, attr_name) self.dependencies.add(bound) self.subextensions.update(iter_extensions(bound)) for method_name, method in inspect.getmembers(service_cls, is_method): entrypoints = getattr(method, ENTRYPOINT_EXTENSIONS_ATTR, []) for entrypoint in entrypoints: bound = entrypoint.bind(self.interface, method_name) self.entrypoints.add(bound) self.subextensions.update(iter_extensions(bound)) self.started = False self._worker_pool = GreenPool(size=self.max_workers) self._worker_threads = {} self._managed_threads = {} self._being_killed = False self._died = Event() @property def extensions(self): return SpawningSet( self.entrypoints | self.dependencies | self.subextensions ) @property def interface(self): """ An interface to this container for use by extensions. """ return self def start(self): """ Start a container by starting all of its extensions. """ _log.debug('starting %s', self) self.started = True with _log_time('started %s', self): self.extensions.all.setup() self.extensions.all.start() def stop(self): """ Stop the container gracefully. First all entrypoints are asked to ``stop()``. This ensures that no new worker threads are started. It is the extensions' responsibility to gracefully shut down when ``stop()`` is called on them and only return when they have stopped. After all entrypoints have stopped the container waits for any active workers to complete. After all active workers have stopped the container stops all dependency providers. At this point there should be no more managed threads. In case there are any managed threads, they are killed by the container. """ if self._died.ready(): _log.debug('already stopped %s', self) return if self._being_killed: # this race condition can happen when a container is hosted by a # runner and yields during its kill method; if it's unlucky in # scheduling the runner will try to stop() it before self._died # has a result _log.debug('already being killed %s', self) try: self._died.wait() except: pass # don't re-raise if we died with an exception return _log.debug('stopping %s', self) with _log_time('stopped %s', self): # entrypoint have to be stopped before dependencies to ensure # that running workers can successfully complete self.entrypoints.all.stop() # there might still be some running workers, which we have to # wait for to complete before we can stop dependencies self._worker_pool.waitall() # it should be safe now to stop any dependency as there is no # active worker which could be using it self.dependencies.all.stop() # finally, stop remaining extensions self.subextensions.all.stop() # any any managed threads they spawned self._kill_managed_threads() self.started = False # if `kill` is called after `stop`, they race to send this if not self._died.ready(): self._died.send(None) def kill(self, exc_info=None): """ Kill the container in a semi-graceful way. Entrypoints are killed, followed by any active worker threads. Next, dependencies are killed. Finally, any remaining managed threads are killed. If ``exc_info`` is provided, the exception will be raised by :meth:`~wait``. """ if self._being_killed: # this happens if a managed thread exits with an exception # while the container is being killed or if multiple errors # happen simultaneously _log.debug('already killing %s ... waiting for death', self) try: self._died.wait() except: pass # don't re-raise if we died with an exception return self._being_killed = True if self._died.ready(): _log.debug('already stopped %s', self) return if exc_info is not None: _log.info('killing %s due to %s', self, exc_info[1]) else: _log.info('killing %s', self) # protect against extensions that throw during kill; the container # is already dying with an exception, so ignore anything else def safely_kill_extensions(ext_set): try: ext_set.kill() except Exception as exc: _log.warning('Extension raised `%s` during kill', exc) safely_kill_extensions(self.entrypoints.all) self._kill_worker_threads() safely_kill_extensions(self.extensions.all) self._kill_managed_threads() self.started = False # if `kill` is called after `stop`, they race to send this if not self._died.ready(): self._died.send(None, exc_info) def wait(self): """ Block until the container has been stopped. If the container was stopped due to an exception, ``wait()`` will raise it. Any unhandled exception raised in a managed thread or in the worker lifecycle (e.g. inside :meth:`DependencyProvider.worker_setup`) results in the container being ``kill()``ed, and the exception raised from ``wait()``. """ return self._died.wait() def spawn_worker(self, entrypoint, args, kwargs, context_data=None, handle_result=None): """ Spawn a worker thread for running the service method decorated by `entrypoint`. ``args`` and ``kwargs`` are used as parameters for the service method. ``context_data`` is used to initialize a ``WorkerContext``. ``handle_result`` is an optional function which may be passed in by the entrypoint. It is called with the result returned or error raised by the service method. If provided it must return a value for ``result`` and ``exc_info`` to propagate to dependencies; these may be different to those returned by the service method. """ if self._being_killed: _log.info("Worker spawn prevented due to being killed") raise ContainerBeingKilled() service = self.service_cls() worker_ctx = WorkerContext( self, service, entrypoint, args, kwargs, data=context_data ) _log.debug('spawning %s', worker_ctx) gt = self._worker_pool.spawn( self._run_worker, worker_ctx, handle_result ) gt.link(self._handle_worker_thread_exited, worker_ctx) self._worker_threads[worker_ctx] = gt return worker_ctx def spawn_managed_thread(self, fn, identifier=None): """ Spawn a managed thread to run ``fn`` on behalf of an extension. The passed `identifier` will be included in logs related to this thread, and otherwise defaults to `fn.__name__`, if it is set. Any uncaught errors inside ``fn`` cause the container to be killed. It is the caller's responsibility to terminate their spawned threads. Threads are killed automatically if they are still running after all extensions are stopped during :meth:`ServiceContainer.stop`. Extensions should delegate all thread spawning to the container. """ if identifier is None: identifier = getattr(fn, '__name__', "<unknown>") gt = eventlet.spawn(fn) self._managed_threads[gt] = identifier gt.link(self._handle_managed_thread_exited, identifier) return gt def _run_worker(self, worker_ctx, handle_result): _log.debug('setting up %s', worker_ctx) _log.debug('call stack for %s: %s', worker_ctx, '->'.join(worker_ctx.call_id_stack)) with _log_time('ran worker %s', worker_ctx): self._inject_dependencies(worker_ctx) self._worker_setup(worker_ctx) result = exc_info = None method_name = worker_ctx.entrypoint.method_name method = getattr(worker_ctx.service, method_name) try: _log.debug('calling handler for %s', worker_ctx) with _log_time('ran handler for %s', worker_ctx): result = method(*worker_ctx.args, **worker_ctx.kwargs) except Exception as exc: if isinstance(exc, worker_ctx.entrypoint.expected_exceptions): _log.warning( '(expected) error handling worker %s: %s', worker_ctx, exc, exc_info=True) else: _log.exception( 'error handling worker %s: %s', worker_ctx, exc) exc_info = sys.exc_info() if handle_result is not None: _log.debug('handling result for %s', worker_ctx) with _log_time('handled result for %s', worker_ctx): result, exc_info = handle_result( worker_ctx, result, exc_info) with _log_time('tore down worker %s', worker_ctx): self._worker_result(worker_ctx, result, exc_info) # we don't need this any more, and breaking the cycle means # this can be reclaimed immediately, rather than waiting for a # gc sweep del exc_info self._worker_teardown(worker_ctx) def _inject_dependencies(self, worker_ctx): for provider in self.dependencies: dependency = provider.get_dependency(worker_ctx) setattr(worker_ctx.service, provider.attr_name, dependency) def _worker_setup(self, worker_ctx): for provider in self.dependencies: provider.worker_setup(worker_ctx) def _worker_result(self, worker_ctx, result, exc_info): _log.debug('signalling result for %s', worker_ctx) for provider in self.dependencies: provider.worker_result(worker_ctx, result, exc_info) def _worker_teardown(self, worker_ctx): for provider in self.dependencies: provider.worker_teardown(worker_ctx) def _kill_worker_threads(self): """ Kill any currently executing worker threads. See :meth:`ServiceContainer.spawn_worker` """ num_workers = len(self._worker_threads) if num_workers: _log.warning('killing %s active workers(s)', num_workers) for worker_ctx, gt in list(self._worker_threads.items()): _log.warning('killing active worker for %s', worker_ctx) gt.kill() def _kill_managed_threads(self): """ Kill any currently executing managed threads. See :meth:`ServiceContainer.spawn_managed_thread` """ num_threads = len(self._managed_threads) if num_threads: _log.warning('killing %s managed thread(s)', num_threads) for gt, identifier in list(self._managed_threads.items()): _log.warning('killing managed thread `%s`', identifier) gt.kill() def _handle_worker_thread_exited(self, gt, worker_ctx): self._worker_threads.pop(worker_ctx, None) self._handle_thread_exited(gt) def _handle_managed_thread_exited(self, gt, extension): self._managed_threads.pop(gt, None) self._handle_thread_exited(gt) def _handle_thread_exited(self, gt): try: gt.wait() except GreenletExit: # we don't care much about threads killed by the container # this can happen in stop() and kill() if extensions # don't properly take care of their threads _log.debug('%s thread killed by container', self) except Exception: _log.critical('%s thread exited with error', self, exc_info=True) # any uncaught error in a thread is unexpected behavior # and probably a bug in the extension or container. # to be safe we call self.kill() to kill our dependencies and # provide the exception info to be raised in self.wait(). self.kill(sys.exc_info()) def __repr__(self): service_name = self.service_name return '<ServiceContainer [{}] at 0x{:x}>'.format( service_name, id(self))
class QueueConsumer(SharedExtension, ProviderCollector, ConsumerMixin): amqp_uri = None prefetch_count = None def __init__(self): self._connection = None self._consumers = {} self._pending_messages = set() self._pending_ack_messages = [] self._pending_requeue_messages = [] self._pending_remove_providers = {} self._gt = None self._starting = False self._consumers_ready = Event() super(QueueConsumer, self).__init__() def _handle_thread_exited(self, gt): exc = None try: gt.wait() except Exception as e: exc = e if not self._consumers_ready.ready(): self._consumers_ready.send_exception(exc) def setup(self): self.amqp_uri = self.container.config[AMQP_URI_CONFIG_KEY] self.accept = self.container.accept self.prefetch_count = self.container.max_workers verify_amqp_uri(self.amqp_uri) def start(self): if not self._starting: self._starting = True _log.debug('starting %s', self) self._gt = self.container.spawn_managed_thread(self.run, protected=True) self._gt.link(self._handle_thread_exited) try: _log.debug('waiting for consumer ready %s', self) self._consumers_ready.wait() except QueueConsumerStopped: _log.debug('consumer was stopped before it started %s', self) except Exception as exc: _log.debug('consumer failed to start %s (%s)', self, exc) else: _log.debug('started %s', self) def stop(self): """ Stop the queue-consumer gracefully. Wait until the last provider has been unregistered and for the ConsumerMixin's greenthread to exit (i.e. until all pending messages have been acked or requeued and all consumers stopped). """ if not self._consumers_ready.ready(): _log.debug('stopping while consumer is starting %s', self) stop_exc = QueueConsumerStopped() # stopping before we have started successfully by brutally # killing the consumer thread as we don't have a way to hook # into the pre-consumption startup process self._gt.kill(stop_exc) self.wait_for_providers() try: _log.debug('waiting for consumer death %s', self) self._gt.wait() except QueueConsumerStopped: pass super(QueueConsumer, self).stop() _log.debug('stopped %s', self) def kill(self): """ Kill the queue-consumer. Unlike `stop()` any pending message ack or requeue-requests, requests to remove providers, etc are lost and the consume thread is asked to terminate as soon as possible. """ # greenlet has a magic attribute ``dead`` - pylint: disable=E1101 if self._gt is not None and not self._gt.dead: # we can't just kill the thread because we have to give # ConsumerMixin a chance to close the sockets properly. self._providers = set() self._pending_messages = set() self._pending_ack_messages = [] self._pending_requeue_messages = [] self._pending_remove_providers = {} self.should_stop = True try: self._gt.wait() except Exception as exc: # discard the exception since we're already being killed _log.warn('QueueConsumer %s raised `%s` during kill', self, exc) super(QueueConsumer, self).kill() _log.debug('killed %s', self) def unregister_provider(self, provider): if not self._consumers_ready.ready(): # we cannot handle the situation where we are starting up and # want to remove a consumer at the same time # TODO: With the upcomming error handling mechanism, this needs # TODO: to be thought through again. self._last_provider_unregistered.send() return removed_event = Event() # we can only cancel a consumer from within the consumer thread self._pending_remove_providers[provider] = removed_event # so we will just register the consumer to be canceled removed_event.wait() super(QueueConsumer, self).unregister_provider(provider) def ack_message(self, message): _log.debug("stashing message-ack: %s", message) self._pending_messages.remove(message) self._pending_ack_messages.append(message) def requeue_message(self, message): _log.debug("stashing message-requeue: %s", message) self._pending_messages.remove(message) self._pending_requeue_messages.append(message) def _on_message(self, body, message): _log.debug("received message: %s", message) self._pending_messages.add(message) def _cancel_consumers_if_requested(self): provider_remove_events = self._pending_remove_providers.items() self._pending_remove_providers = {} for provider, removed_event in provider_remove_events: consumer = self._consumers.pop(provider) _log.debug('cancelling consumer [%s]: %s', provider, consumer) consumer.cancel() removed_event.send() def _process_pending_message_acks(self): messages = self._pending_ack_messages if messages: _log.debug('ack() %d processed messages', len(messages)) while messages: msg = messages.pop() msg.ack() eventlet.sleep() messages = self._pending_requeue_messages if messages: _log.debug('requeue() %d processed messages', len(messages)) while messages: msg = messages.pop() msg.requeue() eventlet.sleep() @property def connection(self): """ Kombu requirement """ if self.amqp_uri is None: return # don't cache a connection during introspection if self._connection is None: self._connection = Connection(self.amqp_uri) return self._connection def get_consumers(self, Consumer, channel): """ Kombu callback to set up consumers. Called after any (re)connection to the broker. """ _log.debug('setting up consumers %s', self) for provider in self._providers: callbacks = [self._on_message, provider.handle_message] consumer = Consumer(queues=[provider.queue], callbacks=callbacks, accept=self.accept) consumer.qos(prefetch_count=self.prefetch_count) self._consumers[provider] = consumer return self._consumers.values() def on_iteration(self): """ Kombu callback for each `drain_events` loop iteration.""" self._cancel_consumers_if_requested() self._process_pending_message_acks() num_consumers = len(self._consumers) num_pending_messages = len(self._pending_messages) if num_consumers + num_pending_messages == 0: _log.debug('requesting stop after iteration') self.should_stop = True def on_connection_error(self, exc, interval): _log.warn("Error connecting to broker at {} ({}).\n" "Retrying in {} seconds.".format(self.amqp_uri, exc, interval)) def on_consume_ready(self, connection, channel, consumers, **kwargs): """ Kombu callback when consumers are ready to accept messages. Called after any (re)connection to the broker. """ if not self._consumers_ready.ready(): _log.debug('consumer started %s', self) self._consumers_ready.send(None) for provider in self._providers: try: callback = provider.on_consume_ready except AttributeError: pass else: callback() def consume(self, limit=None, timeout=None, safety_interval=0.1, **kwargs): """ Lifted from Kombu. We switch the order of the `break` and `self.on_iteration()` to avoid waiting on a drain_events timeout before breaking the loop. """ elapsed = 0 with self.consumer_context(**kwargs) as (conn, channel, consumers): for i in limit and range(limit) or count(): self.on_iteration() if self.should_stop: break try: conn.drain_events(timeout=safety_interval) except socket.timeout: elapsed += safety_interval # Excluding the following clause from coverage, # as timeout never appears to be set - This method # is a lift from kombu so will leave in place for now. if timeout and elapsed >= timeout: # pragma: no cover raise except socket.error: if not self.should_stop: raise else: yield elapsed = 0
class ScheduledMessageService(object): MIN_SLEEP_TIME = timedelta(seconds=1) MAX_SLEEP_TIME = timedelta(minutes=60) MAX_CLAIM_TIME = timedelta(minutes=5) def __init__(self, context): self.context = context self.service_queue = Event() self._listener = None self._dispatch = None def run(self): try: with self.context: self._listener = self._start_listener() self._dispatcher = spawn(self.run_dispatcher) procs = [self._listener, self._dispatcher] waitall(procs) except GreenletExit: pass finally: killall(procs) waitall(procs) ################################################################ # The listener consumes messages on the scheduled message queue # and stores the deferred messages in the database. ################################################################ def _start_listener(self): @always_ack def cb(message_data, message): with self.context: _handle_scheduler_command(message_data, message, self.context) self.wakeup_dispatcher() dispatch = MessageDispatch(self.context) return dispatch.start_worker(SCHEDULER_COMMAND, cb) ############################################################## # The dispatcher consumes deferred messages from the database # when their scheduled time arrives and spits them out # to the message broker ############################################################## def run_dispatcher(self): try: # cleanup any mess left over last time... with self.context: self.cleanup() while(True): log.info("checking for ready messages...") last_time = self.send_ready_messages() sleep_time = self._calc_sleep(last_time) log.info("sleeping for %s" % sleep_time) sleep_secs = sleep_time.days*84600 + sleep_time.seconds try: with_timeout(sleep_secs, self.service_queue.wait) except TimeoutError: pass if self.service_queue.ready(): self.service_queue.reset() except GreenletExit: log.debug("ScheduledMessageService dispatcher exiting...") def wakeup_dispatcher(self): if not self.service_queue.ready(): self.service_queue.send(True) def _calc_sleep(self, after=None): next_time = self.find_next_send_time(after=after) if next_time is None: sleep_time = self.MAX_SLEEP_TIME else: sleep_time = next_time - datetime.utcnow() sleep_time += timedelta(seconds=1) sleep_time -= timedelta(microseconds=sleep_time.microseconds) if sleep_time < self.MIN_SLEEP_TIME: sleep_time = self.MIN_SLEEP_TIME if sleep_time > self.MAX_SLEEP_TIME: sleep_time = self.MAX_SLEEP_TIME return sleep_time def find_next_send_time(self, after=None): if after is None: after = datetime.utcnow() after_str = DateTimeField()._to_json(after) next_query = dict( startkey = [False, after_str, {}], endkey = [True, None], include_docs = False, descending = False, limit = 1 ) next_send = None for r in view_deferred_messages_by_timestamp(self.context.db, **next_query): next_send = DateTimeField()._to_python(r.key[1]) break return next_send def send_ready_messages(self): while True: now = datetime.utcnow() now_str = DateTimeField()._to_json(now) query = dict( startkey = [False, None], endkey = [False, now_str, {}], include_docs = True, descending = False, limit = 100 ) vr = view_deferred_messages_by_timestamp(self.context.db, **query) batch = [] for r in vr: batch.append(DeferredAMQPMessage.wrap(r.doc)) if len(batch) == 0: break dispatch_count = 0 for message in batch: try: if self._dispatch_message(message): dispatch_count += 1 except GreenletExit: # asked to stop, go ahead and quit. raise except: log.error("Unexected error dispatching message %s: %s" % (message, traceback.format_exc())) log.info("Dispatched %d messages" % dispatch_count) return now def _dispatch_message(self, message): if not message.claim(self.context.db): return try: publisher = Publisher(self.context.broker, exchange=message.options.exchange, exchange_type=message.options.exchange_type) publisher.send(message.message, routing_key = message.options.routing_key, delivery_mode = message.options.delivery_mode, mandatory = message.options.mandatory, priority = message.options.priority) publisher.close() except: log.error("Error dispatching deferred message %s: %s" % (message, traceback.format_exc())) self.error_reschedule(message) return False else: log.debug("Dispatched message %s" % message) # sent with no problems, done with it. self.context.db.delete(message) return True def error_reschedule(self, message): message.error_count += 1 if message.error_count < 10: delay = 2**message.error_count else: delay = 60*10 resched_time = datetime.utcnow() + timedelta(seconds=delay) message.unclaim(self.context.db, resched_time) log.warn("Rescheduled message %s for %s" % (message.id, resched_time)) def cleanup(self): log.info("Performing cleanup of claimed items...") # anything older than this has held the claim for too long # and is considered dead. cutoff = datetime.utcnow() - self.MAX_CLAIM_TIME cutoff_str = DateTimeField()._to_json(cutoff) query = dict( startkey = [True, cutoff_str, {}], endkey = [True], limit = 100, include_docs = True, descending = True ) unclaim_count = 0 while(True): vr = view_deferred_messages_by_timestamp(self.context.db, **query) batch = [DeferredAMQPMessage.wrap(r.doc) for r in vr] if len(batch) == 0: break for message in batch: self.error_reschedule(message) unclaim_count += 1 if unclaim_count > 0: log.warn('Cleanup unclaimed %d items' % unclaim_count)
class Client(object): def __init__(self): self.results = [] self.stop = Event() self.no_more_results = Event() self.failure = None self.next_lease_id = 100000 self.keys_written = set() def get(self, key, metadata=False): assert metadata, "Always expect get() call with metadata=True" try: result = self.read(key) mod_revision = 10 if result.etcd_index != 0: mod_revision = result.etcd_index return [(result.value, {'mod_revision': str(mod_revision)})] except etcdv3.KeyNotFound: return [] def watch_once(self, key, timeout=None, **kwargs): result = self.read(key) mod_revision = 10 if result.etcd_index != 0: mod_revision = result.etcd_index return {'kv': {'value': result.value, 'mod_revision': mod_revision}} def read(self, path, **kwargs): try: result = self.results.pop(0) except IndexError: if not self.no_more_results.ready(): self.no_more_results.send() eventlet.with_timeout(5, self.stop.wait) raise NoMoreResults() if result.op != READ: self.failure = "Unexpected result type for read(): %s" % result.op raise UnexpectedResultType() if result.exception is not None: log.debug("Raise read exception %s", type(result.exception).__name__) raise result.exception log.debug("Return read result %s", result) return result def put(self, key, value, lease=None): self.write(key, value) return True def transaction(self, txn): put_request = txn['success'][0]['request_put'] succeeded = self.put(_decode(put_request['key']), _decode(put_request['value'])) return {'succeeded': succeeded} def lease(self, ttl): l = Lease(self.next_lease_id, self) self.next_lease_id += 1 return l def write(self, path, value, **kwargs): log.debug("Write of %s to %s", value, path) try: result = self.results.pop(0) except IndexError: if not self.no_more_results.ready(): self.no_more_results.send() eventlet.with_timeout(5, self.stop.wait) raise NoMoreResults() if result.op != WRITE: self.failure = "Unexpected result type for write(): %s" % result.op raise UnexpectedResultType() if result.exception is not None: log.debug("Raise write exception %s", result.exception) raise result.exception log.debug("Return write result") self.keys_written.add(path) return result def assert_key_written(self, key): assert (key in self.keys_written) def add_read_exception(self, exception): assert (isinstance(exception, Exception)) self.results.append(EtcdResult(exception=exception)) def add_read_result(self, **kwargs): self.results.append(EtcdResult(**kwargs)) def add_write_result(self): # Write results have no useful content. self.results.append(EtcdResult(op=WRITE)) def add_write_exception(self, exception): self.results.append(EtcdResult(op=WRITE, exception=exception))
class Service(ConsumerMixin): def __init__( self, controllercls, connection_factory, exchange, topic, pool=None, poolsize=1000): self.nodeid = UIDGEN() self.max_workers = poolsize if pool is None: self.procpool = GreenPool(size=poolsize) else: self.procpool = pool self.controller = controllercls() self.service = self.controller self.topic = topic self.greenlet = None self.consume_ready = Event() node_topic = "{}.{}".format(self.topic, self.nodeid) self.nova_queues = [ entities.get_topic_queue(exchange, topic), entities.get_topic_queue(exchange, node_topic), entities.get_fanout_queue(topic), ] self._channel = None self._consumers = None self.connection = connection_factory() self.connection_factory = connection_factory inject_dependencies(self.controller, self) self._connection_pool = Pool( max_size=self.procpool.size, create=connection_factory ) self.workers = set() self._pending_ack_messages = [] self._pending_requeue_messages = [] self._do_cancel_consumers = False self._consumers_cancelled = Event() self._timers = list(get_timers(self.controller)) def start(self): self.start_timers() # greenlet has a magic attribute ``dead`` - pylint: disable=E1101 if self.greenlet is not None and not self.greenlet.dead: raise RuntimeError() self.greenlet = eventlet.spawn(self.run) def start_timers(self): for timer in self._timers: timer.start() def get_consumers(self, Consumer, channel): nova_consumer = Consumer( self.nova_queues, callbacks=[self.on_nova_message, ]) consume_consumers = get_consumers( Consumer, self, self.on_consume_message) consumers = [nova_consumer] + list(consume_consumers) prefetch_count = self.procpool.size for consumer in consumers: consumer.qos(prefetch_count=prefetch_count) return consumers def on_consume_ready(self, connection, channel, consumers, **kwargs): self._consumers = consumers self._channel = channel self.consume_ready.send(None) def on_consume_end(self, connection, channel): self.consume_ready.reset() def on_nova_message(self, body, message): _log.debug('spawning RPC worker (%d free)', self.procpool.free()) gt = self.procpool.spawn(self.handle_rpc_message, body) gt.link(self.handle_rpc_message_processed, message) self.workers.add(gt) def on_consume_message(self, consumer_method_config, body, message): _log.debug('spawning consume worker (%d free)', self.procpool.free()) gt = self.procpool.spawn( self.handle_consume_message, consumer_method_config, body, message) gt.link(self.handle_consume_message_processed) self.workers.add(gt) def handle_rpc_message(self, body): # item is patched on for python with ``with``, pylint can't find it # pylint: disable=E1102 with self._connection_pool.item() as connection: process_rpc_message(connection, self.controller, body) def handle_rpc_message_processed(self, gt, message): self.workers.discard(gt) self._pending_ack_messages.append(message) def handle_consume_message(self, consumer_method_config, body, message): with log_time(_log.debug, 'processed consume message in %0.3fsec'): consumer_method, consumer_config = consumer_method_config try: consumer_method(body) except Exception as e: if consumer_config.requeue_on_error: _log.exception( 'failed to consume message, requeueing message: ' '%s(): %s', consumer_method, e) self._pending_requeue_messages.append(message) else: _log.exception( 'failed to consume message, ignoring message: ' '%s(): %s', consumer_method, e) self._pending_ack_messages.append(message) else: self._pending_ack_messages.append(message) def handle_consume_message_processed(self, gt): self.workers.discard(gt) def on_iteration(self): self.process_consumer_cancellation() # we need to make sure we process any pending messages before shutdown self.process_pending_message_acks() self.process_shutdown() def process_consumer_cancellation(self): if self._do_cancel_consumers: self._do_cancel_consumers = False if self._consumers: _log.debug('cancelling consumers') for consumer in self._consumers: consumer.cancel() self._consumers_cancelled.send(True) def process_pending_message_acks(self): messages = self._pending_ack_messages if messages: _log.debug('ack() %d processed messages', len(messages)) while messages: msg = messages.pop() msg.ack() eventlet.sleep() messages = self._pending_requeue_messages if messages: _log.debug('requeue() %d processed messages', len(messages)) while messages: msg = messages.pop() msg.requeue() eventlet.sleep() def consume(self, limit=None, timeout=None, safety_interval=0.1, **kwargs): """ Lifted from kombu so we are able to break the loop immediately after a shutdown is triggered rather than waiting for the timeout. """ elapsed = 0 with self.Consumer() as (connection, channel, consumers): with self.extra_context(connection, channel): self.on_consume_ready(connection, channel, consumers, **kwargs) for i in limit and xrange(limit) or count(): # moved from after the following `should_stop` condition to # avoid waiting on a drain_events timeout before breaking # the loop. self.on_iteration() if self.should_stop: break try: connection.drain_events(timeout=safety_interval) except socket.timeout: elapsed += safety_interval # Excluding the following clause from coverage, # as timeout never appears to be set - This method # is a lift from kombu so will leave in place for now. if timeout and elapsed >= timeout: # pragma: no cover raise socket.timeout() except socket.error: if not self.should_stop: raise else: yield elapsed = 0 def process_shutdown(self): consumers_cancelled = self._consumers_cancelled.ready() no_active_timers = (len(self._timers) == 0) no_active_workers = (self.procpool.running() < 1) no_pending_message_acks = not ( self._pending_ack_messages or self._pending_requeue_messages ) ready_to_stop = ( consumers_cancelled and no_active_timers and no_active_workers and no_pending_message_acks ) if ready_to_stop: _log.debug('notifying service to stop') self.should_stop = True def cancel_consumers(self): # greenlet has a magic attribute ``dead`` - pylint: disable=E1101 if self.greenlet is not None and not self.greenlet.dead: # since consumers were started in a separate thread, # we will just notify the thread to avoid getting # "Second simultaneous read" errors _log.debug('notifying consumers to be cancelled') self._do_cancel_consumers = True self._consumers_cancelled.wait() else: _log.debug('consumer thread already dead') def cancel_timers(self): if self._timers: _log.debug('stopping %d timers', len(self._timers)) while self._timers: self._timers.pop().stop() def kill_workers(self): _log.debug('force killing %d workers', len(self.workers)) while self.workers: self.workers.pop().kill() def wait_for_workers(self): pool = self.procpool _log.debug('waiting for %d workers to complete', pool.running()) pool.waitall() def shut_down(self): # greenlet has a magic attribute ``dead`` - pylint: disable=E1101 if self.greenlet is not None and not self.greenlet.dead: _log.debug('stopping service') self.greenlet.wait() # TODO: when is this ever not None? if self._channel is not None: _log.debug('closing channel') self._channel.close() def kill(self, force=False): _log.debug('killing service') self.cancel_consumers() self.cancel_timers() if force: self.kill_workers() else: self.wait_for_workers() self.shut_down() def link(self, *args, **kwargs): return self.greenlet.link(*args, **kwargs)