def post(self, queue, messages, client_uuid, project=None): msgset_key = utils.msgset_key(queue, project) counter_key = utils.scope_queue_index(queue, project, MESSAGE_RANK_COUNTER_SUFFIX) message_ids = [] now = timeutils.utcnow_ts() with self._client.pipeline() as pipe: for msg in messages: prepared_msg = Message( ttl=msg['ttl'], created=now, client_uuid=client_uuid, claim_id=None, claim_expires=now, claim_count=0, delay_expires=now + msg.get('delay', 0), body=msg.get('body', {}), checksum=s_utils.get_checksum(msg.get('body', None)) if self.driver.conf.enable_checksum else None ) prepared_msg.to_redis(pipe) message_ids.append(prepared_msg.id) pipe.execute() # NOTE(kgriffs): If this call fails, we will return # an error to the client and the messages will be # orphaned, but Redis will remove them when they # expire, so we will just pretend they don't exist # in that case. self._index_messages(msgset_key, counter_key, message_ids) return message_ids
def _create_msg(self, queue, msg, client_uuid, project): slug = str(uuid.uuid1()) now = timeutils.utcnow_ts() message = { 'body': msg.get('body', {}), 'claim_id': None, 'ttl': msg['ttl'], 'claim_count': 0, 'delay_expires': now + msg.get('delay', 0) } if self.driver.conf.enable_checksum: message['checksum'] = s_utils.get_checksum(msg.get('body', None)) contents = jsonutils.dumps(message) utils._put_or_create_container(self._client, utils._message_container( queue, project), slug, contents=contents, content_type='application/json', headers={ 'x-object-meta-clientid': str(client_uuid), 'x-delete-after': msg['ttl'] }) return slug
def post(self, queue_name, messages, client_uuid, project=None): # NOTE(flaper87): This method should be safe to retry on # autoreconnect, since we've a 2-step insert for messages. # The worst-case scenario is that we'll increase the counter # several times and we'd end up with some non-active messages. if not self._queue_ctrl.exists(queue_name, project): raise errors.QueueDoesNotExist(queue_name, project) # NOTE(flaper87): Make sure the counter exists. This method # is an upsert. self._get_counter(queue_name, project) now = timeutils.utcnow_ts() now_dt = datetime.datetime.utcfromtimestamp(now) collection = self._collection(queue_name, project) messages = list(messages) msgs_n = len(messages) next_marker = self._inc_counter(queue_name, project, amount=msgs_n) - msgs_n prepared_messages = [] for index, message in enumerate(messages): msg = { PROJ_QUEUE: utils.scope_queue_name(queue_name, project), 't': message['ttl'], 'e': now_dt + datetime.timedelta(seconds=message['ttl']), 'u': client_uuid, 'c': { 'id': None, 'e': now, 'c': 0 }, 'd': now + message.get('delay', 0), 'b': message['body'] if 'body' in message else {}, 'k': next_marker + index, 'tx': None } if self.driver.conf.enable_checksum: msg['cs'] = s_utils.get_checksum(message.get('body', None)) prepared_messages.append(msg) res = collection.insert_many(prepared_messages, bypass_document_validation=True) return [str(id_) for id_ in res.inserted_ids]
def post(self, queue_name, messages, client_uuid, project=None): # NOTE(flaper87): This method should be safe to retry on # autoreconnect, since we've a 2-step insert for messages. # The worst-case scenario is that we'll increase the counter # several times and we'd end up with some non-active messages. if not self._queue_ctrl.exists(queue_name, project): raise errors.QueueDoesNotExist(queue_name, project) # NOTE(flaper87): Make sure the counter exists. This method # is an upsert. self._get_counter(queue_name, project) now = timeutils.utcnow_ts() now_dt = datetime.datetime.utcfromtimestamp(now) collection = self._collection(queue_name, project) messages = list(messages) msgs_n = len(messages) next_marker = self._inc_counter(queue_name, project, amount=msgs_n) - msgs_n prepared_messages = [] for index, message in enumerate(messages): msg = { PROJ_QUEUE: utils.scope_queue_name(queue_name, project), 't': message['ttl'], 'e': now_dt + datetime.timedelta(seconds=message['ttl']), 'u': client_uuid, 'c': {'id': None, 'e': now, 'c': 0}, 'd': now + message.get('delay', 0), 'b': message['body'] if 'body' in message else {}, 'k': next_marker + index, 'tx': None } if self.driver.conf.enable_checksum: msg['cs'] = s_utils.get_checksum(message.get('body', None)) prepared_messages.append(msg) res = collection.insert_many(prepared_messages, bypass_document_validation=True) return [str(id_) for id_ in res.inserted_ids]
def _create_msg(self, queue, msg, client_uuid, project): slug = str(uuid.uuid1()) now = timeutils.utcnow_ts() message = {'body': msg.get('body', {}), 'claim_id': None, 'ttl': msg['ttl'], 'claim_count': 0, 'delay_expires': now + msg.get('delay', 0)} if self.driver.conf.enable_checksum: message['checksum'] = s_utils.get_checksum(msg.get('body', None)) contents = jsonutils.dumps(message) utils._put_or_create_container( self._client, utils._message_container(queue, project), slug, contents=contents, content_type='application/json', headers={ 'x-object-meta-clientid': str(client_uuid), 'x-delete-after': msg['ttl']}) return slug
def post(self, queue_name, messages, client_uuid, project=None): # NOTE(flaper87): This method should be safe to retry on # autoreconnect, since we've a 2-step insert for messages. # The worst-case scenario is that we'll increase the counter # several times and we'd end up with some non-active messages. if not self._queue_ctrl.exists(queue_name, project): raise errors.QueueDoesNotExist(queue_name, project) # NOTE(flaper87): Make sure the counter exists. This method # is an upsert. self._get_counter(queue_name, project) now = timeutils.utcnow_ts() now_dt = datetime.datetime.utcfromtimestamp(now) collection = self._collection(queue_name, project) # Set the next basis marker for the first attempt. # # Note that we don't increment the counter right away because # if 2 concurrent posts happen and the one with the higher counter # ends before the one with the lower counter, there's a window # where a client paging through the queue may get the messages # with the higher counter and skip the previous ones. This would # make our FIFO guarantee unsound. next_marker = self._get_counter(queue_name, project) # Unique transaction ID to facilitate atomic batch inserts transaction = objectid.ObjectId() prepared_messages = [] for index, message in enumerate(messages): msg = { PROJ_QUEUE: utils.scope_queue_name(queue_name, project), 't': message['ttl'], 'e': now_dt + datetime.timedelta(seconds=message['ttl']), 'u': client_uuid, 'c': {'id': None, 'e': now, 'c': 0}, 'd': now + message.get('delay', 0), 'b': message['body'] if 'body' in message else {}, 'k': next_marker + index, 'tx': None } if self.driver.conf.enable_checksum: msg['cs'] = s_utils.get_checksum(message.get('body', None)) prepared_messages.append(msg) # NOTE(kgriffs): Don't take the time to do a 2-phase insert # if there is no way for it to partially succeed. if len(prepared_messages) == 1: transaction = None prepared_messages[0]['tx'] = None # Use a retry range for sanity, although we expect # to rarely, if ever, reach the maximum number of # retries. # # NOTE(kgriffs): With the default configuration (100 ms # max sleep, 1000 max attempts), the max stall time # before the operation is abandoned is 49.95 seconds. for attempt in self._retry_range: try: res = collection.insert_many(prepared_messages, bypass_document_validation=True) # Log a message if we retried, for debugging perf issues if attempt != 0: msgtmpl = _(u'%(attempts)d attempt(s) required to post ' u'%(num_messages)d messages to queue ' u'"%(queue)s" under project %(project)s') LOG.debug(msgtmpl, dict(queue=queue_name, attempts=attempt + 1, num_messages=len(res.inserted_ids), project=project)) # Update the counter in preparation for the next batch # # NOTE(kgriffs): Due to the unique index on the messages # collection, competing inserts will fail as a whole, # and keep retrying until the counter is incremented # such that the competing marker's will start at a # unique number, 1 past the max of the messages just # inserted above. self._inc_counter(queue_name, project, amount=len(res.inserted_ids)) # NOTE(kgriffs): Finalize the insert once we can say that # all the messages made it. This makes bulk inserts # atomic, assuming queries filter out any non-finalized # messages. if transaction is not None: collection.update_many({'tx': transaction}, {'$set': {'tx': None}}, upsert=False) return [str(id_) for id_ in res.inserted_ids] except (pymongo.errors.DuplicateKeyError, pymongo.errors.BulkWriteError) as ex: # TODO(kgriffs): Record stats of how often retries happen, # and how many attempts, on average, are required to insert # messages. # NOTE(kgriffs): This can be used in conjunction with the # log line, above, that is emitted after all messages have # been posted, to gauge how long it is taking for messages # to be posted to a given queue, or overall. # # TODO(kgriffs): Add transaction ID to help match up loglines if attempt == 0: msgtmpl = _(u'First attempt failed while ' u'adding messages to queue ' u'"%(queue)s" under project %(project)s') LOG.debug(msgtmpl, dict(queue=queue_name, project=project)) # NOTE(kgriffs): Never retry past the point that competing # messages expire and are GC'd, since once they are gone, # the unique index no longer protects us from getting out # of order, which could cause an observer to miss this # message. The code below provides a sanity-check to ensure # this situation can not happen. elapsed = timeutils.utcnow_ts() - now if elapsed > MAX_RETRY_POST_DURATION: msgtmpl = (u'Exceeded maximum retry duration for queue ' u'"%(queue)s" under project %(project)s') LOG.warning(msgtmpl, dict(queue=queue_name, project=project)) break # Chill out for a moment to mitigate thrashing/thundering self._backoff_sleep(attempt) # NOTE(kgriffs): Perhaps we failed because a worker crashed # after inserting messages, but before incrementing the # counter; that would cause all future requests to stall, # since they would keep getting the same base marker that is # conflicting with existing messages, until the messages that # "won" expire, at which time we would end up reusing markers, # and that could make some messages invisible to an observer # that is querying with a marker that is large than the ones # being reused. # # To mitigate this, we apply a heuristic to determine whether # a counter has stalled. We attempt to increment the counter, # but only if it hasn't been updated for a few seconds, which # should mean that nobody is left to update it! # # Note that we increment one at a time until the logjam is # broken, since we don't know how many messages were posted # by the worker before it crashed. next_marker = self._inc_counter( queue_name, project, window=COUNTER_STALL_WINDOW) # Retry the entire batch with a new sequence of markers. # # NOTE(kgriffs): Due to the unique index, and how # MongoDB works with batch requests, we will never # end up with a partially-successful update. The first # document in the batch will fail to insert, and the # remainder of the documents will not be attempted. if next_marker is None: # NOTE(kgriffs): Usually we will end up here, since # it should be rare that a counter becomes stalled. next_marker = self._get_counter( queue_name, project) else: msgtmpl = (u'Detected a stalled message counter ' u'for queue "%(queue)s" under ' u'project %(project)s.' u'The counter was incremented to %(value)d.') LOG.warning(msgtmpl, dict(queue=queue_name, project=project, value=next_marker)) for index, message in enumerate(prepared_messages): message['k'] = next_marker + index except bsonerror.InvalidDocument as ex: LOG.exception(ex) raise except Exception as ex: LOG.exception(ex) raise msgtmpl = (u'Hit maximum number of attempts (%(max)s) for queue ' u'"%(queue)s" under project %(project)s') LOG.warning(msgtmpl, dict(max=self.driver.mongodb_conf.max_attempts, queue=queue_name, project=project)) raise errors.MessageConflict(queue_name, project)
def post(self, topic_name, messages, client_uuid, project=None): # NOTE(flaper87): This method should be safe to retry on # autoreconnect, since we've a 2-step insert for messages. # The worst-case scenario is that we'll increase the counter # several times and we'd end up with some non-active messages. if not self._topic_ctrl.exists(topic_name, project): raise errors.TopicDoesNotExist(topic_name, project) # NOTE(flaper87): Make sure the counter exists. This method # is an upsert. self._get_counter(topic_name, project) now = timeutils.utcnow_ts() now_dt = datetime.datetime.utcfromtimestamp(now) collection = self._collection(topic_name, project) # Set the next basis marker for the first attempt. # # Note that we don't increment the counter right away because # if 2 concurrent posts happen and the one with the higher counter # ends before the one with the lower counter, there's a window # where a client paging through the queue may get the messages # with the higher counter and skip the previous ones. This would # make our FIFO guarantee unsound. next_marker = self._get_counter(topic_name, project) # Unique transaction ID to facilitate atomic batch inserts transaction = objectid.ObjectId() prepared_messages = [] for index, message in enumerate(messages): msg = { PROJ_TOPIC: utils.scope_queue_name(topic_name, project), 't': message['ttl'], 'e': now_dt + datetime.timedelta(seconds=message['ttl']), 'u': client_uuid, 'd': now + message.get('delay', 0), 'b': message['body'] if 'body' in message else {}, 'k': next_marker + index, 'tx': None } if self.driver.conf.enable_checksum: msg['cs'] = s_utils.get_checksum(message.get('body', None)) prepared_messages.append(msg) # NOTE(kgriffs): Don't take the time to do a 2-phase insert # if there is no way for it to partially succeed. if len(prepared_messages) == 1: transaction = None prepared_messages[0]['tx'] = None # Use a retry range for sanity, although we expect # to rarely, if ever, reach the maximum number of # retries. # # NOTE(kgriffs): With the default configuration (100 ms # max sleep, 1000 max attempts), the max stall time # before the operation is abandoned is 49.95 seconds. for attempt in self._retry_range: try: res = collection.insert_many(prepared_messages, bypass_document_validation=True) # Log a message if we retried, for debugging perf issues if attempt != 0: msgtmpl = _(u'%(attempts)d attempt(s) required to post ' u'%(num_messages)d messages to queue ' u'"%(topic)s" under project %(project)s') LOG.debug( msgtmpl, dict(topic=topic_name, attempts=attempt + 1, num_messages=len(res.inserted_ids), project=project)) # Update the counter in preparation for the next batch # # NOTE(kgriffs): Due to the unique index on the messages # collection, competing inserts will fail as a whole, # and keep retrying until the counter is incremented # such that the competing marker's will start at a # unique number, 1 past the max of the messages just # inserted above. self._inc_counter(topic_name, project, amount=len(res.inserted_ids)) # NOTE(kgriffs): Finalize the insert once we can say that # all the messages made it. This makes bulk inserts # atomic, assuming queries filter out any non-finalized # messages. if transaction is not None: collection.update_many({'tx': transaction}, {'$set': { 'tx': None }}, upsert=False) return [str(id_) for id_ in res.inserted_ids] except (pymongo.errors.DuplicateKeyError, pymongo.errors.BulkWriteError) as ex: # TODO(kgriffs): Record stats of how often retries happen, # and how many attempts, on average, are required to insert # messages. # NOTE(kgriffs): This can be used in conjunction with the # log line, above, that is emitted after all messages have # been posted, to gauge how long it is taking for messages # to be posted to a given topic, or overall. # # TODO(kgriffs): Add transaction ID to help match up loglines if attempt == 0: msgtmpl = _(u'First attempt failed while ' u'adding messages to topic ' u'"%(topic)s" under project %(project)s') LOG.debug(msgtmpl, dict(topic=topic_name, project=project)) # NOTE(kgriffs): Never retry past the point that competing # messages expire and are GC'd, since once they are gone, # the unique index no longer protects us from getting out # of order, which could cause an observer to miss this # message. The code below provides a sanity-check to ensure # this situation can not happen. elapsed = timeutils.utcnow_ts() - now if elapsed > MAX_RETRY_POST_DURATION: msgtmpl = (u'Exceeded maximum retry duration for topic ' u'"%(topic)s" under project %(project)s') LOG.warning(msgtmpl, dict(topic=topic_name, project=project)) break # Chill out for a moment to mitigate thrashing/thundering self._backoff_sleep(attempt) # NOTE(kgriffs): Perhaps we failed because a worker crashed # after inserting messages, but before incrementing the # counter; that would cause all future requests to stall, # since they would keep getting the same base marker that is # conflicting with existing messages, until the messages that # "won" expire, at which time we would end up reusing markers, # and that could make some messages invisible to an observer # that is querying with a marker that is large than the ones # being reused. # # To mitigate this, we apply a heuristic to determine whether # a counter has stalled. We attempt to increment the counter, # but only if it hasn't been updated for a few seconds, which # should mean that nobody is left to update it! # # Note that we increment one at a time until the logjam is # broken, since we don't know how many messages were posted # by the worker before it crashed. next_marker = self._inc_counter(topic_name, project, window=COUNTER_STALL_WINDOW) # Retry the entire batch with a new sequence of markers. # # NOTE(kgriffs): Due to the unique index, and how # MongoDB works with batch requests, we will never # end up with a partially-successful update. The first # document in the batch will fail to insert, and the # remainder of the documents will not be attempted. if next_marker is None: # NOTE(kgriffs): Usually we will end up here, since # it should be rare that a counter becomes stalled. next_marker = self._get_counter(topic_name, project) else: msgtmpl = (u'Detected a stalled message counter ' u'for topic "%(topic)s" under ' u'project %(project)s.' u'The counter was incremented to %(value)d.') LOG.warning( msgtmpl, dict(topic=topic_name, project=project, value=next_marker)) for index, message in enumerate(prepared_messages): message['k'] = next_marker + index except bsonerror.InvalidDocument as ex: LOG.exception(ex) raise except Exception as ex: LOG.exception(ex) raise msgtmpl = (u'Hit maximum number of attempts (%(max)s) for topic ' u'"%(topic)s" under project %(project)s') LOG.warning( msgtmpl, dict(max=self.driver.mongodb_conf.max_attempts, topic=topic_name, project=project)) raise errors.MessageConflict(topic_name, project)