class QueueProcessingWorker(object): def __init__(self): self.q = SimpleQueueClient() def consume_wrapper(self, data): try: self.consume(data) except Exception: self._log_problem() if not os.path.exists(settings.QUEUE_ERROR_DIR): os.mkdir(settings.QUEUE_ERROR_DIR) fname = '%s.errors' % (self.queue_name, ) fn = os.path.join(settings.QUEUE_ERROR_DIR, fname) line = '%s\t%s\n' % (time.asctime(), ujson.dumps(data)) lock_fn = fn + '.lock' with lockfile(lock_fn): with open(fn, 'a') as f: f.write(line) reset_queries() def _log_problem(self): logging.exception("Problem handling data on queue %s" % (self.queue_name, )) def start(self): self.q.register_json_consumer(self.queue_name, self.consume_wrapper) self.q.start_consuming() def stop(self): self.q.stop_consuming()
class QueueProcessingWorker(object): def __init__(self): self.q = SimpleQueueClient() def consume_wrapper(self, data): try: self.consume(data) except Exception: self._log_problem() if not os.path.exists(settings.QUEUE_ERROR_DIR): os.mkdir(settings.QUEUE_ERROR_DIR) fname = '%s.errors' % (self.queue_name,) fn = os.path.join(settings.QUEUE_ERROR_DIR, fname) line = '%s\t%s\n' % (time.asctime(), ujson.dumps(data)) lock_fn = fn + '.lock' with lockfile(lock_fn): with open(fn, 'a') as f: f.write(line) reset_queries() def _log_problem(self): logging.exception("Problem handling data on queue %s" % (self.queue_name,)) def start(self): self.q.register_json_consumer(self.queue_name, self.consume_wrapper) self.q.start_consuming() def stop(self): self.q.stop_consuming()
class QueueProcessingWorker(ABC): queue_name = None # type: str def __init__(self) -> None: self.q = None # type: SimpleQueueClient if self.queue_name is None: raise WorkerDeclarationException( "Queue worker declared without queue_name") @abstractmethod def consume(self, data: Dict[str, Any]) -> None: pass def do_consume(self, consume_func: Callable[[List[Dict[str, Any]]], None], events: List[Dict[str, Any]]) -> None: try: consume_func(events) except Exception: self._handle_consume_exception(events) finally: flush_per_request_caches() reset_queries() def consume_wrapper(self, data: Dict[str, Any]) -> None: consume_func = lambda events: self.consume(events[0]) self.do_consume(consume_func, [data]) def _handle_consume_exception(self, events: List[Dict[str, Any]]) -> None: self._log_problem() if not os.path.exists(settings.QUEUE_ERROR_DIR): os.mkdir(settings.QUEUE_ERROR_DIR) # nocoverage fname = '%s.errors' % (self.queue_name, ) fn = os.path.join(settings.QUEUE_ERROR_DIR, fname) line = '%s\t%s\n' % (time.asctime(), ujson.dumps(events)) lock_fn = fn + '.lock' with lockfile(lock_fn): with open(fn, 'ab') as f: f.write(line.encode('utf-8')) check_and_send_restart_signal() def _log_problem(self) -> None: logging.exception("Problem handling data on queue %s" % (self.queue_name, )) def setup(self) -> None: self.q = SimpleQueueClient() def start(self) -> None: self.q.register_json_consumer(self.queue_name, self.consume_wrapper) self.q.start_consuming() def stop(self) -> None: # nocoverage self.q.stop_consuming()
class QueueProcessingWorker: queue_name = None # type: str def __init__(self): # type: () -> None self.q = None # type: SimpleQueueClient if self.queue_name is None: raise WorkerDeclarationException( "Queue worker declared without queue_name") def consume(self, data): # type: (Dict[str, Any]) -> None raise WorkerDeclarationException("No consumer defined!") def consume_wrapper(self, data): # type: (Dict[str, Any]) -> None try: self.consume(data) except Exception: self._log_problem() if not os.path.exists(settings.QUEUE_ERROR_DIR): os.mkdir(settings.QUEUE_ERROR_DIR) # nocoverage fname = '%s.errors' % (self.queue_name, ) fn = os.path.join(settings.QUEUE_ERROR_DIR, fname) line = '%s\t%s\n' % (time.asctime(), ujson.dumps(data)) lock_fn = fn + '.lock' with lockfile(lock_fn): with open(fn, 'ab') as f: f.write(line.encode('utf-8')) check_and_send_restart_signal() finally: reset_queries() def _log_problem(self): # type: () -> None logging.exception("Problem handling data on queue %s" % (self.queue_name, )) def setup(self): # type: () -> None self.q = SimpleQueueClient() def start(self): # type: () -> None self.q.register_json_consumer(self.queue_name, self.consume_wrapper) self.q.start_consuming() def stop(self): # nocoverage # type: () -> None self.q.stop_consuming()
class QueueProcessingWorker(object): queue_name = None # type: str def __init__(self): # type: () -> None self.q = None # type: SimpleQueueClient if self.queue_name is None: raise WorkerDeclarationException("Queue worker declared without queue_name") def consume(self, data): # type: (Mapping[str, Any]) -> None raise WorkerDeclarationException("No consumer defined!") def consume_wrapper(self, data): # type: (Mapping[str, Any]) -> None try: self.consume(data) except Exception: self._log_problem() if not os.path.exists(settings.QUEUE_ERROR_DIR): os.mkdir(settings.QUEUE_ERROR_DIR) fname = '%s.errors' % (self.queue_name,) fn = os.path.join(settings.QUEUE_ERROR_DIR, fname) line = u'%s\t%s\n' % (time.asctime(), ujson.dumps(data)) lock_fn = fn + '.lock' with lockfile(lock_fn): with open(fn, 'ab') as f: f.write(line.encode('utf-8')) check_and_send_restart_signal() finally: reset_queries() def _log_problem(self): # type: () -> None logging.exception("Problem handling data on queue %s" % (self.queue_name,)) def setup(self): # type: () -> None self.q = SimpleQueueClient() def start(self): # type: () -> None self.q.register_json_consumer(self.queue_name, self.consume_wrapper) self.q.start_consuming() def stop(self): # type: () -> None self.q.stop_consuming()
class QueueProcessingWorker(object): queue_name = None def __init__(self): self.q = SimpleQueueClient() if self.queue_name is None: raise WorkerDeclarationException("Queue worker declared without queue_name") def consume(self, data): raise WorkerDeclarationException("No consumer defined!") def consume_wrapper(self, data): try: self.consume(data) except Exception: self._log_problem() if not os.path.exists(settings.QUEUE_ERROR_DIR): os.mkdir(settings.QUEUE_ERROR_DIR) fname = "%s.errors" % (self.queue_name,) fn = os.path.join(settings.QUEUE_ERROR_DIR, fname) line = "%s\t%s\n" % (time.asctime(), ujson.dumps(data)) lock_fn = fn + ".lock" with lockfile(lock_fn): with open(fn, "a") as f: f.write(line.encode("utf-8")) reset_queries() def _log_problem(self): logging.exception("Problem handling data on queue %s" % (self.queue_name,)) def start(self): self.q.register_json_consumer(self.queue_name, self.consume_wrapper) self.q.start_consuming() def stop(self): self.q.stop_consuming()
class QueueProcessingWorker(ABC): queue_name: str MAX_CONSUME_SECONDS: Optional[int] = 30 ENABLE_TIMEOUTS = False CONSUME_ITERATIONS_BEFORE_UPDATE_STATS_NUM = 50 MAX_SECONDS_BEFORE_UPDATE_STATS = 30 def __init__(self) -> None: self.q: Optional[SimpleQueueClient] = None if not hasattr(self, "queue_name"): raise WorkerDeclarationException( "Queue worker declared without queue_name") self.initialize_statistics() def initialize_statistics(self) -> None: self.queue_last_emptied_timestamp = time.time() self.consumed_since_last_emptied = 0 self.recent_consume_times: MutableSequence[Tuple[int, float]] = deque( maxlen=50) self.consume_iteration_counter = 0 self.idle = True self.last_statistics_update_time = 0.0 self.update_statistics(0) def update_statistics(self, remaining_local_queue_size: int) -> None: total_seconds = sum(seconds for _, seconds in self.recent_consume_times) total_events = sum(events_number for events_number, _ in self.recent_consume_times) if total_events == 0: recent_average_consume_time = None else: recent_average_consume_time = total_seconds / total_events stats_dict = dict( update_time=time.time(), recent_average_consume_time=recent_average_consume_time, current_queue_size=remaining_local_queue_size, queue_last_emptied_timestamp=self.queue_last_emptied_timestamp, consumed_since_last_emptied=self.consumed_since_last_emptied, ) os.makedirs(settings.QUEUE_STATS_DIR, exist_ok=True) fname = f"{self.queue_name}.stats" fn = os.path.join(settings.QUEUE_STATS_DIR, fname) with lockfile(fn + ".lock"): tmp_fn = fn + ".tmp" with open(tmp_fn, "wb") as f: f.write( orjson.dumps(stats_dict, option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2)) os.rename(tmp_fn, fn) self.last_statistics_update_time = time.time() def get_remaining_local_queue_size(self) -> int: if self.q is not None: return self.q.local_queue_size() else: # This is a special case that will happen if we're operating without # using RabbitMQ (e.g. in tests). In that case there's no queuing to speak of # and the only reasonable size to return is 0. return 0 @abstractmethod def consume(self, data: Dict[str, Any]) -> None: pass def do_consume(self, consume_func: Callable[[List[Dict[str, Any]]], None], events: List[Dict[str, Any]]) -> None: consume_time_seconds: Optional[float] = None with configure_scope() as scope: scope.clear_breadcrumbs() add_breadcrumb( type="debug", category="queue_processor", message=f"Consuming {self.queue_name}", data={ "events": events, "local_queue_size": self.get_remaining_local_queue_size() }, ) try: if self.idle: # We're reactivating after having gone idle due to emptying the queue. # We should update the stats file to keep it fresh and to make it clear # that the queue started processing, in case the event we're about to process # makes us freeze. self.idle = False self.update_statistics(self.get_remaining_local_queue_size()) time_start = time.time() if self.MAX_CONSUME_SECONDS and self.ENABLE_TIMEOUTS: try: signal.signal( signal.SIGALRM, functools.partial(self.timer_expired, self.MAX_CONSUME_SECONDS, events), ) try: signal.alarm(self.MAX_CONSUME_SECONDS * len(events)) consume_func(events) finally: signal.alarm(0) finally: signal.signal(signal.SIGALRM, signal.SIG_DFL) else: consume_func(events) consume_time_seconds = time.time() - time_start self.consumed_since_last_emptied += len(events) except Exception as e: self._handle_consume_exception(events, e) finally: flush_per_request_caches() reset_queries() if consume_time_seconds is not None: self.recent_consume_times.append( (len(events), consume_time_seconds)) remaining_local_queue_size = self.get_remaining_local_queue_size() if remaining_local_queue_size == 0: self.queue_last_emptied_timestamp = time.time() self.consumed_since_last_emptied = 0 # We've cleared all the events from the queue, so we don't # need to worry about the small overhead of doing a disk write. # We take advantage of this to update the stats file to keep it fresh, # especially since the queue might go idle until new events come in. self.update_statistics(0) self.idle = True return self.consume_iteration_counter += 1 if (self.consume_iteration_counter >= self.CONSUME_ITERATIONS_BEFORE_UPDATE_STATS_NUM or time.time() - self.last_statistics_update_time >= self.MAX_SECONDS_BEFORE_UPDATE_STATS): self.consume_iteration_counter = 0 self.update_statistics(remaining_local_queue_size) def consume_single_event(self, event: Dict[str, Any]) -> None: consume_func = lambda events: self.consume(events[0]) self.do_consume(consume_func, [event]) def timer_expired(self, limit: int, events: List[Dict[str, Any]], signal: int, frame: FrameType) -> None: raise WorkerTimeoutException(self.queue_name, limit, len(events)) def _handle_consume_exception(self, events: List[Dict[str, Any]], exception: Exception) -> None: if isinstance(exception, InterruptConsumeException): # The exception signals that no further error handling # is needed and the worker can proceed. return with configure_scope() as scope: scope.set_context( "events", { "data": events, "queue_name": self.queue_name, }, ) if isinstance(exception, WorkerTimeoutException): with sentry_sdk.push_scope() as scope: scope.fingerprint = ["worker-timeout", self.queue_name] logging.exception(exception, stack_info=True) else: logging.exception("Problem handling data on queue %s", self.queue_name, stack_info=True) if not os.path.exists(settings.QUEUE_ERROR_DIR): os.mkdir(settings.QUEUE_ERROR_DIR) # nocoverage # Use 'mark_sanitized' to prevent Pysa from detecting this false positive # flow. 'queue_name' is always a constant string. fname = mark_sanitized(f"{self.queue_name}.errors") fn = os.path.join(settings.QUEUE_ERROR_DIR, fname) line = f"{time.asctime()}\t{orjson.dumps(events).decode()}\n" lock_fn = fn + ".lock" with lockfile(lock_fn): with open(fn, "a") as f: f.write(line) check_and_send_restart_signal() def setup(self) -> None: self.q = SimpleQueueClient() def start(self) -> None: assert self.q is not None self.initialize_statistics() self.q.start_json_consumer( self.queue_name, lambda events: self.consume_single_event(events[0]), ) def stop(self) -> None: # nocoverage assert self.q is not None self.q.stop_consuming()
class QueueProcessingWorker(ABC): queue_name: str CONSUME_ITERATIONS_BEFORE_UPDATE_STATS_NUM = 50 def __init__(self) -> None: self.q: Optional[SimpleQueueClient] = None if not hasattr(self, "queue_name"): raise WorkerDeclarationException("Queue worker declared without queue_name") self.initialize_statistics() def initialize_statistics(self) -> None: self.queue_last_emptied_timestamp = time.time() self.consumed_since_last_emptied = 0 self.recent_consume_times: MutableSequence[Tuple[int, float]] = deque(maxlen=50) self.consume_interation_counter = 0 self.update_statistics(0) def update_statistics(self, remaining_queue_size: int) -> None: total_seconds = sum([seconds for _, seconds in self.recent_consume_times]) total_events = sum([events_number for events_number, _ in self.recent_consume_times]) if total_events == 0: recent_average_consume_time = None else: recent_average_consume_time = total_seconds / total_events stats_dict = dict( update_time=time.time(), recent_average_consume_time=recent_average_consume_time, current_queue_size=remaining_queue_size, queue_last_emptied_timestamp=self.queue_last_emptied_timestamp, consumed_since_last_emptied=self.consumed_since_last_emptied, ) os.makedirs(settings.QUEUE_STATS_DIR, exist_ok=True) fname = f'{self.queue_name}.stats' fn = os.path.join(settings.QUEUE_STATS_DIR, fname) with lockfile(fn + '.lock'): tmp_fn = fn + '.tmp' with open(tmp_fn, 'wb') as f: f.write( orjson.dumps(stats_dict, option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2) ) os.rename(tmp_fn, fn) @abstractmethod def consume(self, data: Dict[str, Any]) -> None: pass def do_consume(self, consume_func: Callable[[List[Dict[str, Any]]], None], events: List[Dict[str, Any]]) -> None: consume_time_seconds: Optional[float] = None try: time_start = time.time() consume_func(events) consume_time_seconds = time.time() - time_start self.consumed_since_last_emptied += len(events) except Exception: self._handle_consume_exception(events) finally: flush_per_request_caches() reset_queries() if consume_time_seconds is not None: self.recent_consume_times.append((len(events), consume_time_seconds)) if self.q is not None: remaining_queue_size = self.q.queue_size() else: remaining_queue_size = 0 if remaining_queue_size == 0: self.queue_last_emptied_timestamp = time.time() self.consumed_since_last_emptied = 0 self.consume_interation_counter += 1 if self.consume_interation_counter >= self.CONSUME_ITERATIONS_BEFORE_UPDATE_STATS_NUM: self.consume_interation_counter = 0 self.update_statistics(remaining_queue_size) def consume_wrapper(self, data: Dict[str, Any]) -> None: consume_func = lambda events: self.consume(events[0]) self.do_consume(consume_func, [data]) def _handle_consume_exception(self, events: List[Dict[str, Any]]) -> None: self._log_problem() if not os.path.exists(settings.QUEUE_ERROR_DIR): os.mkdir(settings.QUEUE_ERROR_DIR) # nocoverage # Use 'mark_sanitized' to prevent Pysa from detecting this false positive # flow. 'queue_name' is always a constant string. fname = mark_sanitized(f'{self.queue_name}.errors') fn = os.path.join(settings.QUEUE_ERROR_DIR, fname) line = f'{time.asctime()}\t{orjson.dumps(events).decode()}\n' lock_fn = fn + '.lock' with lockfile(lock_fn): with open(fn, 'ab') as f: f.write(line.encode('utf-8')) check_and_send_restart_signal() def _log_problem(self) -> None: logging.exception("Problem handling data on queue %s", self.queue_name, stack_info=True) def setup(self) -> None: self.q = SimpleQueueClient() def start(self) -> None: assert self.q is not None self.initialize_statistics() self.q.register_json_consumer(self.queue_name, self.consume_wrapper) self.q.start_consuming() def stop(self) -> None: # nocoverage assert self.q is not None self.q.stop_consuming()
class QueueProcessingWorker(ABC): queue_name: str = None CONSUME_ITERATIONS_BEFORE_UPDATE_STATS_NUM = 50 def __init__(self) -> None: self.q: SimpleQueueClient = None if self.queue_name is None: raise WorkerDeclarationException( "Queue worker declared without queue_name") self.initialize_statistics() def initialize_statistics(self) -> None: self.queue_last_emptied_timestamp = time.time() self.consumed_since_last_emptied = 0 self.recent_consume_times: MutableSequence[Tuple[int, float]] = deque( maxlen=50) self.consume_interation_counter = 0 self.update_statistics(0) def update_statistics(self, remaining_queue_size: int) -> None: total_seconds = sum( [seconds for _, seconds in self.recent_consume_times]) total_events = sum( [events_number for events_number, _ in self.recent_consume_times]) if total_events == 0: recent_average_consume_time = None else: recent_average_consume_time = total_seconds / total_events stats_dict = dict( update_time=time.time(), recent_average_consume_time=recent_average_consume_time, current_queue_size=remaining_queue_size, queue_last_emptied_timestamp=self.queue_last_emptied_timestamp, consumed_since_last_emptied=self.consumed_since_last_emptied, ) os.makedirs(settings.QUEUE_STATS_DIR, exist_ok=True) fname = '%s.stats' % (self.queue_name, ) fn = os.path.join(settings.QUEUE_STATS_DIR, fname) with lockfile(fn + '.lock'): tmp_fn = fn + '.tmp' with open(tmp_fn, 'w') as f: serialized_dict = ujson.dumps(stats_dict, indent=2) serialized_dict += '\n' f.write(serialized_dict) os.rename(tmp_fn, fn) @abstractmethod def consume(self, data: Dict[str, Any]) -> None: pass def do_consume(self, consume_func: Callable[[List[Dict[str, Any]]], None], events: List[Dict[str, Any]]) -> None: try: time_start = time.time() consume_func(events) consume_time_seconds: Optional[float] = time.time() - time_start self.consumed_since_last_emptied += len(events) except Exception: self._handle_consume_exception(events) consume_time_seconds = None finally: flush_per_request_caches() reset_queries() if consume_time_seconds is not None: self.recent_consume_times.append( (len(events), consume_time_seconds)) if self.q is not None: remaining_queue_size = self.q.queue_size() else: remaining_queue_size = 0 if remaining_queue_size == 0: self.queue_last_emptied_timestamp = time.time() self.consumed_since_last_emptied = 0 self.consume_interation_counter += 1 if self.consume_interation_counter >= self.CONSUME_ITERATIONS_BEFORE_UPDATE_STATS_NUM: self.consume_interation_counter = 0 self.update_statistics(remaining_queue_size) def consume_wrapper(self, data: Dict[str, Any]) -> None: consume_func = lambda events: self.consume(events[0]) self.do_consume(consume_func, [data]) def _handle_consume_exception(self, events: List[Dict[str, Any]]) -> None: self._log_problem() if not os.path.exists(settings.QUEUE_ERROR_DIR): os.mkdir(settings.QUEUE_ERROR_DIR) # nocoverage fname = '%s.errors' % (self.queue_name, ) fn = os.path.join(settings.QUEUE_ERROR_DIR, fname) line = '%s\t%s\n' % (time.asctime(), ujson.dumps(events)) lock_fn = fn + '.lock' with lockfile(lock_fn): with open(fn, 'ab') as f: f.write(line.encode('utf-8')) check_and_send_restart_signal() def _log_problem(self) -> None: logging.exception("Problem handling data on queue %s" % (self.queue_name, )) def setup(self) -> None: self.q = SimpleQueueClient() def start(self) -> None: self.initialize_statistics() self.q.register_json_consumer(self.queue_name, self.consume_wrapper) self.q.start_consuming() def stop(self) -> None: # nocoverage self.q.stop_consuming()