class TimingMiddleware(threading.local, MiddlewareMixin): dest = '/var/log/tower/profile' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.prof = AWXProfiler("TimingMiddleware") def process_request(self, request): self.start_time = time.time() if settings.AWX_REQUEST_PROFILE: self.prof.start() def process_response(self, request, response): if not hasattr( self, 'start_time'): # some tools may not invoke process_request return response total_time = time.time() - self.start_time response['X-API-Total-Time'] = '%0.3fs' % total_time if settings.AWX_REQUEST_PROFILE: response['X-API-Profile-File'] = self.prof.stop() perf_logger.info( f'request: {request}, response_time: {response["X-API-Total-Time"]}', extra=dict(python_objects=dict( request=request, response=response, X_API_TOTAL_TIME=response["X-API-Total-Time"]))) return response
class CallbackBrokerWorker(BaseWorker): """ A worker implementation that deserializes callback event data and persists it into the database. The code that *generates* these types of messages is found in the ansible-runner display callback plugin. """ MAX_RETRIES = 2 last_stats = time.time() last_flush = time.time() total = 0 last_event = '' prof = None def __init__(self): self.buff = {} self.pid = os.getpid() self.redis = redis.Redis.from_url(settings.BROKER_URL) self.subsystem_metrics = s_metrics.Metrics(auto_pipe_execute=False) self.queue_pop = 0 self.queue_name = settings.CALLBACK_QUEUE self.prof = AWXProfiler("CallbackBrokerWorker") for key in self.redis.keys('awx_callback_receiver_statistics_*'): self.redis.delete(key) def read(self, queue): try: res = self.redis.blpop(self.queue_name, timeout=1) if res is None: return {'event': 'FLUSH'} self.total += 1 self.queue_pop += 1 self.subsystem_metrics.inc('callback_receiver_events_popped_redis', 1) self.subsystem_metrics.inc('callback_receiver_events_in_memory', 1) return json.loads(res[1]) except redis.exceptions.RedisError: logger.exception("encountered an error communicating with redis") time.sleep(1) except (json.JSONDecodeError, KeyError): logger.exception("failed to decode JSON message from redis") finally: self.record_statistics() self.record_read_metrics() return {'event': 'FLUSH'} def record_read_metrics(self): if self.queue_pop == 0: return if self.subsystem_metrics.should_pipe_execute() is True: queue_size = self.redis.llen(self.queue_name) self.subsystem_metrics.set( 'callback_receiver_events_queue_size_redis', queue_size) self.subsystem_metrics.pipe_execute() self.queue_pop = 0 def record_statistics(self): # buffer stat recording to once per (by default) 5s if time.time( ) - self.last_stats > settings.JOB_EVENT_STATISTICS_INTERVAL: try: self.redis.set(f'awx_callback_receiver_statistics_{self.pid}', self.debug()) self.last_stats = time.time() except Exception: logger.exception( "encountered an error communicating with redis") self.last_stats = time.time() def debug(self): return f'. worker[pid:{self.pid}] sent={self.total} rss={self.mb}MB {self.last_event}' @property def mb(self): return '{:0.3f}'.format( psutil.Process(self.pid).memory_info().rss / 1024.0 / 1024.0) def toggle_profiling(self, *args): if not self.prof.is_started(): self.prof.start() logger.error('profiling is enabled') else: filepath = self.prof.stop() logger.error(f'profiling is disabled, wrote {filepath}') def work_loop(self, *args, **kw): if settings.AWX_CALLBACK_PROFILE: signal.signal(signal.SIGUSR1, self.toggle_profiling) return super(CallbackBrokerWorker, self).work_loop(*args, **kw) def flush(self, force=False): now = tz_now() if force or (time.time() - self.last_flush ) > settings.JOB_EVENT_BUFFER_SECONDS or any([ len(events) >= 1000 for events in self.buff.values() ]): bulk_events_saved = 0 singular_events_saved = 0 metrics_events_batch_save_errors = 0 for cls, events in self.buff.items(): logger.debug( f'{cls.__name__}.objects.bulk_create({len(events)})') for e in events: if not e.created: e.created = now e.modified = now duration_to_save = time.perf_counter() try: cls.objects.bulk_create(events) bulk_events_saved += len(events) except Exception: # if an exception occurs, we should re-attempt to save the # events one-by-one, because something in the list is # broken/stale metrics_events_batch_save_errors += 1 for e in events: try: e.save() singular_events_saved += 1 except Exception: logger.exception('Database Error Saving Job Event') duration_to_save = time.perf_counter() - duration_to_save for e in events: if not getattr(e, '_skip_websocket_message', False): emit_event_detail(e) self.buff = {} self.last_flush = time.time() # only update metrics if we saved events if (bulk_events_saved + singular_events_saved) > 0: self.subsystem_metrics.inc( 'callback_receiver_batch_events_errors', metrics_events_batch_save_errors) self.subsystem_metrics.inc( 'callback_receiver_events_insert_db_seconds', duration_to_save) self.subsystem_metrics.inc( 'callback_receiver_events_insert_db', bulk_events_saved + singular_events_saved) self.subsystem_metrics.observe( 'callback_receiver_batch_events_insert_db', bulk_events_saved) self.subsystem_metrics.inc( 'callback_receiver_events_in_memory', -(bulk_events_saved + singular_events_saved)) if self.subsystem_metrics.should_pipe_execute() is True: self.subsystem_metrics.pipe_execute() def perform_work(self, body): try: flush = body.get('event') == 'FLUSH' if flush: self.last_event = '' if not flush: event_map = { 'job_id': JobEvent, 'ad_hoc_command_id': AdHocCommandEvent, 'project_update_id': ProjectUpdateEvent, 'inventory_update_id': InventoryUpdateEvent, 'system_job_id': SystemJobEvent, } job_identifier = 'unknown job' for key, cls in event_map.items(): if key in body: job_identifier = body[key] break self.last_event = f'\n\t- {cls.__name__} for #{job_identifier} ({body.get("event", "")} {body.get("uuid", "")})' # noqa if body.get('event') == 'EOF': try: if 'guid' in body: GuidMiddleware.set_guid(body['guid']) final_counter = body.get('final_counter', 0) logger.info( 'Event processing is finished for Job {}, sending notifications' .format(job_identifier)) # EOF events are sent when stdout for the running task is # closed. don't actually persist them to the database; we # just use them to report `summary` websocket events as an # approximation for when a job is "done" emit_channel_notification( 'jobs-summary', dict(group_name='jobs', unified_job_id=job_identifier, final_counter=final_counter)) # Additionally, when we've processed all events, we should # have all the data we need to send out success/failure # notification templates uj = UnifiedJob.objects.get(pk=job_identifier) if isinstance(uj, Job): # *actual playbooks* send their success/failure # notifications in response to the playbook_on_stats # event handling code in main.models.events pass elif hasattr(uj, 'send_notification_templates'): handle_success_and_failure_notifications.apply_async( [uj.id]) except Exception: logger.exception( 'Worker failed to emit notifications: Job {}'. format(job_identifier)) finally: self.subsystem_metrics.inc( 'callback_receiver_events_in_memory', -1) GuidMiddleware.set_guid('') return skip_websocket_message = body.pop('skip_websocket_message', False) event = cls.create_from_data(**body) if skip_websocket_message: event._skip_websocket_message = True self.buff.setdefault(cls, []).append(event) retries = 0 while retries <= self.MAX_RETRIES: try: self.flush(force=flush) break except (OperationalError, InterfaceError, InternalError): if retries >= self.MAX_RETRIES: logger.exception( 'Worker could not re-establish database connectivity, giving up on one or more events.' ) return delay = 60 * retries logger.exception( 'Database Error Saving Job Event, retry #{i} in {delay} seconds:' .format(i=retries + 1, delay=delay)) django_connection.close() time.sleep(delay) retries += 1 except DatabaseError: logger.exception('Database Error Saving Job Event') break except Exception as exc: tb = traceback.format_exc() logger.error('Callback Task Processor Raised Exception: %r', exc) logger.error('Detail: {}'.format(tb))
class CallbackBrokerWorker(BaseWorker): """ A worker implementation that deserializes callback event data and persists it into the database. The code that *generates* these types of messages is found in the ansible-runner display callback plugin. """ MAX_RETRIES = 2 last_stats = time.time() last_flush = time.time() total = 0 last_event = '' prof = None def __init__(self): self.buff = {} self.redis = redis.Redis.from_url(settings.BROKER_URL) self.subsystem_metrics = s_metrics.Metrics(auto_pipe_execute=False) self.queue_pop = 0 self.queue_name = settings.CALLBACK_QUEUE self.prof = AWXProfiler("CallbackBrokerWorker") for key in self.redis.keys('awx_callback_receiver_statistics_*'): self.redis.delete(key) @cached_property def pid(self): """This needs to be obtained after forking, or else it will give the parent process""" return os.getpid() def read(self, queue): try: res = self.redis.blpop(self.queue_name, timeout=1) if res is None: return {'event': 'FLUSH'} self.total += 1 self.queue_pop += 1 self.subsystem_metrics.inc('callback_receiver_events_popped_redis', 1) self.subsystem_metrics.inc('callback_receiver_events_in_memory', 1) return json.loads(res[1]) except redis.exceptions.RedisError: logger.exception("encountered an error communicating with redis") time.sleep(1) except (json.JSONDecodeError, KeyError): logger.exception("failed to decode JSON message from redis") finally: self.record_statistics() self.record_read_metrics() return {'event': 'FLUSH'} def record_read_metrics(self): if self.queue_pop == 0: return if self.subsystem_metrics.should_pipe_execute() is True: queue_size = self.redis.llen(self.queue_name) self.subsystem_metrics.set( 'callback_receiver_events_queue_size_redis', queue_size) self.subsystem_metrics.pipe_execute() self.queue_pop = 0 def record_statistics(self): # buffer stat recording to once per (by default) 5s if time.time( ) - self.last_stats > settings.JOB_EVENT_STATISTICS_INTERVAL: try: self.redis.set(f'awx_callback_receiver_statistics_{self.pid}', self.debug()) self.last_stats = time.time() except Exception: logger.exception( "encountered an error communicating with redis") self.last_stats = time.time() def debug(self): return f'. worker[pid:{self.pid}] sent={self.total} rss={self.mb}MB {self.last_event}' @property def mb(self): return '{:0.3f}'.format( psutil.Process(self.pid).memory_info().rss / 1024.0 / 1024.0) def toggle_profiling(self, *args): if not self.prof.is_started(): self.prof.start() logger.error('profiling is enabled') else: filepath = self.prof.stop() logger.error(f'profiling is disabled, wrote {filepath}') def work_loop(self, *args, **kw): if settings.AWX_CALLBACK_PROFILE: signal.signal(signal.SIGUSR1, self.toggle_profiling) return super(CallbackBrokerWorker, self).work_loop(*args, **kw) def flush(self, force=False): now = tz_now() if force or (time.time() - self.last_flush ) > settings.JOB_EVENT_BUFFER_SECONDS or any([ len(events) >= 1000 for events in self.buff.values() ]): metrics_bulk_events_saved = 0 metrics_singular_events_saved = 0 metrics_events_batch_save_errors = 0 metrics_events_broadcast = 0 metrics_events_missing_created = 0 metrics_total_job_event_processing_seconds = datetime.timedelta( seconds=0) for cls, events in self.buff.items(): logger.debug( f'{cls.__name__}.objects.bulk_create({len(events)})') for e in events: e.modified = now # this can be set before created because now is set above on line 149 if not e.created: e.created = now metrics_events_missing_created += 1 else: # only calculate the seconds if the created time already has been set metrics_total_job_event_processing_seconds += e.modified - e.created metrics_duration_to_save = time.perf_counter() try: cls.objects.bulk_create(events) metrics_bulk_events_saved += len(events) except Exception: # if an exception occurs, we should re-attempt to save the # events one-by-one, because something in the list is # broken/stale metrics_events_batch_save_errors += 1 for e in events: try: e.save() metrics_singular_events_saved += 1 except Exception: logger.exception('Database Error Saving Job Event') metrics_duration_to_save = time.perf_counter( ) - metrics_duration_to_save for e in events: if not getattr(e, '_skip_websocket_message', False): metrics_events_broadcast += 1 emit_event_detail(e) if getattr(e, '_notification_trigger_event', False): job_stats_wrapup(getattr(e, e.JOB_REFERENCE), event=e) self.buff = {} self.last_flush = time.time() # only update metrics if we saved events if (metrics_bulk_events_saved + metrics_singular_events_saved) > 0: self.subsystem_metrics.inc( 'callback_receiver_batch_events_errors', metrics_events_batch_save_errors) self.subsystem_metrics.inc( 'callback_receiver_events_insert_db_seconds', metrics_duration_to_save) self.subsystem_metrics.inc( 'callback_receiver_events_insert_db', metrics_bulk_events_saved + metrics_singular_events_saved) self.subsystem_metrics.observe( 'callback_receiver_batch_events_insert_db', metrics_bulk_events_saved) self.subsystem_metrics.inc( 'callback_receiver_events_in_memory', -(metrics_bulk_events_saved + metrics_singular_events_saved)) self.subsystem_metrics.inc( 'callback_receiver_events_broadcast', metrics_events_broadcast) self.subsystem_metrics.set( 'callback_receiver_event_processing_avg_seconds', metrics_total_job_event_processing_seconds.total_seconds() / (metrics_bulk_events_saved + metrics_singular_events_saved - metrics_events_missing_created), ) if self.subsystem_metrics.should_pipe_execute() is True: self.subsystem_metrics.pipe_execute() def perform_work(self, body): try: flush = body.get('event') == 'FLUSH' if flush: self.last_event = '' if not flush: job_identifier = 'unknown job' for cls in (JobEvent, AdHocCommandEvent, ProjectUpdateEvent, InventoryUpdateEvent, SystemJobEvent): if cls.JOB_REFERENCE in body: job_identifier = body[cls.JOB_REFERENCE] break self.last_event = f'\n\t- {cls.__name__} for #{job_identifier} ({body.get("event", "")} {body.get("uuid", "")})' # noqa notification_trigger_event = bool( body.get('event') == cls.WRAPUP_EVENT) if body.get('event') == 'EOF': try: if 'guid' in body: set_guid(body['guid']) final_counter = body.get('final_counter', 0) logger.info( 'Starting EOF event processing for Job {}'.format( job_identifier)) # EOF events are sent when stdout for the running task is # closed. don't actually persist them to the database; we # just use them to report `summary` websocket events as an # approximation for when a job is "done" emit_channel_notification( 'jobs-summary', dict(group_name='jobs', unified_job_id=job_identifier, final_counter=final_counter)) if notification_trigger_event: job_stats_wrapup(job_identifier) except Exception: logger.exception( 'Worker failed to perform EOF tasks: Job {}'. format(job_identifier)) finally: self.subsystem_metrics.inc( 'callback_receiver_events_in_memory', -1) set_guid('') return skip_websocket_message = body.pop('skip_websocket_message', False) event = cls.create_from_data(**body) if skip_websocket_message: # if this event sends websocket messages, fire them off on flush event._skip_websocket_message = True if notification_trigger_event: # if this is an Ansible stats event, ensure notifications on flush event._notification_trigger_event = True self.buff.setdefault(cls, []).append(event) retries = 0 while retries <= self.MAX_RETRIES: try: self.flush(force=flush) break except (OperationalError, InterfaceError, InternalError): if retries >= self.MAX_RETRIES: logger.exception( 'Worker could not re-establish database connectivity, giving up on one or more events.' ) return delay = 60 * retries logger.exception( 'Database Error Saving Job Event, retry #{i} in {delay} seconds:' .format(i=retries + 1, delay=delay)) django_connection.close() time.sleep(delay) retries += 1 except DatabaseError: logger.exception('Database Error Saving Job Event') break except Exception as exc: tb = traceback.format_exc() logger.error('Callback Task Processor Raised Exception: %r', exc) logger.error('Detail: {}'.format(tb))