def test_monitor_experiment_logs(self): expeirment_uuid = uuid.uuid4().hex RedisToStream.monitor_experiment_logs(expeirment_uuid) assert RedisToStream.is_monitored_experiment_logs( expeirment_uuid) is True RedisToStream.remove_experiment_logs(expeirment_uuid) assert RedisToStream.is_monitored_experiment_logs( expeirment_uuid) is False
def test_experiment_monitoring(self): experiment_uuid = uuid.uuid4().hex assert RedisToStream.is_monitored_experiment_resources(experiment_uuid) is False RedisToStream.monitor_experiment_resources(experiment_uuid) assert RedisToStream.is_monitored_experiment_resources(experiment_uuid) is True RedisToStream.remove_experiment_resources(experiment_uuid) assert RedisToStream.is_monitored_experiment_resources(experiment_uuid) is False assert RedisToStream.is_monitored_experiment_logs(experiment_uuid) is False RedisToStream.monitor_experiment_logs(experiment_uuid) assert RedisToStream.is_monitored_experiment_logs(experiment_uuid) is True RedisToStream.remove_experiment_logs(experiment_uuid) assert RedisToStream.is_monitored_experiment_logs(experiment_uuid) is False
def publish_experiment_job_log(self, log_lines, status, experiment_uuid, experiment_name, job_uuid, task_type=None, task_idx=None): self._logger.debug("Publishing log event for task: %s.%s, %s", task_type, task_idx, experiment_name) celery_app.send_task( EventsCeleryTasks.EVENTS_HANDLE_LOGS_EXPERIMENT_JOB, kwargs={ 'experiment_name': experiment_name, 'experiment_uuid': experiment_uuid, 'job_uuid': job_uuid, 'log_lines': log_lines, 'task_type': task_type, 'task_idx': task_idx }) try: should_stream = ( RedisToStream.is_monitored_job_logs(job_uuid) or RedisToStream.is_monitored_experiment_logs(experiment_uuid)) except RedisError: should_stream = False if should_stream: self._logger.info( "Streaming new log event for experiment: %s job: %s", experiment_uuid, job_uuid) with celery_app.producer_or_acquire(None) as producer: try: producer.publish( { 'experiment_uuid': experiment_uuid, 'job_uuid': job_uuid, 'log_lines': log_lines, 'status': status, 'task_type': task_type, 'task_idx': task_idx }, retry=True, routing_key='{}.{}.{}'.format( RoutingKeys.LOGS_SIDECARS_EXPERIMENTS, experiment_uuid, job_uuid), exchange=settings.INTERNAL_EXCHANGE, ) except (TimeoutError, AMQPError): pass
def publish_experiment_job_log(self, log_lines, experiment_uuid, experiment_name, job_uuid, send_task=True): self._logger.debug("Publishing log event for task: %s, %s", job_uuid, experiment_name) if send_task: workers.send(LogsCeleryTasks.LOGS_HANDLE_EXPERIMENT_JOB, kwargs={ 'experiment_name': experiment_name, 'experiment_uuid': experiment_uuid, 'log_lines': log_lines, 'temp': True }, countdown=None) try: should_stream = ( RedisToStream.is_monitored_job_logs(job_uuid) or RedisToStream.is_monitored_experiment_logs(experiment_uuid)) except RedisError: should_stream = False if should_stream: self._logger.info( "Streaming new log event for experiment: %s job: %s", experiment_uuid, job_uuid) with workers.app.producer_or_acquire(None) as producer: try: producer.publish( { 'experiment_uuid': experiment_uuid, 'job_uuid': job_uuid, 'log_lines': log_lines, }, retry=True, routing_key='{}.{}.{}'.format( RoutingKeys.STREAM_LOGS_SIDECARS_EXPERIMENTS, experiment_uuid, job_uuid), exchange=settings.INTERNAL_EXCHANGE, ) except (TimeoutError, AMQPError): pass
async def experiment_logs( request, # pylint:disable=too-many-branches ws, username, project_name, experiment_id): from streams.consumers.consumers import Consumer experiment, message = validate_experiment(request=request, username=username, project_name=project_name, experiment_id=experiment_id) if experiment is None: await ws.send(get_error_message(message)) return experiment_uuid = experiment.uuid.hex auditor.record(event_type=EXPERIMENT_LOGS_VIEWED, instance=experiment, actor_id=request.app.user.id, actor_name=request.app.user.username) if not RedisToStream.is_monitored_experiment_logs( experiment_uuid=experiment_uuid): logger.info('Experiment uuid `%s` logs is now being monitored', experiment_uuid) RedisToStream.monitor_experiment_logs(experiment_uuid=experiment_uuid) # start consumer if experiment_uuid in request.app.experiment_logs_consumers: consumer = request.app.experiment_logs_consumers[experiment_uuid] else: logger.info('Add experiment log consumer for %s', experiment_uuid) consumer = Consumer(routing_key='{}.{}.*'.format( RoutingKeys.STREAM_LOGS_SIDECARS_EXPERIMENTS, experiment_uuid), queue='{}.{}'.format( CeleryQueues.STREAM_LOGS_SIDECARS, experiment_uuid)) request.app.experiment_logs_consumers[experiment_uuid] = consumer consumer.run() def should_disconnect(): if not consumer.ws: logger.info('Stopping logs monitor for experiment uuid %s', experiment_uuid) RedisToStream.remove_experiment_logs( experiment_uuid=experiment_uuid) # if experiment_uuid in request.app.experiment_logs_consumers: # consumer = request.app.experiment_logs_consumers.pop(experiment_uuid, None) # if consumer: # consumer.stop() return True return False # add socket manager consumer.add_socket(ws) should_quite = False num_message_retries = 0 # Stream phase changes status = None while status != ExperimentLifeCycle.RUNNING and not ExperimentLifeCycle.is_done( status): experiment.refresh_from_db() if status != experiment.last_status: status = experiment.last_status await notify(ws_manager=consumer, message=get_status_message(status)) if should_disconnect(): return await asyncio.sleep(SOCKET_SLEEP) if ExperimentLifeCycle.is_done(status): await notify(ws_manager=consumer, message=get_status_message(status)) RedisToStream.remove_experiment_logs(experiment_uuid=experiment_uuid) return while True: num_message_retries += 1 for message in consumer.get_messages(): num_message_retries = 0 await notify(ws_manager=consumer, message=message) # After trying a couple of time, we must check the status of the experiment if num_message_retries > MAX_RETRIES: experiment.refresh_from_db() if experiment.is_done: logger.info( 'removing all socket because the experiment `%s` is done', experiment_uuid) consumer.ws = set([]) else: num_message_retries -= CHECK_DELAY # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access logger.info('Quitting logs socket for experiment uuid %s', experiment_uuid) consumer.remove_sockets({ ws, }) should_quite = True if should_disconnect(): should_quite = True if should_quite: return await asyncio.sleep(SOCKET_SLEEP)