def test_set_latest_job_resources(self): gpu_resources = { 'index': 0, 'bus_id': '0000:00:1E.1', 'memory_free': 1000, 'memory_total': 12883853312, 'memory_used': 8388608000, 'memory_utilization': 0, 'minor': 1, 'name': 'GeForce GTX TITAN 0', 'power_draw': 125, 'power_limit': 250, 'processes': [{ 'command': 'python', 'gpu_memory_usage': 4000, 'pid': 48448, 'username': '******' }, { 'command': 'python', 'gpu_memory_usage': 4000, 'pid': 153223, 'username': '******' }], 'serial': '0322917092147', 'temperature_gpu': 80, 'utilization_gpu': 76, 'uuid': 'GPU-10fb0fbd-2696-43f3-467f-d280d906a107' } config_dict = { 'job_uuid': uuid.uuid4().hex, 'experiment_uuid': uuid.uuid4().hex, 'container_id': '3175e88873af9077688cee20eaadc0c07746efb84d01ae696d6d17ed9bcdfbc4', 'cpu_percentage': 0.6947691836734693, 'percpu_percentage': [0.4564075715616173, 0.23836161211185192], 'memory_used': 84467712, 'memory_limit': 2096160768, 'gpu_resources': gpu_resources } RedisToStream.set_latest_job_resources(config_dict['job_uuid'], config_dict) config_dict['job_name'] = 'master.0' assert config_dict == RedisToStream.get_latest_job_resources( config_dict['job_uuid'], 'master.0', True)
def run(containers, node, persist): container_ids = RedisJobContainers.get_containers() gpu_resources = get_gpu_resources() if gpu_resources: gpu_resources = {gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources} update_cluster_node(gpu_resources) for container_id in container_ids: container = get_container(containers, container_id) if not container: continue try: payload = get_container_resources(node, containers[container_id], gpu_resources) except KeyError: payload = None if payload: payload = payload.to_dict() # todo: Re-enable publishing # logger.debug("Publishing resources event") # celery_app.send_task( # K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_RESOURCES, # kwargs={'payload': payload, 'persist': persist}) job_uuid = payload['job_uuid'] # Check if we should stream the payload # Check if we have this container already in place experiment_uuid = RedisJobContainers.get_experiment_for_job(job_uuid) set_last_resources_cond = ( RedisToStream.is_monitored_job_resources(job_uuid) or RedisToStream.is_monitored_experiment_resources(experiment_uuid)) if set_last_resources_cond: RedisToStream.set_latest_job_resources(job_uuid, payload)
def handle_experiment_disconnected_ws(ws): ws_manager.remove_sockets(ws) if not ws_manager.ws: logger.info('Stopping resources monitor for uuid %s', experiment_uuid) RedisToStream.remove_experiment_resources(experiment_uuid=experiment_uuid) request.app.experiment_resources_ws_managers.pop(experiment_uuid, None) logger.info('Quitting resources socket for uuid %s', experiment_uuid)
def handle_job_disconnected_ws(ws): ws_manager.remove_sockets(ws) if not ws_manager.ws: logger.info('Stopping resources monitor for job %s', job_name) RedisToStream.remove_job_resources(job_uuid=job_uuid) request.app.job_resources_ws_managers.pop(job_uuid, None) logger.info('Quitting resources socket for job %s', job_name)
def should_disconnect(): if not consumer.ws: logger.info('Stopping logs monitor for job uuid %s', job_uuid) RedisToStream.remove_job_logs(job_uuid=job_uuid) # if job_uuid in request.app.job_logs_consumers: # consumer = request.app.job_logs_consumers.pop(job_uuid, None) # if consumer: # consumer.stop() return True return False
def should_disconnect(): if not consumer.ws: _logger.info('Stopping logs monitor for experiment uuid %s', experiment_uuid) RedisToStream.remove_experiment_logs(experiment_uuid=experiment_uuid) # if experiment_uuid in request.app.experiment_logs_consumers: # consumer = request.app.experiment_logs_consumers.pop(experiment_uuid, None) # if consumer: # consumer.stop() return True return False
def publish_experiment_job_log(self, log_lines, status, experiment_uuid, experiment_name, job_uuid, task_type=None, task_idx=None): self._logger.debug("Publishing log event for task: %s.%s, %s", task_type, task_idx, experiment_name) celery_app.send_task( EventsCeleryTasks.EVENTS_HANDLE_LOGS_EXPERIMENT_JOB, kwargs={ 'experiment_name': experiment_name, 'experiment_uuid': experiment_uuid, 'job_uuid': job_uuid, 'log_lines': log_lines, 'task_type': task_type, 'task_idx': task_idx }) try: should_stream = ( RedisToStream.is_monitored_job_logs(job_uuid) or RedisToStream.is_monitored_experiment_logs(experiment_uuid)) except RedisError: should_stream = False if should_stream: self._logger.info( "Streaming new log event for experiment: %s job: %s", experiment_uuid, job_uuid) with celery_app.producer_or_acquire(None) as producer: try: producer.publish( { 'experiment_uuid': experiment_uuid, 'job_uuid': job_uuid, 'log_lines': log_lines, 'status': status, 'task_type': task_type, 'task_idx': task_idx }, retry=True, routing_key='{}.{}.{}'.format( RoutingKeys.LOGS_SIDECARS_EXPERIMENTS, experiment_uuid, job_uuid), exchange=settings.INTERNAL_EXCHANGE, ) except (TimeoutError, AMQPError): pass
def run(cls): results = {} result = cls.redis_health(RedisEphemeralTokens.connection()) if not result.is_healthy: results['REDIS_EPH_TOKENS'] = result result = cls.redis_health(RedisSessions.connection()) if not result.is_healthy: results['REDIS_SESSIONS'] = result result = cls.redis_health(RedisTTL.connection()) if not result.is_healthy: results['REDIS_TTL'] = result result = cls.redis_health(RedisToStream.connection()) if not result.is_healthy: results['REDIS_TO_STREAM'] = result result = cls.redis_health(RedisJobContainers.connection()) if not result.is_healthy: results['REDIS_CONTAINERS'] = result if not results: results = {'REDIS': Result()} return results
def publish_experiment_job_log(self, log_lines, experiment_uuid, experiment_name, job_uuid, send_task=True): self._logger.debug("Publishing log event for task: %s, %s", job_uuid, experiment_name) if send_task: workers.send(LogsCeleryTasks.LOGS_HANDLE_EXPERIMENT_JOB, kwargs={ 'experiment_name': experiment_name, 'experiment_uuid': experiment_uuid, 'log_lines': log_lines, 'temp': True }, countdown=None) try: should_stream = ( RedisToStream.is_monitored_job_logs(job_uuid) or RedisToStream.is_monitored_experiment_logs(experiment_uuid)) except RedisError: should_stream = False if should_stream: self._logger.info( "Streaming new log event for experiment: %s job: %s", experiment_uuid, job_uuid) with workers.app.producer_or_acquire(None) as producer: try: producer.publish( { 'experiment_uuid': experiment_uuid, 'job_uuid': job_uuid, 'log_lines': log_lines, }, retry=True, routing_key='{}.{}.{}'.format( RoutingKeys.STREAM_LOGS_SIDECARS_EXPERIMENTS, experiment_uuid, job_uuid), exchange=settings.INTERNAL_EXCHANGE, ) except (TimeoutError, AMQPError): pass
def test_monitor_experiment_logs(self): expeirment_uuid = uuid.uuid4().hex RedisToStream.monitor_experiment_logs(expeirment_uuid) assert RedisToStream.is_monitored_experiment_logs( expeirment_uuid) is True RedisToStream.remove_experiment_logs(expeirment_uuid) assert RedisToStream.is_monitored_experiment_logs( expeirment_uuid) is False
def _stream_job_log(self, job_uuid, log_lines, routing_key): try: should_stream = RedisToStream.is_monitored_job_logs(job_uuid) except RedisError: should_stream = False if should_stream: self._logger.info("Streaming new log event for job: %s", job_uuid) with celery_app.producer_or_acquire(None) as producer: try: producer.publish( { 'job_uuid': job_uuid, 'log_lines': log_lines, }, routing_key='{}.{}'.format(routing_key, job_uuid), exchange=settings.INTERNAL_EXCHANGE, ) except (TimeoutError, AMQPError): pass
def test_job_monitoring(self): job_uuid = uuid.uuid4().hex assert RedisToStream.is_monitored_job_resources(job_uuid) is False RedisToStream.monitor_job_resources(job_uuid) assert RedisToStream.is_monitored_job_resources(job_uuid) is True RedisToStream.remove_job_resources(job_uuid) assert RedisToStream.is_monitored_job_resources(job_uuid) is False assert RedisToStream.is_monitored_job_logs(job_uuid) is False RedisToStream.monitor_job_logs(job_uuid) assert RedisToStream.is_monitored_job_logs(job_uuid) is True RedisToStream.remove_job_logs(job_uuid) assert RedisToStream.is_monitored_job_logs(job_uuid) is False
async def experiment_job_resources(request, ws, username, project_name, experiment_id, job_id): job, _, message = validate_experiment_job(request=request, username=username, project_name=project_name, experiment_id=experiment_id, job_id=job_id) if job is None: await ws.send(get_error_message(message)) return job_uuid = job.uuid.hex job_name = '{}.{}'.format(job.role, job.id) auditor.record(event_type=EXPERIMENT_JOB_RESOURCES_VIEWED, instance=job, actor_id=request.app.user.id, actor_name=request.app.user.username) if not RedisToStream.is_monitored_job_resources(job_uuid=job_uuid): _logger.info('Job resources with uuid `%s` is now being monitored', job_name) RedisToStream.monitor_job_resources(job_uuid=job_uuid) if job_uuid in request.app.job_resources_ws_mangers: ws_manager = request.app.job_resources_ws_mangers[job_uuid] else: ws_manager = SocketManager() request.app.job_resources_ws_mangers[job_uuid] = ws_manager def handle_job_disconnected_ws(ws): ws_manager.remove_sockets(ws) if not ws_manager.ws: _logger.info('Stopping resources monitor for job %s', job_name) RedisToStream.remove_job_resources(job_uuid=job_uuid) request.app.job_resources_ws_mangers.pop(job_uuid, None) _logger.info('Quitting resources socket for job %s', job_name) ws_manager.add_socket(ws) should_check = 0 while True: resources = RedisToStream.get_latest_job_resources(job=job_uuid, job_name=job_name) should_check += 1 # After trying a couple of time, we must check the status of the job if should_check > RESOURCES_CHECK: job.refresh_from_db() if job.is_done: _logger.info( 'removing all socket because the job `%s` is done', job_name) ws_manager.ws = set([]) handle_job_disconnected_ws(ws) return else: should_check -= CHECK_DELAY if resources: try: await ws.send(resources) except ConnectionClosed: handle_job_disconnected_ws(ws) return # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access handle_job_disconnected_ws(ws) return await asyncio.sleep(SOCKET_SLEEP)
async def experiment_resources(request, ws, username, project_name, experiment_id): experiment, message = validate_experiment(request=request, username=username, project_name=project_name, experiment_id=experiment_id) if experiment is None: await ws.send(get_error_message(message)) return experiment_uuid = experiment.uuid.hex auditor.record(event_type=EXPERIMENT_RESOURCES_VIEWED, instance=experiment, actor_id=request.app.user.id, actor_name=request.app.user.username) if not RedisToStream.is_monitored_experiment_resources(experiment_uuid=experiment_uuid): logger.info('Experiment resource with uuid `%s` is now being monitored', experiment_uuid) RedisToStream.monitor_experiment_resources(experiment_uuid=experiment_uuid) if experiment_uuid in request.app.experiment_resources_ws_managers: ws_manager = request.app.experiment_resources_ws_managers[experiment_uuid] else: ws_manager = SocketManager() request.app.experiment_resources_ws_managers[experiment_uuid] = ws_manager def handle_experiment_disconnected_ws(ws): ws_manager.remove_sockets(ws) if not ws_manager.ws: logger.info('Stopping resources monitor for uuid %s', experiment_uuid) RedisToStream.remove_experiment_resources(experiment_uuid=experiment_uuid) request.app.experiment_resources_ws_managers.pop(experiment_uuid, None) logger.info('Quitting resources socket for uuid %s', experiment_uuid) jobs = [] for job in experiment.jobs.values('uuid', 'role', 'id'): job['uuid'] = job['uuid'].hex job['name'] = '{}.{}'.format(job.pop('role'), job.pop('id')) jobs.append(job) ws_manager.add_socket(ws) should_check = 0 while True: resources = RedisToStream.get_latest_experiment_resources(jobs) should_check += 1 # After trying a couple of time, we must check the status of the experiment if should_check > RESOURCES_CHECK: experiment.refresh_from_db() if experiment.is_done: logger.info( 'removing all socket because the experiment `%s` is done', experiment_uuid) ws_manager.ws = set([]) handle_experiment_disconnected_ws(ws) return else: should_check -= CHECK_DELAY if resources: try: await ws.send(resources) except ConnectionClosed: handle_experiment_disconnected_ws(ws) return # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access handle_experiment_disconnected_ws(ws) return await asyncio.sleep(SOCKET_SLEEP)
def test_monitor_job_logs(self): job_uuid = uuid.uuid4().hex RedisToStream.monitor_job_logs(job_uuid) assert RedisToStream.is_monitored_job_logs(job_uuid) is True RedisToStream.remove_job_logs(job_uuid) assert RedisToStream.is_monitored_job_logs(job_uuid) is False
async def job_logs( request, # pylint:disable=too-many-branches ws, username, project_name, job_id): job, message = validate_job(request=request, username=username, project_name=project_name, job_id=job_id) if job is None: await ws.send(get_error_message(message)) return job_uuid = job.uuid.hex auditor.record(event_type=JOB_LOGS_VIEWED, instance=job, actor_id=request.app.user.id, actor_name=request.app.user.username) if not RedisToStream.is_monitored_job_logs(job_uuid=job_uuid): logger.info('Job uuid `%s` logs is now being monitored', job_uuid) RedisToStream.monitor_job_logs(job_uuid=job_uuid) # start consumer if job_uuid in request.app.job_logs_consumers: consumer = request.app.job_logs_consumers[job_uuid] else: logger.info('Add job log consumer for %s', job_uuid) consumer = Consumer( routing_key='{}.{}'.format(RoutingKeys.STREAM_LOGS_SIDECARS_JOBS, job_uuid), queue='{}.{}'.format(CeleryQueues.STREAM_LOGS_SIDECARS, job_uuid)) request.app.job_logs_consumers[job_uuid] = consumer consumer.run() def should_disconnect(): if not consumer.ws: logger.info('Stopping logs monitor for job uuid %s', job_uuid) RedisToStream.remove_job_logs(job_uuid=job_uuid) # if job_uuid in request.app.job_logs_consumers: # consumer = request.app.job_logs_consumers.pop(job_uuid, None) # if consumer: # consumer.stop() return True return False # add socket manager consumer.add_socket(ws) should_quite = False num_message_retries = 0 # Stream phase changes status = None while status != JobLifeCycle.RUNNING and not JobLifeCycle.is_done(status): job.refresh_from_db() if status != job.last_status: status = job.last_status await notify(consumer=consumer, message=get_status_message(status)) if should_disconnect(): return await asyncio.sleep(SOCKET_SLEEP) if JobLifeCycle.is_done(status): await notify(consumer=consumer, message=get_status_message(status)) RedisToStream.remove_job_logs(job_uuid=job_uuid) return while True: num_message_retries += 1 for message in consumer.get_messages(): num_message_retries = 0 await notify(consumer=consumer, message=message) # After trying a couple of time, we must check the status of the experiment if num_message_retries > MAX_RETRIES: job.refresh_from_db() if job.is_done: logger.info('removing all socket because the job `%s` is done', job_uuid) consumer.ws = set([]) else: num_message_retries -= CHECK_DELAY # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access logger.info('Quitting logs socket for job uuid %s', job_uuid) consumer.remove_sockets({ ws, }) should_quite = True if should_disconnect(): should_quite = True if should_quite: return await asyncio.sleep(SOCKET_SLEEP)
def test_experiment_monitoring(self): experiment_uuid = uuid.uuid4().hex assert RedisToStream.is_monitored_experiment_resources(experiment_uuid) is False RedisToStream.monitor_experiment_resources(experiment_uuid) assert RedisToStream.is_monitored_experiment_resources(experiment_uuid) is True RedisToStream.remove_experiment_resources(experiment_uuid) assert RedisToStream.is_monitored_experiment_resources(experiment_uuid) is False assert RedisToStream.is_monitored_experiment_logs(experiment_uuid) is False RedisToStream.monitor_experiment_logs(experiment_uuid) assert RedisToStream.is_monitored_experiment_logs(experiment_uuid) is True RedisToStream.remove_experiment_logs(experiment_uuid) assert RedisToStream.is_monitored_experiment_logs(experiment_uuid) is False