def test_monitor_experiment_resources(self): expeirment_uuid = uuid.uuid4().hex RedisToStream.monitor_experiment_resources(expeirment_uuid) assert RedisToStream.is_monitored_experiment_resources( expeirment_uuid) is True RedisToStream.remove_experiment_resources(expeirment_uuid) assert RedisToStream.is_monitored_experiment_resources( expeirment_uuid) is False
def run(containers, node, persist): container_ids = RedisJobContainers.get_containers() gpu_resources = get_gpu_resources() if gpu_resources: gpu_resources = {gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources} update_cluster_node(gpu_resources) for container_id in container_ids: container = get_container(containers, container_id) if not container: continue try: payload = get_container_resources(node, containers[container_id], gpu_resources) except KeyError: payload = None if payload: payload = payload.to_dict() # todo: Re-enable publishing # logger.debug("Publishing resources event") # celery_app.send_task( # K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_RESOURCES, # kwargs={'payload': payload, 'persist': persist}) job_uuid = payload['job_uuid'] # Check if we should stream the payload # Check if we have this container already in place experiment_uuid = RedisJobContainers.get_experiment_for_job(job_uuid) set_last_resources_cond = ( RedisToStream.is_monitored_job_resources(job_uuid) or RedisToStream.is_monitored_experiment_resources(experiment_uuid)) if set_last_resources_cond: RedisToStream.set_latest_job_resources(job_uuid, payload)
async def experiment_resources(request, ws, username, project_name, experiment_id): experiment, message = validate_experiment(request=request, username=username, project_name=project_name, experiment_id=experiment_id) if experiment is None: await ws.send(get_error_message(message)) return experiment_uuid = experiment.uuid.hex auditor.record(event_type=EXPERIMENT_RESOURCES_VIEWED, instance=experiment, actor_id=request.app.user.id, actor_name=request.app.user.username) if not RedisToStream.is_monitored_experiment_resources(experiment_uuid=experiment_uuid): logger.info('Experiment resource with uuid `%s` is now being monitored', experiment_uuid) RedisToStream.monitor_experiment_resources(experiment_uuid=experiment_uuid) if experiment_uuid in request.app.experiment_resources_ws_managers: ws_manager = request.app.experiment_resources_ws_managers[experiment_uuid] else: ws_manager = SocketManager() request.app.experiment_resources_ws_managers[experiment_uuid] = ws_manager def handle_experiment_disconnected_ws(ws): ws_manager.remove_sockets(ws) if not ws_manager.ws: logger.info('Stopping resources monitor for uuid %s', experiment_uuid) RedisToStream.remove_experiment_resources(experiment_uuid=experiment_uuid) request.app.experiment_resources_ws_managers.pop(experiment_uuid, None) logger.info('Quitting resources socket for uuid %s', experiment_uuid) jobs = [] for job in experiment.jobs.values('uuid', 'role', 'id'): job['uuid'] = job['uuid'].hex job['name'] = '{}.{}'.format(job.pop('role'), job.pop('id')) jobs.append(job) ws_manager.add_socket(ws) should_check = 0 while True: resources = RedisToStream.get_latest_experiment_resources(jobs) should_check += 1 # After trying a couple of time, we must check the status of the experiment if should_check > RESOURCES_CHECK: experiment.refresh_from_db() if experiment.is_done: logger.info( 'removing all socket because the experiment `%s` is done', experiment_uuid) ws_manager.ws = set([]) handle_experiment_disconnected_ws(ws) return else: should_check -= CHECK_DELAY if resources: try: await ws.send(resources) except ConnectionClosed: handle_experiment_disconnected_ws(ws) return # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access handle_experiment_disconnected_ws(ws) return await asyncio.sleep(SOCKET_SLEEP)