async def job_resources(request, ws, username, project_name, experiment_sequence, job_sequence): project = _get_project(username, project_name) if not has_project_permissions(request.app.user, project, 'GET'): exceptions.Forbidden("You don't have access to this project") experiment = _get_validated_experiment(project, experiment_sequence) job = _get_job(experiment, job_sequence) job_uuid = job.uuid.hex job_name = '{}.{}'.format(job.role, job.sequence) auditor.record(event_type=EXPERIMENT_JOB_RESOURCES_VIEWED, instance=job, actor_id=request.app.user.id) if not RedisToStream.is_monitored_job_resources(job_uuid=job_uuid): _logger.info('Job resources with uuid `%s` is now being monitored', job_name) RedisToStream.monitor_job_resources(job_uuid=job_uuid) if job_uuid in request.app.job_resources_ws_mangers: ws_manager = request.app.job_resources_ws_mangers[job_uuid] else: ws_manager = SocketManager() request.app.job_resources_ws_mangers[job_uuid] = ws_manager def handle_job_disconnected_ws(ws): ws_manager.remove_sockets(ws) if not ws_manager.ws: _logger.info('Stopping resources monitor for job %s', job_name) RedisToStream.remove_job_resources(job_uuid=job_uuid) request.app.job_resources_ws_mangers.pop(job_uuid, None) _logger.info('Quitting resources socket for job %s', job_name) ws_manager.add_socket(ws) should_check = 0 while True: resources = RedisToStream.get_latest_job_resources(job=job_uuid, job_name=job_name) should_check += 1 # After trying a couple of time, we must check the status of the job if should_check > RESOURCES_CHECK: job.refresh_from_db() if job.is_done: _logger.info('removing all socket because the job `%s` is done', job_name) ws_manager.ws = set([]) handle_job_disconnected_ws(ws) return else: should_check -= CHECK_DELAY if resources: try: await ws.send(resources) except ConnectionClosed: handle_job_disconnected_ws(ws) return # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access handle_job_disconnected_ws(ws) return await asyncio.sleep(SOCKET_SLEEP)
def test_set_latest_job_resources(self): gpu_resources = { 'index': 0, 'bus_id': '0000:00:1E.1', 'memory_free': 1000, 'memory_total': 12883853312, 'memory_used': 8388608000, 'memory_utilization': 0, 'minor': 1, 'name': 'GeForce GTX TITAN 0', 'power_draw': 125, 'power_limit': 250, 'processes': [{ 'command': 'python', 'gpu_memory_usage': 4000, 'pid': 48448, 'username': '******' }, { 'command': 'python', 'gpu_memory_usage': 4000, 'pid': 153223, 'username': '******' }], 'serial': '0322917092147', 'temperature_gpu': 80, 'utilization_gpu': 76, 'uuid': 'GPU-10fb0fbd-2696-43f3-467f-d280d906a107' } config_dict = { 'job_uuid': uuid.uuid4().hex, 'experiment_uuid': uuid.uuid4().hex, 'container_id': '3175e88873af9077688cee20eaadc0c07746efb84d01ae696d6d17ed9bcdfbc4', 'cpu_percentage': 0.6947691836734693, 'percpu_percentage': [0.4564075715616173, 0.23836161211185192], 'memory_used': 84467712, 'memory_limit': 2096160768, 'gpu_resources': gpu_resources } RedisToStream.set_latest_job_resources(config_dict['job_uuid'], config_dict) config_dict['job_name'] = 'master.0' assert config_dict == RedisToStream.get_latest_job_resources( config_dict['job_uuid'], 'master.0', True)
async def job_resources(request, ws, username, project_name, experiment_sequence, job_sequence): project = _get_project(username, project_name) if not has_project_permissions(request.app.user, project, 'GET'): exceptions.Forbidden("You don't have access to this project") experiment = _get_validated_experiment(project, experiment_sequence) job = _get_job(experiment, job_sequence) job_uuid = job.uuid.hex job_name = '{}.{}'.format(job.role, job.sequence) auditor.record(event_type=EXPERIMENT_JOB_RESOURCES_VIEWED, instance=job, actor_id=request.app.user.id) if not RedisToStream.is_monitored_job_resources(job_uuid=job_uuid): logger.info('Job resources with uuid `%s` is now being monitored', job_name) RedisToStream.monitor_job_resources(job_uuid=job_uuid) if job_uuid in request.app.job_resources_ws_mangers: ws_manager = request.app.job_resources_ws_mangers[job_uuid] else: ws_manager = SocketManager() request.app.job_resources_ws_mangers[job_uuid] = ws_manager def handle_job_disconnected_ws(ws): ws_manager.remove_sockets(ws) if not ws_manager.ws: logger.info('Stopping resources monitor for job %s', job_name) RedisToStream.remove_job_resources(job_uuid=job_uuid) request.app.job_resources_ws_mangers.pop(job_uuid, None) logger.info('Quitting resources socket for job %s', job_name) ws_manager.add_socket(ws) should_check = 0 while True: resources = RedisToStream.get_latest_job_resources(job=job_uuid, job_name=job_name) should_check += 1 # After trying a couple of time, we must check the status of the job if should_check > RESOURCES_CHECK: job.refresh_from_db() if job.is_done: logger.info('removing all socket because the job `%s` is done', job_name) ws_manager.ws = set([]) handle_job_disconnected_ws(ws) return else: should_check -= CHECK_DELAY if resources: try: await ws.send(resources) except ConnectionClosed: handle_job_disconnected_ws(ws) return # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access handle_job_disconnected_ws(ws) return await asyncio.sleep(SOCKET_SLEEP)
def test_set_latest_job_resources(self): gpu_resources = { 'index': 0, 'bus_id': '0000:00:1E.1', 'memory_free': 1000, 'memory_total': 12883853312, 'memory_used': 8388608000, 'memory_utilization': 0, 'minor': 1, 'name': 'GeForce GTX TITAN 0', 'power_draw': 125, 'power_limit': 250, 'processes': [{'command': 'python', 'gpu_memory_usage': 4000, 'pid': 48448, 'username': '******'}, {'command': 'python', 'gpu_memory_usage': 4000, 'pid': 153223, 'username': '******'}], 'serial': '0322917092147', 'temperature_gpu': 80, 'utilization_gpu': 76, 'uuid': 'GPU-10fb0fbd-2696-43f3-467f-d280d906a107' } config_dict = { 'job_uuid': uuid.uuid4().hex, 'experiment_uuid': uuid.uuid4().hex, 'container_id': '3175e88873af9077688cee20eaadc0c07746efb84d01ae696d6d17ed9bcdfbc4', 'cpu_percentage': 0.6947691836734693, 'percpu_percentage': [0.4564075715616173, 0.23836161211185192], 'memory_used': 84467712, 'memory_limit': 2096160768, 'gpu_resources': gpu_resources } RedisToStream.set_latest_job_resources(config_dict['job_uuid'], config_dict) config_dict['job_name'] = 'master.0' assert config_dict == RedisToStream.get_latest_job_resources( config_dict['job_uuid'], 'master.0', True)