def run(containers, node, persist): container_ids = RedisJobContainers.get_containers() gpu_resources = get_gpu_resources() if gpu_resources: gpu_resources = {gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources} update_cluster_node(gpu_resources) for container_id in container_ids: container = get_container(containers, container_id) if not container: continue try: payload = get_container_resources(node, containers[container_id], gpu_resources) except KeyError: payload = None if payload: payload = payload.to_dict() # todo: Re-enable publishing # logger.debug("Publishing resources event") # celery_app.send_task( # K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_RESOURCES, # kwargs={'payload': payload, 'persist': persist}) job_uuid = payload['job_uuid'] # Check if we should stream the payload # Check if we have this container already in place experiment_uuid = RedisJobContainers.get_experiment_for_job(job_uuid) set_last_resources_cond = ( RedisToStream.is_monitored_job_resources(job_uuid) or RedisToStream.is_monitored_experiment_resources(experiment_uuid)) if set_last_resources_cond: RedisToStream.set_latest_job_resources(job_uuid, payload)
def test_set_latest_job_resources(self): gpu_resources = { 'index': 0, 'bus_id': '0000:00:1E.1', 'memory_free': 1000, 'memory_total': 12883853312, 'memory_used': 8388608000, 'memory_utilization': 0, 'minor': 1, 'name': 'GeForce GTX TITAN 0', 'power_draw': 125, 'power_limit': 250, 'processes': [{ 'command': 'python', 'gpu_memory_usage': 4000, 'pid': 48448, 'username': '******' }, { 'command': 'python', 'gpu_memory_usage': 4000, 'pid': 153223, 'username': '******' }], 'serial': '0322917092147', 'temperature_gpu': 80, 'utilization_gpu': 76, 'uuid': 'GPU-10fb0fbd-2696-43f3-467f-d280d906a107' } config_dict = { 'job_uuid': uuid.uuid4().hex, 'experiment_uuid': uuid.uuid4().hex, 'container_id': '3175e88873af9077688cee20eaadc0c07746efb84d01ae696d6d17ed9bcdfbc4', 'cpu_percentage': 0.6947691836734693, 'percpu_percentage': [0.4564075715616173, 0.23836161211185192], 'memory_used': 84467712, 'memory_limit': 2096160768, 'gpu_resources': gpu_resources } RedisToStream.set_latest_job_resources(config_dict['job_uuid'], config_dict) config_dict['job_name'] = 'master.0' assert config_dict == RedisToStream.get_latest_job_resources( config_dict['job_uuid'], 'master.0', True)