示例#1
0
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {
            gpu_resource['index']: gpu_resource
            for gpu_resource in gpu_resources
        }
    # update cluster and current node
    update_cluster(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id],
                                          gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.info("Publishing resources event")
            handle_events_resources.delay(payload=payload, persist=persist)

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(
                job_uuid)
            if (RedisToStream.is_monitored_job_resources(job_uuid)
                    or RedisToStream.is_monitored_experiment_resources(
                        experiment_uuid)):
                RedisToStream.set_latest_job_resources(job_uuid, payload)
    def test_set_latest_job_resources(self):
        gpu_resources = {
            'index':
            0,
            'bus_id':
            '0000:00:1E.1',
            'memory_free':
            1000,
            'memory_total':
            12883853312,
            'memory_used':
            8388608000,
            'memory_utilization':
            0,
            'minor':
            1,
            'name':
            'GeForce GTX TITAN 0',
            'power_draw':
            125,
            'power_limit':
            250,
            'processes': [{
                'command': 'python',
                'gpu_memory_usage': 4000,
                'pid': 48448,
                'username': '******'
            }, {
                'command': 'python',
                'gpu_memory_usage': 4000,
                'pid': 153223,
                'username': '******'
            }],
            'serial':
            '0322917092147',
            'temperature_gpu':
            80,
            'utilization_gpu':
            76,
            'uuid':
            'GPU-10fb0fbd-2696-43f3-467f-d280d906a107'
        }

        config_dict = {
            'job_uuid': uuid.uuid4().hex,
            'experiment_uuid': uuid.uuid4().hex,
            'container_id':
            '3175e88873af9077688cee20eaadc0c07746efb84d01ae696d6d17ed9bcdfbc4',
            'cpu_percentage': 0.6947691836734693,
            'percpu_percentage': [0.4564075715616173, 0.23836161211185192],
            'memory_used': 84467712,
            'memory_limit': 2096160768,
            'gpu_resources': gpu_resources
        }

        RedisToStream.set_latest_job_resources(config_dict['job_uuid'],
                                               config_dict)
        config_dict['job_name'] = 'master.0'
        assert config_dict == RedisToStream.get_latest_job_resources(
            config_dict['job_uuid'], 'master.0', True)
示例#3
0
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {
            gpu_resource['index']: gpu_resource
            for gpu_resource in gpu_resources
        }
    update_cluster_node(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id],
                                          gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.debug("Publishing resources event")
            celery_app.send_task(EventsCeleryTasks.EVENTS_HANDLE_RESOURCES,
                                 kwargs={
                                     'payload': payload,
                                     'persist': persist
                                 })

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(
                job_uuid)
            set_last_resources_cond = (
                RedisToStream.is_monitored_job_resources(job_uuid)
                or RedisToStream.is_monitored_experiment_resources(
                    experiment_uuid))
            if set_last_resources_cond:
                RedisToStream.set_latest_job_resources(job_uuid, payload)
示例#4
0
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources}
    # update cluster and current node
    update_cluster(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id], gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.info("Publishing resources event")
            celery_app.send_task(
                EventsCeleryTasks.EVENTS_HANDLE_RESOURCES,
                kwargs={'payload': payload, 'persist': persist})

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(job_uuid)
            if (RedisToStream.is_monitored_job_resources(job_uuid) or
                    RedisToStream.is_monitored_experiment_resources(experiment_uuid)):
                RedisToStream.set_latest_job_resources(job_uuid, payload)
    def test_set_latest_job_resources(self):
        gpu_resources = {
            'index': 0,
            'bus_id': '0000:00:1E.1',
            'memory_free': 1000,
            'memory_total': 12883853312,
            'memory_used': 8388608000,
            'memory_utilization': 0,
            'minor': 1,
            'name': 'GeForce GTX TITAN 0',
            'power_draw': 125,
            'power_limit': 250,
            'processes': [{'command': 'python',
                           'gpu_memory_usage': 4000,
                           'pid': 48448,
                           'username': '******'},
                          {'command': 'python',
                           'gpu_memory_usage': 4000,
                           'pid': 153223,
                           'username': '******'}],
            'serial': '0322917092147',
            'temperature_gpu': 80,
            'utilization_gpu': 76,
            'uuid': 'GPU-10fb0fbd-2696-43f3-467f-d280d906a107'
        }

        config_dict = {
            'job_uuid': uuid.uuid4().hex,
            'experiment_uuid': uuid.uuid4().hex,
            'container_id': '3175e88873af9077688cee20eaadc0c07746efb84d01ae696d6d17ed9bcdfbc4',
            'cpu_percentage': 0.6947691836734693,
            'percpu_percentage': [0.4564075715616173, 0.23836161211185192],
            'memory_used': 84467712,
            'memory_limit': 2096160768,
            'gpu_resources': gpu_resources
        }

        RedisToStream.set_latest_job_resources(config_dict['job_uuid'], config_dict)
        config_dict['job_name'] = 'master.0'
        assert config_dict == RedisToStream.get_latest_job_resources(
            config_dict['job_uuid'], 'master.0', True)