예제 #1
0
    def test_job_monitoring(self):
        job_uuid = uuid.uuid4().hex
        assert RedisToStream.is_monitored_job_resources(job_uuid) is False
        RedisToStream.monitor_job_resources(job_uuid)
        assert RedisToStream.is_monitored_job_resources(job_uuid) is True
        RedisToStream.remove_job_resources(job_uuid)
        assert RedisToStream.is_monitored_job_resources(job_uuid) is False

        assert RedisToStream.is_monitored_job_logs(job_uuid) is False
        RedisToStream.monitor_job_logs(job_uuid)
        assert RedisToStream.is_monitored_job_logs(job_uuid) is True
        RedisToStream.remove_job_logs(job_uuid)
        assert RedisToStream.is_monitored_job_logs(job_uuid) is False
예제 #2
0
    def test_job_monitoring(self):
        job_uuid = uuid.uuid4().hex
        assert RedisToStream.is_monitored_job_resources(job_uuid) is False
        RedisToStream.monitor_job_resources(job_uuid)
        assert RedisToStream.is_monitored_job_resources(job_uuid) is True
        RedisToStream.remove_job_resources(job_uuid)
        assert RedisToStream.is_monitored_job_resources(job_uuid) is False

        assert RedisToStream.is_monitored_job_logs(job_uuid) is False
        RedisToStream.monitor_job_logs(job_uuid)
        assert RedisToStream.is_monitored_job_logs(job_uuid) is True
        RedisToStream.remove_job_logs(job_uuid)
        assert RedisToStream.is_monitored_job_logs(job_uuid) is False
예제 #3
0
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {
            gpu_resource['index']: gpu_resource
            for gpu_resource in gpu_resources
        }
    # update cluster and current node
    update_cluster(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id],
                                          gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.info("Publishing resources event")
            handle_events_resources.delay(payload=payload, persist=persist)

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(
                job_uuid)
            if (RedisToStream.is_monitored_job_resources(job_uuid)
                    or RedisToStream.is_monitored_experiment_resources(
                        experiment_uuid)):
                RedisToStream.set_latest_job_resources(job_uuid, payload)
예제 #4
0
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {
            gpu_resource['index']: gpu_resource
            for gpu_resource in gpu_resources
        }
    update_cluster_node(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id],
                                          gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.debug("Publishing resources event")
            celery_app.send_task(EventsCeleryTasks.EVENTS_HANDLE_RESOURCES,
                                 kwargs={
                                     'payload': payload,
                                     'persist': persist
                                 })

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(
                job_uuid)
            set_last_resources_cond = (
                RedisToStream.is_monitored_job_resources(job_uuid)
                or RedisToStream.is_monitored_experiment_resources(
                    experiment_uuid))
            if set_last_resources_cond:
                RedisToStream.set_latest_job_resources(job_uuid, payload)
예제 #5
0
파일: api.py 프로젝트: qzl1221117/polyaxon
async def job_resources(request, ws, username, project_name, experiment_sequence, job_sequence):
    project = _get_project(username, project_name)
    if not has_project_permissions(request.app.user, project, 'GET'):
        exceptions.Forbidden("You don't have access to this project")
    experiment = _get_validated_experiment(project, experiment_sequence)
    job = _get_job(experiment, job_sequence)
    job_uuid = job.uuid.hex
    job_name = '{}.{}'.format(job.role, job.sequence)
    auditor.record(event_type=EXPERIMENT_JOB_RESOURCES_VIEWED,
                   instance=job,
                   actor_id=request.app.user.id)

    if not RedisToStream.is_monitored_job_resources(job_uuid=job_uuid):
        _logger.info('Job resources with uuid `%s` is now being monitored', job_name)
        RedisToStream.monitor_job_resources(job_uuid=job_uuid)

    if job_uuid in request.app.job_resources_ws_mangers:
        ws_manager = request.app.job_resources_ws_mangers[job_uuid]
    else:
        ws_manager = SocketManager()
        request.app.job_resources_ws_mangers[job_uuid] = ws_manager

    def handle_job_disconnected_ws(ws):
        ws_manager.remove_sockets(ws)
        if not ws_manager.ws:
            _logger.info('Stopping resources monitor for job %s', job_name)
            RedisToStream.remove_job_resources(job_uuid=job_uuid)
            request.app.job_resources_ws_mangers.pop(job_uuid, None)

        _logger.info('Quitting resources socket for job %s', job_name)

    ws_manager.add_socket(ws)
    should_check = 0
    while True:
        resources = RedisToStream.get_latest_job_resources(job=job_uuid, job_name=job_name)
        should_check += 1

        # After trying a couple of time, we must check the status of the job
        if should_check > RESOURCES_CHECK:
            job.refresh_from_db()
            if job.is_done:
                _logger.info('removing all socket because the job `%s` is done', job_name)
                ws_manager.ws = set([])
                handle_job_disconnected_ws(ws)
                return
            else:
                should_check -= CHECK_DELAY

        if resources:
            try:
                await ws.send(resources)
            except ConnectionClosed:
                handle_job_disconnected_ws(ws)
                return

        # Just to check if connection closed
        if ws._connection_lost:  # pylint:disable=protected-access
            handle_job_disconnected_ws(ws)
            return
        await asyncio.sleep(SOCKET_SLEEP)
예제 #6
0
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources}
    # update cluster and current node
    update_cluster(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id], gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.info("Publishing resources event")
            celery_app.send_task(
                EventsCeleryTasks.EVENTS_HANDLE_RESOURCES,
                kwargs={'payload': payload, 'persist': persist})

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(job_uuid)
            if (RedisToStream.is_monitored_job_resources(job_uuid) or
                    RedisToStream.is_monitored_experiment_resources(experiment_uuid)):
                RedisToStream.set_latest_job_resources(job_uuid, payload)
예제 #7
0
파일: api.py 프로젝트: ttsvetanov/polyaxon
async def job_resources(request, ws, username, project_name, experiment_sequence, job_sequence):
    project = _get_project(username, project_name)
    if not has_project_permissions(request.app.user, project, 'GET'):
        exceptions.Forbidden("You don't have access to this project")
    experiment = _get_validated_experiment(project, experiment_sequence)
    job = _get_job(experiment, job_sequence)
    job_uuid = job.uuid.hex
    job_name = '{}.{}'.format(job.role, job.sequence)
    auditor.record(event_type=EXPERIMENT_JOB_RESOURCES_VIEWED,
                   instance=job,
                   actor_id=request.app.user.id)

    if not RedisToStream.is_monitored_job_resources(job_uuid=job_uuid):
        logger.info('Job resources with uuid `%s` is now being monitored', job_name)
        RedisToStream.monitor_job_resources(job_uuid=job_uuid)

    if job_uuid in request.app.job_resources_ws_mangers:
        ws_manager = request.app.job_resources_ws_mangers[job_uuid]
    else:
        ws_manager = SocketManager()
        request.app.job_resources_ws_mangers[job_uuid] = ws_manager

    def handle_job_disconnected_ws(ws):
        ws_manager.remove_sockets(ws)
        if not ws_manager.ws:
            logger.info('Stopping resources monitor for job %s', job_name)
            RedisToStream.remove_job_resources(job_uuid=job_uuid)
            request.app.job_resources_ws_mangers.pop(job_uuid, None)

        logger.info('Quitting resources socket for job %s', job_name)

    ws_manager.add_socket(ws)
    should_check = 0
    while True:
        resources = RedisToStream.get_latest_job_resources(job=job_uuid, job_name=job_name)
        should_check += 1

        # After trying a couple of time, we must check the status of the job
        if should_check > RESOURCES_CHECK:
            job.refresh_from_db()
            if job.is_done:
                logger.info('removing all socket because the job `%s` is done', job_name)
                ws_manager.ws = set([])
                handle_job_disconnected_ws(ws)
                return
            else:
                should_check -= CHECK_DELAY

        if resources:
            try:
                await ws.send(resources)
            except ConnectionClosed:
                handle_job_disconnected_ws(ws)
                return

        # Just to check if connection closed
        if ws._connection_lost:  # pylint:disable=protected-access
            handle_job_disconnected_ws(ws)
            return
        await asyncio.sleep(SOCKET_SLEEP)