Exemplo n.º 1
0
    def test_set_latest_job_resources(self):
        gpu_resources = {
            'index':
            0,
            'bus_id':
            '0000:00:1E.1',
            'memory_free':
            1000,
            'memory_total':
            12883853312,
            'memory_used':
            8388608000,
            'memory_utilization':
            0,
            'minor':
            1,
            'name':
            'GeForce GTX TITAN 0',
            'power_draw':
            125,
            'power_limit':
            250,
            'processes': [{
                'command': 'python',
                'gpu_memory_usage': 4000,
                'pid': 48448,
                'username': '******'
            }, {
                'command': 'python',
                'gpu_memory_usage': 4000,
                'pid': 153223,
                'username': '******'
            }],
            'serial':
            '0322917092147',
            'temperature_gpu':
            80,
            'utilization_gpu':
            76,
            'uuid':
            'GPU-10fb0fbd-2696-43f3-467f-d280d906a107'
        }

        config_dict = {
            'job_uuid': uuid.uuid4().hex,
            'experiment_uuid': uuid.uuid4().hex,
            'container_id':
            '3175e88873af9077688cee20eaadc0c07746efb84d01ae696d6d17ed9bcdfbc4',
            'cpu_percentage': 0.6947691836734693,
            'percpu_percentage': [0.4564075715616173, 0.23836161211185192],
            'memory_used': 84467712,
            'memory_limit': 2096160768,
            'gpu_resources': gpu_resources
        }

        RedisToStream.set_latest_job_resources(config_dict['job_uuid'],
                                               config_dict)
        config_dict['job_name'] = 'master.0'
        assert config_dict == RedisToStream.get_latest_job_resources(
            config_dict['job_uuid'], 'master.0', True)
Exemplo n.º 2
0
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {
            gpu_resource['index']: gpu_resource
            for gpu_resource in gpu_resources
        }
    # update cluster and current node
    update_cluster(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id],
                                          gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.info("Publishing resources event")
            handle_events_resources.delay(payload=payload, persist=persist)

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(
                job_uuid)
            if (RedisToStream.is_monitored_job_resources(job_uuid)
                    or RedisToStream.is_monitored_experiment_resources(
                        experiment_uuid)):
                RedisToStream.set_latest_job_resources(job_uuid, payload)
Exemplo n.º 3
0
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {
            gpu_resource['index']: gpu_resource
            for gpu_resource in gpu_resources
        }
    update_cluster_node(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id],
                                          gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.debug("Publishing resources event")
            celery_app.send_task(EventsCeleryTasks.EVENTS_HANDLE_RESOURCES,
                                 kwargs={
                                     'payload': payload,
                                     'persist': persist
                                 })

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(
                job_uuid)
            set_last_resources_cond = (
                RedisToStream.is_monitored_job_resources(job_uuid)
                or RedisToStream.is_monitored_experiment_resources(
                    experiment_uuid))
            if set_last_resources_cond:
                RedisToStream.set_latest_job_resources(job_uuid, payload)
Exemplo n.º 4
0
def run(containers, node, persist):
    container_ids = RedisJobContainers.get_containers()
    gpu_resources = get_gpu_resources()
    if gpu_resources:
        gpu_resources = {gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources}
    # update cluster and current node
    update_cluster(gpu_resources)
    for container_id in container_ids:
        container = get_container(containers, container_id)
        if not container:
            continue
        payload = get_container_resources(node, containers[container_id], gpu_resources)
        if payload:
            payload = payload.to_dict()
            logger.info("Publishing resources event")
            celery_app.send_task(
                EventsCeleryTasks.EVENTS_HANDLE_RESOURCES,
                kwargs={'payload': payload, 'persist': persist})

            job_uuid = payload['job_uuid']
            # Check if we should stream the payload
            # Check if we have this container already in place
            experiment_uuid = RedisJobContainers.get_experiment_for_job(job_uuid)
            if (RedisToStream.is_monitored_job_resources(job_uuid) or
                    RedisToStream.is_monitored_experiment_resources(experiment_uuid)):
                RedisToStream.set_latest_job_resources(job_uuid, payload)
Exemplo n.º 5
0
async def job_resources(request, ws, username, project_name, experiment_sequence, job_sequence):
    project = _get_project(username, project_name)
    if not has_project_permissions(request.app.user, project, 'GET'):
        exceptions.Forbidden("You don't have access to this project")
    experiment = _get_validated_experiment(project, experiment_sequence)
    job = _get_job(experiment, job_sequence)
    job_uuid = job.uuid.hex
    job_name = '{}.{}'.format(job.role, job.sequence)
    auditor.record(event_type=EXPERIMENT_JOB_RESOURCES_VIEWED,
                   instance=job,
                   actor_id=request.app.user.id)

    if not RedisToStream.is_monitored_job_resources(job_uuid=job_uuid):
        _logger.info('Job resources with uuid `%s` is now being monitored', job_name)
        RedisToStream.monitor_job_resources(job_uuid=job_uuid)

    if job_uuid in request.app.job_resources_ws_mangers:
        ws_manager = request.app.job_resources_ws_mangers[job_uuid]
    else:
        ws_manager = SocketManager()
        request.app.job_resources_ws_mangers[job_uuid] = ws_manager

    def handle_job_disconnected_ws(ws):
        ws_manager.remove_sockets(ws)
        if not ws_manager.ws:
            _logger.info('Stopping resources monitor for job %s', job_name)
            RedisToStream.remove_job_resources(job_uuid=job_uuid)
            request.app.job_resources_ws_mangers.pop(job_uuid, None)

        _logger.info('Quitting resources socket for job %s', job_name)

    ws_manager.add_socket(ws)
    should_check = 0
    while True:
        resources = RedisToStream.get_latest_job_resources(job=job_uuid, job_name=job_name)
        should_check += 1

        # After trying a couple of time, we must check the status of the job
        if should_check > RESOURCES_CHECK:
            job.refresh_from_db()
            if job.is_done:
                _logger.info('removing all socket because the job `%s` is done', job_name)
                ws_manager.ws = set([])
                handle_job_disconnected_ws(ws)
                return
            else:
                should_check -= CHECK_DELAY

        if resources:
            try:
                await ws.send(resources)
            except ConnectionClosed:
                handle_job_disconnected_ws(ws)
                return

        # Just to check if connection closed
        if ws._connection_lost:  # pylint:disable=protected-access
            handle_job_disconnected_ws(ws)
            return
        await asyncio.sleep(SOCKET_SLEEP)
Exemplo n.º 6
0
async def job_resources(request, ws, username, project_name, experiment_sequence, job_sequence):
    project = _get_project(username, project_name)
    if not has_project_permissions(request.app.user, project, 'GET'):
        exceptions.Forbidden("You don't have access to this project")
    experiment = _get_validated_experiment(project, experiment_sequence)
    job = _get_job(experiment, job_sequence)
    job_uuid = job.uuid.hex
    job_name = '{}.{}'.format(job.role, job.sequence)
    auditor.record(event_type=EXPERIMENT_JOB_RESOURCES_VIEWED,
                   instance=job,
                   actor_id=request.app.user.id)

    if not RedisToStream.is_monitored_job_resources(job_uuid=job_uuid):
        logger.info('Job resources with uuid `%s` is now being monitored', job_name)
        RedisToStream.monitor_job_resources(job_uuid=job_uuid)

    if job_uuid in request.app.job_resources_ws_mangers:
        ws_manager = request.app.job_resources_ws_mangers[job_uuid]
    else:
        ws_manager = SocketManager()
        request.app.job_resources_ws_mangers[job_uuid] = ws_manager

    def handle_job_disconnected_ws(ws):
        ws_manager.remove_sockets(ws)
        if not ws_manager.ws:
            logger.info('Stopping resources monitor for job %s', job_name)
            RedisToStream.remove_job_resources(job_uuid=job_uuid)
            request.app.job_resources_ws_mangers.pop(job_uuid, None)

        logger.info('Quitting resources socket for job %s', job_name)

    ws_manager.add_socket(ws)
    should_check = 0
    while True:
        resources = RedisToStream.get_latest_job_resources(job=job_uuid, job_name=job_name)
        should_check += 1

        # After trying a couple of time, we must check the status of the job
        if should_check > RESOURCES_CHECK:
            job.refresh_from_db()
            if job.is_done:
                logger.info('removing all socket because the job `%s` is done', job_name)
                ws_manager.ws = set([])
                handle_job_disconnected_ws(ws)
                return
            else:
                should_check -= CHECK_DELAY

        if resources:
            try:
                await ws.send(resources)
            except ConnectionClosed:
                handle_job_disconnected_ws(ws)
                return

        # Just to check if connection closed
        if ws._connection_lost:  # pylint:disable=protected-access
            handle_job_disconnected_ws(ws)
            return
        await asyncio.sleep(SOCKET_SLEEP)
Exemplo n.º 7
0
    def handle_experiment_disconnected_ws(ws):
        ws_manager.remove_sockets(ws)
        if len(ws_manager.ws) == 0:
            logger.info('Stopping resources monitor for uuid {}'.format(experiment_uuid))
            RedisToStream.remove_experiment_resources(experiment_uuid=experiment_uuid)
            request.app.experiment_resources_ws_mangers.pop(experiment_uuid, None)

        logger.info('Quitting resources socket for uuid {}'.format(experiment_uuid))
Exemplo n.º 8
0
    def handle_experiment_disconnected_ws(ws):
        ws_manager.remove_sockets(ws)
        if not ws_manager.ws:
            _logger.info('Stopping resources monitor for uuid %s', experiment_uuid)
            RedisToStream.remove_experiment_resources(experiment_uuid=experiment_uuid)
            request.app.experiment_resources_ws_mangers.pop(experiment_uuid, None)

        _logger.info('Quitting resources socket for uuid %s', experiment_uuid)
Exemplo n.º 9
0
    def handle_job_disconnected_ws(ws):
        ws_manager.remove_sockets(ws)
        if not ws_manager.ws:
            logger.info('Stopping resources monitor for job %s', job_name)
            RedisToStream.remove_job_resources(job_uuid=job_uuid)
            request.app.job_resources_ws_mangers.pop(job_uuid, None)

        logger.info('Quitting resources socket for job %s', job_name)
Exemplo n.º 10
0
    def handle_job_disconnected_ws(ws):
        ws_manager.remove_sockets(ws)
        if not ws_manager.ws:
            logger.info('Stopping resources monitor for job %s', job_name)
            RedisToStream.remove_job_resources(job_uuid=job_uuid)
            request.app.job_resources_ws_mangers.pop(job_uuid, None)

        logger.info('Quitting resources socket for job %s', job_name)
Exemplo n.º 11
0
    def handle_experiment_disconnected_ws(ws):
        ws_manager.remove_sockets(ws)
        if not ws_manager.ws:
            logger.info('Stopping resources monitor for uuid %s', experiment_uuid)
            RedisToStream.remove_experiment_resources(experiment_uuid=experiment_uuid)
            request.app.experiment_resources_ws_mangers.pop(experiment_uuid, None)

        logger.info('Quitting resources socket for uuid %s', experiment_uuid)
Exemplo n.º 12
0
 def should_disconnect():
     if not consumer.ws:
         _logger.info('Stopping logs monitor for job uuid %s', job_uuid)
         RedisToStream.remove_job_logs(job_uuid=job_uuid)
         # if job_uuid in request.app.job_logs_consumers:
         #     consumer = request.app.job_logs_consumers.pop(job_uuid, None)
         #     if consumer:
         #         consumer.stop()
         return True
     return False
Exemplo n.º 13
0
    def publish_experiment_job_log(self,
                                   log_line,
                                   status,
                                   experiment_uuid,
                                   experiment_name,
                                   job_uuid,
                                   task_type=None,
                                   task_idx=None):
        try:
            log_line = log_line.decode('utf-8')
        except AttributeError:
            pass

        self._logger.info("Publishing log event for task: %s.%s, %s",
                          task_type, task_idx, experiment_name)
        celery_app.send_task(
            EventsCeleryTasks.EVENTS_HANDLE_LOGS_EXPERIMENT_JOB,
            kwargs={
                'experiment_name': experiment_name,
                'experiment_uuid': experiment_uuid,
                'job_uuid': job_uuid,
                'log_line': log_line,
                'task_type': task_type,
                'task_idx': task_idx
            })
        try:
            should_stream = (
                RedisToStream.is_monitored_job_logs(job_uuid)
                or RedisToStream.is_monitored_experiment_logs(experiment_uuid))
        except RedisError:
            should_stream = False
        if should_stream:
            self._logger.info("Streaming new log event for experiment: %s",
                              experiment_uuid)

            with celery_app.producer_or_acquire(None) as producer:
                try:
                    producer.publish(
                        {
                            'experiment_uuid': experiment_uuid,
                            'job_uuid': job_uuid,
                            'log_line': log_line,
                            'status': status,
                            'task_type': task_type,
                            'task_idx': task_idx
                        },
                        routing_key='{}.{}.{}'.format(
                            RoutingKeys.LOGS_SIDECARS, experiment_uuid,
                            job_uuid),
                        exchange=settings.INTERNAL_EXCHANGE,
                    )
                except (TimeoutError, AMQPError):
                    pass
Exemplo n.º 14
0
 def should_disconnect():
     if not consumer.ws:
         _logger.info('Stopping logs monitor for experiment uuid %s',
                      experiment_uuid)
         RedisToStream.remove_experiment_logs(
             experiment_uuid=experiment_uuid)
         # if experiment_uuid in request.app.experiment_logs_consumers:
         #     consumer = request.app.experiment_logs_consumers.pop(experiment_uuid, None)
         #     if consumer:
         #         consumer.stop()
         return True
     return False
Exemplo n.º 15
0
def publish_log(log_line,
                status,
                experiment_uuid,
                experiment_name,
                job_uuid,
                task_type=None,
                task_idx=None):
    try:
        log_line = log_line.decode('utf-8')
    except AttributeError:
        pass

    logger.info("Publishing log event for task: {}.{}, {}".format(
        task_type, task_idx, experiment_name))
    handle_events_job_logs.delay(experiment_name=experiment_name,
                                 experiment_uuid=experiment_uuid,
                                 job_uuid=job_uuid,
                                 log_line=log_line,
                                 task_type=task_type,
                                 task_idx=task_idx)
    try:
        should_stream = (
            RedisToStream.is_monitored_job_logs(job_uuid)
            or RedisToStream.is_monitored_experiment_logs(experiment_uuid))
    except RedisError:
        should_stream = False
    if should_stream:
        logger.info("Streaming new log event for experiment: {}".format(
            experiment_uuid))

        with celery_app.producer_or_acquire(None) as producer:
            try:
                producer.publish(
                    {
                        'experiment_uuid': experiment_uuid,
                        'job_uuid': job_uuid,
                        'log_line': log_line,
                        'status': status,
                        'task_type': task_type,
                        'task_idx': task_idx
                    },
                    routing_key='{}.{}.{}'.format(RoutingKeys.LOGS_SIDECARS,
                                                  experiment_uuid, job_uuid),
                    exchange=settings.INTERNAL_EXCHANGE,
                )
            except (TimeoutError, AMQPError):
                pass
Exemplo n.º 16
0
    def publish_experiment_job_log(self,
                                   log_lines,
                                   status,
                                   experiment_uuid,
                                   experiment_name,
                                   job_uuid,
                                   task_type=None,
                                   task_idx=None):

        self._logger.debug("Publishing log event for task: %s.%s, %s",
                           task_type, task_idx, experiment_name)
        celery_app.send_task(
            EventsCeleryTasks.EVENTS_HANDLE_LOGS_EXPERIMENT_JOB,
            kwargs={
                'experiment_name': experiment_name,
                'experiment_uuid': experiment_uuid,
                'job_uuid': job_uuid,
                'log_lines': log_lines,
                'task_type': task_type,
                'task_idx': task_idx})
        try:
            should_stream = (RedisToStream.is_monitored_job_logs(job_uuid) or
                             RedisToStream.is_monitored_experiment_logs(experiment_uuid))
        except RedisError:
            should_stream = False
        if should_stream:
            self._logger.info("Streaming new log event for experiment: %s", experiment_uuid)

            with celery_app.producer_or_acquire(None) as producer:
                try:
                    producer.publish(
                        {
                            'experiment_uuid': experiment_uuid,
                            'job_uuid': job_uuid,
                            'log_lines': log_lines,
                            'status': status,
                            'task_type': task_type,
                            'task_idx': task_idx
                        },
                        routing_key='{}.{}.{}'.format(RoutingKeys.LOGS_SIDECARS,
                                                      experiment_uuid,
                                                      job_uuid),
                        exchange=settings.INTERNAL_EXCHANGE,
                    )
                except (TimeoutError, AMQPError):
                    pass
Exemplo n.º 17
0
    def test_set_latest_job_resources(self):
        gpu_resources = {
            'index': 0,
            'bus_id': '0000:00:1E.1',
            'memory_free': 1000,
            'memory_total': 12883853312,
            'memory_used': 8388608000,
            'memory_utilization': 0,
            'minor': 1,
            'name': 'GeForce GTX TITAN 0',
            'power_draw': 125,
            'power_limit': 250,
            'processes': [{'command': 'python',
                           'gpu_memory_usage': 4000,
                           'pid': 48448,
                           'username': '******'},
                          {'command': 'python',
                           'gpu_memory_usage': 4000,
                           'pid': 153223,
                           'username': '******'}],
            'serial': '0322917092147',
            'temperature_gpu': 80,
            'utilization_gpu': 76,
            'uuid': 'GPU-10fb0fbd-2696-43f3-467f-d280d906a107'
        }

        config_dict = {
            'job_uuid': uuid.uuid4().hex,
            'experiment_uuid': uuid.uuid4().hex,
            'container_id': '3175e88873af9077688cee20eaadc0c07746efb84d01ae696d6d17ed9bcdfbc4',
            'cpu_percentage': 0.6947691836734693,
            'percpu_percentage': [0.4564075715616173, 0.23836161211185192],
            'memory_used': 84467712,
            'memory_limit': 2096160768,
            'gpu_resources': gpu_resources
        }

        RedisToStream.set_latest_job_resources(config_dict['job_uuid'], config_dict)
        config_dict['job_name'] = 'master.0'
        assert config_dict == RedisToStream.get_latest_job_resources(
            config_dict['job_uuid'], 'master.0', True)
Exemplo n.º 18
0
 def test_monitor_experiment_logs(self):
     expeirment_uuid = uuid.uuid4().hex
     RedisToStream.monitor_experiment_logs(expeirment_uuid)
     assert RedisToStream.is_monitored_experiment_logs(
         expeirment_uuid) is True
     RedisToStream.remove_experiment_logs(expeirment_uuid)
     assert RedisToStream.is_monitored_experiment_logs(
         expeirment_uuid) is False
Exemplo n.º 19
0
    def _stream_job_log(self, job_uuid, log_lines, routing_key):
        try:
            should_stream = RedisToStream.is_monitored_job_logs(job_uuid)
        except RedisError:
            should_stream = False
        if should_stream:
            self._logger.info("Streaming new log event for job: %s", job_uuid)

            with celery_app.producer_or_acquire(None) as producer:
                try:
                    producer.publish(
                        {
                            'job_uuid': job_uuid,
                            'log_lines': log_lines,
                        },
                        routing_key='{}.{}'.format(routing_key, job_uuid),
                        exchange=settings.INTERNAL_EXCHANGE,
                    )
                except (TimeoutError, AMQPError):
                    pass
Exemplo n.º 20
0
    def test_experiment_monitoring(self):
        experiment_uuid = uuid.uuid4().hex
        assert RedisToStream.is_monitored_experiment_resources(
            experiment_uuid) is False
        RedisToStream.monitor_experiment_resources(experiment_uuid)
        assert RedisToStream.is_monitored_experiment_resources(
            experiment_uuid) is True
        RedisToStream.remove_experiment_resources(experiment_uuid)
        assert RedisToStream.is_monitored_experiment_resources(
            experiment_uuid) is False

        assert RedisToStream.is_monitored_experiment_logs(
            experiment_uuid) is False
        RedisToStream.monitor_experiment_logs(experiment_uuid)
        assert RedisToStream.is_monitored_experiment_logs(
            experiment_uuid) is True
        RedisToStream.remove_experiment_logs(experiment_uuid)
        assert RedisToStream.is_monitored_experiment_logs(
            experiment_uuid) is False
Exemplo n.º 21
0
    def test_experiment_monitoring(self):
        experiment_uuid = uuid.uuid4().hex
        assert RedisToStream.is_monitored_experiment_resources(experiment_uuid) is False
        RedisToStream.monitor_experiment_resources(experiment_uuid)
        assert RedisToStream.is_monitored_experiment_resources(experiment_uuid) is True
        RedisToStream.remove_experiment_resources(experiment_uuid)
        assert RedisToStream.is_monitored_experiment_resources(experiment_uuid) is False

        assert RedisToStream.is_monitored_experiment_logs(experiment_uuid) is False
        RedisToStream.monitor_experiment_logs(experiment_uuid)
        assert RedisToStream.is_monitored_experiment_logs(experiment_uuid) is True
        RedisToStream.remove_experiment_logs(experiment_uuid)
        assert RedisToStream.is_monitored_experiment_logs(experiment_uuid) is False
Exemplo n.º 22
0
    def test_job_monitoring(self):
        job_uuid = uuid.uuid4().hex
        assert RedisToStream.is_monitored_job_resources(job_uuid) is False
        RedisToStream.monitor_job_resources(job_uuid)
        assert RedisToStream.is_monitored_job_resources(job_uuid) is True
        RedisToStream.remove_job_resources(job_uuid)
        assert RedisToStream.is_monitored_job_resources(job_uuid) is False

        assert RedisToStream.is_monitored_job_logs(job_uuid) is False
        RedisToStream.monitor_job_logs(job_uuid)
        assert RedisToStream.is_monitored_job_logs(job_uuid) is True
        RedisToStream.remove_job_logs(job_uuid)
        assert RedisToStream.is_monitored_job_logs(job_uuid) is False
Exemplo n.º 23
0
 def test_monitor_experiment_logs(self):
     expeirment_uuid = uuid.uuid4().hex
     RedisToStream.monitor_experiment_logs(expeirment_uuid)
     assert RedisToStream.is_monitored_experiment_logs(expeirment_uuid) is True
     RedisToStream.remove_experiment_logs(expeirment_uuid)
     assert RedisToStream.is_monitored_experiment_logs(expeirment_uuid) is False
Exemplo n.º 24
0
async def experiment_logs(request, ws, username, project_name, experiment_sequence):
    project = _get_project(username, project_name)
    if not has_project_permissions(request.app.user, project, 'GET'):
        exceptions.Forbidden("You don't have access to this project")
    experiment = _get_validated_experiment(project, experiment_sequence)
    experiment_uuid = experiment.uuid.hex
    auditor.record(event_type=EXPERIMENT_LOGS_VIEWED,
                   instance=experiment,
                   actor_id=request.app.user.id)

    if not RedisToStream.is_monitored_experiment_logs(experiment_uuid=experiment_uuid):
        logger.info('Experiment uuid `%s` logs is now being monitored', experiment_uuid)
        RedisToStream.monitor_experiment_logs(experiment_uuid=experiment_uuid)

    # start consumer
    if experiment_uuid in request.app.experiment_logs_consumers:
        consumer = request.app.experiment_logs_consumers[experiment_uuid]
    else:
        logger.info('Add experiment log consumer for %s', experiment_uuid)
        consumer = Consumer(
            routing_key='{}.{}.*'.format(RoutingKeys.LOGS_SIDECARS, experiment_uuid),
            queue='{}.{}'.format(CeleryQueues.STREAM_LOGS_SIDECARS, experiment_uuid))
        request.app.experiment_logs_consumers[experiment_uuid] = consumer
        consumer.run()

    # add socket manager
    consumer.add_socket(ws)
    should_quite = False
    num_message_retries = 0
    while True:
        num_message_retries += 1
        for message in consumer.get_messages():
            num_message_retries = 0
            disconnected_ws = set()
            for _ws in consumer.ws:
                try:
                    await _ws.send(message)
                except ConnectionClosed:
                    disconnected_ws.add(_ws)
            consumer.remove_sockets(disconnected_ws)

        # After trying a couple of time, we must check the status of the experiment
        if num_message_retries > MAX_RETRIES:
            experiment.refresh_from_db()
            if experiment.is_done:
                logger.info(
                    'removing all socket because the experiment `%s` is done', experiment_uuid)
                consumer.ws = set([])
            else:
                num_message_retries -= CHECK_DELAY

        # Just to check if connection closed
        if ws._connection_lost:  # pylint:disable=protected-access
            logger.info('Quitting logs socket for experiment uuid %s', experiment_uuid)
            consumer.remove_sockets({ws, })
            should_quite = True

        if not consumer.ws:
            logger.info('Stopping logs monitor for experiment uuid %s', experiment_uuid)
            RedisToStream.remove_experiment_logs(experiment_uuid=experiment_uuid)
            # if experiment_uuid in request.app.experiment_logs_consumers:
            #     consumer = request.app.experiment_logs_consumers.pop(experiment_uuid, None)
            #     if consumer:
            #         consumer.stop()
            should_quite = True

        if should_quite:
            return

        await asyncio.sleep(SOCKET_SLEEP)
Exemplo n.º 25
0
async def job_logs(request, ws, username, project_name, experiment_sequence,
                   job_sequence):
    project = _get_project(username, project_name)
    if not has_project_permissions(request.app.user, project, 'GET'):
        exceptions.Forbidden("You don't have access to this project")
    experiment = _get_validated_experiment(project, experiment_sequence)
    job = _get_job(experiment, job_sequence)
    job_uuid = job.uuid.hex

    if not RedisToStream.is_monitored_job_logs(job_uuid=job_uuid):
        logger.info(
            'Job uuid `{}` logs is now being monitored'.format(job_uuid))
        RedisToStream.monitor_job_logs(job_uuid=job_uuid)

    # start consumer
    if job_uuid in request.app.job_logs_consumers:
        consumer = request.app.job_logs_consumers[job_uuid]
    else:
        logger.info('Add job log consumer for {}'.format(job_uuid))
        consumer = Consumer(
            routing_key='{}.{}.{}'.format(RoutingKeys.LOGS_SIDECARS,
                                          experiment.uuid.hex, job_uuid),
            queue='{}.{}'.format(CeleryQueues.STREAM_LOGS_SIDECARS, job_uuid))
        request.app.job_logs_consumers[job_uuid] = consumer
        consumer.run()

    # add socket manager
    consumer.add_socket(ws)
    should_quite = False
    num_message_retries = 0
    while True:
        num_message_retries += 1
        for message in consumer.get_messages():
            num_message_retries = 0
            disconnected_ws = set()
            for _ws in consumer.ws:
                try:
                    await _ws.send(message)
                except ConnectionClosed:
                    disconnected_ws.add(_ws)
            consumer.remove_sockets(disconnected_ws)

        # After trying a couple of time, we must check the status of the experiment
        if num_message_retries > MAX_RETRIES:
            job.refresh_from_db()
            if job.is_done:
                logger.info(
                    'removing all socket because the job `{}` is done'.format(
                        job_uuid))
                consumer.ws = set([])
            else:
                num_message_retries -= CHECK_DELAY

        # Just to check if connection closed
        if ws._connection_lost:
            logger.info(
                'Quitting logs socket for job uuid {}'.format(job_uuid))
            consumer.remove_sockets({
                ws,
            })
            should_quite = True

        if len(consumer.ws) == 0:
            logger.info(
                'Stopping logs monitor for job uuid {}'.format(job_uuid))
            RedisToStream.remove_job_logs(job_uuid=job_uuid)
            # if job_uuid in request.app.job_logs_consumers:
            #     consumer = request.app.job_logs_consumers.pop(job_uuid, None)
            #     if consumer:
            #         consumer.stop()
            should_quite = True

        if should_quite:
            return

        await asyncio.sleep(SOCKET_SLEEP)
Exemplo n.º 26
0
async def experiment_resources(request, ws, username, project_name,
                               experiment_sequence):
    project = _get_project(username, project_name)
    if not has_project_permissions(request.app.user, project, 'GET'):
        exceptions.Forbidden("You don't have access to this project")
    experiment = _get_validated_experiment(project, experiment_sequence)
    experiment_uuid = experiment.uuid.hex

    if not RedisToStream.is_monitored_experiment_resources(
            experiment_uuid=experiment_uuid):
        logger.info(
            'Experiment resource with uuid `{}` is now being monitored'.format(
                experiment_uuid))
        RedisToStream.monitor_experiment_resources(
            experiment_uuid=experiment_uuid)

    if experiment_uuid in request.app.experiment_resources_ws_mangers:
        ws_manager = request.app.experiment_resources_ws_mangers[
            experiment_uuid]
    else:
        ws_manager = SocketManager()
        request.app.experiment_resources_ws_mangers[
            experiment_uuid] = ws_manager

    def handle_experiment_disconnected_ws(ws):
        ws_manager.remove_sockets(ws)
        if len(ws_manager.ws) == 0:
            logger.info('Stopping resources monitor for uuid {}'.format(
                experiment_uuid))
            RedisToStream.remove_experiment_resources(
                experiment_uuid=experiment_uuid)
            request.app.experiment_resources_ws_mangers.pop(
                experiment_uuid, None)

        logger.info(
            'Quitting resources socket for uuid {}'.format(experiment_uuid))

    jobs = []
    for job in experiment.jobs.values('uuid', 'role', 'sequence'):
        job['uuid'] = job['uuid'].hex
        job['name'] = '{}.{}'.format(job.pop('role'), job.pop('sequence'))
        jobs.append(job)
    ws_manager.add_socket(ws)
    should_check = 0
    while True:
        resources = RedisToStream.get_latest_experiment_resources(jobs)
        should_check += 1

        # After trying a couple of time, we must check the status of the experiment
        if should_check > RESOURCES_CHECK:
            experiment.refresh_from_db()
            if experiment.is_done:
                logger.info(
                    'removing all socket because the experiment `{}` is done'.
                    format(experiment_uuid))
                ws_manager.ws = set([])
                handle_experiment_disconnected_ws(ws)
                return
            else:
                should_check -= CHECK_DELAY

        if resources:
            try:
                await ws.send(resources)
            except ConnectionClosed:
                handle_experiment_disconnected_ws(ws)
                return

        # Just to check if connection closed
        if ws._connection_lost:
            handle_experiment_disconnected_ws(ws)
            return

        await asyncio.sleep(SOCKET_SLEEP)
Exemplo n.º 27
0
 def test_monitor_job_resources(self):
     job_uuid = uuid.uuid4().hex
     RedisToStream.monitor_job_resources(job_uuid)
     assert RedisToStream.is_monitored_job_resources(job_uuid) is True
     RedisToStream.remove_job_resources(job_uuid)
     assert RedisToStream.is_monitored_job_resources(job_uuid) is False
Exemplo n.º 28
0
async def job_logs(request, ws, username, project_name, job_id):
    job, message = validate_job(request=request,
                                username=username,
                                project_name=project_name,
                                job_id=job_id)
    if job is None:
        await ws.send(get_error_message(message))
        return
    job_uuid = job.uuid.hex
    auditor.record(event_type=JOB_LOGS_VIEWED,
                   instance=job,
                   actor_id=request.app.user.id)

    if not RedisToStream.is_monitored_job_logs(job_uuid=job_uuid):
        _logger.info('Job uuid `%s` logs is now being monitored', job_uuid)
        RedisToStream.monitor_job_logs(job_uuid=job_uuid)

    # start consumer
    if job_uuid in request.app.job_logs_consumers:
        consumer = request.app.job_logs_consumers[job_uuid]
    else:
        _logger.info('Add job log consumer for %s', job_uuid)
        consumer = Consumer(
            routing_key='{}.{}'.format(RoutingKeys.LOGS_SIDECARS_JOBS,
                                       job_uuid),
            queue='{}.{}'.format(CeleryQueues.STREAM_LOGS_SIDECARS, job_uuid))
        request.app.job_logs_consumers[job_uuid] = consumer
        consumer.run()

    # add socket manager
    consumer.add_socket(ws)
    should_quite = False
    num_message_retries = 0
    while True:
        num_message_retries += 1
        for message in consumer.get_messages():
            num_message_retries = 0
            disconnected_ws = set()
            for _ws in consumer.ws:
                try:
                    await _ws.send(message)
                except ConnectionClosed:
                    disconnected_ws.add(_ws)
            consumer.remove_sockets(disconnected_ws)

        # After trying a couple of time, we must check the status of the experiment
        if num_message_retries > MAX_RETRIES:
            job.refresh_from_db()
            if job.is_done:
                _logger.info(
                    'removing all socket because the job `%s` is done',
                    job_uuid)
                consumer.ws = set([])
            else:
                num_message_retries -= CHECK_DELAY

        # Just to check if connection closed
        if ws._connection_lost:  # pylint:disable=protected-access
            _logger.info('Quitting logs socket for job uuid %s', job_uuid)
            consumer.remove_sockets({
                ws,
            })
            should_quite = True

        if not consumer.ws:
            _logger.info('Stopping logs monitor for job uuid %s', job_uuid)
            RedisToStream.remove_job_logs(job_uuid=job_uuid)
            # if job_uuid in request.app.job_logs_consumers:
            #     consumer = request.app.job_logs_consumers.pop(job_uuid, None)
            #     if consumer:
            #         consumer.stop()
            should_quite = True

        if should_quite:
            return

        await asyncio.sleep(SOCKET_SLEEP)
Exemplo n.º 29
0
async def experiment_logs(request, ws, username, project_name,
                          experiment_sequence):
    project = _get_project(username, project_name)
    if not has_project_permissions(request.app.user, project, 'GET'):
        exceptions.Forbidden("You don't have access to this project")
    experiment = _get_validated_experiment(project, experiment_sequence)
    experiment_uuid = experiment.uuid.hex
    auditor.record(event_type=EXPERIMENT_LOGS_VIEWED,
                   instance=experiment,
                   actor_id=request.app.user.id)

    if not RedisToStream.is_monitored_experiment_logs(
            experiment_uuid=experiment_uuid):
        logger.info('Experiment uuid `%s` logs is now being monitored',
                    experiment_uuid)
        RedisToStream.monitor_experiment_logs(experiment_uuid=experiment_uuid)

    # start consumer
    if experiment_uuid in request.app.experiment_logs_consumers:
        consumer = request.app.experiment_logs_consumers[experiment_uuid]
    else:
        logger.info('Add experiment log consumer for %s', experiment_uuid)
        consumer = Consumer(
            routing_key='{}.{}.*'.format(RoutingKeys.LOGS_SIDECARS,
                                         experiment_uuid),
            queue='{}.{}'.format(CeleryQueues.STREAM_LOGS_SIDECARS,
                                 experiment_uuid))
        request.app.experiment_logs_consumers[experiment_uuid] = consumer
        consumer.run()

    # add socket manager
    consumer.add_socket(ws)
    should_quite = False
    num_message_retries = 0
    while True:
        num_message_retries += 1
        for message in consumer.get_messages():
            num_message_retries = 0
            disconnected_ws = set()
            for _ws in consumer.ws:
                try:
                    await _ws.send(message)
                except ConnectionClosed:
                    disconnected_ws.add(_ws)
            consumer.remove_sockets(disconnected_ws)

        # After trying a couple of time, we must check the status of the experiment
        if num_message_retries > MAX_RETRIES:
            experiment.refresh_from_db()
            if experiment.is_done:
                logger.info(
                    'removing all socket because the experiment `%s` is done',
                    experiment_uuid)
                consumer.ws = set([])
            else:
                num_message_retries -= CHECK_DELAY

        # Just to check if connection closed
        if ws._connection_lost:  # pylint:disable=protected-access
            logger.info('Quitting logs socket for experiment uuid %s',
                        experiment_uuid)
            consumer.remove_sockets({
                ws,
            })
            should_quite = True

        if not consumer.ws:
            logger.info('Stopping logs monitor for experiment uuid %s',
                        experiment_uuid)
            RedisToStream.remove_experiment_logs(
                experiment_uuid=experiment_uuid)
            # if experiment_uuid in request.app.experiment_logs_consumers:
            #     consumer = request.app.experiment_logs_consumers.pop(experiment_uuid, None)
            #     if consumer:
            #         consumer.stop()
            should_quite = True

        if should_quite:
            return

        await asyncio.sleep(SOCKET_SLEEP)
Exemplo n.º 30
0
 def test_monitor_job_logs(self):
     job_uuid = uuid.uuid4().hex
     RedisToStream.monitor_job_logs(job_uuid)
     assert RedisToStream.is_monitored_job_logs(job_uuid) is True
     RedisToStream.remove_job_logs(job_uuid)
     assert RedisToStream.is_monitored_job_logs(job_uuid) is False
Exemplo n.º 31
0
async def build_logs(
        request,  # pylint:disable=too-many-branches
        ws,
        username,
        project_name,
        build_id):
    job, message = validate_build(request=request,
                                  username=username,
                                  project_name=project_name,
                                  build_id=build_id)
    if job is None:
        await ws.send(get_error_message(message))
        return

    job_uuid = job.uuid.hex

    auditor.record(event_type=BUILD_JOB_LOGS_VIEWED,
                   instance=job,
                   actor_id=request.app.user.id,
                   actor_name=request.app.user.username)

    if not RedisToStream.is_monitored_job_logs(job_uuid=job_uuid):
        _logger.info('Job uuid `%s` logs is now being monitored', job_uuid)
        RedisToStream.monitor_job_logs(job_uuid=job_uuid)

    # start consumer
    if job_uuid in request.app.job_logs_consumers:
        consumer = request.app.job_logs_consumers[job_uuid]
    else:
        _logger.info('Add job log consumer for %s', job_uuid)
        consumer = Consumer(
            routing_key='{}.{}'.format(RoutingKeys.LOGS_SIDECARS_BUILDS,
                                       job_uuid),
            queue='{}.{}'.format(CeleryQueues.STREAM_LOGS_SIDECARS, job_uuid))
        request.app.job_logs_consumers[job_uuid] = consumer
        consumer.run()

    def should_disconnect():
        if not consumer.ws:
            _logger.info('Stopping logs monitor for job uuid %s', job_uuid)
            RedisToStream.remove_job_logs(job_uuid=job_uuid)
            # if job_uuid in request.app.job_logs_consumers:
            #     consumer = request.app.job_logs_consumers.pop(job_uuid, None)
            #     if consumer:
            #         consumer.stop()
            return True
        return False

    # add socket manager
    consumer.add_socket(ws)
    should_quite = False
    num_message_retries = 0

    # Stream phase changes
    status = None
    while status != JobLifeCycle.RUNNING and not JobLifeCycle.is_done(status):
        job.refresh_from_db()
        if status != job.last_status:
            status = job.last_status
            await notify(consumer=consumer, message=get_status_message(status))
            if should_disconnect():
                return
        await asyncio.sleep(SOCKET_SLEEP)

    if JobLifeCycle.is_done(status):
        await notify(consumer=consumer, message=get_status_message(status))
        RedisToStream.remove_job_logs(job_uuid=job_uuid)
        return

    while True:
        num_message_retries += 1
        for message in consumer.get_messages():
            num_message_retries = 0
            await notify(consumer=consumer, message=message)

        # After trying a couple of time, we must check the status of the experiment
        if num_message_retries > MAX_RETRIES:
            job.refresh_from_db()
            if job.is_done:
                _logger.info(
                    'removing all socket because the job `%s` is done',
                    job_uuid)
                consumer.ws = set([])
            else:
                num_message_retries -= CHECK_DELAY

        # Just to check if connection closed
        if ws._connection_lost:  # pylint:disable=protected-access
            _logger.info('Quitting logs socket for job uuid %s', job_uuid)
            consumer.remove_sockets({
                ws,
            })
            should_quite = True

        if should_disconnect():
            should_quite = True

        if should_quite:
            return

        await asyncio.sleep(SOCKET_SLEEP)
Exemplo n.º 32
0
async def experiment_resources(request, ws, username, project_name,
                               experiment_id):
    experiment, message = validate_experiment(request=request,
                                              username=username,
                                              project_name=project_name,
                                              experiment_id=experiment_id)
    if experiment is None:
        await ws.send(get_error_message(message))
        return
    experiment_uuid = experiment.uuid.hex
    auditor.record(event_type=EXPERIMENT_RESOURCES_VIEWED,
                   instance=experiment,
                   actor_id=request.app.user.id,
                   actor_name=request.app.user.username)

    if not RedisToStream.is_monitored_experiment_resources(
            experiment_uuid=experiment_uuid):
        _logger.info(
            'Experiment resource with uuid `%s` is now being monitored',
            experiment_uuid)
        RedisToStream.monitor_experiment_resources(
            experiment_uuid=experiment_uuid)

    if experiment_uuid in request.app.experiment_resources_ws_mangers:
        ws_manager = request.app.experiment_resources_ws_mangers[
            experiment_uuid]
    else:
        ws_manager = SocketManager()
        request.app.experiment_resources_ws_mangers[
            experiment_uuid] = ws_manager

    def handle_experiment_disconnected_ws(ws):
        ws_manager.remove_sockets(ws)
        if not ws_manager.ws:
            _logger.info('Stopping resources monitor for uuid %s',
                         experiment_uuid)
            RedisToStream.remove_experiment_resources(
                experiment_uuid=experiment_uuid)
            request.app.experiment_resources_ws_mangers.pop(
                experiment_uuid, None)

        _logger.info('Quitting resources socket for uuid %s', experiment_uuid)

    jobs = []
    for job in experiment.jobs.values('uuid', 'role', 'id'):
        job['uuid'] = job['uuid'].hex
        job['name'] = '{}.{}'.format(job.pop('role'), job.pop('id'))
        jobs.append(job)
    ws_manager.add_socket(ws)
    should_check = 0
    while True:
        resources = RedisToStream.get_latest_experiment_resources(jobs)
        should_check += 1

        # After trying a couple of time, we must check the status of the experiment
        if should_check > RESOURCES_CHECK:
            experiment.refresh_from_db()
            if experiment.is_done:
                _logger.info(
                    'removing all socket because the experiment `%s` is done',
                    experiment_uuid)
                ws_manager.ws = set([])
                handle_experiment_disconnected_ws(ws)
                return
            else:
                should_check -= CHECK_DELAY

        if resources:
            try:
                await ws.send(resources)
            except ConnectionClosed:
                handle_experiment_disconnected_ws(ws)
                return

        # Just to check if connection closed
        if ws._connection_lost:  # pylint:disable=protected-access
            handle_experiment_disconnected_ws(ws)
            return

        await asyncio.sleep(SOCKET_SLEEP)