Exemplo n.º 1
0
    def test_master_success_influences_other_experiment_workers_status(self):
        with patch('scheduler.tasks.experiments.experiments_build.apply_async'
                   ) as _:  # noqa
            with patch.object(Experiment, 'set_status') as _:  # noqa
                experiment = ExperimentFactory()

        assert ExperimentLifeCycle.is_done(experiment.last_status) is False
        # Add jobs
        master = ExperimentJobFactory(experiment=experiment,
                                      role=TaskType.MASTER)
        assert JobLifeCycle.is_done(master.last_status) is False
        workers = [
            ExperimentJobFactory(experiment=experiment, role=TaskType.WORKER)
            for _ in range(2)
        ]
        for worker in workers:
            worker.refresh_from_db()
            assert JobLifeCycle.is_done(worker.last_status) is False

        # Set master to succeeded
        ExperimentJobStatusFactory(job=master, status=JobLifeCycle.SUCCEEDED)

        # All worker should have a success status
        for worker in workers:
            worker.refresh_from_db()
            assert worker.last_status == JobLifeCycle.SUCCEEDED

        # Experiment last status should be success
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.SUCCEEDED
Exemplo n.º 2
0
def build_experiment(self, experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        if self.request.retries < 2:
            _logger.info('Trying again for Experiment `%s`.', experiment_id)
            self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)

        _logger.info(
            'Something went wrong, '
            'the Experiment `%s` does not exist anymore.', experiment_id)
        return

    # No need to build the image, start the experiment directly
    if not (experiment.specification.build and experiment.specification.run):
        celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_START,
                             kwargs={'experiment_id': experiment_id})
        return

    if not ExperimentLifeCycle.can_transition(
            status_from=experiment.last_status,
            status_to=ExperimentLifeCycle.BUILDING):
        _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.',
                     experiment_id, experiment.last_status,
                     ExperimentLifeCycle.BUILDING)
        return None

    # Update experiment status to show that its building
    experiment.set_status(ExperimentLifeCycle.BUILDING)

    # Building the docker image
    try:
        status = experiments_builder.build_experiment(experiment)
    except DockerException as e:
        _logger.warning('Failed to build experiment %s', e)
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return
    except Repo.DoesNotExist:
        _logger.warning('No code was found for this project')
        experiment.set_status(
            ExperimentLifeCycle.FAILED,
            message='No code was found for to build this experiment.')
        return
    except Exception as e:  # Other exceptions
        _logger.error(
            'Failed to build experiment, unexpected error occurred.\n%s',
            traceback.format_exc())
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return

    if not status:
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return

    # Now we can start the experiment
    celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_START,
                         kwargs={'experiment_id': experiment_id})
Exemplo n.º 3
0
 def calculated_status(self):
     master_status = self.jobs.filter(role=TaskType.MASTER)[0].last_status
     calculated_status = master_status if JobLifeCycle.is_done(master_status) else None
     if calculated_status is None:
         calculated_status = ExperimentLifeCycle.jobs_status(self.last_job_statuses)
     if calculated_status is None:
         return self.last_status
     return calculated_status
Exemplo n.º 4
0
 def calculated_status(self):
     master_status = self.jobs.filter(role=TaskType.MASTER)[0].last_status
     calculated_status = master_status if JobLifeCycle.is_done(master_status) else None
     if calculated_status is None:
         calculated_status = ExperimentLifeCycle.jobs_status(self.last_job_statuses)
     if calculated_status is None:
         return self.last_status
     return calculated_status
Exemplo n.º 5
0
    def post(self, request, *args, **kwargs):
        experiment = self.get_object()

        if not ExperimentLifeCycle.is_stoppable(experiment.last_status):
            return Response(status=status.HTTP_403_FORBIDDEN)

        token, _ = Token.objects.get_or_create(user=experiment.user)
        return Response({'token': token.key}, status=status.HTTP_200_OK)
Exemplo n.º 6
0
 def set_status(self, status, message=None, traceback=None, **kwargs):
     if status in ExperimentLifeCycle.HEARTBEAT_STATUS:
         RedisHeartBeat.experiment_ping(self.id)
     if ExperimentLifeCycle.can_transition(status_from=self.last_status,
                                           status_to=status):
         ExperimentStatus.objects.create(experiment=self,
                                         status=status,
                                         message=message,
                                         traceback=traceback)
Exemplo n.º 7
0
 def calculated_status(self) -> str:
     master_status = self.jobs.order_by('created_at').first().last_status
     calculated_status = master_status if JobLifeCycle.is_done(
         master_status) else None
     if calculated_status is None:
         calculated_status = ExperimentLifeCycle.jobs_status(
             self.last_job_statuses)
     if calculated_status is None:
         return self.last_status
     return calculated_status
Exemplo n.º 8
0
 def set_status(self, status, created_at=None, message=None, traceback=None, **kwargs):
     if status in ExperimentLifeCycle.HEARTBEAT_STATUS:
         RedisHeartBeat.experiment_ping(self.id)
     last_status = self.last_status_before(status_date=created_at)
     if ExperimentLifeCycle.can_transition(status_from=last_status, status_to=status):
         params = {'created_at': created_at} if created_at else {}
         ExperimentStatus.objects.create(experiment=self,
                                         status=status,
                                         message=message,
                                         traceback=traceback,
                                         **params)
Exemplo n.º 9
0
async def log_experiment(request, ws, experiment, namespace, container):
    experiment_uuid = experiment.uuid.hex
    if experiment_uuid in request.app.experiment_logs_ws_managers:
        ws_manager = request.app.experiment_logs_ws_managers[experiment_uuid]
    else:
        ws_manager = SocketManager()
        request.app.experiment_logs_ws_managers[experiment_uuid] = ws_manager

    ws_manager.add_socket(ws)

    # Stream phase changes
    status = None
    while status != ExperimentLifeCycle.RUNNING and not ExperimentLifeCycle.is_done(
            status):
        experiment.refresh_from_db()
        if status != experiment.last_status:
            status = experiment.last_status
            await notify_ws(ws=ws, message=get_status_message(status))
            if should_disconnect(ws=ws, ws_manager=ws_manager):
                return
        await asyncio.sleep(SOCKET_SLEEP)

    if ExperimentLifeCycle.is_done(status):
        await notify_ws(ws=ws, message=get_status_message(status))
        return

    config.load_incluster_config()
    k8s_api = client.CoreV1Api()
    log_requests = []
    for job in experiment.jobs.all():
        pod_id = job.pod_id
        log_requests.append(
            log_job_pod(k8s_api=k8s_api,
                        ws=ws,
                        ws_manager=ws_manager,
                        pod_id=pod_id,
                        container=container,
                        namespace=namespace,
                        task_type=job.role,
                        task_idx=job.sequence))
    await asyncio.wait(log_requests)
Exemplo n.º 10
0
def build_experiment(self, experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        if self.request.retries < 2:
            _logger.info('Trying again for Experiment `%s`.', experiment_id)
            self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)

        _logger.info('Something went wrong, '
                     'the Experiment `%s` does not exist anymore.', experiment_id)
        return

    # No need to build the image, start the experiment directly
    if not (experiment.specification.build and experiment.specification.run):
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_START,
            kwargs={'experiment_id': experiment_id})
        return

    if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status,
                                              status_to=ExperimentLifeCycle.BUILDING):
        _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.',
                     experiment_id, experiment.last_status, ExperimentLifeCycle.BUILDING)
        return None

    # Update experiment status to show that its building
    experiment.set_status(ExperimentLifeCycle.BUILDING)

    # Building the docker image
    try:
        status = experiments_builder.build_experiment(experiment)
    except DockerException as e:
        _logger.warning('Failed to build experiment %s', e)
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return
    except Repo.DoesNotExist:
        _logger.warning('No code was found for this project')
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='No code was found for to build this experiment.')
        return
    except Exception as e:  # Other exceptions
        _logger.warning('Failed to build experiment %s', e)
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return

    if not status:
        return

    # Now we can start the experiment
    celery_app.send_task(
        SchedulerCeleryTasks.EXPERIMENTS_START,
        kwargs={'experiment_id': experiment_id})
Exemplo n.º 11
0
def experiment_status_post_save(sender, **kwargs):
    instance = kwargs['instance']
    experiment = instance.experiment
    previous_status = experiment.last_status

    # update experiment last_status
    experiment.status = instance
    set_started_at(instance=experiment,
                   status=instance.status,
                   starting_statuses=[
                       ExperimentLifeCycle.STARTING,
                       ExperimentLifeCycle.RUNNING
                   ],
                   running_status=ExperimentLifeCycle.RUNNING)
    set_finished_at(instance=experiment,
                    status=instance.status,
                    is_done=ExperimentLifeCycle.is_done)
    experiment.save(update_fields=['status', 'started_at', 'finished_at'])
    auditor.record(event_type=EXPERIMENT_NEW_STATUS,
                   instance=experiment,
                   previous_status=previous_status)

    if instance.status == ExperimentLifeCycle.SUCCEEDED:
        # update all workers with succeeded status, since we will trigger a stop mechanism
        for job in experiment.jobs.all():
            if not job.is_done:
                job.set_status(JobLifeCycle.SUCCEEDED,
                               message='Master is done.')
        auditor.record(event_type=EXPERIMENT_SUCCEEDED,
                       instance=experiment,
                       previous_status=previous_status)
    if instance.status == ExperimentLifeCycle.FAILED:
        auditor.record(event_type=EXPERIMENT_FAILED,
                       instance=experiment,
                       previous_status=previous_status)

    if instance.status == ExperimentLifeCycle.STOPPED:
        auditor.record(event_type=EXPERIMENT_STOPPED,
                       instance=experiment,
                       previous_status=previous_status)

    if ExperimentLifeCycle.is_done(instance.status):
        auditor.record(event_type=EXPERIMENT_DONE,
                       instance=experiment,
                       previous_status=previous_status)
        # Check if it's part of an experiment group, and start following tasks
        if not experiment.is_independent:
            celery_app.send_task(
                HPCeleryTasks.HP_START,
                kwargs={'experiment_group_id': experiment.experiment_group.id},
                countdown=1)
Exemplo n.º 12
0
def experiments_start(experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        _logger.info('Something went wrong, '
                     'the Experiment `%s` does not exist anymore.', experiment_id)
        return

    if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status,
                                              status_to=ExperimentLifeCycle.SCHEDULED):
        _logger.info('Experiment `%s` cannot transition from `%s` to `%s`.',
                     experiment.unique_name, experiment.last_status, ExperimentLifeCycle.SCHEDULED)
        return None

    experiment_scheduler.start_experiment(experiment)
Exemplo n.º 13
0
def experiment_status_post_save(sender, **kwargs):
    instance = kwargs['instance']
    experiment = instance.experiment
    previous_status = experiment.last_status

    # update experiment last_status
    experiment.status = instance
    set_started_at(instance=experiment,
                   status=instance.status,
                   starting_statuses=[
                       ExperimentLifeCycle.STARTING,
                       ExperimentLifeCycle.RUNNING
                   ],
                   running_status=ExperimentLifeCycle.RUNNING)
    set_finished_at(instance=experiment,
                    status=instance.status,
                    is_done=ExperimentLifeCycle.is_done)
    experiment.save(
        update_fields=['status', 'started_at', 'updated_at', 'finished_at'])
    auditor.record(event_type=EXPERIMENT_NEW_STATUS,
                   instance=experiment,
                   previous_status=previous_status)

    if instance.status == ExperimentLifeCycle.CREATED:
        auditor.record(event_type=EXPERIMENT_CREATED, instance=experiment)
    elif instance.status == ExperimentLifeCycle.SUCCEEDED:
        # update all workers with succeeded status, since we will trigger a stop mechanism
        for job in experiment.jobs.all():
            if not job.is_done:
                job.set_status(JobLifeCycle.SUCCEEDED,
                               message='Master is done.')
        auditor.record(event_type=EXPERIMENT_SUCCEEDED,
                       instance=experiment,
                       previous_status=previous_status)
    elif instance.status == ExperimentLifeCycle.FAILED:
        auditor.record(event_type=EXPERIMENT_FAILED,
                       instance=experiment,
                       previous_status=previous_status)
    elif instance.status == ExperimentLifeCycle.STOPPED:
        auditor.record(event_type=EXPERIMENT_STOPPED,
                       instance=experiment,
                       previous_status=previous_status)

    if ExperimentLifeCycle.is_done(instance.status):
        auditor.record(event_type=EXPERIMENT_DONE,
                       instance=experiment,
                       previous_status=previous_status)
Exemplo n.º 14
0
def experiments_build(experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        return

    # No need to build the image, start the experiment directly
    if not (experiment.specification.build and experiment.specification.run):
        celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_START,
                             kwargs={'experiment_id': experiment_id},
                             countdown=conf.get('GLOBAL_COUNTDOWN'))
        return

    last_status = experiment.last_status
    if not ExperimentLifeCycle.can_transition(
            status_from=last_status, status_to=ExperimentLifeCycle.BUILDING):
        _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.',
                     experiment_id, last_status, ExperimentLifeCycle.BUILDING)
        return

    build_job, image_exists, build_status = dockerizer_scheduler.create_build_job(
        user=experiment.user,
        project=experiment.project,
        config=experiment.specification.build,
        configmap_refs=experiment.specification.configmap_refs,
        secret_refs=experiment.specification.secret_refs,
        code_reference=experiment.code_reference)

    experiment.build_job = build_job
    experiment.save(update_fields=['build_job'])
    if image_exists:
        # The image already exists, so we can start the experiment right away
        celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_START,
                             kwargs={'experiment_id': experiment_id},
                             countdown=conf.get('GLOBAL_COUNTDOWN'))
        return

    if not build_status:
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Could not start build process.')
        return

    # Update experiment status to show that its building
    experiment.set_status(ExperimentLifeCycle.BUILDING)
Exemplo n.º 15
0
def experiments_start(experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        _logger.info('Something went wrong, '
                     'the Experiment `%s` does not exist anymore.', experiment_id)
        return

    if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status,
                                              status_to=ExperimentLifeCycle.SCHEDULED):
        _logger.info('Experiment `%s` cannot transition from `%s` to `%s`.',
                     experiment.unique_name, experiment.last_status, ExperimentLifeCycle.SCHEDULED)
        return None

    # Check if we need to copy an experiment
    if experiment.is_copy:
        copy_experiment(experiment)
    else:
        create_experiment_outputs_path(experiment.unique_name)

    experiment_scheduler.start_experiment(experiment)
Exemplo n.º 16
0
def experiments_build(experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        return

    # No need to build the image, start the experiment directly
    if not (experiment.specification.build and experiment.specification.run):
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_START,
            kwargs={'experiment_id': experiment_id})
        return

    if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status,
                                              status_to=ExperimentLifeCycle.BUILDING):
        _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.',
                     experiment_id, experiment.last_status, ExperimentLifeCycle.BUILDING)
        return

    build_job, image_exists, build_status = dockerizer_scheduler.create_build_job(
        user=experiment.user,
        project=experiment.project,
        config=experiment.specification.build,
        code_reference=experiment.code_reference)

    experiment.build_job = build_job
    experiment.save()
    if image_exists:
        # The image already exists, so we can start the experiment right away
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_START,
            kwargs={'experiment_id': experiment_id})
        return

    if not build_status:
        experiment.set_status(ExperimentLifeCycle.FAILED, message='Could not start build process.')
        return

    # Update experiment status to show that its building
    experiment.set_status(ExperimentLifeCycle.BUILDING)
Exemplo n.º 17
0
async def experiment_logs(
        request,  # pylint:disable=too-many-branches
        ws,
        username,
        project_name,
        experiment_id):
    from streams.consumers.consumers import Consumer

    experiment, message = validate_experiment(request=request,
                                              username=username,
                                              project_name=project_name,
                                              experiment_id=experiment_id)
    if experiment is None:
        await ws.send(get_error_message(message))
        return

    experiment_uuid = experiment.uuid.hex

    auditor.record(event_type=EXPERIMENT_LOGS_VIEWED,
                   instance=experiment,
                   actor_id=request.app.user.id,
                   actor_name=request.app.user.username)

    if not RedisToStream.is_monitored_experiment_logs(
            experiment_uuid=experiment_uuid):
        logger.info('Experiment uuid `%s` logs is now being monitored',
                    experiment_uuid)
        RedisToStream.monitor_experiment_logs(experiment_uuid=experiment_uuid)

    # start consumer
    if experiment_uuid in request.app.experiment_logs_consumers:
        consumer = request.app.experiment_logs_consumers[experiment_uuid]
    else:
        logger.info('Add experiment log consumer for %s', experiment_uuid)
        consumer = Consumer(routing_key='{}.{}.*'.format(
            RoutingKeys.STREAM_LOGS_SIDECARS_EXPERIMENTS, experiment_uuid),
                            queue='{}.{}'.format(
                                CeleryQueues.STREAM_LOGS_SIDECARS,
                                experiment_uuid))
        request.app.experiment_logs_consumers[experiment_uuid] = consumer
        consumer.run()

    def should_disconnect():
        if not consumer.ws:
            logger.info('Stopping logs monitor for experiment uuid %s',
                        experiment_uuid)
            RedisToStream.remove_experiment_logs(
                experiment_uuid=experiment_uuid)
            # if experiment_uuid in request.app.experiment_logs_consumers:
            #     consumer = request.app.experiment_logs_consumers.pop(experiment_uuid, None)
            #     if consumer:
            #         consumer.stop()
            return True
        return False

    # add socket manager
    consumer.add_socket(ws)
    should_quite = False
    num_message_retries = 0

    # Stream phase changes
    status = None
    while status != ExperimentLifeCycle.RUNNING and not ExperimentLifeCycle.is_done(
            status):
        experiment.refresh_from_db()
        if status != experiment.last_status:
            status = experiment.last_status
            await notify(ws_manager=consumer,
                         message=get_status_message(status))
            if should_disconnect():
                return
        await asyncio.sleep(SOCKET_SLEEP)

    if ExperimentLifeCycle.is_done(status):
        await notify(ws_manager=consumer, message=get_status_message(status))
        RedisToStream.remove_experiment_logs(experiment_uuid=experiment_uuid)
        return

    while True:
        num_message_retries += 1
        for message in consumer.get_messages():
            num_message_retries = 0
            await notify(ws_manager=consumer, message=message)

        # After trying a couple of time, we must check the status of the experiment
        if num_message_retries > MAX_RETRIES:
            experiment.refresh_from_db()
            if experiment.is_done:
                logger.info(
                    'removing all socket because the experiment `%s` is done',
                    experiment_uuid)
                consumer.ws = set([])
            else:
                num_message_retries -= CHECK_DELAY

        # Just to check if connection closed
        if ws._connection_lost:  # pylint:disable=protected-access
            logger.info('Quitting logs socket for experiment uuid %s',
                        experiment_uuid)
            consumer.remove_sockets({
                ws,
            })
            should_quite = True

        if should_disconnect():
            should_quite = True

        if should_quite:
            return

        await asyncio.sleep(SOCKET_SLEEP)