def test_master_success_influences_other_experiment_workers_status(self): with patch('scheduler.tasks.experiments.experiments_build.apply_async') as _: # noqa # with patch.object(Experiment, 'set_status') as _: # noqa experiment = ExperimentFactory() assert ExperimentLifeCycle.is_done(experiment.last_status) is False # Add jobs master = ExperimentJobFactory(experiment=experiment, role=TaskType.MASTER) assert JobLifeCycle.is_done(master.last_status) is False workers = [ExperimentJobFactory(experiment=experiment, role=TaskType.WORKER) for _ in range(2)] for worker in workers: worker.refresh_from_db() assert JobLifeCycle.is_done(worker.last_status) is False # Set master to succeeded ExperimentJobStatusFactory(job=master, status=JobLifeCycle.SUCCEEDED) # All worker should have a success status for worker in workers: worker.refresh_from_db() assert worker.last_status == JobLifeCycle.SUCCEEDED # Experiment last status should be success experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.SUCCEEDED
def post(self, request, *args, **kwargs): experiment = self.get_object() if not ExperimentLifeCycle.is_stoppable(experiment.last_status): return Response(status=status.HTTP_403_FORBIDDEN) token, _ = Token.objects.get_or_create(user=experiment.user) return Response({'token': token.key}, status=status.HTTP_200_OK)
def calculated_status(self) -> str: master_status = self.jobs.order_by('created_at').first().last_status calculated_status = master_status if JobLifeCycle.is_done(master_status) else None if calculated_status is None: calculated_status = ExperimentLifeCycle.jobs_status(self.last_job_statuses) if calculated_status is None: return self.last_status return calculated_status
async def log_experiment(request, ws, experiment, namespace, container): experiment_uuid = experiment.uuid.hex if experiment_uuid in request.app.experiment_logs_ws_managers: ws_manager = request.app.experiment_logs_ws_managers[experiment_uuid] else: ws_manager = SocketManager() request.app.experiment_logs_ws_managers[experiment_uuid] = ws_manager ws_manager.add_socket(ws) # Stream phase changes status = None while status != ExperimentLifeCycle.RUNNING and not ExperimentLifeCycle.is_done(status): experiment.refresh_from_db() if status != experiment.last_status: status = experiment.last_status await notify_ws(ws=ws, message=get_status_message(status)) if should_disconnect(ws=ws, ws_manager=ws_manager): return await asyncio.sleep(SOCKET_SLEEP) if ExperimentLifeCycle.is_done(status): await notify_ws(ws=ws, message=get_status_message(status)) return config.load_incluster_config() k8s_api = client.CoreV1Api() log_requests = [] for job in experiment.jobs.all(): pod_id = job.pod_id log_requests.append( log_job_pod(k8s_api=k8s_api, ws=ws, ws_manager=ws_manager, pod_id=pod_id, container=container, namespace=namespace, task_type=job.role, task_idx=job.sequence)) await asyncio.wait(log_requests)
def experiment_status_post_save(sender, **kwargs): instance = kwargs['instance'] experiment = instance.experiment previous_status = experiment.last_status # update experiment last_status experiment.status = instance set_started_at(instance=experiment, status=instance.status, starting_statuses=[ ExperimentLifeCycle.STARTING, ExperimentLifeCycle.RUNNING ], running_status=ExperimentLifeCycle.RUNNING) set_finished_at(instance=experiment, status=instance.status, is_done=ExperimentLifeCycle.is_done) experiment.save( update_fields=['status', 'started_at', 'updated_at', 'finished_at']) auditor.record(event_type=EXPERIMENT_NEW_STATUS, instance=experiment, previous_status=previous_status) if instance.status == ExperimentLifeCycle.CREATED: auditor.record(event_type=EXPERIMENT_CREATED, instance=experiment) elif instance.status == ExperimentLifeCycle.SUCCEEDED: # update all workers with succeeded status, since we will trigger a stop mechanism for job in experiment.jobs.all(): if not job.is_done: job.set_status(JobLifeCycle.SUCCEEDED, message='Master is done.') auditor.record(event_type=EXPERIMENT_SUCCEEDED, instance=experiment, previous_status=previous_status) elif instance.status == ExperimentLifeCycle.FAILED: auditor.record(event_type=EXPERIMENT_FAILED, instance=experiment, previous_status=previous_status) elif instance.status == ExperimentLifeCycle.STOPPED: auditor.record(event_type=EXPERIMENT_STOPPED, instance=experiment, previous_status=previous_status) if ExperimentLifeCycle.is_done(instance.status): auditor.record(event_type=EXPERIMENT_DONE, instance=experiment, previous_status=previous_status) new_operation_run_status(entity_type=content_types.EXPERIMENT, entity=experiment, status=instance.status)
def experiments_start(experiment_id): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: _logger.info('Something went wrong, ' 'the Experiment `%s` does not exist anymore.', experiment_id) return if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status, status_to=ExperimentLifeCycle.SCHEDULED): _logger.info('Experiment `%s` cannot transition from `%s` to `%s`.', experiment.unique_name, experiment.last_status, ExperimentLifeCycle.SCHEDULED) return None experiment_scheduler.start_experiment(experiment)
def set_status(self, status: str, created_at: AwareDT = None, message: str = None, traceback: Dict = None, **kwargs): if status in ExperimentLifeCycle.HEARTBEAT_STATUS: RedisHeartBeat.experiment_ping(self.id) last_status = self.last_status_before(status_date=created_at) if ExperimentLifeCycle.can_transition(status_from=last_status, status_to=status): params = {'created_at': created_at} if created_at else {} ExperimentStatus.objects.create(experiment=self, status=status, message=message, traceback=traceback, **params)
def experiments_build(experiment_id): experiment = get_valid_experiment(experiment_id=experiment_id) if not experiment: return # No need to build the image, start the experiment directly if not (experiment.specification.build and experiment.specification.run): celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id}, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN)) return last_status = experiment.last_status if not ExperimentLifeCycle.can_transition(status_from=last_status, status_to=ExperimentLifeCycle.BUILDING): _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.', experiment_id, last_status, ExperimentLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=experiment.user, project=experiment.project, config=experiment.specification.build, configmap_refs=experiment.specification.configmap_refs, secret_refs=experiment.specification.secret_refs, code_reference=experiment.code_reference) experiment.build_job = build_job experiment.save(update_fields=['build_job']) if image_exists: # The image already exists, so we can start the experiment right away celery_app.send_task( SchedulerCeleryTasks.EXPERIMENTS_START, kwargs={'experiment_id': experiment_id}, countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN)) return if not build_status: experiment.set_status(ExperimentLifeCycle.FAILED, message='Could not start build process.') return # Update experiment status to show that its building experiment.set_status(ExperimentLifeCycle.BUILDING)
async def experiment_logs( request, # pylint:disable=too-many-branches ws, username, project_name, experiment_id): from streams.consumers.consumers import Consumer experiment, message = validate_experiment(request=request, username=username, project_name=project_name, experiment_id=experiment_id) if experiment is None: await ws.send(get_error_message(message)) return experiment_uuid = experiment.uuid.hex auditor.record(event_type=EXPERIMENT_LOGS_VIEWED, instance=experiment, actor_id=request.app.user.id, actor_name=request.app.user.username) if not RedisToStream.is_monitored_experiment_logs( experiment_uuid=experiment_uuid): logger.info('Experiment uuid `%s` logs is now being monitored', experiment_uuid) RedisToStream.monitor_experiment_logs(experiment_uuid=experiment_uuid) # start consumer if experiment_uuid in request.app.experiment_logs_consumers: consumer = request.app.experiment_logs_consumers[experiment_uuid] else: logger.info('Add experiment log consumer for %s', experiment_uuid) consumer = Consumer(routing_key='{}.{}.*'.format( RoutingKeys.STREAM_LOGS_SIDECARS_EXPERIMENTS, experiment_uuid), queue='{}.{}'.format( CeleryQueues.STREAM_LOGS_SIDECARS, experiment_uuid)) request.app.experiment_logs_consumers[experiment_uuid] = consumer consumer.run() def should_disconnect(): if not consumer.ws: logger.info('Stopping logs monitor for experiment uuid %s', experiment_uuid) RedisToStream.remove_experiment_logs( experiment_uuid=experiment_uuid) # if experiment_uuid in request.app.experiment_logs_consumers: # consumer = request.app.experiment_logs_consumers.pop(experiment_uuid, None) # if consumer: # consumer.stop() return True return False # add socket manager consumer.add_socket(ws) should_quite = False num_message_retries = 0 # Stream phase changes status = None while status != ExperimentLifeCycle.RUNNING and not ExperimentLifeCycle.is_done( status): experiment.refresh_from_db() if status != experiment.last_status: status = experiment.last_status await notify(ws_manager=consumer, message=get_status_message(status)) if should_disconnect(): return await asyncio.sleep(SOCKET_SLEEP) if ExperimentLifeCycle.is_done(status): await notify(ws_manager=consumer, message=get_status_message(status)) RedisToStream.remove_experiment_logs(experiment_uuid=experiment_uuid) return while True: num_message_retries += 1 for message in consumer.get_messages(): num_message_retries = 0 await notify(ws_manager=consumer, message=message) # After trying a couple of time, we must check the status of the experiment if num_message_retries > MAX_RETRIES: experiment.refresh_from_db() if experiment.is_done: logger.info( 'removing all socket because the experiment `%s` is done', experiment_uuid) consumer.ws = set([]) else: num_message_retries -= CHECK_DELAY # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access logger.info('Quitting logs socket for experiment uuid %s', experiment_uuid) consumer.remove_sockets({ ws, }) should_quite = True if should_disconnect(): should_quite = True if should_quite: return await asyncio.sleep(SOCKET_SLEEP)
def test_experiment_statuses_transition(self): # pylint:disable=too-many-branches # pylint:disable=too-many-statements # Cannot transition to `CREATED` for status in ExperimentLifeCycle.VALUES: assert ExperimentLifeCycle.can_transition( status_from=status, status_to=ExperimentLifeCycle.CREATED) is False # -> RESUMING for status in ExperimentLifeCycle.VALUES: can_transition = ExperimentLifeCycle.can_transition( status_from=status, status_to=ExperimentLifeCycle.RESUMING) if status in {ExperimentLifeCycle.CREATED, ExperimentLifeCycle.WARNING, ExperimentLifeCycle.SUCCEEDED, ExperimentLifeCycle.SKIPPED, ExperimentLifeCycle.STOPPED, }: assert can_transition is True else: assert can_transition is False # -> BUILDING for status in ExperimentLifeCycle.VALUES: can_transition = ExperimentLifeCycle.can_transition( status_from=status, status_to=ExperimentLifeCycle.BUILDING) if status in {ExperimentLifeCycle.CREATED, ExperimentLifeCycle.RESUMING, ExperimentLifeCycle.WARNING, ExperimentLifeCycle.UNKNOWN, }: assert can_transition is True else: assert can_transition is False # -> SCHEDULED for status in ExperimentLifeCycle.VALUES: can_transition = ExperimentLifeCycle.can_transition( status_from=status, status_to=ExperimentLifeCycle.SCHEDULED) if status in {ExperimentLifeCycle.CREATED, ExperimentLifeCycle.RESUMING, ExperimentLifeCycle.BUILDING, ExperimentLifeCycle.WARNING, ExperimentLifeCycle.UNKNOWN, }: assert can_transition is True else: assert can_transition is False # -> STARTING for status in ExperimentLifeCycle.VALUES: can_transition = ExperimentLifeCycle.can_transition( status_from=status, status_to=ExperimentLifeCycle.STARTING) if status in {ExperimentLifeCycle.CREATED, ExperimentLifeCycle.RESUMING, ExperimentLifeCycle.BUILDING, ExperimentLifeCycle.SCHEDULED, ExperimentLifeCycle.WARNING, }: assert can_transition is True else: assert can_transition is False # -> RUNNING for status in ExperimentLifeCycle.VALUES: can_transition = ExperimentLifeCycle.can_transition( status_from=status, status_to=ExperimentLifeCycle.RUNNING) if status in {ExperimentLifeCycle.CREATED, ExperimentLifeCycle.RESUMING, ExperimentLifeCycle.BUILDING, ExperimentLifeCycle.SCHEDULED, ExperimentLifeCycle.STARTING, ExperimentLifeCycle.UNKNOWN, ExperimentLifeCycle.WARNING, }: assert can_transition is True else: assert can_transition is False # -> SKIPPED for status in ExperimentLifeCycle.VALUES: can_transition = ExperimentLifeCycle.can_transition( status_from=status, status_to=ExperimentLifeCycle.SKIPPED) if status not in ExperimentLifeCycle.DONE_STATUS: assert can_transition is True else: assert can_transition is False # -> SUCCEEDED for status in ExperimentLifeCycle.VALUES: can_transition = ExperimentLifeCycle.can_transition( status_from=status, status_to=ExperimentLifeCycle.SUCCEEDED) if status not in ExperimentLifeCycle.DONE_STATUS: assert can_transition is True else: assert can_transition is False # -> FAILED for status in ExperimentLifeCycle.VALUES: can_transition = ExperimentLifeCycle.can_transition( status_from=status, status_to=ExperimentLifeCycle.FAILED) if status not in ExperimentLifeCycle.DONE_STATUS: assert can_transition is True else: assert can_transition is False # -> UPSTREAM_FAILED for status in ExperimentLifeCycle.VALUES: can_transition = ExperimentLifeCycle.can_transition( status_from=status, status_to=ExperimentLifeCycle.UPSTREAM_FAILED) if status not in ExperimentLifeCycle.DONE_STATUS: assert can_transition is True else: assert can_transition is False # -> STOPPED for status in ExperimentLifeCycle.VALUES: can_transition = ExperimentLifeCycle.can_transition( status_from=status, status_to=ExperimentLifeCycle.STOPPED) if status in ExperimentLifeCycle.VALUES - {ExperimentLifeCycle.STOPPED, ExperimentLifeCycle.SKIPPED, }: assert can_transition is True else: assert can_transition is False # -> WARNING for status in ExperimentLifeCycle.VALUES: can_transition = ExperimentLifeCycle.can_transition( status_from=status, status_to=ExperimentLifeCycle.WARNING) cond = status in (ExperimentLifeCycle.VALUES - ExperimentLifeCycle.DONE_STATUS - {ExperimentLifeCycle.WARNING, }) if cond: assert can_transition is True else: assert can_transition is False # -> UNKNOWN for status in ExperimentLifeCycle.VALUES: can_transition = ExperimentLifeCycle.can_transition( status_from=status, status_to=ExperimentLifeCycle.UNKNOWN) cond = status in (ExperimentLifeCycle.VALUES - ExperimentLifeCycle.DONE_STATUS - {ExperimentLifeCycle.UNKNOWN, }) if cond: assert can_transition is True else: assert can_transition is False