async def log_job(request, ws, job, pod_id, namespace, container): job_uuid = job.uuid.hex if job_uuid in request.app.job_logs_ws_managers: ws_manager = request.app.job_logs_ws_managers[job_uuid] else: ws_manager = SocketManager() request.app.job_logs_ws_managers[job_uuid] = ws_manager ws_manager.add_socket(ws) # Stream phase changes status = None while status != JobLifeCycle.RUNNING and not JobLifeCycle.is_done(status): job.refresh_from_db() if status != job.last_status: status = job.last_status await notify_ws(ws=ws, message=get_status_message(status)) if should_disconnect(ws=ws, ws_manager=ws_manager): return await asyncio.sleep(SOCKET_SLEEP) if JobLifeCycle.is_done(status): await notify_ws(ws=ws, message=get_status_message(status)) return config.load_incluster_config() k8s_api = client.CoreV1Api() await log_job_pod(k8s_api=k8s_api, ws=ws, ws_manager=ws_manager, pod_id=pod_id, container=container, namespace=namespace)
def test_master_success_influences_other_experiment_workers_status(self): with patch('scheduler.tasks.experiments.experiments_build.apply_async') as _: # noqa # with patch.object(Experiment, 'set_status') as _: # noqa experiment = ExperimentFactory() assert ExperimentLifeCycle.is_done(experiment.last_status) is False # Add jobs master = ExperimentJobFactory(experiment=experiment, role=TaskType.MASTER) assert JobLifeCycle.is_done(master.last_status) is False workers = [ExperimentJobFactory(experiment=experiment, role=TaskType.WORKER) for _ in range(2)] for worker in workers: worker.refresh_from_db() assert JobLifeCycle.is_done(worker.last_status) is False # Set master to succeeded ExperimentJobStatusFactory(job=master, status=JobLifeCycle.SUCCEEDED) # All worker should have a success status for worker in workers: worker.refresh_from_db() assert worker.last_status == JobLifeCycle.SUCCEEDED # Experiment last status should be success experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.SUCCEEDED
def tensorboard_job_status_post_save(sender, **kwargs): instance = kwargs['instance'] job = instance.job previous_status = job.last_status # Update job last_status job.status = instance set_job_started_at(instance=job, status=instance.status) set_job_finished_at(instance=job, status=instance.status) job.save(update_fields=['status', 'started_at', 'updated_at', 'finished_at']) auditor.record(event_type=TENSORBOARD_NEW_STATUS, instance=job, previous_status=previous_status, target='project') if instance.status == JobLifeCycle.STOPPED: auditor.record(event_type=TENSORBOARD_STOPPED, instance=job, previous_status=previous_status, target='project') elif instance.status == JobLifeCycle.FAILED: auditor.record(event_type=TENSORBOARD_FAILED, instance=job, previous_status=previous_status, target='project') elif instance.status == JobLifeCycle.STOPPED: auditor.record(event_type=TENSORBOARD_SUCCEEDED, instance=job, previous_status=previous_status, target='project') if JobLifeCycle.is_done(instance.status): RedisStatuses.delete_status(job.uuid.hex) new_operation_run_status(entity_type=content_types.TENSORBOARD_JOB, entity=job, status=instance.status)
def update_job_containers(event: Mapping, status: str, job_container_name: str) -> None: job_containers = RedisJobContainers() if JobLifeCycle.is_done(status): # Remove the job monitoring job_uuid = event['metadata']['labels']['job_uuid'] logger.info('Stop monitoring job_uuid: %s', job_uuid) job_containers.remove_job(job_uuid) if event['status']['container_statuses'] is None: return def get_container_id(container_id): if not container_id: return None if container_id.startswith('docker://'): return container_id[len('docker://'):] return container_id for container_status in event['status']['container_statuses']: if container_status['name'] != job_container_name: continue container_id = get_container_id(container_status['container_id']) if container_id: job_uuid = event['metadata']['labels']['job_uuid'] if container_status['state']['running'] is not None: logger.info('Monitoring (container_id, job_uuid): (%s, %s)', container_id, job_uuid) job_containers.monitor(container_id=container_id, job_uuid=job_uuid) else: job_containers.remove_container(container_id=container_id)
def _set_status(self, status_model, status: str, created_at: AwareDT = None, message: str = None, traceback: Dict = None, details: Dict = None) -> bool: current_status = self.last_status_before(status_model=status_model, status_date=created_at) if self.is_done: # We should not update statuses anymore _logger.debug( 'Received a new status `%s` for job `%s`. ' 'But the job is already done with status `%s`', status, self.unique_name, current_status) return False if status in JobLifeCycle.HEARTBEAT_STATUS: self._ping_heartbeat() if JobLifeCycle.can_transition(status_from=current_status, status_to=status): # Add new status to the job params = {'created_at': created_at} if created_at else {} status_model.objects.create(job=self, status=status, message=message, traceback=traceback, details=details, **params) return True return False
def jobs_build(job_id): job = get_valid_job(job_id=job_id) if not job: return None if not JobLifeCycle.can_transition(status_from=job.last_status, status_to=JobLifeCycle.BUILDING): _logger.info('Job id `%s` cannot transition from `%s` to `%s`.', job_id, job.last_status, JobLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=job.user, project=job.project, config=job.specification.build, configmap_refs=job.specification.configmap_refs, secret_refs=job.specification.secret_refs, code_reference=job.code_reference) job.build_job = build_job job.save(update_fields=['build_job']) if image_exists: # The image already exists, so we can start the experiment right away celery_app.send_task( SchedulerCeleryTasks.JOBS_START, kwargs={'job_id': job_id}, countdown=conf.get('GLOBAL_COUNTDOWN')) return if not build_status: job.set_status(JobLifeCycle.FAILED, message='Could not start build process.') return # Update job status to show that its building docker image job.set_status(JobLifeCycle.BUILDING, message='Building container')
def projects_notebook_build(notebook_job_id): notebook_job = get_valid_notebook(notebook_job_id=notebook_job_id) if not notebook_job: return None if not JobLifeCycle.can_transition(status_from=notebook_job.last_status, status_to=JobLifeCycle.BUILDING): _logger.info('Notebook `%s` cannot transition from `%s` to `%s`.', notebook_job, notebook_job.last_status, JobLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=notebook_job.user, project=notebook_job.project, config=notebook_job.specification.build, configmap_refs=notebook_job.specification.configmap_refs, secret_refs=notebook_job.specification.secret_refs, code_reference=notebook_job.code_reference) notebook_job.build_job = build_job notebook_job.save(update_fields=['build_job']) if image_exists: # The image already exists, so we can start the experiment right away workers.send( SchedulerCeleryTasks.PROJECTS_NOTEBOOK_START, kwargs={'notebook_job_id': notebook_job_id}) return if not build_status: notebook_job.set_status(JobLifeCycle.FAILED, message='Could not start build process.') return # Update job status to show that its building docker image notebook_job.set_status(JobLifeCycle.BUILDING, message='Building container')
def job_status_post_save(sender, **kwargs): instance = kwargs['instance'] job = instance.job previous_status = job.last_status # Update job last_status job.status = instance set_job_started_at(instance=job, status=instance.status) set_job_finished_at(instance=job, status=instance.status) job.save(update_fields=['status', 'started_at', 'updated_at', 'finished_at']) auditor.record(event_type=JOB_NEW_STATUS, instance=job, previous_status=previous_status) if instance.status == JobLifeCycle.CREATED: auditor.record(event_type=JOB_CREATED, instance=job) elif instance.status == JobLifeCycle.STOPPED: auditor.record(event_type=JOB_STOPPED, instance=job, previous_status=previous_status) elif instance.status == JobLifeCycle.FAILED: auditor.record(event_type=JOB_FAILED, instance=job, previous_status=previous_status) elif instance.status == JobLifeCycle.SUCCEEDED: auditor.record(event_type=JOB_SUCCEEDED, instance=job, previous_status=previous_status) if JobLifeCycle.is_done(instance.status): auditor.record(event_type=JOB_DONE, instance=job, previous_status=previous_status) RedisStatuses.delete_status(job.uuid.hex) new_operation_run_status(entity_type=content_types.JOB, entity=job, status=instance.status)
def calculated_status(self) -> str: master_status = self.jobs.order_by('created_at').first().last_status calculated_status = master_status if JobLifeCycle.is_done(master_status) else None if calculated_status is None: calculated_status = ExperimentLifeCycle.jobs_status(self.last_job_statuses) if calculated_status is None: return self.last_status return calculated_status
def post(self, request, *args, **kwargs): job = self.get_object() if not JobLifeCycle.is_stoppable(job.last_status): return Response(status=status.HTTP_403_FORBIDDEN) token, _ = Token.objects.get_or_create(user=job.user) return Response({'token': token.key}, status=status.HTTP_200_OK)
def post(self, request, *args, **kwargs): project = self.project if not project.has_notebook or not JobLifeCycle.is_stoppable( project.notebook.last_status): return Response(status=status.HTTP_403_FORBIDDEN) token, _ = Token.objects.get_or_create(user=project.user) return Response({'token': token.key}, status=status.HTTP_200_OK)
def projects_notebook_start(notebook_job_id): notebook_job = get_valid_notebook(notebook_job_id=notebook_job_id) if not notebook_job: return None if not JobLifeCycle.can_transition(status_from=notebook_job.last_status, status_to=JobLifeCycle.SCHEDULED): _logger.info('Notebook `%s` cannot transition from `%s` to `%s`.', notebook_job.unique_name, notebook_job.last_status, JobLifeCycle.SCHEDULED) notebook_scheduler.start_notebook(notebook_job)
def should_handle_job_status(pod_state: Any, status: str) -> bool: job_uuid = pod_state['details']['labels']['job_uuid'] current_status = RedisStatuses.get_status(job=job_uuid) if not current_status: # If the status does not exist or is evicted return True try: return JobLifeCycle.can_transition( status_from=RedisStatuses.get_status(job=job_uuid), status_to=status) except redis.connection.ConnectionError: return True
def tensorboards_start(tensorboard_job_id): tensorboard = get_valid_tensorboard(tensorboard_job_id=tensorboard_job_id) if not tensorboard: return None if not JobLifeCycle.can_transition(status_from=tensorboard.last_status, status_to=JobLifeCycle.SCHEDULED): _logger.info('Tensorboard `%s` cannot transition from `%s` to `%s`.', tensorboard.unique_name, tensorboard.last_status, JobLifeCycle.SCHEDULED) try: tensorboard_scheduler.start_tensorboard(tensorboard) except StoreNotFoundError: tensorboard.set_status(status=JobLifeCycle.FAILED, message='Tensorboard failed to start, ' 'the outputs volume/storage was not found.')
def jobs_start(job_id): job = get_valid_job(job_id=job_id) if not job: return None if job.last_status == JobLifeCycle.RUNNING: _logger.warning('Job is already running.') return None if not JobLifeCycle.can_transition(status_from=job.last_status, status_to=JobLifeCycle.SCHEDULED): _logger.info('Job `%s` cannot transition from `%s` to `%s`.', job.unique_name, job.last_status, JobLifeCycle.SCHEDULED) return None job_scheduler.start_job(job)
def k8s_events_handle_experiment_job_statuses(self: 'workers.app.task', payload: Dict) -> None: """Experiment jobs statuses""" details = payload['details'] job_uuid = details['labels']['job_uuid'] restart_count = payload.get('restart_count', 0) logger.debug('handling events status for job_uuid: %s, status: %s', job_uuid, payload['status']) try: job = ExperimentJob.objects.get(uuid=job_uuid) except ExperimentJob.DoesNotExist: logger.debug('Job uuid`%s` does not exist', job_uuid) return try: experiment = job.experiment except Experiment.DoesNotExist: logger.debug('Experiment for job `%s` does not exist anymore', job_uuid) return if job.last_status is None and self.request.retries < 2: self.retry(countdown=1) max_restarts = experiment.max_restarts or conf.get( MAX_RESTARTS_EXPERIMENTS) if JobLifeCycle.failed(payload['status']) and restart_count < max_restarts: return # Set the new status try: RedisStatuses.set_status(job_uuid, payload['status']) set_node_scheduling(job, details['node_name']) job.set_status(status=payload['status'], message=payload['message'], created_at=payload.get('created_at'), traceback=payload.get('traceback'), details=details) logger.debug('status %s is set for job %s %s', payload['status'], job_uuid, job.id) except IntegrityError: # Due to concurrency this could happen, we just retry it logger.info('Retry job status %s handling %s', payload['status'], job_uuid) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def k8s_events_handle_build_job_statuses(self: 'workers.app.task', payload: Dict) -> None: """Project Plugin jobs statuses""" details = payload['details'] app = details['labels']['app'] job_uuid = details['labels']['job_uuid'] job_name = details['labels']['job_name'] restart_count = payload.get('restart_count', 0) project_name = details['labels'].get('project_name') logger.debug('handling events status for build jon %s %s', job_name, app) try: build_job = BuildJob.objects.get(uuid=job_uuid) except BuildJob.DoesNotExist: logger.info('Build job `%s` does not exist', job_name) return try: build_job.project except Project.DoesNotExist: logger.debug('`%s` does not exist anymore', project_name) max_restarts = build_job.max_restarts or conf.get(MAX_RESTARTS_BUILD_JOBS) if JobLifeCycle.failed(payload['status']) and restart_count < max_restarts: return # Set the new status try: RedisStatuses.set_status(job_uuid, payload['status']) set_node_scheduling(build_job, details['node_name']) build_job.set_status(status=payload['status'], message=payload['message'], traceback=payload.get('traceback'), details=details) except IntegrityError: # Due to concurrency this could happen, we just retry it self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def test_job_statuses_transition(self): # pylint:disable=too-many-branches # pylint:disable=too-many-statements # Cannot transition to `CREATED` for status in JobLifeCycle.VALUES: assert JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.CREATED) is False # -> BUILDING for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.BUILDING) if status in { JobLifeCycle.CREATED, JobLifeCycle.RESUMING, JobLifeCycle.SCHEDULED, JobLifeCycle.UNSCHEDULABLE, JobLifeCycle.WARNING, JobLifeCycle.UNKNOWN, }: assert can_transition is True else: assert can_transition is False # -> SCHEDULED for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.SCHEDULED) if status in { JobLifeCycle.CREATED, JobLifeCycle.RESUMING, JobLifeCycle.BUILDING, JobLifeCycle.WARNING, JobLifeCycle.UNSCHEDULABLE, JobLifeCycle.UNKNOWN, }: assert can_transition is True else: assert can_transition is False # -> RUNNING for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.RUNNING) if status in { JobLifeCycle.CREATED, JobLifeCycle.SCHEDULED, JobLifeCycle.RESUMING, JobLifeCycle.BUILDING, JobLifeCycle.UNSCHEDULABLE, JobLifeCycle.UNKNOWN, JobLifeCycle.WARNING, }: assert can_transition is True else: assert can_transition is False # -> SKIPPED for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.SKIPPED) if status not in JobLifeCycle.DONE_STATUS: assert can_transition is True else: assert can_transition is False # -> SUCCEEDED for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.SUCCEEDED) if status not in JobLifeCycle.DONE_STATUS: assert can_transition is True else: assert can_transition is False # -> FAILED for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.FAILED) if status not in JobLifeCycle.DONE_STATUS: assert can_transition is True else: assert can_transition is False # -> UPSTREAM_FAILED for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.UPSTREAM_FAILED) if status not in JobLifeCycle.DONE_STATUS: assert can_transition is True else: assert can_transition is False # -> STOPPED for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.STOPPED) if status not in JobLifeCycle.DONE_STATUS: assert can_transition is True else: assert can_transition is False # -> WARNING for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.WARNING) cond = status in (JobLifeCycle.VALUES - JobLifeCycle.DONE_STATUS - { JobLifeCycle.WARNING, }) if cond: assert can_transition is True else: assert can_transition is False # -> UNKNOWN for status in JobLifeCycle.VALUES: can_transition = JobLifeCycle.can_transition( status_from=status, status_to=JobLifeCycle.UNKNOWN) if status not in { JobLifeCycle.UNKNOWN, }: assert can_transition is True else: assert can_transition is False
async def job_logs( request, # pylint:disable=too-many-branches ws, username, project_name, job_id): from streams.consumers.consumers import Consumer job, message = validate_job(request=request, username=username, project_name=project_name, job_id=job_id) if job is None: await ws.send(get_error_message(message)) return job_uuid = job.uuid.hex auditor.record(event_type=JOB_LOGS_VIEWED, instance=job, actor_id=request.app.user.id, actor_name=request.app.user.username) if not RedisToStream.is_monitored_job_logs(job_uuid=job_uuid): logger.info('Job uuid `%s` logs is now being monitored', job_uuid) RedisToStream.monitor_job_logs(job_uuid=job_uuid) # start consumer if job_uuid in request.app.job_logs_consumers: consumer = request.app.job_logs_consumers[job_uuid] else: logger.info('Add job log consumer for %s', job_uuid) consumer = Consumer( routing_key='{}.{}'.format(RoutingKeys.STREAM_LOGS_SIDECARS_JOBS, job_uuid), queue='{}.{}'.format(CeleryQueues.STREAM_LOGS_SIDECARS, job_uuid)) request.app.job_logs_consumers[job_uuid] = consumer consumer.run() def should_disconnect(): if not consumer.ws: logger.info('Stopping logs monitor for job uuid %s', job_uuid) RedisToStream.remove_job_logs(job_uuid=job_uuid) # if job_uuid in request.app.job_logs_consumers: # consumer = request.app.job_logs_consumers.pop(job_uuid, None) # if consumer: # consumer.stop() return True return False # add socket manager consumer.add_socket(ws) should_quite = False num_message_retries = 0 # Stream phase changes status = None while status != JobLifeCycle.RUNNING and not JobLifeCycle.is_done(status): job.refresh_from_db() if status != job.last_status: status = job.last_status await notify(ws_manager=consumer, message=get_status_message(status)) if should_disconnect(): return await asyncio.sleep(SOCKET_SLEEP) if JobLifeCycle.is_done(status): await notify(ws_manager=consumer, message=get_status_message(status)) RedisToStream.remove_job_logs(job_uuid=job_uuid) return while True: num_message_retries += 1 for message in consumer.get_messages(): num_message_retries = 0 await notify(ws_manager=consumer, message=message) # After trying a couple of time, we must check the status of the experiment if num_message_retries > MAX_RETRIES: job.refresh_from_db() if job.is_done: logger.info('removing all socket because the job `%s` is done', job_uuid) consumer.ws = set([]) else: num_message_retries -= CHECK_DELAY # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access logger.info('Quitting logs socket for job uuid %s', job_uuid) consumer.remove_sockets({ ws, }) should_quite = True if should_disconnect(): should_quite = True if should_quite: return await asyncio.sleep(SOCKET_SLEEP)