def k8s_events_handle_build_job_statuses(self, payload): """Project Plugin jobs statuses""" details = payload['details'] app = details['labels']['app'] job_uuid = details['labels']['job_uuid'] job_name = details['labels']['job_name'] project_name = details['labels'].get('project_name') logger.debug('handling events status for build jon %s %s', job_name, app) try: build_job = BuildJob.objects.get(uuid=job_uuid) except BuildJob.DoesNotExist: logger.info('Build job `%s` does not exist', job_name) return try: build_job.project except Project.DoesNotExist: logger.debug('`%s` does not exist anymore', project_name) # Set the new status try: set_node_scheduling(build_job, details['node_name']) build_job.set_status(status=payload['status'], message=payload['message'], traceback=payload.get('traceback'), details=details) except IntegrityError: # Due to concurrency this could happen, we just retry it self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def k8s_events_reconcile_job_statuses(job_id, status, created_at) -> None: try: job = Job.objects.get(id=job_id) except ExperimentJob.DoesNotExist: logger.debug('Job `%s` does not exist', job_id) return if job.is_done: return job.set_status(status=status, message='Status was reconciled.', created_at=created_at)
def k8s_events_reconcile_plugin_job_statuses(job_id, app, status, created_at) -> None: job = get_plugin_job(app=app, job_uuid=job_id) if not job: logger.debug('Job `%s` does not exist', job_id) return if job.is_done: return job.set_status(status=status, message='Status was reconciled.', created_at=created_at)
def k8s_events_handle_experiment_job_statuses(self, payload): """Experiment jobs statuses""" details = payload['details'] job_uuid = details['labels']['job_uuid'] logger.debug('handling events status for job_uuid: %s, status: %s', job_uuid, payload['status']) try: job = ExperimentJob.objects.get(uuid=job_uuid) except ExperimentJob.DoesNotExist: logger.debug('Job uuid`%s` does not exist', job_uuid) return try: job.experiment except Experiment.DoesNotExist: logger.debug('Experiment for job `%s` does not exist anymore', job_uuid) return if job.last_status is None and self.request.retries < 2: self.retry(countdown=1) # Set the new status try: set_node_scheduling(job, details['node_name']) job.set_status(status=payload['status'], message=payload['message'], created_at=payload.get('created_at'), traceback=payload.get('traceback'), details=details) logger.debug('status %s is set for job %s %s', payload['status'], job_uuid, job.id) except IntegrityError: # Due to concurrency this could happen, we just retry it logger.info('Retry job status %s handling %s', payload['status'], job_uuid) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def k8s_events_handle_job_statuses(self: 'workers.app.task', payload: Dict) -> None: """Project jobs statuses""" details = payload['details'] job_uuid = details['labels']['job_uuid'] job_name = details['labels']['job_name'] project_name = details['labels'].get('project_name') logger.debug('handling events status for job %s', job_name) try: job = Job.objects.get(uuid=job_uuid) except Job.DoesNotExist: logger.debug('Job `%s` does not exist', job_name) return try: job.project except Project.DoesNotExist: logger.debug('Project for job `%s` does not exist', project_name) return # Set the new status try: RedisStatuses.set_status(job_uuid, payload['status']) set_node_scheduling(job, details['node_name']) job.set_status(status=payload['status'], message=payload['message'], traceback=payload.get('traceback'), details=details) except IntegrityError: # Due to concurrency this could happen, we just retry it self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def k8s_events_handle_build_job_statuses(self: 'workers.app.task', payload: Dict) -> None: """Project Plugin jobs statuses""" details = payload['details'] app = details['labels']['app'] job_uuid = details['labels']['job_uuid'] job_name = details['labels']['job_name'] restart_count = payload.get('restart_count', 0) project_name = details['labels'].get('project_name') logger.debug('handling events status for build jon %s %s', job_name, app) try: build_job = BuildJob.objects.get(uuid=job_uuid) except BuildJob.DoesNotExist: logger.info('Build job `%s` does not exist', job_name) return try: build_job.project except Project.DoesNotExist: logger.debug('`%s` does not exist anymore', project_name) max_restarts = build_job.max_restarts or conf.get(MAX_RESTARTS_BUILD_JOBS) if JobLifeCycle.failed(payload['status']) and restart_count < max_restarts: return # Set the new status try: RedisStatuses.set_status(job_uuid, payload['status']) set_node_scheduling(build_job, details['node_name']) build_job.set_status(status=payload['status'], message=payload['message'], traceback=payload.get('traceback'), details=details) except IntegrityError: # Due to concurrency this could happen, we just retry it self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def k8s_events_handle_experiment_job_statuses(self: 'workers.app.task', payload: Dict) -> None: """Experiment jobs statuses""" details = payload['details'] job_uuid = details['labels']['job_uuid'] restart_count = payload.get('restart_count', 0) logger.debug('handling events status for job_uuid: %s, status: %s', job_uuid, payload['status']) try: job = ExperimentJob.objects.get(uuid=job_uuid) except ExperimentJob.DoesNotExist: logger.debug('Job uuid`%s` does not exist', job_uuid) return try: experiment = job.experiment except Experiment.DoesNotExist: logger.debug('Experiment for job `%s` does not exist anymore', job_uuid) return if job.last_status is None and self.request.retries < 2: self.retry(countdown=1) max_restarts = experiment.max_restarts or conf.get( MAX_RESTARTS_EXPERIMENTS) if JobLifeCycle.failed(payload['status']) and restart_count < max_restarts: return # Set the new status try: RedisStatuses.set_status(job_uuid, payload['status']) set_node_scheduling(job, details['node_name']) job.set_status(status=payload['status'], message=payload['message'], created_at=payload.get('created_at'), traceback=payload.get('traceback'), details=details) logger.debug('status %s is set for job %s %s', payload['status'], job_uuid, job.id) except IntegrityError: # Due to concurrency this could happen, we just retry it logger.info('Retry job status %s handling %s', payload['status'], job_uuid) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def k8s_events_handle_plugin_job_statuses(self: 'workers.app.task', payload: Dict) -> None: """Project Plugin jobs statuses""" details = payload['details'] app = details['labels']['app'] job_uuid = details['labels']['job_uuid'] job_name = details['labels']['job_name'] project_name = details['labels'].get('project_name') logger.debug('handling events status for job %s %s', job_name, app) try: if app == conf.get(APP_LABELS_TENSORBOARD): job = TensorboardJob.objects.get(uuid=job_uuid) elif app == conf.get(APP_LABELS_NOTEBOOK): job = NotebookJob.objects.get(uuid=job_uuid) else: logger.info('Plugin job `%s` does not exist', app) return except (NotebookJob.DoesNotExist, TensorboardJob.DoesNotExist): logger.debug('`%s - %s` does not exist', app, job_name) return try: job.project except Project.DoesNotExist: logger.debug('`%s` does not exist anymore', project_name) # Set the new status try: RedisStatuses.set_status(job_uuid, payload['status']) set_node_scheduling(job, details['node_name']) job.set_status(status=payload['status'], message=payload['message'], traceback=payload.get('traceback'), details=details) except IntegrityError: # Due to concurrency this could happen, we just retry it self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
def k8s_handle_events_namespace(cluster_id: int, payload: Dict) -> None: logger.debug('handling events namespace for cluster: %s', cluster_id) try: ClusterEvent.objects.create(cluster_id=cluster_id, **payload) except OperationalError: pass