def jobs_stop(self, project_name, project_uuid, job_name, job_uuid, update_status=True, collect_logs=True, message=None): if collect_logs: try: logs_collect_job(job_uuid=job_uuid) except (OSError, VolumeNotFoundError, PolyaxonStoresException): _logger.warning( 'Scheduler could not collect the logs for job `%s`.', job_name) deleted = job_scheduler.stop_job(project_name=project_name, project_uuid=project_uuid, job_name=job_name, job_uuid=job_uuid) if not deleted and self.request.retries < 2: _logger.info('Trying again to delete job `%s`.', job_name) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return if not update_status: return job = get_valid_job(job_uuid=job_uuid, include_deleted=True) if not job: return None # Update notebook status to show that its stopped job.set_status(status=JobLifeCycle.STOPPED, message=message or 'Job was stopped.')
def jobs_schedule_deletion(job_id, immediate=False): job = get_valid_job(job_id=job_id, include_deleted=True) if not job: return None job.archive() if job.is_stoppable: project = job.project celery_app.send_task( SchedulerCeleryTasks.JOBS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'job_name': job.unique_name, 'job_uuid': job.uuid.hex, 'update_status': True, 'collect_logs': False, 'is_managed': job.is_managed, 'message': 'Job is scheduled for deletion.' }, countdown=conf.get('GLOBAL_COUNTDOWN')) if immediate: celery_app.send_task( SchedulerCeleryTasks.DELETE_ARCHIVED_JOB, kwargs={ 'job_id': job_id, }, countdown=conf.get('GLOBAL_COUNTDOWN_DELAYED'))
def jobs_build(job_id): job = get_valid_job(job_id=job_id) if not job: return None if not JobLifeCycle.can_transition(status_from=job.last_status, status_to=JobLifeCycle.BUILDING): _logger.info('Job id `%s` cannot transition from `%s` to `%s`.', job_id, job.last_status, JobLifeCycle.BUILDING) return build_job, image_exists, build_status = dockerizer_scheduler.create_build_job( user=job.user, project=job.project, config=job.specification.build, configmap_refs=job.specification.configmap_refs, secret_refs=job.specification.secret_refs, code_reference=job.code_reference) job.build_job = build_job job.save(update_fields=['build_job']) if image_exists: # The image already exists, so we can start the experiment right away celery_app.send_task( SchedulerCeleryTasks.JOBS_START, kwargs={'job_id': job_id}, countdown=conf.get('GLOBAL_COUNTDOWN')) return if not build_status: job.set_status(JobLifeCycle.FAILED, message='Could not start build process.') return # Update job status to show that its building docker image job.set_status(JobLifeCycle.BUILDING, message='Building container')
def jobs_stop(self, project_name, project_uuid, job_name, job_uuid, update_status=True): deleted = job_scheduler.stop_job(project_name=project_name, project_uuid=project_uuid, job_name=job_name, job_uuid=job_uuid) if not deleted and self.request.retries < 2: _logger.info('Trying again to delete job `%s`.', job_name) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return if not update_status: return job = get_valid_job(job_uuid=job_uuid) if not job: return None # Update notebook status to show that its stopped job.set_status(status=JobLifeCycle.STOPPED, message='Job was stopped')
def jobs_schedule_deletion(job_id, immediate=False): job = get_valid_job(job_id=job_id, include_deleted=True) if not job: return None job.archive() if job.is_running: project = job.project celery_app.send_task(SchedulerCeleryTasks.JOBS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'job_name': job.unique_name, 'job_uuid': job.uuid.hex, 'update_status': True, 'collect_logs': False, 'message': 'Job is scheduled for deletion.' }) if immediate: celery_app.send_task(SchedulerCeleryTasks.DELETE_ARCHIVED_JOB, kwargs={ 'job_id': job_id, })
def jobs_check_heartbeat(job_id): if RedisHeartBeat.job_is_alive(job_id=job_id): return job = get_valid_job(job_id=job_id) if not job: return # Job is zombie status job.set_status(JobLifeCycle.FAILED, message='Job is in zombie state (no heartbeat was reported).')
def jobs_start(job_id): job = get_valid_job(job_id=job_id) if not job: return None if job.last_status == JobLifeCycle.RUNNING: _logger.warning('Job is already running.') return None if not JobLifeCycle.can_transition(status_from=job.last_status, status_to=JobLifeCycle.SCHEDULED): _logger.info('Job `%s` cannot transition from `%s` to `%s`.', job.unique_name, job.last_status, JobLifeCycle.SCHEDULED) return None job_scheduler.start_job(job)
def jobs_stop(project_name, project_uuid, job_name, job_uuid, specification, update_status=True): job_scheduler.stop_job(project_name=project_name, project_uuid=project_uuid, job_name=job_name, job_uuid=job_uuid, specification=specification) if not update_status: return job = get_valid_job(job_uuid=job_uuid) if not job: return None # Update notebook status to show that its stopped job.set_status(status=JobLifeCycle.STOPPED, message='job was stopped')
def jobs_stop(job_id): job = get_valid_job(job_id=job_id) if not job: return None job_scheduler.stop_job(job, update_status=True)