def stop_job(self, job_id, job_status, comment=None): m = self.model m.begin_transaction() lock = None try: job = self._get_job(job_id, "stop_job") if not job.is_running(): raise DjmApiError("stop_job", "Job %s is already stopped" % job_id) if not job.does_not_have_active_tasks(): raise DjmApiError("stop_job", "Job %s cannot be stopped as it still has running tasks" % job_id) if not self.job_locks.has_key(job_id): lockfile_name = job.get_coordinator_lockfile() if lockfile_name: lock = locking.lock_from_path(lockfile_name) got_lock = lock.acquire_if_available() if not got_lock: raise DjmApiError("stop_job", "Unable to stop job %s, someone seems to have the lock" % job_id) logger.debug("Coordinator for job %s seems to have died, got the lock" % job_id) else: lock = self.job_locks[job_id] assert lock.is_locked_by_me(), "Lock %s is not locked!" % lock.path m.stop_job(job, job_status, comment=comment) except Exception, e: logger.exception("Aborting transaction due to exception %s" % e) m.abort_transaction() raise
def cleanup_dead_coordinators(self): """Clean up any dead coodinators from previous runs. """ m = self.model m.begin_transaction() locks = [] job_ids = [] if len(self.job_locks)>0: raise DjmApiError("cleanup_dead_coordinators", "Cannot run cleanup_dead_coordinators while jobs are running: %s" % ' '.join(self.job_locks.keys())) try: jobs = m.query_jobs() logger.debug("checking %d jobs" % len(jobs)) for job in jobs: if job.get_status()==None and \ job.get_coordinator_lockfile()!=None: lockfile_name = job.get_coordinator_lockfile() lock = locking.lock_from_path(lockfile_name) logger.debug("checking job %s, lockfile %s" % (job.get_id(), lockfile_name)) got_lock = lock.acquire_if_available() if not got_lock: logger.debug("Job %s is still running" % job.get_id()) continue locks.append(lock) job_ids.append(job.get_id()) logger.info("Coordinator for job %s died - cleaning up" % job.get_id()) tasks = job.get_all_tasks() for task in tasks: if not task.is_stopped(): m.stop_task(task, TaskStatus.TASK_UNKNOWN) m.stop_job(job, JobStatus.JOB_FAILED, comment="Coordinator process crashed") ## else: ## logger.debug("skipping job %s, status=%s, lockfile=%s" % ## (job.get_id(), job.get_status(), ## job.get_coordinator_lockfile())) except Exception, e: logger.exception("Aborting transaction due to exception %s" % e) m.abort_transaction() for lock in locks: lock.release() raise