示例#1
0
 def stop_job(self, job_id, job_status, comment=None):
     m = self.model
     m.begin_transaction()
     lock = None
     try:
         job = self._get_job(job_id, "stop_job")
         if not job.is_running():
             raise DjmApiError("stop_job",
                               "Job %s is already stopped" % job_id)
         if not job.does_not_have_active_tasks():
             raise DjmApiError("stop_job",
                               "Job %s cannot be stopped as it still has running tasks" %
                               job_id)
         if not self.job_locks.has_key(job_id):
             lockfile_name = job.get_coordinator_lockfile()
             if lockfile_name:
                 lock = locking.lock_from_path(lockfile_name)
                 got_lock = lock.acquire_if_available()
                 if not got_lock:
                     raise DjmApiError("stop_job",
                                       "Unable to stop job %s, someone seems to have the lock" %
                                       job_id)
                 logger.debug("Coordinator for job %s seems to have died, got the lock" %
                              job_id)
         else:
             lock = self.job_locks[job_id]
             assert lock.is_locked_by_me(), "Lock %s is not locked!" % lock.path
         m.stop_job(job, job_status, comment=comment)
     except Exception, e:
         logger.exception("Aborting transaction due to exception %s" % e)
         m.abort_transaction()
         raise
示例#2
0
 def cleanup_dead_coordinators(self):
     """Clean up any dead coodinators from previous runs.
     """
     m = self.model
     m.begin_transaction()
     locks = []
     job_ids = []
     if len(self.job_locks)>0:
         raise DjmApiError("cleanup_dead_coordinators",
                           "Cannot run cleanup_dead_coordinators while jobs are running: %s" %
                           ' '.join(self.job_locks.keys()))
     try:
         jobs = m.query_jobs()
         logger.debug("checking %d jobs" % len(jobs))
         for job in jobs:
             if job.get_status()==None and \
                    job.get_coordinator_lockfile()!=None:
                 lockfile_name = job.get_coordinator_lockfile()
                 lock = locking.lock_from_path(lockfile_name)
                 logger.debug("checking job %s, lockfile %s" %
                              (job.get_id(), lockfile_name))
                 got_lock = lock.acquire_if_available()
                 if not got_lock:
                     logger.debug("Job %s is still running" % job.get_id())
                     continue
                 locks.append(lock)
                 job_ids.append(job.get_id())
                 logger.info("Coordinator for job %s died - cleaning up" %
                             job.get_id())
                 tasks = job.get_all_tasks()
                 for task in tasks:
                     if not task.is_stopped():
                         m.stop_task(task, TaskStatus.TASK_UNKNOWN)
                 m.stop_job(job, JobStatus.JOB_FAILED,
                            comment="Coordinator process crashed")
             ## else:
             ##     logger.debug("skipping job %s, status=%s, lockfile=%s" %
             ##                  (job.get_id(), job.get_status(),
             ##                   job.get_coordinator_lockfile()))
     except Exception, e:
         logger.exception("Aborting transaction due to exception %s" % e)
         m.abort_transaction()
         for lock in locks:
             lock.release()
         raise