def _join_handler(self): set_thread_title() session = self._create_session() # TODO make it more robust to exceptions with self._lock: while self._running: #self._log.debug("Checking for the state of jobs ...") for job in session.query(SagaJob).filter( SagaJob.saga_state != None, ~SagaJob.state.in_(runstates.TERMINAL_STATES)): #self._log.debug("Checking state for [{}] {} ... ".format(job.id, job.name)) try: if job.saga_getjob_start is None: job.saga_getjob_start = time.time() session.commit() #self._qlock.release() saga_job = self._job_service.get_job(job.saga_id) #self._qlock.acquire() job.saga_state = saga_job.state next_state = self.__JOB_STATE_FROM_SAGA[job.saga_state] if job.state == next_state: continue if job.saga_state in [saga.job.DONE, saga.job.FAILED, saga.job.CANCELED]: job.exitcode = saga_job.exit_code job.created = datetime_from_posix(saga_job.created) job.started = datetime_from_posix(saga_job.started) job.finished = datetime_from_posix(saga_job.finished) job.hostname = saga_job.execution_hosts except saga.NoSuccess: # wait up to N seconds before failing if time.time() - job.saga_getjob_start >= 120: self._log.error("Couldn't reconnect to job {}".format(job.saga_id)) # what should we do ? failing or finishing ? job.state = runstates.FAILED # let's be paranoid ! session.commit() self._lock.release() self._fire_job_updated(job) self._lock.acquire() continue if job.saga_state == saga.job.DONE and job.exitcode != 0: next_state = runstates.FAILED if job.state != runstates.WAITING and next_state == runstates.WAITING: self._queued_count += 1 elif job.state == runstates.WAITING and next_state != runstates.WAITING: self._queued_count -= 1 job.state = next_state session.commit() self._lock.release() self._fire_job_updated(job) self._lock.acquire() if not self._running: break if self._running: self._lock.release() time.sleep(self._state_check_interval) self._lock.acquire() session.close() self._log.debug("Join thread finished")
def _run_handler(self): set_thread_title() session = self._create_session() with self._lock: while self._running: if self._max_queued != 0 and self._queued_count >= self._max_queued: self._lock.release() time.sleep(1) self._lock.acquire() continue job = None try: #self._log.debug("Looking for waiting jobs ...") job = session.query(SagaJob).filter( SagaJob.state == runstates.WAITING, SagaJob.saga_state == None).order_by(SagaJob.priority).first() if job is not None: pass #self._queued_count += 1 #self._log.debug("Found: {}".format(repr(job))) except Exception as e: self._log.exception(e) if job is None: self._lock.release() time.sleep(1) self._lock.acquire() continue self._lock.release() jd = saga.job.Description() jd.name = job.name jd.executable = job.script jd.arguments = [] jd.environment = job.env jd.spmd_variation = self._task_conf(job, "pe", self._pe) jd.total_cpu_count = self._task_conf(job, "cpu_count", self._cpu_count) jd.queue = self._task_conf(job, "queue", self._queue) jd.project = self._task_conf(job, "project", self._project) jd.working_directory = self._task_conf(job, "working_directory", self._working_directory) jd.output = jd.error = os.path.join(self._remote_output_path, job.case_name, "{}.txt".format(job.name)) # remove previous output file try: self._log.debug("Removing remote output for [{}] {} ...".format(job.id, job.name)) self._remote_dir.remove(jd.output) except saga.DoesNotExist: pass except Exception as ex: self._log.error("Couldn't remove remote job output file: {}".format(ex)) self._lock.acquire() # create and run the job try: self._log.debug("Running job [{}] {} ...".format(job.id, job.name)) saga_job = None saga_job = self._job_service.create_job(jd) saga_job.run() job.saga_id = saga_job.id job.saga_state = saga_job.state job.output = jd.output session.commit() self._queued_count += 1 except Exception as ex: self._log.exception(ex) if saga_job is not None: # and saga_job.state in [saga.job.RUNNING, saga.job.SUSPENDED]: try: saga_job.cancel() finally: session.rollback() session.close() self._log.debug("Run thread finished")
def _join(self): "Joiner thread" set_thread_title() self._num_alive_threads += 1 _log = logger.get_logger("wok.engine.join") _log.debug("Engine join thread ready") session = None num_exc = 0 while self._running: try: set_thread_title("waiting") job_info = self.__queue_adaptative_get(self._join_queue) if job_info is None: continue workitem_id, job_id = job_info with self._lock: session = db.Session() workitem = session.query(db.WorkItem).filter(db.WorkItem.id == workitem_id).one() case = self._cases_by_name[workitem.case.name] task = case.component(workitem.task.cname) set_thread_title(task.cname) #_log.debug("Joining work-item %s ..." % task.cname) jr = task.platform.jobs.join(job_id) wr = Data.element(dict( hostname=jr.hostname, created=jr.created.strftime(_DT_FORMAT) if jr.created is not None else None, started=jr.started.strftime(_DT_FORMAT) if jr.started is not None else None, finished=jr.finished.strftime(_DT_FORMAT) if jr.finished is not None else None, exitcode=jr.exitcode.code if jr.exitcode is not None else None)) r = task.platform.data.load_workitem_result(case.name, task.cname, workitem.index) if r is not None: if r.exception is not None: wr["exception"] = r.exception if r.trace is not None: wr["trace"] = r.trace workitem.substate = None workitem.result = wr case.num_active_workitems -= 1 session.commit() # check if there are still more work-items num_workitems = session.query(func.count(db.WorkItem.id)).filter( ~db.WorkItem.state.in_(runstates.TERMINAL_STATES)).scalar() if self._single_run and num_workitems == 0: stop_engine = True for case in self._cases: stop_engine = stop_engine and (case.state in runstates.TERMINAL_STATES) #self._running = not stop_engine if stop_engine: self._finished_event.set() _log.debug("[{}] Joined work-item {}".format(case.name, workitem.cname)) # check stopping instances if case in self._stopping_cases: job_ids = self._stopping_cases[case] if job_id in job_ids: job_ids.remove(job_id) if len(job_ids) == 0: del self._stopping_cases[case] if case.state == runstates.ABORTING: workitem.case.state = case.state = runstates.ABORTED session.commit() if case.removed: self.__remove_case(session, case) session.commit() else: _log.debug("Still waiting for {} jobs to stop".format(len(job_ids))) if case.state in runstates.TERMINAL_STATES and case.num_active_workitems == 0: _log.info("[{}] Case {}. Total time: {}".format(case.name, case.state.title, str(case.elapsed))) self._lock.release() try: self.case_finished.send(case) finally: self._lock.acquire() except BaseException as ex: num_exc += 1 _log.warn("Exception in join thread ({}): {}".format(num_exc, str(ex))) from traceback import format_exc _log.debug(format_exc()) try: if session is not None: session.rollback() except Exception as ex: _log.warn("Session rollback failed") _log.exception(ex) finally: try: if session is not None: session.close() except Exception as ex: _log.warn("Session close failed") _log.exception(ex) self._num_alive_threads -= 1 _log.debug("Engine join thread finished")
def _logs(self, index): "Log retrieval thread" set_thread_title() self._num_alive_threads += 1 _log = logger.get_logger("wok.engine.logs-{}".format(index)) _log.debug("Engine logs thread ready") num_exc = 0 while self._running: set_thread_title("waiting") # get the next task to retrieve the logs job_info = self.__queue_adaptative_get(self._logs_queue) if job_info is None: continue workitem_id, job_id = job_info session = db.Session() task = None try: workitem = session.query(db.WorkItem).filter(db.WorkItem.id == workitem_id).one() case = self._cases_by_name[workitem.case.name] task = case.component(workitem.task.cname) set_thread_title(workitem.cname) _log.debug("[{}] Reading logs for work-item {} ...".format(case.name, workitem.cname)) output = task.platform.jobs.output(job_id) if output is None: output = StringIO.StringIO() path = os.path.join(self._work_path, "logs", case.name, task.cname) if not os.path.isdir(path): try: os.makedirs(path) except: if not os.path.isdir(path): raise path = os.path.join(path, "{:08}.db".format(workitem.index)) if os.path.isfile(path): os.remove(path) logs_db = LogsDb(path) logs_db.open() logs_db.add(case.name, task.cname, workitem.index, output) logs_db.close() _log.debug("[{}] Done with logs of work-item {}".format(case.name, workitem.cname)) except BaseException as ex: num_exc += 1 session.rollback() _log.info("Exception in logs thread ({}): {}".format(num_exc, str(ex))) from traceback import format_exc _log.debug(format_exc()) finally: workitem.substate = runstates.JOINING self._join_queue.put(job_info) session.commit() session.close() self._num_alive_threads -= 1 _log.debug("Engine logs thread finished")
def _run(self): set_thread_title() num_exc = 0 self._running = True self._num_alive_threads += 1 # Start the logs threads for i in range(self._num_log_threads): t = threading.Thread(target=self._logs, args=(i, ), name="wok-engine-logs-%d" % i) self._logs_threads += [t] t.start() # Start the join thread self._join_thread = threading.Thread(target=self._join, name="wok-engine-join") self._join_thread.start() _log = logger.get_logger("wok.engine.run") _log.debug("Engine run thread ready") while self._running: session = db.Session() try: #_log.debug("Scheduling new tasks ...") set_thread_title("scheduling") updated_tasks = set() # schedule tasks ready to be executed and save new workitems into the db for case in self._cases: tasks = case.schedule(session) updated_tasks.update(tasks) session.commit() # submit workitems ready to be executed for platform in self._platforms: job_submissions = self.__job_submissions(session, platform) for js, job_id, job_state in platform.submit(job_submissions): workitem = session.query(db.WorkItem).filter(db.WorkItem.id == js.workitem_id).one() workitem.job_id = job_id workitem.state = job_state js.task.dirty = True session.commit() updated_tasks.add(js.task) session.close() session = None #_log.debug("Waiting for events ...") set_thread_title("waiting") while len(updated_tasks) == 0 and not self._notified and self._running: self._cvar.wait(1) self._notified = False if not self._running: break session = db.Session() # there is a session.close() in the finished block #_log.debug("Stopping jobs for aborting instances ...") set_thread_title("working") # check stopping instances for case in self._cases: if (case.state == runstates.ABORTING or case.removed) and case not in self._stopping_cases: num_job_ids = session.query(db.WorkItem.job_id).filter(db.WorkItem.case_id == case.id)\ .filter(~db.WorkItem.state.in_(runstates.TERMINAL_STATES)).count() if num_job_ids == 0: if case.state == runstates.ABORTING: _log.debug("Aborted case {} with no running jobs".format(case.name)) dbcase = session.query(db.Case).filter(db.Case.id == case.id) dbcase.state = case.state = runstates.ABORTED session.commit() else: _log.debug("Stopped case {} with no running jobs".format(case.name)) if case.removed: _log.debug("Removing case {} with no running jobs".format(case.name)) self.__remove_case(session, case) session.commit() else: _log.info("Stopping {} jobs for case {} ...".format(num_job_ids, case.name)) self._stopping_cases[case] = set() for platform in self._platforms: job_ids = [int(r[0]) for r in session.query(db.WorkItem.job_id) .filter(db.WorkItem.case_id == case.id)\ .filter(db.WorkItem.platform == platform.name)\ .filter(~db.WorkItem.state.in_(runstates.TERMINAL_STATES))] self._stopping_cases[case].update(job_ids) platform.jobs.abort(job_ids) #_log.debug("Checking job state changes ...") # detect workitems which state has changed for platform in self._platforms: for job_id, state in platform.jobs.state(): try: workitem = session.query(db.WorkItem).filter(db.WorkItem.job_id == job_id).one() except NoResultFound: _log.warn("No work-item available for the job {0} while retrieving state".format(job_id)) platform.jobs.abort([job_id]) platform.jobs.join(job_id) continue if workitem.state != state: case = self._cases_by_name[workitem.case.name] task = case.component(workitem.task.cname) task.dirty = True workitem.state = state workitem.substate = runstates.LOGS_RETRIEVAL session.commit() updated_tasks.add(task) # if workitem has finished, queue it for logs retrieval if state in runstates.TERMINAL_STATES: self._logs_queue.put((workitem.id, job_id)) _log.debug("[{}] Work-Item {} changed state to {}".format(case.name, workitem.cname, state)) #_log.debug("Updating components state ...") # update affected components state updated_cases = set([task.case for task in updated_tasks]) for case in updated_cases: case.update_states(session) case.update_count_by_state(session) case.clean_components(session) session.commit() if case.state == runstates.RUNNING: self._lock.release() try: self.case_started.send(case) finally: self._lock.acquire() for task in updated_tasks: case = task.case #_log.debug("[{}] Component {} updated state to {} ...".format( # component.case.name, component.cname, component.state)) count = task.workitem_count_by_state sb = ["[{}] {} ({})".format(case.name, task.cname, task.state.title)] sep = " " for state in runstates.STATES: if state in count: sb += [sep, "{}={}".format(state.symbol, count[state])] if sep == " ": sep = ", " if task.state == runstates.FINISHED and task.state in count: elapsed = str(task.elapsed) elapsed = elapsed.split(".")[0] sb += [" ", "<{}>".format(elapsed)] self._log.info("".join(sb)) except BaseException as ex: num_exc += 1 _log.warn("Exception in run thread ({}): {}".format(num_exc, str(ex))) #if num_exc > 3: # raise #else: from traceback import format_exc _log.debug(format_exc()) try: if session is not None: session.rollback() except Exception as ex: _log.warn("Session rollback failed") _log.exception(ex) finally: try: if session is not None: session.close() except Exception as ex: _log.warn("Session close failed") _log.exception(ex) session = None set_thread_title("finishing") try: # print cases state before leaving the thread #for case in self._cases: # _log.debug("Case state:\n" + repr(case)) for t in self._logs_threads: t.join() self._lock.release() self._join_thread.join() self._lock.acquire() _log.debug("Engine run thread finished") except Exception as ex: _log.exception(ex) self._running = False self._num_alive_threads -= 1
def _run(self): self._log.debug("Run thread ready") session = self._create_session() try: while self._running: set_thread_title() job = self._next_job(session) if job is None: break set_thread_title("{}".format(job.name)) self._log.debug("Running task [{}] {} ...".format(job.id, job.name)) # Prepare execution script, env = job.script, job.env sb = ["Script:\n", script] if len(env) > 0: for k, v in env.iteritems(): sb += ["\n{}={}".format(k, v)] self._log.debug("".join(sb)) o = open(job.output, "w") cwd = self._conf.get("working_directory") if cwd is not None: cwd = os.path.abspath(cwd) job.hostname = self.hostname job.started = datetime.now() self._update_job(session, job) # Prepare the script file fd, script_path = tempfile.mkstemp(prefix=job.name + "-", suffix=".sh") with os.fdopen(fd, "w") as f: f.write(script) # Run the script exception_trace = None try: process = subprocess.Popen( args=["/bin/bash", script_path], stdin=None, stdout=o, stderr=subprocess.STDOUT, cwd=cwd, env=env, preexec_fn=os.setsid) set_thread_title("{} PID={}".format(job.name, process.pid)) session.refresh(job, ["state"]) timeout = ProgressiveTimeout(0.5, 6.0, 0.5) while job.state == runstates.RUNNING and process.poll() is None and not self._kill_threads: timeout.sleep() session.refresh(job, ["state"]) job.finished = datetime.now() if process.poll() is None: self._log.info("Killing job [{}] {} ...".format(job.id, job.name)) os.killpg(process.pid, signal.SIGTERM) timeout = ProgressiveTimeout(0.5, 6.0, 0.5) while process.poll() is None: timeout.sleep() job.state = runstates.ABORTED job.exitcode = process.returncode elif job.state == runstates.ABORTING: job.state = runstates.ABORTED else: job.state = runstates.FINISHED if process.returncode == 0 else runstates.FAILED job.exitcode = process.returncode except Exception as e: self._log.exception(e) job.state = runstates.FAILED job.finished = datetime.now() job.exitcode = exit_codes.RUN_THREAD_EXCEPTION finally: try: if process.poll() is None: self._log.info("Killing job [{}] {} ...".format(job.id, job.name)) os.killpg(process.pid, signal.SIGKILL) process.wait() except: self._log.exception("Exception while waiting for process {} to finish".format(process.pid)) o.close() if os.path.exists(script_path): os.remove(script_path) if job.state == runstates.FINISHED: self._log.debug("Job finished [{}] {}".format(job.id, job.name)) elif job.state == runstates.ABORTED: self._log.debug("Job aborted [{}] {}".format(job.id, job.name)) else: self._log.debug("Job failed [{}] {}".format(job.id, job.name)) self._update_job(session, job) except: self._log.exception("Unexpected exception in thread {}".format(current_thread().name)) finally: session.close() self._log.debug("Run thread finished")