def shutdown(self, msg): """Shutdown the scheduler.""" try: self.cleanup() self.history.append("Completed on %s" % time.asctime()) self.history.append("Elapsed time %s" % self.get_delta_etime()) if self.debug: print(">>>>> shutdown: Number of open file descriptors: %s" % get_open_fds()) retcode = self.send_email(msg) if self.debug: print("send_mail retcode", retcode) # Write file with the list of exceptions: if self.exceptions: dump_file = os.path.join(self.flow.workdir, "_exceptions") with open(dump_file, "w") as fh: fh.writelines(self.exceptions) fh.write("Shutdown message:\n%s" % msg) lines = [] app = lines.append app("Submitted on %s" % time.ctime(self.start_time)) app("Completed on %s" % time.asctime()) app("Elapsed time %s" % str(self.get_delta_etime())) if self.flow.all_ok: app("Flow completed successfully") else: app("Flow didn't complete successfully") app("Shutdown message:\n%s" % msg) print("") print("\n".join(lines)) self._do_customer_service() if self.flow.all_ok: print("Calling flow.finalize()") self.flow.finalize() finally: # Shutdown the scheduler thus allowing the process to exit. logger.debug('this should be the shutdown of the scheduler') # Unschedule all the jobs before calling shutdown #self.sched.print_jobs() if not has_sched_v3: for job in self.sched.get_jobs(): self.sched.unschedule_job(job) #self.sched.print_jobs() self.sched.shutdown()
def shutdown(self, msg): """Shutdown the scheduler.""" try: self.cleanup() #if False and self.flow.has_db: # try: # self.flow.db_insert() # except Exception: # logger.critical("MongoDb insertion failed.") self.history.append("Completed on %s" % time.asctime()) self.history.append("Elapsed time %s" % self.get_delta_etime()) if self.DEBUG: print(">>>>> shutdown: Number of open file descriptors: %s" % get_open_fds()) retcode = self.send_email(msg) if self.DEBUG: print("send_mail retcode", retcode) # Write file with the list of exceptions: if self.exceptions: dump_file = os.path.join(self.flow.workdir, "_exceptions") with open(dump_file, "w") as fh: fh.writelines(self.exceptions) fh.write("Shutdown message:\n%s" % msg) finally: # Shutdown the scheduler thus allowing the process to exit. print('this should be the shutdown of the scheduler') # Unschedule all the jobs before calling shutdown self.sched.print_jobs() for job in self.sched.get_jobs(): self.sched.unschedule_job(job) self.sched.print_jobs() self.sched.shutdown()
def shutdown(self, msg): """Shutdown the scheduler.""" try: self.cleanup() self.history.append("Completed on: %s" % time.asctime()) self.history.append("Elapsed time: %s" % self.get_delta_etime()) if self.debug: print(">>>>> shutdown: Number of open file descriptors: %s" % get_open_fds()) retcode = self.send_email(msg) if self.debug: print("send_mail retcode", retcode) # Write file with the list of exceptions: if self.exceptions: dump_file = os.path.join(self.flow.workdir, "_exceptions") with open(dump_file, "wt") as fh: fh.writelines(self.exceptions) fh.write("Shutdown message:\n%s" % msg) lines = [] app = lines.append app("Submitted on: %s" % time.ctime(self.start_time)) app("Completed on: %s" % time.asctime()) app("Elapsed time: %s" % str(self.get_delta_etime())) if self.flow.all_ok: app("Flow completed successfully") else: app("Flow %s didn't complete successfully" % repr(self.flow.workdir)) app("use `abirun.py FLOWDIR debug` to analyze the problem.") app("Shutdown message:\n%s" % msg) print("") print("\n".join(lines)) print("") self._do_customer_service() if self.flow.all_ok: print("Calling flow.finalize()...") self.flow.finalize() #print("finalized:", self.flow.finalized) if self.rmflow: app("Flow directory will be removed...") try: self.flow.rmtree() except Exception: logger.warning("Ignoring exception while trying to remove flow dir.") finally: # Shutdown the scheduler thus allowing the process to exit. logger.debug('This should be the shutdown of the scheduler') # Unschedule all the jobs before calling shutdown #self.sched.print_jobs() if not has_sched_v3: for job in self.sched.get_jobs(): self.sched.unschedule_job(job) #self.sched.print_jobs() self.sched.shutdown()
def _callback(self): """The actual callback.""" if self.debug: # Show the number of open file descriptors print(">>>>> _callback: Number of open file descriptors: %s" % get_open_fds()) self._runem_all() # Mission accomplished. Shutdown the scheduler. all_ok = self.flow.all_ok if all_ok: return self.shutdown(msg="All tasks have reached S_OK. Will shutdown the scheduler and exit") # Handle failures. err_lines = [] # Shall we send a reminder to the user? delta_etime = self.get_delta_etime() if delta_etime.total_seconds() > self.num_reminders * self.remindme_s: self.num_reminders += 1 msg = ("Just to remind you that the scheduler with pid %s, flow %s\n has been running for %s " % (self.pid, self.flow, delta_etime)) retcode = self.send_email(msg, tag="[REMINDER]") if retcode: # Cannot send mail, shutdown now! msg += ("\nThe scheduler tried to send an e-mail to remind the user\n" + " but send_email returned %d. Aborting now" % retcode) err_lines.append(msg) #if delta_etime.total_seconds() > self.max_etime_s: # err_lines.append("\nExceeded max_etime_s %s. Will shutdown the scheduler and exit" % self.max_etime_s) # Too many exceptions. Shutdown the scheduler. if self.num_excs > self.max_num_pyexcs: msg = "Number of exceptions %s > %s. Will shutdown the scheduler and exit" % ( self.num_excs, self.max_num_pyexcs) err_lines.append(boxed(msg)) # Paranoid check: disable the scheduler if we have submitted # too many jobs (it might be due to some bug or other external reasons # such as race conditions between difference callbacks!) if self.nlaunch > self.safety_ratio * self.flow.num_tasks: msg = "Too many jobs launched %d. Total number of tasks = %s, Will shutdown the scheduler and exit" % ( self.nlaunch, self.flow.num_tasks) err_lines.append(boxed(msg)) # Count the number of tasks with status == S_ERROR. if self.flow.num_errored_tasks > self.max_num_abierrs: msg = "Number of tasks with ERROR status %s > %s. Will shutdown the scheduler and exit" % ( self.flow.num_errored_tasks, self.max_num_abierrs) err_lines.append(boxed(msg)) # Test on the presence of deadlocks. g = self.flow.find_deadlocks() if g.deadlocked: # Check the flow again so that status are updated. self.flow.check_status() g = self.flow.find_deadlocks() print("deadlocked:\n", g.deadlocked, "\nrunnables:\n", g.runnables, "\nrunning\n", g.running) if g.deadlocked and not g.runnables and not g.running: err_lines.append("No runnable job with deadlocked tasks:\n%s." % str(g.deadlocked)) if not g.runnables and not g.running: # Check the flow again so that status are updated. self.flow.check_status() g = self.flow.find_deadlocks() if not g.runnables and not g.running: err_lines.append("No task is running and cannot find other tasks to submit.") # Something wrong. Quit if err_lines: # Cancel all jobs. if self.killjobs_if_errors: cprint("killjobs_if_errors set to 'yes' in scheduler file. Will kill jobs before exiting.", "yellow") try: num_cancelled = 0 for task in self.flow.iflat_tasks(): num_cancelled += task.cancel() cprint("Killed %d tasks" % num_cancelled, "yellow") except Exception as exc: cprint("Exception while trying to kill jobs:\n%s" % str(exc), "red") self.shutdown("\n".join(err_lines)) return len(self.exceptions)
def _callback(self): """The actual callback.""" if self.debug: # Show the number of open file descriptors print(">>>>> _callback: Number of open file descriptors: %s" % get_open_fds()) self._runem_all() # Mission accomplished. Shutdown the scheduler. all_ok = self.flow.all_ok if self.verbose: print("all_ok", all_ok) if all_ok: self.shutdown(msg="All tasks have reached S_OK. Will shutdown the scheduler and exit") # Handle failures. err_msg = "" # Shall we send a reminder to the user? delta_etime = self.get_delta_etime() if delta_etime.total_seconds() > self.num_reminders * self.remindme_s: self.num_reminders += 1 msg = ("Just to remind you that the scheduler with pid %s, flow %s\n has been running for %s " % (self.pid, self.flow, delta_etime)) retcode = self.send_email(msg, tag="[REMINDER]") if retcode: # Cannot send mail, shutdown now! msg += ("\nThe scheduler tried to send an e-mail to remind the user\n" + " but send_email returned %d. Aborting now" % retcode) err_msg += msg #if delta_etime.total_seconds() > self.max_etime_s: # err_msg += "\nExceeded max_etime_s %s. Will shutdown the scheduler and exit" % self.max_etime_s # Too many exceptions. Shutdown the scheduler. if self.num_excs > self.max_num_pyexcs: msg = "Number of exceptions %s > %s. Will shutdown the scheduler and exit" % ( self.num_excs, self.max_num_pyexcs) err_msg += boxed(msg) # Paranoid check: disable the scheduler if we have submitted # too many jobs (it might be due to some bug or other external reasons # such as race conditions between difference callbacks!) if self.nlaunch > self.safety_ratio * self.flow.num_tasks: msg = "Too many jobs launched %d. Total number of tasks = %s, Will shutdown the scheduler and exit" % ( self.nlaunch, self.flow.num_tasks) err_msg += boxed(msg) # Count the number of tasks with status == S_ERROR. if self.flow.num_errored_tasks > self.max_num_abierrs: msg = "Number of tasks with ERROR status %s > %s. Will shutdown the scheduler and exit" % ( self.flow.num_errored_tasks, self.max_num_abierrs) err_msg += boxed(msg) deadlocked, runnables, running = self.flow.deadlocked_runnables_running() #print("\ndeadlocked:\n", deadlocked, "\nrunnables:\n", runnables, "\nrunning\n", running) if deadlocked and not runnables and not running: msg = "No runnable job with deadlocked tasks:\n %s\nWill shutdown the scheduler and exit" % str(deadlocked) err_msg += boxed(msg) if err_msg: # Something wrong. Quit self.shutdown(err_msg) return len(self.exceptions)
def shutdown(self, msg): """Shutdown the scheduler.""" try: self.cleanup() self.history.append("Completed on: %s" % time.asctime()) self.history.append("Elapsed time: %s" % self.get_delta_etime()) if self.debug: print(">>>>> shutdown: Number of open file descriptors: %s" % get_open_fds()) retcode = self.send_email(msg) if self.debug: print("send_mail retcode", retcode) # Write file with the list of exceptions: if self.exceptions: dump_file = os.path.join(self.flow.workdir, "_exceptions") with open(dump_file, "wt") as fh: fh.writelines(self.exceptions) fh.write("Shutdown message:\n%s" % msg) lines = [] app = lines.append app("Submitted on: %s" % time.ctime(self.start_time)) app("Completed on: %s" % time.asctime()) app("Elapsed time: %s" % str(self.get_delta_etime())) if self.flow.all_ok: app("Flow completed successfully") else: app("Flow %s didn't complete successfully" % repr(self.flow.workdir)) app("use `abirun.py FLOWDIR debug` to analyze the problem.") app("Shutdown message:\n%s" % msg) print("") print("\n".join(lines)) print("") self._do_customer_service() if self.flow.all_ok: print("Calling flow.finalize()...") self.flow.finalize() #print("finalized:", self.flow.finalized) if self.rmflow: app("Flow directory will be removed...") try: self.flow.rmtree() except Exception: logger.warning( "Ignoring exception while trying to remove flow dir." ) finally: # Shutdown the scheduler thus allowing the process to exit. logger.debug('This should be the shutdown of the scheduler') # Unschedule all the jobs before calling shutdown #self.sched.print_jobs() if not has_sched_v3: for job in self.sched.get_jobs(): self.sched.unschedule_job(job) #self.sched.print_jobs() self.sched.shutdown()
def _callback(self): """The actual callback.""" if self.debug: # Show the number of open file descriptors print(">>>>> _callback: Number of open file descriptors: %s" % get_open_fds()) self._runem_all() # Mission accomplished. Shutdown the scheduler. all_ok = self.flow.all_ok if all_ok: return self.shutdown( msg= "All tasks have reached S_OK. Will shutdown the scheduler and exit" ) # Handle failures. err_lines = [] # Shall we send a reminder to the user? delta_etime = self.get_delta_etime() if delta_etime.total_seconds() > self.num_reminders * self.remindme_s: self.num_reminders += 1 msg = ( "Just to remind you that the scheduler with pid %s, flow %s\n has been running for %s " % (self.pid, self.flow, delta_etime)) retcode = self.send_email(msg, tag="[REMINDER]") if retcode: # Cannot send mail, shutdown now! msg += ( "\nThe scheduler tried to send an e-mail to remind the user\n" + " but send_email returned %d. Aborting now" % retcode) err_lines.append(msg) #if delta_etime.total_seconds() > self.max_etime_s: # err_lines.append("\nExceeded max_etime_s %s. Will shutdown the scheduler and exit" % self.max_etime_s) # Too many exceptions. Shutdown the scheduler. if self.num_excs > self.max_num_pyexcs: msg = "Number of exceptions %s > %s. Will shutdown the scheduler and exit" % ( self.num_excs, self.max_num_pyexcs) err_lines.append(boxed(msg)) # Paranoid check: disable the scheduler if we have submitted # too many jobs (it might be due to some bug or other external reasons # such as race conditions between difference callbacks!) if self.nlaunch > self.safety_ratio * self.flow.num_tasks: msg = "Too many jobs launched %d. Total number of tasks = %s, Will shutdown the scheduler and exit" % ( self.nlaunch, self.flow.num_tasks) err_lines.append(boxed(msg)) # Count the number of tasks with status == S_ERROR. if self.flow.num_errored_tasks > self.max_num_abierrs: msg = "Number of tasks with ERROR status %s > %s. Will shutdown the scheduler and exit" % ( self.flow.num_errored_tasks, self.max_num_abierrs) err_lines.append(boxed(msg)) # Test on the presence of deadlocks. g = self.flow.find_deadlocks() if g.deadlocked: # Check the flow again so that status are updated. self.flow.check_status() g = self.flow.find_deadlocks() print("deadlocked:\n", g.deadlocked, "\nrunnables:\n", g.runnables, "\nrunning\n", g.running) if g.deadlocked and not g.runnables and not g.running: err_lines.append( "No runnable job with deadlocked tasks:\n%s." % str(g.deadlocked)) if not g.runnables and not g.running: # Check the flow again so that status are updated. self.flow.check_status() g = self.flow.find_deadlocks() if not g.runnables and not g.running: err_lines.append( "No task is running and cannot find other tasks to submit." ) # Something wrong. Quit if err_lines: # Cancel all jobs. if self.killjobs_if_errors: cprint( "killjobs_if_errors set to 'yes' in scheduler file. Will kill jobs before exiting.", "yellow") try: num_cancelled = 0 for task in self.flow.iflat_tasks(): num_cancelled += task.cancel() cprint("Killed %d tasks" % num_cancelled, "yellow") except Exception as exc: cprint( "Exception while trying to kill jobs:\n%s" % str(exc), "red") self.shutdown("\n".join(err_lines)) return len(self.exceptions)
def _callback(self): """The actual callback.""" if self.DEBUG: # Show the number of open file descriptors print(">>>>> _callback: Number of open file descriptors: %s" % get_open_fds()) #print('before _runem_all in _callback') self._runem_all() # Mission accomplished. Shutdown the scheduler. all_ok = self.flow.all_ok if self.verbose: print("all_ok", all_ok) if all_ok: self.shutdown(msg="All tasks have reached S_OK. Will shutdown the scheduler and exit") # Handle failures. err_msg = "" # Shall we send a reminder to the user? delta_etime = self.get_delta_etime() if delta_etime.total_seconds() > self.num_reminders * self.REMINDME_S: self.num_reminders += 1 msg = ("Just to remind you that the scheduler with pid %s, flow %s\n has been running for %s " % (self.pid, self.flow, delta_etime)) retcode = self.send_email(msg, tag="[REMINDER]") if retcode: # Cannot send mail, shutdown now! msg += ("\nThe scheduler tried to send an e-mail to remind the user\n" + " but send_email returned %d. Aborting now" % retcode) err_msg += msg #if delta_etime.total_seconds() > self.MAX_ETIME_S: # err_msg += "\nExceeded MAX_ETIME_S %s. Will shutdown the scheduler and exit" % self.MAX_ETIME_S # Too many exceptions. Shutdown the scheduler. if self.num_excs > self.MAX_NUM_PYEXCS: msg = "Number of exceptions %s > %s. Will shutdown the scheduler and exit" % ( self.num_excs, self.MAX_NUM_PYEXCS) err_msg += boxed(msg) # Paranoid check: disable the scheduler if we have submitted # too many jobs (it might be due to some bug or other external reasons # such as race conditions between difference callbacks!) if self.nlaunch > self.SAFETY_RATIO * self.flow.num_tasks: msg = "Too many jobs launched %d. Total number of tasks = %s, Will shutdown the scheduler and exit" % ( self.nlaunch, self.flow.num_tasks) err_msg += boxed(msg) # Count the number of tasks with status == S_ERROR. if self.flow.num_errored_tasks > self.MAX_NUM_ABIERRS: msg = "Number of tasks with ERROR status %s > %s. Will shutdown the scheduler and exit" % ( self.flow.num_errored_tasks, self.MAX_NUM_ABIERRS) err_msg += boxed(msg) # Count the number of tasks with status == S_UNCONVERGED. #if self.flow.num_unconverged_tasks: # # TODO: this is needed to avoid deadlocks, automatic restarting is not available yet # msg = ("Found %d unconverged tasks." # "Automatic restarting is not available yet. Will shutdown the scheduler and exit" # % self.flow.num_unconverged_tasks) # err_msg += boxed(msg) #deadlocks = self.detect_deadlocks() #if deadlocks: # msg = ("Detected deadlocks in flow. Will shutdown the scheduler and exit" # % self.flow.num_unconverged_tasks) # err_msg += boxed(msg) if err_msg: # Something wrong. Quit self.shutdown(err_msg) return len(self.exceptions)
def _callback(self): """The actual callback.""" if self.DEBUG: # Show the number of open file descriptors print(">>>>> _callback: Number of open file descriptors: %s" % get_open_fds()) #print('before _runem_all in _callback') self._runem_all() # Mission accomplished. Shutdown the scheduler. all_ok = self.flow.all_ok if self.verbose: print("all_ok", all_ok) if all_ok: self.shutdown( msg= "All tasks have reached S_OK. Will shutdown the scheduler and exit" ) # Handle failures. err_msg = "" # Shall we send a reminder to the user? delta_etime = self.get_delta_etime() if delta_etime.total_seconds() > self.num_reminders * self.REMINDME_S: self.num_reminders += 1 msg = ( "Just to remind you that the scheduler with pid %s, flow %s\n has been running for %s " % (self.pid, self.flow, delta_etime)) retcode = self.send_email(msg, tag="[REMINDER]") if retcode: # Cannot send mail, shutdown now! msg += ( "\nThe scheduler tried to send an e-mail to remind the user\n" + " but send_email returned %d. Aborting now" % retcode) err_msg += msg #if delta_etime.total_seconds() > self.MAX_ETIME_S: # err_msg += "\nExceeded MAX_ETIME_S %s. Will shutdown the scheduler and exit" % self.MAX_ETIME_S # Too many exceptions. Shutdown the scheduler. if self.num_excs > self.MAX_NUM_PYEXCS: msg = "Number of exceptions %s > %s. Will shutdown the scheduler and exit" % ( self.num_excs, self.MAX_NUM_PYEXCS) err_msg += boxed(msg) # Paranoid check: disable the scheduler if we have submitted # too many jobs (it might be due to some bug or other external reasons # such as race conditions between difference callbacks!) if self.nlaunch > self.SAFETY_RATIO * self.flow.num_tasks: msg = "Too many jobs launched %d. Total number of tasks = %s, Will shutdown the scheduler and exit" % ( self.nlaunch, self.flow.num_tasks) err_msg += boxed(msg) # Count the number of tasks with status == S_ERROR. if self.flow.num_errored_tasks > self.MAX_NUM_ABIERRS: msg = "Number of tasks with ERROR status %s > %s. Will shutdown the scheduler and exit" % ( self.flow.num_errored_tasks, self.MAX_NUM_ABIERRS) err_msg += boxed(msg) # Count the number of tasks with status == S_UNCONVERGED. #if self.flow.num_unconverged_tasks: # # TODO: this is needed to avoid deadlocks, automatic restarting is not available yet # msg = ("Found %d unconverged tasks." # "Automatic restarting is not available yet. Will shutdown the scheduler and exit" # % self.flow.num_unconverged_tasks) # err_msg += boxed(msg) #deadlocks = self.detect_deadlocks() #if deadlocks: # msg = ("Detected deadlocks in flow. Will shutdown the scheduler and exit" # % self.flow.num_unconverged_tasks) # err_msg += boxed(msg) if err_msg: # Something wrong. Quit self.shutdown(err_msg) return len(self.exceptions)
def test_get_open_fds(self): self.assertTrue(get_open_fds() > 0)