def _get_all_files(job_names: List[str], scheduler: BaseScheduler) -> List[str]: log_fnames = [scheduler.log_fname(name) for name in job_names] output_fnames = [scheduler.output_fnames(name) for name in job_names] output_fnames = sum(output_fnames, []) batch_fnames = [scheduler.batch_fname(name) for name in job_names] fnames = log_fnames + output_fnames + batch_fnames all_files = [ glob.glob(f.replace(scheduler._JOB_ID_VARIABLE, "*")) for f in fnames ] return sum(all_files, [])
def output_fnames(self, scheduler: BaseScheduler) -> Dict[str, List[str]]: """The output log filenames as a dictionary.""" output_fnames = {} for entry in self.as_dicts(): if entry["job_id"] is None: continue job_id = scheduler.sanatize_job_id(entry["job_id"]) output_fnames[entry["job_id"]] = [ f.replace(scheduler._JOB_ID_VARIABLE, job_id) for f in scheduler.output_fnames(entry["job_name"]) ] return output_fnames
def _get_all_files(job_names: list[str], scheduler: BaseScheduler) -> list[str]: log_fnames = [scheduler.log_fname(name) for name in job_names] output_fnames = [scheduler.output_fnames(name) for name in job_names] output_fnames = sum(output_fnames, []) batch_fnames = [scheduler.batch_fname(name) for name in job_names] fnames = log_fnames + output_fnames + batch_fnames all_files = [ glob.glob(f.replace(scheduler._JOB_ID_VARIABLE, "*")) for f in fnames ] all_files = sum(all_files, []) # For schedulers that use a single batch file name_prefix = job_names[0].rsplit("-", 1)[0] batch_file = scheduler.batch_fname(name_prefix) if os.path.exists(batch_file): all_files.append(batch_file) return all_files
async def manage_killer( job_names: List[str], scheduler: BaseScheduler, error: Union[str, Callable[[List[str]], bool]] = "srun: error:", interval: int = 600, max_cancel_tries: int = 5, move_to: Optional[str] = None, db_fname: str = "running.json", ) -> Coroutine: # It seems like tasks that print the error message do not always stop working # I think it only stops working when the error happens on a node where the logger runs. from adaptive_scheduler.utils import _remove_or_move_files while True: try: failed_jobs = logs_with_string_or_condition( error, db_fname, scheduler) to_cancel = [] to_delete = [] # get cancel/delete only the processes/logs that are running now for job_id in scheduler.queue().keys(): if job_id in failed_jobs: job_name, fnames = failed_jobs[job_id] to_cancel.append(job_name) to_delete += fnames scheduler.cancel(to_cancel, with_progress_bar=False, max_tries=max_cancel_tries) _remove_or_move_files(to_delete, with_progress_bar=False, move_to=move_to) await asyncio.sleep(interval) except concurrent.futures.CancelledError: log.info("task was cancelled because of a CancelledError") raise except Exception as e: log.exception("got exception in kill manager", exception=str(e))
def _delete_old_ipython_profiles(scheduler: BaseScheduler, with_progress_bar: bool = True) -> None: if scheduler.executor_type != "ipyparallel": return # We need the job_ids because only job_names wouldn't be # enough information. There might be other job_managers # running. pattern = "profile_adaptive_scheduler_" profile_folders = glob.glob(os.path.expanduser(f"~/.ipython/{pattern}*")) running_job_ids = set(scheduler.queue().keys()) to_delete = [ folder for folder in profile_folders if not folder.split(pattern)[1] in running_job_ids ] with ThreadPoolExecutor(256) as ex: desc = "Submitting deleting old IPython profiles tasks" pbar = _progress(to_delete, desc=desc) futs = [ex.submit(shutil.rmtree, folder) for folder in pbar] desc = "Finishing deleting old IPython profiles" for fut in _progress(futs, with_progress_bar, desc=desc): fut.result()
async def manage_jobs( job_names: List[str], db_fname: str, ioloop, scheduler: BaseScheduler, interval: int = 30, *, max_simultaneous_jobs: int = 5000, max_fails_per_job: int = 100, ) -> Coroutine: n_started = 0 max_job_starts = max_fails_per_job * len(job_names) with concurrent.futures.ProcessPoolExecutor() as ex: while True: try: running = scheduler.queue() _update_db(db_fname, running) # in case some jobs died queued = { j["job_name"] for j in running.values() if j["job_name"] in job_names } not_queued = set(job_names) - queued n_done = _get_n_jobs_done(db_fname) for _ in range(n_done): # remove jobs that are finished if not_queued: # A job might still be running but can at the same # time be marked as finished in the db. Therefore # we added the `if not_queued` clause. not_queued.pop() if n_done == len(job_names): # we are finished! return while not_queued: if len(queued) < max_simultaneous_jobs: job_name = not_queued.pop() queued.add(job_name) await ioloop.run_in_executor(ex, scheduler.start_job, job_name) n_started += 1 else: break if n_started > max_job_starts: raise MaxRestartsReached( "Too many jobs failed, your Python code probably has a bug." ) await asyncio.sleep(interval) except concurrent.futures.CancelledError: log.info("task was cancelled because of a CancelledError") raise except MaxRestartsReached as e: log.exception( "too many jobs have failed, cancelling the job manager", n_started=n_started, max_fails_per_job=max_fails_per_job, max_job_starts=max_job_starts, exception=str(e), ) raise except Exception as e: log.exception("got exception when starting a job", exception=str(e)) await asyncio.sleep(5)