示例#1
0
def _get_all_files(job_names: List[str],
                   scheduler: BaseScheduler) -> List[str]:
    log_fnames = [scheduler.log_fname(name) for name in job_names]
    output_fnames = [scheduler.output_fnames(name) for name in job_names]
    output_fnames = sum(output_fnames, [])
    batch_fnames = [scheduler.batch_fname(name) for name in job_names]
    fnames = log_fnames + output_fnames + batch_fnames
    all_files = [
        glob.glob(f.replace(scheduler._JOB_ID_VARIABLE, "*")) for f in fnames
    ]
    return sum(all_files, [])
示例#2
0
 def output_fnames(self, scheduler: BaseScheduler) -> Dict[str, List[str]]:
     """The output log filenames as a dictionary."""
     output_fnames = {}
     for entry in self.as_dicts():
         if entry["job_id"] is None:
             continue
         job_id = scheduler.sanatize_job_id(entry["job_id"])
         output_fnames[entry["job_id"]] = [
             f.replace(scheduler._JOB_ID_VARIABLE, job_id)
             for f in scheduler.output_fnames(entry["job_name"])
         ]
     return output_fnames
示例#3
0
def _get_all_files(job_names: list[str],
                   scheduler: BaseScheduler) -> list[str]:
    log_fnames = [scheduler.log_fname(name) for name in job_names]
    output_fnames = [scheduler.output_fnames(name) for name in job_names]
    output_fnames = sum(output_fnames, [])
    batch_fnames = [scheduler.batch_fname(name) for name in job_names]
    fnames = log_fnames + output_fnames + batch_fnames
    all_files = [
        glob.glob(f.replace(scheduler._JOB_ID_VARIABLE, "*")) for f in fnames
    ]
    all_files = sum(all_files, [])

    # For schedulers that use a single batch file
    name_prefix = job_names[0].rsplit("-", 1)[0]
    batch_file = scheduler.batch_fname(name_prefix)
    if os.path.exists(batch_file):
        all_files.append(batch_file)
    return all_files
示例#4
0
async def manage_killer(
    job_names: List[str],
    scheduler: BaseScheduler,
    error: Union[str, Callable[[List[str]], bool]] = "srun: error:",
    interval: int = 600,
    max_cancel_tries: int = 5,
    move_to: Optional[str] = None,
    db_fname: str = "running.json",
) -> Coroutine:
    # It seems like tasks that print the error message do not always stop working
    # I think it only stops working when the error happens on a node where the logger runs.
    from adaptive_scheduler.utils import _remove_or_move_files

    while True:
        try:
            failed_jobs = logs_with_string_or_condition(
                error, db_fname, scheduler)
            to_cancel = []
            to_delete = []

            # get cancel/delete only the processes/logs that are running now
            for job_id in scheduler.queue().keys():
                if job_id in failed_jobs:
                    job_name, fnames = failed_jobs[job_id]
                    to_cancel.append(job_name)
                    to_delete += fnames

            scheduler.cancel(to_cancel,
                             with_progress_bar=False,
                             max_tries=max_cancel_tries)
            _remove_or_move_files(to_delete,
                                  with_progress_bar=False,
                                  move_to=move_to)
            await asyncio.sleep(interval)
        except concurrent.futures.CancelledError:
            log.info("task was cancelled because of a CancelledError")
            raise
        except Exception as e:
            log.exception("got exception in kill manager", exception=str(e))
示例#5
0
def _delete_old_ipython_profiles(scheduler: BaseScheduler,
                                 with_progress_bar: bool = True) -> None:

    if scheduler.executor_type != "ipyparallel":
        return
    # We need the job_ids because only job_names wouldn't be
    # enough information. There might be other job_managers
    # running.
    pattern = "profile_adaptive_scheduler_"
    profile_folders = glob.glob(os.path.expanduser(f"~/.ipython/{pattern}*"))

    running_job_ids = set(scheduler.queue().keys())
    to_delete = [
        folder for folder in profile_folders
        if not folder.split(pattern)[1] in running_job_ids
    ]

    with ThreadPoolExecutor(256) as ex:
        desc = "Submitting deleting old IPython profiles tasks"
        pbar = _progress(to_delete, desc=desc)
        futs = [ex.submit(shutil.rmtree, folder) for folder in pbar]
        desc = "Finishing deleting old IPython profiles"
        for fut in _progress(futs, with_progress_bar, desc=desc):
            fut.result()
示例#6
0
async def manage_jobs(
    job_names: List[str],
    db_fname: str,
    ioloop,
    scheduler: BaseScheduler,
    interval: int = 30,
    *,
    max_simultaneous_jobs: int = 5000,
    max_fails_per_job: int = 100,
) -> Coroutine:
    n_started = 0
    max_job_starts = max_fails_per_job * len(job_names)
    with concurrent.futures.ProcessPoolExecutor() as ex:
        while True:
            try:
                running = scheduler.queue()
                _update_db(db_fname, running)  # in case some jobs died
                queued = {
                    j["job_name"]
                    for j in running.values() if j["job_name"] in job_names
                }
                not_queued = set(job_names) - queued

                n_done = _get_n_jobs_done(db_fname)

                for _ in range(n_done):
                    # remove jobs that are finished
                    if not_queued:
                        # A job might still be running but can at the same
                        # time be marked as finished in the db. Therefore
                        # we added the `if not_queued` clause.
                        not_queued.pop()

                if n_done == len(job_names):
                    # we are finished!
                    return

                while not_queued:
                    if len(queued) < max_simultaneous_jobs:
                        job_name = not_queued.pop()
                        queued.add(job_name)
                        await ioloop.run_in_executor(ex, scheduler.start_job,
                                                     job_name)
                        n_started += 1
                    else:
                        break
                if n_started > max_job_starts:
                    raise MaxRestartsReached(
                        "Too many jobs failed, your Python code probably has a bug."
                    )
                await asyncio.sleep(interval)
            except concurrent.futures.CancelledError:
                log.info("task was cancelled because of a CancelledError")
                raise
            except MaxRestartsReached as e:
                log.exception(
                    "too many jobs have failed, cancelling the job manager",
                    n_started=n_started,
                    max_fails_per_job=max_fails_per_job,
                    max_job_starts=max_job_starts,
                    exception=str(e),
                )
                raise
            except Exception as e:
                log.exception("got exception when starting a job",
                              exception=str(e))
                await asyncio.sleep(5)