def check_mgmt_db(self): """Create errors for all entries in management db that did not complete""" base_proc_types = [ const.ProcessType.EMOD3D, const.ProcessType.HF, const.ProcessType.BB, const.ProcessType.IM_calculation, ] db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir)) entries = db.command_builder( allowed_tasks=base_proc_types, allowed_states=[const.Status.unknown, const.Status.failed], blocked_ids=self.canceled_running, ) for entry in entries: self.errors.append( Error( "Slurm task", "Run {} did not complete task {} " "(Status {}, JobId {}".format( entry.run_name, const.ProcessType(entry.proc_type), const.Status(entry.status), entry.job_id, ), ))
def cancel_running(self, proc_types: List[const.ProcessType]): """Looks for any running task of the specified process types and attempts to cancel one of each. """ # Get all running jobs in the mgmt db db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir)) entries = db.command_builder(allowed_tasks=proc_types, allowed_states=[const.Status.running]) # Cancel one for each process type for entry in entries: if entry.proc_type in proc_types: print( f"Checkpoint testing: Cancelling job-id {entry.job_id} " "for {entry.run_name} and process type {entry.proc_type}") out, err = Scheduler.get_scheduler().cancel_job(entry.job_id) print("Scancel out: ", out, err) if "error" not in out.lower() and "error" not in err.lower(): self.canceled_running.append(str(entry.job_id)) proc_types.remove(entry.proc_type) print("Cancelled job-id {}".format(entry.job_id)) return proc_types
def update_db(root_folder, process, status, run_name, job_id, error): """Update the database with the given values""" entry = SchedulerTask(run_name, process, status, job_id, error) database = MgmtDB(root_folder) # If we are running this manually then we should set the retry limit to be above a reasonable value for manual # submissions database.update_entries_live([entry], 256)
def check_completed(self): """Checks all simulations that have completed""" base_proc_types = [const.ProcessType.IM_calculation] db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir)) entries = db.command_builder(allowed_tasks=base_proc_types, allowed_states=[const.Status.completed]) completed_sims = [sim_t.run_name for sim_t in entries] # Only check the ones that haven't been checked already completed_new = set(completed_sims) - (self._sim_passed | self._sim_failed) for sim in completed_new: result = self.check_sim_result( os.path.join(self.runs_dir, sim_struct.get_fault_from_realisation(sim), sim)) if not result: self._sim_failed.add(sim) if self._stop_on_error: print("Quitting as the following errors occured: ") self.print_errors() return False else: print("The following error occured for simulation {}:". format(sim)) print("ERROR: {}, {}\n".format(self.errors[-1].location, self.errors[-1].error)) else: self._sim_passed.add(sim) print("Passed/Failed/Total simulations: {}/{}/{}, ".format( len(self._sim_passed), len(self._sim_failed), len(self.sim_dirs))) return True
def check_mgmt_db_progress(self): """Checks auto submit progress in the management db""" base_proc_types = [ const.ProcessType.EMOD3D, const.ProcessType.HF, const.ProcessType.BB, const.ProcessType.IM_calculation, ] db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir)) total_count = len(db.command_builder(allowed_tasks=base_proc_types)) comp_count = len( db.command_builder(allowed_tasks=base_proc_types, allowed_states=[const.Status.completed])) failed_count = len( db.command_builder( allowed_tasks=base_proc_types, allowed_states=[const.Status.failed, const.Status.unknown], )) return total_count, comp_count, failed_count
def queue_monitor_loop( root_folder: str, sleep_time: int, max_retries: int, queue_logger: Logger = qclogging.get_basic_logger(), alert_url=None, ): mgmt_db = MgmtDB(sim_struct.get_mgmt_db(root_folder)) queue_folder = sim_struct.get_mgmt_db_queue(root_folder) queue_logger.info("Running queue-monitor, exit with Ctrl-C.") mgmt_db.add_retries(max_retries) sqlite_tmpdir = "/tmp/cer" while keepAlive: complete_data = True if not os.path.exists(sqlite_tmpdir): os.makedirs(sqlite_tmpdir) queue_logger.debug("Set up the sqlite_tmpdir") # For each hpc get a list of job id and status', and for each pair save them in a dictionary queued_tasks = {} for hpc in HPC: try: squeued_tasks = Scheduler.get_scheduler().check_queues( user=False, target_machine=hpc ) except EnvironmentError as e: queue_logger.critical(e) queue_logger.critical( f"An error was encountered when attempting to check {Scheduler.get_scheduler().QUEUE_NAME} for HPC {hpc}. " "Tasks will not be submitted to this HPC until the issue is resolved" ) complete_data = False else: for task in squeued_tasks: queued_tasks[task.split()[0]] = task.split()[1] if len(queued_tasks) > 0: if len(queued_tasks) > 200: queue_logger.log( VERYVERBOSE, f"{Scheduler.get_scheduler().QUEUE_NAME} tasks: {', '.join([' '.join(task) for task in queued_tasks.items()])}", ) queue_logger.info( f"Over 200 tasks were found in the queue. Check the log for an exact listing of them" ) else: queue_logger.info( f"{Scheduler.get_scheduler().QUEUE_NAME} tasks: {', '.join([' '.join(task) for task in queued_tasks.items()])}" ) else: queue_logger.debug(f"No {Scheduler.get_scheduler().QUEUE_NAME} tasks") db_in_progress_tasks = mgmt_db.get_submitted_tasks() if len(db_in_progress_tasks) > 0: queue_logger.info( "In progress tasks in mgmt db:" + ", ".join( [ "{}-{}-{}-{}".format( entry.run_name, const.ProcessType(entry.proc_type).str_value, entry.job_id, const.Status(entry.status).str_value, ) for entry in db_in_progress_tasks ] ) ) entry_files = os.listdir(queue_folder) entry_files.sort() entries = [] for file_name in entry_files[::-1]: queue_logger.debug( "Checking {} to see if it is a valid update file".format(file_name) ) entry = get_queue_entry(os.path.join(queue_folder, file_name), queue_logger) if entry is None: queue_logger.debug( "Removing {} from the list of update files".format(file_name) ) entry_files.remove(file_name) else: if str(entry.job_id) in queued_tasks.keys() and entry.status > 3: # This will prevent race conditions if the failure/completion state file is made and picked up before the job actually finishes # Most notabley happens on Kisti # The queued and running states are allowed queue_logger.debug( "Job {} is still running on the HPC, skipping this iteration".format( entry ) ) entry_files.remove(file_name) else: queue_logger.debug("Adding {} to the list of updates".format(entry)) entries.insert(0, entry) entries.extend( update_tasks( entry_files, queued_tasks, db_in_progress_tasks, complete_data, queue_logger, root_folder, ) ) if len(entries) > 0: queue_logger.info("Updating {} mgmt db tasks.".format(len(entries))) if mgmt_db.update_entries_live(entries, max_retries, queue_logger): for file_name in entry_files: os.remove(os.path.join(queue_folder, file_name)) # check for jobs that matches alert criteria if alert_url != None: for entry in entries: if entry.status == const.Status.failed.value: entry_retries = mgmt_db.get_retries( entry.proc_type, entry.run_name ) if entry_retries < max_retries: msg = f"fault:{entry.run_name} step:{entry.proc_type} has failed with error:{entry.error}" elif entry_retries >= max_retries: msg = f"@here fault:{entry.run_name} step:{entry.proc_type} has failed with error:{entry.error} and met the retry cap" send_alert(msg, alert_url) else: queue_logger.error( "Failed to update the current entries in the mgmt db queue. " "Please investigate and fix. If this is a repeating error, then this " "will block all other entries from updating." ) else: queue_logger.info("No entries in the mgmt db queue.") # Nap time queue_logger.debug("Sleeping for {}".format(sleep_time)) time.sleep(sleep_time)
def run_main_submit_loop( root_folder: str, n_runs: Dict[str, int], rels_to_run: str, given_tasks_to_run: List[const.ProcessType], sleep_time: int, models_tuple: Tuple[est.EstModel], main_logger: Logger = qclogging.get_basic_logger(), cycle_timeout=1, ): mgmt_queue_folder = sim_struct.get_mgmt_db_queue(root_folder) mgmt_db = MgmtDB(sim_struct.get_mgmt_db(root_folder)) root_params_file = os.path.join(sim_struct.get_runs_dir(root_folder), "root_params.yaml") config = utils.load_yaml(root_params_file) main_logger.info("Loaded root params file: {}".format(root_params_file)) # Default values hf_seed = config["hf"].get(const.RootParams.seed.value, const.HF_DEFAULT_SEED) main_logger.debug("hf_seed set to {}".format(hf_seed)) main_logger.debug( f"extended_period set to {config['ims']['extended_period']}") time_since_something_happened = cycle_timeout while time_since_something_happened > 0: main_logger.debug("time_since_something_happened is now {}".format( time_since_something_happened)) time_since_something_happened -= 1 # Get items in the mgmt queue, have to get a snapshot instead of # checking the directory real-time to prevent timing issues, # which can result in dual-submission mgmt_queue_entries = os.listdir(mgmt_queue_folder) # Get in progress tasks in the db and the HPC queue n_tasks_to_run = {} for hpc in HPC: try: squeued_tasks = Scheduler.get_scheduler().check_queues( user=True, target_machine=hpc) except EnvironmentError as e: main_logger.critical(e) n_tasks_to_run[hpc] = 0 else: n_tasks_to_run[hpc] = n_runs[hpc] - len(squeued_tasks) if len(squeued_tasks) > 0: main_logger.debug( "There was at least one job in squeue, resetting timeout" ) time_since_something_happened = cycle_timeout # Gets all runnable tasks based on mgmt db state runnable_tasks = mgmt_db.get_runnable_tasks( rels_to_run, sum(n_runs.values()), os.listdir(sim_struct.get_mgmt_db_queue(root_folder)), given_tasks_to_run, main_logger, ) if len(runnable_tasks) > 0: time_since_something_happened = cycle_timeout main_logger.info("Number of runnable tasks: {}".format( len(runnable_tasks))) main_logger.debug( "There was at least one runnable task, resetting timeout") else: main_logger.debug("No runnable_tasks") # Select the first ntask_to_run that are not waiting # for mgmt db updates (i.e. items in the queue) tasks_to_run, task_counter = [], {key: 0 for key in HPC} for cur_proc_type, cur_run_name, retries in runnable_tasks: cur_hpc = get_target_machine(cur_proc_type) # Add task if limit has not been reached and there are no # outstanding mgmt db updates if (not shared_automated_workflow.check_mgmt_queue( mgmt_queue_entries, cur_run_name, cur_proc_type) and task_counter.get(cur_hpc, 0) < n_tasks_to_run[cur_hpc]): tasks_to_run.append((cur_proc_type, cur_run_name, retries)) task_counter[cur_hpc] += 1 # Open to better suggestions # Break if enough tasks for each HPC have been added if np.all([ True if task_counter.get(hpc, 0) >= n_tasks_to_run[hpc] else False for hpc in n_tasks_to_run.keys() ]): break if len(tasks_to_run) > 0: main_logger.info("Tasks to run this iteration: " + ", ".join([ "{}-{}".format(entry[1], const.ProcessType(entry[0]).str_value) for entry in tasks_to_run ])) else: main_logger.debug("No tasks to run this iteration") # Submit the runnable tasks for proc_type, run_name, retries in tasks_to_run: # Special handling for merge-ts if proc_type == const.ProcessType.merge_ts.value: # Check if clean up has already run if mgmt_db.is_task_complete([ const.ProcessType.clean_up.value, run_name, const.Status.completed.str_value, ]): # If clean_up has already run, then we should set it to # be run again after merge_ts has run shared_automated_workflow.add_to_queue( mgmt_queue_folder, run_name, const.ProcessType.clean_up.value, const.Status.created.value, logger=main_logger, ) # submit the job submit_task( sim_struct.get_sim_dir(root_folder, run_name), proc_type, run_name, root_folder, main_logger, retries=retries, hf_seed=hf_seed, models=models_tuple, ) main_logger.debug("Sleeping for {} second(s)".format(sleep_time)) time.sleep(sleep_time) main_logger.info( "Nothing was running or ready to run last cycle, exiting now")