Пример #1
0
def update_db(root_folder, process, status, run_name, job_id, error):
    """Update the database with the given values"""

    entry = SchedulerTask(run_name, process, status, job_id, error)
    database = MgmtDB(root_folder)

    # If we are running this manually then we should set the retry limit to be above a reasonable value for manual
    # submissions
    database.update_entries_live([entry], 256)
Пример #2
0
def queue_monitor_loop(
    root_folder: str,
    sleep_time: int,
    max_retries: int,
    queue_logger: Logger = qclogging.get_basic_logger(),
    alert_url=None,
):
    mgmt_db = MgmtDB(sim_struct.get_mgmt_db(root_folder))
    queue_folder = sim_struct.get_mgmt_db_queue(root_folder)

    queue_logger.info("Running queue-monitor, exit with Ctrl-C.")

    mgmt_db.add_retries(max_retries)

    sqlite_tmpdir = "/tmp/cer"
    while keepAlive:
        complete_data = True
        if not os.path.exists(sqlite_tmpdir):
            os.makedirs(sqlite_tmpdir)
            queue_logger.debug("Set up the sqlite_tmpdir")

        # For each hpc get a list of job id and status', and for each pair save them in a dictionary
        queued_tasks = {}
        for hpc in HPC:
            try:
                squeued_tasks = Scheduler.get_scheduler().check_queues(
                    user=False, target_machine=hpc
                )
            except EnvironmentError as e:
                queue_logger.critical(e)
                queue_logger.critical(
                    f"An error was encountered when attempting to check {Scheduler.get_scheduler().QUEUE_NAME} for HPC {hpc}. "
                    "Tasks will not be submitted to this HPC until the issue is resolved"
                )
                complete_data = False
            else:
                for task in squeued_tasks:
                    queued_tasks[task.split()[0]] = task.split()[1]

        if len(queued_tasks) > 0:
            if len(queued_tasks) > 200:
                queue_logger.log(
                    VERYVERBOSE,
                    f"{Scheduler.get_scheduler().QUEUE_NAME} tasks: {', '.join([' '.join(task) for task in queued_tasks.items()])}",
                )
                queue_logger.info(
                    f"Over 200 tasks were found in the queue. Check the log for an exact listing of them"
                )
            else:
                queue_logger.info(
                    f"{Scheduler.get_scheduler().QUEUE_NAME} tasks: {', '.join([' '.join(task) for task in queued_tasks.items()])}"
                )
        else:
            queue_logger.debug(f"No {Scheduler.get_scheduler().QUEUE_NAME} tasks")

        db_in_progress_tasks = mgmt_db.get_submitted_tasks()
        if len(db_in_progress_tasks) > 0:

            queue_logger.info(
                "In progress tasks in mgmt db:"
                + ", ".join(
                    [
                        "{}-{}-{}-{}".format(
                            entry.run_name,
                            const.ProcessType(entry.proc_type).str_value,
                            entry.job_id,
                            const.Status(entry.status).str_value,
                        )
                        for entry in db_in_progress_tasks
                    ]
                )
            )

        entry_files = os.listdir(queue_folder)
        entry_files.sort()

        entries = []

        for file_name in entry_files[::-1]:
            queue_logger.debug(
                "Checking {} to see if it is a valid update file".format(file_name)
            )
            entry = get_queue_entry(os.path.join(queue_folder, file_name), queue_logger)
            if entry is None:
                queue_logger.debug(
                    "Removing {} from the list of update files".format(file_name)
                )
                entry_files.remove(file_name)
            else:
                if str(entry.job_id) in queued_tasks.keys() and entry.status > 3:
                    # This will prevent race conditions if the failure/completion state file is made and picked up before the job actually finishes
                    # Most notabley happens on Kisti
                    # The queued and running states are allowed
                    queue_logger.debug(
                        "Job {} is still running on the HPC, skipping this iteration".format(
                            entry
                        )
                    )
                    entry_files.remove(file_name)
                else:
                    queue_logger.debug("Adding {} to the list of updates".format(entry))
                    entries.insert(0, entry)

        entries.extend(
            update_tasks(
                entry_files,
                queued_tasks,
                db_in_progress_tasks,
                complete_data,
                queue_logger,
                root_folder,
            )
        )

        if len(entries) > 0:
            queue_logger.info("Updating {} mgmt db tasks.".format(len(entries)))
            if mgmt_db.update_entries_live(entries, max_retries, queue_logger):
                for file_name in entry_files:
                    os.remove(os.path.join(queue_folder, file_name))
                # check for jobs that matches alert criteria
                if alert_url != None:
                    for entry in entries:
                        if entry.status == const.Status.failed.value:
                            entry_retries = mgmt_db.get_retries(
                                entry.proc_type, entry.run_name
                            )
                            if entry_retries < max_retries:
                                msg = f"fault:{entry.run_name} step:{entry.proc_type} has failed with error:{entry.error}"
                            elif entry_retries >= max_retries:
                                msg = f"@here fault:{entry.run_name} step:{entry.proc_type} has failed with error:{entry.error} and met the retry cap"
                            send_alert(msg, alert_url)
            else:
                queue_logger.error(
                    "Failed to update the current entries in the mgmt db queue. "
                    "Please investigate and fix. If this is a repeating error, then this "
                    "will block all other entries from updating."
                )
        else:
            queue_logger.info("No entries in the mgmt db queue.")

        # Nap time
        queue_logger.debug("Sleeping for {}".format(sleep_time))
        time.sleep(sleep_time)