Exemplo n.º 1
0
def run_automated_workflow(
    root_folder: str,
    log_directory: str,
    n_runs: Dict[str, int],
    n_max_retries: int,
    tasks_to_run: List[const.ProcessType],
    sleep_time: int,
    tasks_to_run_with_pattern: List[Tuple[str, List[const.ProcessType]]],
    wrapper_logger: Logger,
    debug: bool,
    alert_url=None,
):
    """Runs the automated workflow. Beings the queue monitor script and the script for tasks that apply to all
    realisations. Then while the all realisation thread is running go through each pattern and run all tasks that are
    available. When each instance of auto_submit doesn't submit anything or have anything running for an iteration it
    will automatically return, and the next pattern will have its tasks automatically submitted.
    It is advised that each task list within tasks_to_run_with_pattern be disjoint from task_types_to_run as a race condition
    may occur, and the task run twice at the same time, resulting in file writing issues.
    :param root_folder: The root directory of the cybershake folder structure
    :param log_directory: The directory the log files are to be placed in
    :param n_runs: The maximum number of processes that can be running at once. Note that this will be applied
    individually to each instance of auto_submit and so will effectively be doubled
    :param n_max_retries: The maximum number of times a task can be run before being written off as needing user input
    :param tasks_to_run: The tasks to be run for all realisations
    :param sleep_time: The amount of time to sleep between iterations of the auto_submit script
    :param tasks_to_run_with_pattern: A list of (pattern, task_list) pairs to be run. task_list must have dependencies
    already added.
    :param wrapper_logger: The logger to use for wrapper messages
    """

    wrapper_logger.info("Loading estimation models")
    lf_est_model = est.load_full_model(
        join(platform_config[const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name],
             "LF"),
        logger=wrapper_logger,
    )
    hf_est_model = est.load_full_model(
        join(platform_config[const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name],
             "HF"),
        logger=wrapper_logger,
    )
    bb_est_model = est.load_full_model(
        join(platform_config[const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name],
             "BB"),
        logger=wrapper_logger,
    )
    im_est_model = est.load_full_model(
        join(platform_config[const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name],
             "IM"),
        logger=wrapper_logger,
    )

    bulk_logger = qclogging.get_logger(name="auto_submit_main", threaded=True)
    if debug:
        qclogging.set_stdout_level(bulk_logger, DEBUG)
    qclogging.add_general_file_handler(
        bulk_logger,
        join(
            log_directory,
            MASTER_AUTO_SUBMIT_LOG_FILE_NAME.format(datetime.now().strftime(
                const.TIMESTAMP_FORMAT)),
        ),
    )
    wrapper_logger.debug("Created logger for the main auto_submit thread")

    queue_logger = qclogging.get_logger(name="queue_monitor", threaded=True)
    if debug:
        qclogging.set_stdout_level(queue_logger, DEBUG)
    qclogging.add_general_file_handler(
        queue_logger,
        join(
            log_directory,
            QUEUE_MONITOR_LOG_FILE_NAME.format(datetime.now().strftime(
                const.TIMESTAMP_FORMAT)),
        ),
    )
    wrapper_logger.debug("Created logger for the queue_monitor thread")

    tasks_to_run_with_pattern_and_logger = [(
        pattern,
        tasks,
        qclogging.get_logger(name="pattern_{}".format(pattern), threaded=True),
    ) for pattern, tasks in tasks_to_run_with_pattern]
    for pattern, tasks, logger in tasks_to_run_with_pattern_and_logger:
        qclogging.add_general_file_handler(
            logger,
            join(
                log_directory,
                PATTERN_AUTO_SUBMIT_LOG_FILE_NAME.format(
                    pattern,
                    datetime.now().strftime(const.TIMESTAMP_FORMAT)),
            ),
        )
        wrapper_logger.debug(
            "Created logger for auto_submit with pattern {} and added to list to run"
            .format(pattern))

    queue_monitor_thread = threading.Thread(
        name="queue monitor",
        daemon=True,
        target=queue_monitor.queue_monitor_loop,
        args=(root_folder, sleep_time, n_max_retries, queue_logger, alert_url),
    )
    wrapper_logger.info("Created queue_monitor thread")

    bulk_auto_submit_thread = threading.Thread(
        name="main auto submit",
        daemon=True,
        target=run_main_submit_loop,
        args=(
            root_folder,
            n_runs,
            "%",
            tasks_to_run,
            sleep_time,
            (lf_est_model, hf_est_model, bb_est_model, im_est_model),
        ),
        kwargs={
            "main_logger": bulk_logger,
            "cycle_timeout": 2 * len(tasks_to_run_with_pattern_and_logger) + 2,
        },
    )
    wrapper_logger.info("Created main auto_submit thread")

    bulk_auto_submit_thread.start()
    if bulk_auto_submit_thread.is_alive():
        wrapper_logger.info("Started main auto_submit thread")
    else:
        thread_not_running = "The queue monitor thread has failed to start"
        wrapper_logger.log(qclogging.NOPRINTCRITICAL, thread_not_running)
        raise RuntimeError(thread_not_running)

    queue_monitor_thread.start()
    if queue_monitor_thread.is_alive():
        wrapper_logger.info("Started queue_monitor thread")
    else:
        thread_not_running = "The main auto_submit thread has failed to start"
        wrapper_logger.log(qclogging.NOPRINTCRITICAL, thread_not_running)
        raise RuntimeError(thread_not_running)
    run_sub_threads = len(tasks_to_run_with_pattern_and_logger) > 0
    while bulk_auto_submit_thread.is_alive() and run_sub_threads:
        wrapper_logger.info("Checking all patterns for tasks to be run")
        for pattern, tasks, pattern_logger in tasks_to_run_with_pattern_and_logger:
            wrapper_logger.info(
                "Loaded pattern {}. Checking for tasks to be run of types: {}".
                format(pattern, tasks))
            run_main_submit_loop(
                root_folder,
                n_runs,
                pattern,
                tasks,
                sleep_time,
                (lf_est_model, hf_est_model, bb_est_model, im_est_model),
                main_logger=pattern_logger,
                cycle_timeout=1,
            )
    bulk_auto_submit_thread.join()
    wrapper_logger.info(
        "The main auto_submit thread has terminated, and all auto_submit patterns have completed a final run through"
    )
    wrapper_logger.info("Attempting to shut down the queue monitor thread")
    queue_monitor.keepAlive = False
    queue_monitor_thread.join(2.0 * sleep_time)
    if not queue_monitor_thread.is_alive():
        wrapper_logger.info(
            "The queue monitor has been shut down successfully")
    else:
        wrapper_logger.critical(
            "The queue monitor has not successfully terminated")
Exemplo n.º 2
0
def main():
    logger = qclogging.get_logger()

    parser = argparse.ArgumentParser()

    parser.add_argument("root_folder",
                        type=str,
                        help="The cybershake root folder")
    parser.add_argument(
        "--n_runs",
        default=None,
        type=int,
        nargs="+",
        help=
        "The number of processes each machine can run at once. If a single value is given this is used for all "
        "machines, otherwise one value per machine must be given. The current order is: {}"
        .format((x.name for x in HPC)),
    )
    parser.add_argument(
        "user",
        type=str,
        help="The username under which the jobs will be submitted.")
    parser.add_argument(
        "--sleep_time",
        type=int,
        help="Seconds sleeping between checking queue and adding more jobs",
        default=5,
    )
    parser.add_argument(
        "--log_file",
        type=str,
        default=None,
        help=
        "Location of the log file to use. Defaults to 'cybershake_log.txt' in the location root_folder. "
        "Must be absolute or relative to the root_folder.",
    )
    parser.add_argument(
        "--task_types_to_run",
        nargs="+",
        help=
        "Which processes should be run. Defaults to IM_Calc and clean_up with dependencies automatically propagated",
        choices=[proc.str_value for proc in const.ProcessType],
        default=[const.ProcessType.clean_up.str_value],
    )
    parser.add_argument(
        "--rels_to_run",
        help=
        "An SQLite formatted query to match the realisations that should run.",
        default="%",
    )

    args = parser.parse_args()

    root_folder = os.path.abspath(args.root_folder)

    if args.log_file is None:
        qclogging.add_general_file_handler(
            logger,
            os.path.join(
                root_folder,
                AUTO_SUBMIT_LOG_FILE_NAME.format(datetime.now().strftime(
                    const.TIMESTAMP_FORMAT)),
            ),
        )
    else:
        qclogging.add_general_file_handler(
            logger, os.path.join(root_folder, args.log_file))
    logger.debug("Added file handler to the logger")

    logger.debug("Raw args passed in as follows: {}".format(str(args)))

    n_runs = 0
    if args.n_runs is not None:
        if len(args.n_runs) == 1:
            n_runs = {hpc: args.n_runs[0] for hpc in HPC}
            logger.debug(
                "Using {} as the maximum number of jobs per machine".format(
                    args.n_runs[0]))
        elif len(args.n_runs) == len(HPC):
            n_runs = {}
            for index, hpc in enumerate(HPC):
                logger.debug(
                    "Setting {} to have at most {} concurrently running jobs".
                    format(hpc, args.n_runs[index]))
                n_runs.update({hpc: args.n_runs[index]})
        else:
            logger.critical(
                "Expected either 1 or {} values for --n_runs, got {} values. Specifically: {}. Exiting now"
                .format(len(HPC), len(args.n_runs), args.n_runs))
            parser.error(
                "You must specify wither one common value for --n_runs, or one "
                "for each in the following list: {}".format(list(HPC)))
    else:
        n_runs = platform_config[const.PLATFORM_CONFIG.DEFAULT_N_RUNS.name]

    logger.debug(
        "Processes to be run were: {}. Getting all required dependencies now.".
        format(args.task_types_to_run))
    task_types_to_run = [
        const.ProcessType.from_str(proc) for proc in args.task_types_to_run
    ]
    for task in task_types_to_run:
        logger.debug(
            "Process {} in processes to be run, adding dependencies now.".
            format(task.str_value))
        for proc_num in task.get_remaining_dependencies(task_types_to_run):
            proc = const.ProcessType(proc_num)
            if proc not in task_types_to_run:
                logger.debug(
                    "Process {} added as a dependency of process {}".format(
                        proc.str_value, task.str_value))
                task_types_to_run.append(proc)

    mutually_exclusive_task_error = const.ProcessType.check_mutually_exclusive_tasks(
        task_types_to_run)
    if mutually_exclusive_task_error != "":
        logger.log(qclogging.NOPRINTCRITICAL, mutually_exclusive_task_error)
        parser.error(mutually_exclusive_task_error)

    logger.debug("Processed args are as follows: {}".format(str(args)))

    scheduler_logger = qclogging.get_logger(name=f"{logger.name}.scheduler")
    Scheduler.initialise_scheduler(user=args.user, logger=scheduler_logger)

    logger.info("Loading estimation models")
    lf_est_model = est.load_full_model(
        os.path.join(
            platform_config[const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name],
            "LF"),
        logger=logger,
    )
    hf_est_model = est.load_full_model(
        os.path.join(
            platform_config[const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name],
            "HF"),
        logger=logger,
    )
    bb_est_model = est.load_full_model(
        os.path.join(
            platform_config[const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name],
            "BB"),
        logger=logger,
    )
    im_est_model = est.load_full_model(
        os.path.join(
            platform_config[const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name],
            "IM"),
        logger=logger,
    )

    run_main_submit_loop(
        root_folder,
        n_runs,
        args.rels_to_run,
        task_types_to_run,
        args.sleep_time,
        (lf_est_model, hf_est_model, bb_est_model, im_est_model),
        main_logger=logger,
    )
Exemplo n.º 3
0
def hf_NN_model():
    return est.load_full_model(
        os.path.join(FILE_DIR, "../../estimation/models/HF/"),
        const.EstModelType.NN)
Exemplo n.º 4
0
def bb_SVR_model():
    return est.load_full_model(
        os.path.join(FILE_DIR, "../../estimation/models/BB/"),
        const.EstModelType.SVR)
Exemplo n.º 5
0
def lf_combined_model():
    return est.load_full_model(
        os.path.join(FILE_DIR, "../../estimation/models/LF/"),
        const.EstModelType.NN_SVR)