Exemplo n.º 1
0
def main(args, logger: Logger = get_basic_logger()):
    params = utils.load_sim_params(os.path.join(args.rel_dir, "sim_params.yaml"))
    sim_dir = params.sim_dir
    mgmt_db_loc = params.mgmt_db_location
    submit_yes = True if args.auto else confirm("Also submit the job for you?")

    # get the srf(rup) name without extensions
    srf_name = os.path.splitext(os.path.basename(params.srf_file))[0]
    # if srf(variation) is provided as args, only create the slurm
    # with same name provided
    if args.srf is not None and srf_name != args.srf:
        return

    write_directory = args.write_directory if args.write_directory else sim_dir

    # get lf_sim_dir
    lf_sim_dir = os.path.join(sim_dir, "LF")

    header_dict = {
        "platform_specific_args": get_platform_node_requirements(
            platform_config[const.PLATFORM_CONFIG.MERGE_TS_DEFAULT_NCORES.name]
        ),
        "wallclock_limit": default_run_time_merge_ts,
        "job_name": "merge_ts.{}".format(srf_name),
        "job_description": "post emod3d: merge_ts",
        "additional_lines": "###SBATCH -C avx",
    }

    command_template_parameters = {
        "run_command": platform_config[const.PLATFORM_CONFIG.RUN_COMMAND.name],
        "merge_ts_path": binary_version.get_unversioned_bin(
            "merge_tsP3_par", get_machine_config(args.machine)["tools_dir"]
        ),
    }

    body_template_params = (
        "{}.sl.template".format(merge_ts_name_prefix),
        {"lf_sim_dir": lf_sim_dir},
    )

    script_prefix = "{}_{}".format(merge_ts_name_prefix, srf_name)
    script_file_path = write_sl_script(
        write_directory,
        sim_dir,
        const.ProcessType.merge_ts,
        script_prefix,
        header_dict,
        body_template_params,
        command_template_parameters,
    )
    if submit_yes:
        submit_script_to_scheduler(
            script_file_path,
            const.ProcessType.merge_ts.value,
            sim_struct.get_mgmt_db_queue(mgmt_db_loc),
            sim_dir,
            srf_name,
            target_machine=args.machine,
            logger=logger,
        )
Exemplo n.º 2
0
 def submit_script_to_scheduler(script_name, target_machine=None, **kwargs):
     shared_automated_workflow.submit_script_to_scheduler(
         script_name,
         proc_type,
         sim_struct.get_mgmt_db_queue(root_folder),
         sim_dir,
         run_name,
         target_machine=target_machine,
         logger=task_logger,
     )
Exemplo n.º 3
0
def main(
        args: argparse.Namespace,
        est_model: est.EstModel = None,
        logger: Logger = get_basic_logger(),
):
    params = utils.load_sim_params(
        os.path.join(args.rel_dir, "sim_params.yaml"))

    submit_yes = True if args.auto else confirm("Also submit the job for you?")

    logger.debug("params.srf_file {}".format(params.srf_file))
    # Get the srf(rup) name without extensions
    srf_name = os.path.splitext(os.path.basename(params.srf_file))[0]
    if args.srf is None or srf_name == args.srf:
        logger.debug("not set_params_only")
        # get lf_sim_dir
        sim_dir = os.path.abspath(params.sim_dir)
        lf_sim_dir = sim_struct.get_lf_dir(sim_dir)

        # default_core will be changed is user passes ncore
        nt = int(float(params.sim_duration) / float(params.dt))

        target_qconfig = get_machine_config(args.machine)

        retries = args.retries if hasattr(args, "retries") else None

        est_cores, est_run_time, wct = get_lf_cores_and_wct(
            est_model,
            logger,
            nt,
            params,
            sim_dir,
            srf_name,
            target_qconfig,
            args.ncore,
            retries,
        )

        binary_path = binary_version.get_lf_bin(params.emod3d.emod3d_version,
                                                target_qconfig["tools_dir"])
        # use the original estimated run time for determining the checkpoint, or uses a minimum of 3 checkpoints
        steps_per_checkpoint = int(
            min(nt / (60.0 * est_run_time) * const.CHECKPOINT_DURATION,
                nt // 3))
        write_directory = (args.write_directory
                           if args.write_directory else params.sim_dir)

        set_runparams.create_run_params(
            sim_dir, steps_per_checkpoint=steps_per_checkpoint, logger=logger)

        header_dict = {
            "wallclock_limit": wct,
            "job_name": "emod3d.{}".format(srf_name),
            "job_description": "emod3d slurm script",
            "additional_lines": "#SBATCH --hint=nomultithread",
            "platform_specific_args":
            get_platform_node_requirements(est_cores),
        }

        command_template_parameters = {
            "run_command":
            platform_config[const.PLATFORM_CONFIG.RUN_COMMAND.name],
            "emod3d_bin": binary_path,
            "lf_sim_dir": lf_sim_dir,
        }

        body_template_params = ("run_emod3d.sl.template", {})

        script_prefix = "run_emod3d_{}".format(srf_name)
        script_file_path = write_sl_script(
            write_directory,
            params.sim_dir,
            const.ProcessType.EMOD3D,
            script_prefix,
            header_dict,
            body_template_params,
            command_template_parameters,
        )
        if submit_yes:
            submit_script_to_scheduler(
                script_file_path,
                const.ProcessType.EMOD3D.value,
                sim_struct.get_mgmt_db_queue(params.mgmt_db_location),
                params.sim_dir,
                srf_name,
                target_machine=args.machine,
                logger=logger,
            )
Exemplo n.º 4
0
def queue_monitor_loop(
    root_folder: str,
    sleep_time: int,
    max_retries: int,
    queue_logger: Logger = qclogging.get_basic_logger(),
    alert_url=None,
):
    mgmt_db = MgmtDB(sim_struct.get_mgmt_db(root_folder))
    queue_folder = sim_struct.get_mgmt_db_queue(root_folder)

    queue_logger.info("Running queue-monitor, exit with Ctrl-C.")

    mgmt_db.add_retries(max_retries)

    sqlite_tmpdir = "/tmp/cer"
    while keepAlive:
        complete_data = True
        if not os.path.exists(sqlite_tmpdir):
            os.makedirs(sqlite_tmpdir)
            queue_logger.debug("Set up the sqlite_tmpdir")

        # For each hpc get a list of job id and status', and for each pair save them in a dictionary
        queued_tasks = {}
        for hpc in HPC:
            try:
                squeued_tasks = Scheduler.get_scheduler().check_queues(
                    user=False, target_machine=hpc
                )
            except EnvironmentError as e:
                queue_logger.critical(e)
                queue_logger.critical(
                    f"An error was encountered when attempting to check {Scheduler.get_scheduler().QUEUE_NAME} for HPC {hpc}. "
                    "Tasks will not be submitted to this HPC until the issue is resolved"
                )
                complete_data = False
            else:
                for task in squeued_tasks:
                    queued_tasks[task.split()[0]] = task.split()[1]

        if len(queued_tasks) > 0:
            if len(queued_tasks) > 200:
                queue_logger.log(
                    VERYVERBOSE,
                    f"{Scheduler.get_scheduler().QUEUE_NAME} tasks: {', '.join([' '.join(task) for task in queued_tasks.items()])}",
                )
                queue_logger.info(
                    f"Over 200 tasks were found in the queue. Check the log for an exact listing of them"
                )
            else:
                queue_logger.info(
                    f"{Scheduler.get_scheduler().QUEUE_NAME} tasks: {', '.join([' '.join(task) for task in queued_tasks.items()])}"
                )
        else:
            queue_logger.debug(f"No {Scheduler.get_scheduler().QUEUE_NAME} tasks")

        db_in_progress_tasks = mgmt_db.get_submitted_tasks()
        if len(db_in_progress_tasks) > 0:

            queue_logger.info(
                "In progress tasks in mgmt db:"
                + ", ".join(
                    [
                        "{}-{}-{}-{}".format(
                            entry.run_name,
                            const.ProcessType(entry.proc_type).str_value,
                            entry.job_id,
                            const.Status(entry.status).str_value,
                        )
                        for entry in db_in_progress_tasks
                    ]
                )
            )

        entry_files = os.listdir(queue_folder)
        entry_files.sort()

        entries = []

        for file_name in entry_files[::-1]:
            queue_logger.debug(
                "Checking {} to see if it is a valid update file".format(file_name)
            )
            entry = get_queue_entry(os.path.join(queue_folder, file_name), queue_logger)
            if entry is None:
                queue_logger.debug(
                    "Removing {} from the list of update files".format(file_name)
                )
                entry_files.remove(file_name)
            else:
                if str(entry.job_id) in queued_tasks.keys() and entry.status > 3:
                    # This will prevent race conditions if the failure/completion state file is made and picked up before the job actually finishes
                    # Most notabley happens on Kisti
                    # The queued and running states are allowed
                    queue_logger.debug(
                        "Job {} is still running on the HPC, skipping this iteration".format(
                            entry
                        )
                    )
                    entry_files.remove(file_name)
                else:
                    queue_logger.debug("Adding {} to the list of updates".format(entry))
                    entries.insert(0, entry)

        entries.extend(
            update_tasks(
                entry_files,
                queued_tasks,
                db_in_progress_tasks,
                complete_data,
                queue_logger,
                root_folder,
            )
        )

        if len(entries) > 0:
            queue_logger.info("Updating {} mgmt db tasks.".format(len(entries)))
            if mgmt_db.update_entries_live(entries, max_retries, queue_logger):
                for file_name in entry_files:
                    os.remove(os.path.join(queue_folder, file_name))
                # check for jobs that matches alert criteria
                if alert_url != None:
                    for entry in entries:
                        if entry.status == const.Status.failed.value:
                            entry_retries = mgmt_db.get_retries(
                                entry.proc_type, entry.run_name
                            )
                            if entry_retries < max_retries:
                                msg = f"fault:{entry.run_name} step:{entry.proc_type} has failed with error:{entry.error}"
                            elif entry_retries >= max_retries:
                                msg = f"@here fault:{entry.run_name} step:{entry.proc_type} has failed with error:{entry.error} and met the retry cap"
                            send_alert(msg, alert_url)
            else:
                queue_logger.error(
                    "Failed to update the current entries in the mgmt db queue. "
                    "Please investigate and fix. If this is a repeating error, then this "
                    "will block all other entries from updating."
                )
        else:
            queue_logger.info("No entries in the mgmt db queue.")

        # Nap time
        queue_logger.debug("Sleeping for {}".format(sleep_time))
        time.sleep(sleep_time)
Exemplo n.º 5
0
def main(
        args: argparse.Namespace,
        est_model: est.EstModel = None,
        logger: Logger = get_basic_logger(),
):
    params = utils.load_sim_params(
        os.path.join(args.rel_dir, "sim_params.yaml"))

    # check if the args is none, if not, change the version

    if args.version is not None and args.version in ["mpi", "run_hf_mpi"]:
        version = args.version
        ll_name_prefix = "run_hf_mpi"
    else:
        if args.version is not None:
            logger.error(
                "{} cannot be recognize as a valid version option. version is set to default: {}"
                .format(
                    args.version,
                    platform_config[
                        const.PLATFORM_CONFIG.HF_DEFAULT_VERSION.name],
                ))
        version = platform_config[
            const.PLATFORM_CONFIG.HF_DEFAULT_VERSION.name]
        ll_name_prefix = platform_config[
            const.PLATFORM_CONFIG.HF_DEFAULT_VERSION.name]
    logger.debug("version: {}".format(version))

    # modify the logic to use the same as in install_bb:
    # sniff through params_base to get the names of srf,
    # instead of running through file directories.

    # loop through all srf file to generate related slurm scripts
    srf_name = os.path.splitext(os.path.basename(params.srf_file))[0]
    # if srf(variation) is provided as args, only create
    # the slurm with same name provided
    if args.srf is None or srf_name == args.srf:
        nt = get_hf_nt(params)
        fd_count = len(shared.get_stations(params.FD_STATLIST))
        # TODO:make it read through the whole list
        #  instead of assuming every stoch has same size
        nsub_stoch, sub_fault_area = srf.get_nsub_stoch(params.hf.slip,
                                                        get_area=True)

        if est_model is None:
            est_model = os.path.join(
                platform_config[
                    const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name], "HF")
        est_core_hours, est_run_time, est_cores = est.est_HF_chours_single(
            fd_count,
            nsub_stoch,
            nt,
            args.ncore,
            est_model,
            scale_ncores=SCALE_NCORES,
            logger=logger,
        )

        # scale up the est_run_time if it is a re-run (with check-pointing)
        # creates and extra variable so we keep the orignial estimated run time for other purpose
        est_run_time_scaled = est_run_time
        if hasattr(args, "retries") and int(args.retries) > 0:
            # check if HF.bin is read-able = restart-able
            try:
                from qcore.timeseries import HFSeis

                bin = HFSeis(sim_struct.get_hf_bin_path(params.sim_dir))
            except:
                logger.debug("Retried count > 0 but HF.bin is not readable")
            else:
                est_run_time_scaled = est_run_time * (int(args.retries) + 1)

        wct = set_wct(est_run_time_scaled, est_cores, args.auto)
        hf_sim_dir = sim_struct.get_hf_dir(params.sim_dir)
        write_directory = (args.write_directory
                           if args.write_directory else params.sim_dir)
        underscored_srf = srf_name.replace("/", "__")

        header_dict = {
            "platform_specific_args":
            get_platform_node_requirements(est_cores),
            "wallclock_limit": wct,
            "job_name": "hf.{}".format(underscored_srf),
            "job_description": "HF calculation",
            "additional_lines": "###SBATCH -C avx",
        }
        command_template_parameters, add_args = gen_command_template(
            params, args.machine, seed=args.seed)

        body_template_params = (
            "{}.sl.template".format(ll_name_prefix),
            {
                "hf_sim_dir": hf_sim_dir,
                "test_hf_script": "test_hf.sh"
            },
        )

        script_prefix = "{}_{}".format(ll_name_prefix, underscored_srf)
        script_file_path = write_sl_script(
            write_directory,
            params.sim_dir,
            const.ProcessType.HF,
            script_prefix,
            header_dict,
            body_template_params,
            command_template_parameters,
            add_args,
        )

        # Submit the script
        submit_yes = True if args.auto else confirm(
            "Also submit the job for you?")
        if submit_yes:
            submit_script_to_scheduler(
                script_file_path,
                const.ProcessType.HF.value,
                sim_struct.get_mgmt_db_queue(params.mgmt_db_location),
                params.sim_dir,
                srf_name,
                target_machine=args.machine,
                logger=logger,
            )
Exemplo n.º 6
0
def main(
    args: argparse.Namespace,
    est_model: est.EstModel = None,
    logger: Logger = get_basic_logger(),
):
    params = utils.load_sim_params(os.path.join(args.rel_dir, "sim_params.yaml"))
    ncores = platform_config[const.PLATFORM_CONFIG.BB_DEFAULT_NCORES.name]

    version = args.version
    if version in ["mpi", "run_bb_mpi"]:
        sl_name_prefix = "run_bb_mpi"
    else:
        if version is not None:
            logger.error(
                "{} cannot be recognized as a valid option. version is set to default:".format(
                    version,
                    platform_config[const.PLATFORM_CONFIG.BB_DEFAULT_VERSION.name],
                )
            )
        version = platform_config[const.PLATFORM_CONFIG.BB_DEFAULT_VERSION.name]
        sl_name_prefix = platform_config[const.PLATFORM_CONFIG.BB_DEFAULT_VERSION.name]
    logger.debug(version)

    srf_name = os.path.splitext(os.path.basename(params.srf_file))[0]
    if args.srf is None or srf_name == args.srf:
        # TODO: save status as HF. refer to submit_hf
        # Use HF nt for wct estimation
        nt = get_hf_nt(params)
        fd_count = len(shared.get_stations(params.FD_STATLIST))

        if est_model is None:
            est_model = os.path.join(
                platform_config[const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name], "BB"
            )

        est_core_hours, est_run_time = est.est_BB_chours_single(
            fd_count, nt, ncores, est_model
        )

        # creates and extra variable so we keep the original estimated run time for other purpos
        est_run_time_scaled = est_run_time
        if hasattr(args, "retries") and int(args.retries) > 0:
            # check if BB.bin is read-able = restart-able
            try:
                from qcore.timeseries import BBSeis

                bin = BBSeis(simulation_structure.get_bb_bin_path(params.sim_dir))
            except:
                logger.debug("Retried count > 0 but BB.bin is not readable")
            else:
                est_run_time_scaled = est_run_time * (int(args.retries) + 1)

        wct = set_wct(est_run_time_scaled, ncores, args.auto)
        write_directory = (
            args.write_directory if args.write_directory else params.sim_dir
        )
        underscored_srf = srf_name.replace("/", "__")

        header_dict = {
            "wallclock_limit": wct,
            "job_name": "bb.{}".format(underscored_srf),
            "job_description": "BB calculation",
            "additional_lines": "###SBATCH -C avx",
            "platform_specific_args": get_platform_node_requirements(ncores),
        }

        body_template_params = (
            "{}.sl.template".format(sl_name_prefix),
            {"test_bb_script": "test_bb.sh"},
        )

        command_template_parameters, add_args = gen_command_template(params)

        script_prefix = "{}_{}".format(sl_name_prefix, underscored_srf)
        script_file_path = write_sl_script(
            write_directory,
            params.sim_dir,
            const.ProcessType.BB,
            script_prefix,
            header_dict,
            body_template_params,
            command_template_parameters,
            add_args,
        )

        # Submit the script
        submit_yes = True if args.auto else confirm("Also submit the job for you?")
        if submit_yes:
            submit_script_to_scheduler(
                script_file_path,
                const.ProcessType.BB.value,
                simulation_structure.get_mgmt_db_queue(params.mgmt_db_location),
                params.sim_dir,
                srf_name,
                target_machine=args.machine,
                logger=logger,
            )
def submit_im_calc_slurm(
        sim_dir: str,
        write_dir: str = None,
        simple_out: bool = True,
        adv_ims: bool = False,
        target_machine: str = get_target_machine(
            const.ProcessType.IM_calculation).name,
        est_model: EstModel = path.join(
            platform_config[const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name],
            "IM"),
        logger: Logger = get_basic_logger(),
):
    """Creates the IM calc slurm scrip, also submits if specified

    The options_dict is populated by the DEFAULT_OPTIONS, values can be changed by
    passing in a dict containing the entries that require changing. Merges the
    two dictionaries, the passed in one has higher priority.
    """
    # Load the yaml params
    params = utils.load_sim_params(
        sim_struct.get_sim_params_yaml_path(sim_dir), load_vm=True)
    realisation_name = params[const.SimParams.run_name.value]
    fault_name = sim_struct.get_fault_from_realisation(realisation_name)
    station_count = len(load_station_file(params["FD_STATLIST"]).index)

    header_options = {
        const.SlHdrOptConsts.description.value:
        "Calculates intensity measures.",
        const.SlHdrOptConsts.memory.value:
        "2G",
        const.SlHdrOptConsts.version.value:
        "slurm",
        "exe_time":
        const.timestamp,
        const.SlHdrOptConsts.additional.value:
        "#SBATCH --hint=nomultithread"
        if platform_config[const.PLATFORM_CONFIG.SCHEDULER.name] == "slurm"
        else [""],
    }

    body_options = {
        const.SlBodyOptConsts.component.value: "",
        "realisation_name": realisation_name,
        const.SlBodyOptConsts.fault_name.value: fault_name,
        "np":
        platform_config[const.PLATFORM_CONFIG.IM_CALC_DEFAULT_N_CORES.name],
        "sim_IM_calc_dir": sim_struct.get_im_calc_dir(sim_dir),
        "output_csv": sim_struct.get_IM_csv(sim_dir),
        "output_info": sim_struct.get_IM_info(sim_dir),
        "models": "",
        const.SlBodyOptConsts.mgmt_db.value: "",
        "n_components": "",
        "match_obs_stations": False,
    }

    command_options = {
        const.SlBodyOptConsts.sim_dir.value:
        sim_dir,
        const.SlBodyOptConsts.component.value:
        "",
        const.SlBodyOptConsts.sim_name.value:
        realisation_name,
        const.SlBodyOptConsts.fault_name.value:
        fault_name,
        const.SlBodyOptConsts.n_procs.value:
        platform_config[const.PLATFORM_CONFIG.IM_CALC_DEFAULT_N_CORES.name],
        const.SlBodyOptConsts.extended.value:
        "",
        const.SlBodyOptConsts.simple_out.value:
        "",
        const.SlBodyOptConsts.advanced_IM.value:
        "",
        "pSA_periods":
        "",
    }

    # Convert option settings to values
    if write_dir is None:
        write_dir = sim_dir

    # Simple vs adv im settings
    if adv_ims:
        # Common values
        proc_type = const.ProcessType.advanced_IM
        sl_template = "adv_im_calc.sl.template"
        script_prefix = "adv_im_calc"

        body_options["models"] = " ".join(
            params[const.SlBodyOptConsts.advanced_IM.value]["models"])
        command_options[
            const.SlBodyOptConsts.advanced_IM.
            value] = f"-a {body_options['models']} --OpenSees {qconfig['OpenSees']} "

        # create temporary station list if "match_obs_stations" is directory
        if path.isdir(
                str(params[const.SlBodyOptConsts.advanced_IM.value]
                    ["match_obs_stations"])):
            logger.debug(
                "match_obs_station specificed: {params[const.SlBodyOptConsts.advanced_IM.value]['match_obs_stations']}"
            )
            # retreived station list from observed/fault(eventname)/Vol*/data/accBB/station.
            obs_accBB_dir_glob = path.join(
                params[const.SlBodyOptConsts.advanced_IM.value]
                ["match_obs_stations"],
                f"{fault_name}/*/*/accBB",
            )
            obs_accBB_dir = glob.glob(obs_accBB_dir_glob)
            if len(obs_accBB_dir) > 1:
                logger.error(
                    "got more than one folder globbed. please double check the path to the match_obs_stations is correct."
                )
                sys.exit()
            station_names_tmp = get_observed_stations(obs_accBB_dir[0])
            # write to a tmp file
            tmp_station_file = path.join(sim_dir, "tmp_station_file")
            with open(tmp_station_file, "w") as f:
                for station in station_names_tmp:
                    f.write(f"{station} ")
            command_options[const.SlBodyOptConsts.advanced_IM.value] = (
                command_options[const.SlBodyOptConsts.advanced_IM.value] +
                f"--station_names `cat {tmp_station_file}`")
        #        header_options[const.SlHdrOptConsts.n_tasks.value] = body_options["np"] = qconfig["cores_per_node"]

        # Time for one station to run in hours
        # This should be a machine property. Or take the largest across all machines used
        time_for_one_station = 0.5
        est_run_time = (np.ceil(station_count / qconfig["cores_per_node"]) *
                        2 * time_for_one_station)

    else:
        proc_type = const.ProcessType.IM_calculation
        sl_template = "sim_im_calc.sl.template"
        script_prefix = "sim_im_calc"

        if simple_out:
            command_options[const.SlBodyOptConsts.simple_out.value] = "-s"

        if params["ims"][const.RootParams.extended_period.name]:
            command_options[const.SlBodyOptConsts.extended.value] = "-e"
            period_count = len(
                np.unique(
                    np.append(params["ims"]["pSA_periods"], const.EXT_PERIOD)))
        else:
            period_count = len(params["ims"]["pSA_periods"])

        if "pSA_periods" in params["ims"]:
            command_options[
                "pSA_periods"] = f"-p {' '.join(str(p) for p in params['ims']['pSA_periods'])}"

        comps_to_store = params["ims"][const.SlBodyOptConsts.component.value]
        command_options[const.SlBodyOptConsts.component.
                        value] = "-c " + " ".join(comps_to_store)
        body_options["n_components"] = len(comps_to_store)

        # Get wall clock estimation
        logger.info(
            "Running wall clock estimation for IM sim for realisation {}".
            format(realisation_name))
        _, est_run_time = est_IM_chours_single(
            station_count,
            int(float(params["sim_duration"]) / float(params["dt"])),
            comps_to_store,
            period_count,
            body_options["np"],
            est_model,
        )

    # Header options requiring upstream settings
    # special treatment for im_calc, as the scaling feature in estimation is not suitable
    # cap the wct, otherwise cannot submit
    est_run_time = min(est_run_time * CH_SAFETY_FACTOR, qconfig["MAX_JOB_WCT"])
    # set ch_safety_factor=1 as we scale it already.
    header_options["wallclock_limit"] = get_wct(est_run_time,
                                                ch_safety_factor=1)
    logger.debug("Using WCT for IM_calc: {header_options['wallclock_limit']}")
    header_options["job_name"] = "{}_{}".format(proc_type.str_value,
                                                fault_name)
    header_options["platform_specific_args"] = get_platform_node_requirements(
        body_options["np"])

    script_file_path = write_sl_script(
        write_dir,
        sim_dir,
        proc_type,
        script_prefix,
        header_options,
        (sl_template, body_options),
        command_options,
    )

    submit_script_to_scheduler(
        script_file_path,
        proc_type.value,
        sim_struct.get_mgmt_db_queue(params["mgmt_db_location"]),
        sim_dir,
        realisation_name,
        target_machine=target_machine,
        logger=logger,
    )
Exemplo n.º 8
0
def run_main_submit_loop(
        root_folder: str,
        n_runs: Dict[str, int],
        rels_to_run: str,
        given_tasks_to_run: List[const.ProcessType],
        sleep_time: int,
        models_tuple: Tuple[est.EstModel],
        main_logger: Logger = qclogging.get_basic_logger(),
        cycle_timeout=1,
):
    mgmt_queue_folder = sim_struct.get_mgmt_db_queue(root_folder)
    mgmt_db = MgmtDB(sim_struct.get_mgmt_db(root_folder))
    root_params_file = os.path.join(sim_struct.get_runs_dir(root_folder),
                                    "root_params.yaml")
    config = utils.load_yaml(root_params_file)
    main_logger.info("Loaded root params file: {}".format(root_params_file))
    # Default values

    hf_seed = config["hf"].get(const.RootParams.seed.value,
                               const.HF_DEFAULT_SEED)
    main_logger.debug("hf_seed set to {}".format(hf_seed))

    main_logger.debug(
        f"extended_period set to {config['ims']['extended_period']}")

    time_since_something_happened = cycle_timeout

    while time_since_something_happened > 0:
        main_logger.debug("time_since_something_happened is now {}".format(
            time_since_something_happened))
        time_since_something_happened -= 1
        # Get items in the mgmt queue, have to get a snapshot instead of
        # checking the directory real-time to prevent timing issues,
        # which can result in dual-submission
        mgmt_queue_entries = os.listdir(mgmt_queue_folder)

        # Get in progress tasks in the db and the HPC queue
        n_tasks_to_run = {}
        for hpc in HPC:
            try:
                squeued_tasks = Scheduler.get_scheduler().check_queues(
                    user=True, target_machine=hpc)
            except EnvironmentError as e:
                main_logger.critical(e)
                n_tasks_to_run[hpc] = 0
            else:
                n_tasks_to_run[hpc] = n_runs[hpc] - len(squeued_tasks)
                if len(squeued_tasks) > 0:
                    main_logger.debug(
                        "There was at least one job in squeue, resetting timeout"
                    )
                    time_since_something_happened = cycle_timeout

        # Gets all runnable tasks based on mgmt db state
        runnable_tasks = mgmt_db.get_runnable_tasks(
            rels_to_run,
            sum(n_runs.values()),
            os.listdir(sim_struct.get_mgmt_db_queue(root_folder)),
            given_tasks_to_run,
            main_logger,
        )
        if len(runnable_tasks) > 0:
            time_since_something_happened = cycle_timeout
            main_logger.info("Number of runnable tasks: {}".format(
                len(runnable_tasks)))
            main_logger.debug(
                "There was at least one runnable task, resetting timeout")
        else:
            main_logger.debug("No runnable_tasks")

        # Select the first ntask_to_run that are not waiting
        # for mgmt db updates (i.e. items in the queue)
        tasks_to_run, task_counter = [], {key: 0 for key in HPC}
        for cur_proc_type, cur_run_name, retries in runnable_tasks:

            cur_hpc = get_target_machine(cur_proc_type)
            # Add task if limit has not been reached and there are no
            # outstanding mgmt db updates
            if (not shared_automated_workflow.check_mgmt_queue(
                    mgmt_queue_entries, cur_run_name, cur_proc_type) and
                    task_counter.get(cur_hpc, 0) < n_tasks_to_run[cur_hpc]):
                tasks_to_run.append((cur_proc_type, cur_run_name, retries))
                task_counter[cur_hpc] += 1

            # Open to better suggestions
            # Break if enough tasks for each HPC have been added
            if np.all([
                    True if task_counter.get(hpc, 0) >= n_tasks_to_run[hpc]
                    else False for hpc in n_tasks_to_run.keys()
            ]):
                break

        if len(tasks_to_run) > 0:
            main_logger.info("Tasks to run this iteration: " + ", ".join([
                "{}-{}".format(entry[1],
                               const.ProcessType(entry[0]).str_value)
                for entry in tasks_to_run
            ]))
        else:
            main_logger.debug("No tasks to run this iteration")

        # Submit the runnable tasks
        for proc_type, run_name, retries in tasks_to_run:

            # Special handling for merge-ts
            if proc_type == const.ProcessType.merge_ts.value:
                # Check if clean up has already run
                if mgmt_db.is_task_complete([
                        const.ProcessType.clean_up.value,
                        run_name,
                        const.Status.completed.str_value,
                ]):
                    # If clean_up has already run, then we should set it to
                    # be run again after merge_ts has run
                    shared_automated_workflow.add_to_queue(
                        mgmt_queue_folder,
                        run_name,
                        const.ProcessType.clean_up.value,
                        const.Status.created.value,
                        logger=main_logger,
                    )

            # submit the job
            submit_task(
                sim_struct.get_sim_dir(root_folder, run_name),
                proc_type,
                run_name,
                root_folder,
                main_logger,
                retries=retries,
                hf_seed=hf_seed,
                models=models_tuple,
            )
        main_logger.debug("Sleeping for {} second(s)".format(sleep_time))
        time.sleep(sleep_time)
    main_logger.info(
        "Nothing was running or ready to run last cycle, exiting now")