Пример #1
0
    def check_mgmt_db(self):
        """Create errors for all entries in management db that did not complete"""
        base_proc_types = [
            const.ProcessType.EMOD3D,
            const.ProcessType.HF,
            const.ProcessType.BB,
            const.ProcessType.IM_calculation,
        ]
        db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir))

        entries = db.command_builder(
            allowed_tasks=base_proc_types,
            allowed_states=[const.Status.unknown, const.Status.failed],
            blocked_ids=self.canceled_running,
        )

        for entry in entries:
            self.errors.append(
                Error(
                    "Slurm task",
                    "Run {} did not complete task {} "
                    "(Status {}, JobId {}".format(
                        entry.run_name,
                        const.ProcessType(entry.proc_type),
                        const.Status(entry.status),
                        entry.job_id,
                    ),
                ))
Пример #2
0
    def cancel_running(self, proc_types: List[const.ProcessType]):
        """Looks for any running task of the specified process types
        and attempts to cancel one of each.
        """
        # Get all running jobs in the mgmt db
        db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir))
        entries = db.command_builder(allowed_tasks=proc_types,
                                     allowed_states=[const.Status.running])

        # Cancel one for each process type
        for entry in entries:
            if entry.proc_type in proc_types:
                print(
                    f"Checkpoint testing: Cancelling job-id {entry.job_id} "
                    "for {entry.run_name} and process type {entry.proc_type}")

                out, err = Scheduler.get_scheduler().cancel_job(entry.job_id)

                print("Scancel out: ", out, err)
                if "error" not in out.lower() and "error" not in err.lower():
                    self.canceled_running.append(str(entry.job_id))
                    proc_types.remove(entry.proc_type)
                    print("Cancelled job-id {}".format(entry.job_id))

        return proc_types
Пример #3
0
 def install(self):
     print("Installing database")
     create_mgmt_db.create_mgmt_db(
         [
             "{}_REL{:0>2}".format(self.realisation_name, i)
             for i in range(1, 1 + self.realisations)
         ],
         sim_struct.get_mgmt_db(self.stage_dir),
     )
Пример #4
0
    def check_mgmt_db_progress(self):
        """Checks auto submit progress in the management db"""
        with connect_db_ctx(sim_struct.get_mgmt_db(self.stage_dir)) as cur:
            comp_count = [
                cur.execute(
                    "SELECT COUNT(*) "
                    "FROM state "
                    "WHERE status = ? "
                    "AND proc_type in (?{})".format(",?" *
                                                    (len(self.tasks) - 1)),
                    (i, *self.tasks),
                ).fetchone()[0] for i in range(1, 5)
            ]
            total_count = cur.execute(
                "SELECT COUNT(*) FROM state "
                "WHERE proc_type in (?{})".format(",?" *
                                                  (len(self.tasks) - 1)),
                (*self.tasks, ),
            ).fetchone()[0]

        return comp_count, total_count
Пример #5
0
    def check_completed(self):
        """Checks all simulations that have completed"""
        base_proc_types = [const.ProcessType.IM_calculation]
        db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir))
        entries = db.command_builder(allowed_tasks=base_proc_types,
                                     allowed_states=[const.Status.completed])

        completed_sims = [sim_t.run_name for sim_t in entries]

        # Only check the ones that haven't been checked already
        completed_new = set(completed_sims) - (self._sim_passed
                                               | self._sim_failed)

        for sim in completed_new:
            result = self.check_sim_result(
                os.path.join(self.runs_dir,
                             sim_struct.get_fault_from_realisation(sim), sim))

            if not result:
                self._sim_failed.add(sim)

                if self._stop_on_error:
                    print("Quitting as the following errors occured: ")
                    self.print_errors()
                    return False
                else:
                    print("The following error occured for simulation {}:".
                          format(sim))
                    print("ERROR: {}, {}\n".format(self.errors[-1].location,
                                                   self.errors[-1].error))

            else:
                self._sim_passed.add(sim)

        print("Passed/Failed/Total simulations: {}/{}/{}, ".format(
            len(self._sim_passed), len(self._sim_failed), len(self.sim_dirs)))

        return True
Пример #6
0
    def check_mgmt_db_progress(self):
        """Checks auto submit progress in the management db"""
        base_proc_types = [
            const.ProcessType.EMOD3D,
            const.ProcessType.HF,
            const.ProcessType.BB,
            const.ProcessType.IM_calculation,
        ]
        db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir))

        total_count = len(db.command_builder(allowed_tasks=base_proc_types))

        comp_count = len(
            db.command_builder(allowed_tasks=base_proc_types,
                               allowed_states=[const.Status.completed]))

        failed_count = len(
            db.command_builder(
                allowed_tasks=base_proc_types,
                allowed_states=[const.Status.failed, const.Status.unknown],
            ))

        return total_count, comp_count, failed_count
def install_fault(
    fault_name,
    n_rel,
    root_folder,
    version,
    stat_file_path,
    seed=HF_DEFAULT_SEED,
    extended_period=False,
    vm_perturbations=False,
    ignore_vm_perturbations=False,
    vm_qpqs_files=False,
    ignore_vm_qpqs_files=False,
    keep_dup_station=True,
    components=None,
    logger: Logger = get_basic_logger(),
):

    config_dict = utils.load_yaml(
        os.path.join(
            platform_config[PLATFORM_CONFIG.TEMPLATES_DIR.name],
            "gmsim",
            version,
            ROOT_DEFAULTS_FILE_NAME,
        )
    )
    # Load variables from cybershake config

    v1d_full_path = os.path.join(
        platform_config[PLATFORM_CONFIG.VELOCITY_MODEL_DIR.name],
        "Mod-1D",
        config_dict.get("v_1d_mod"),
    )
    site_v1d_dir = config_dict.get("site_v1d_dir")
    hf_stat_vs_ref = config_dict.get("hf_stat_vs_ref")

    vs30_file_path = stat_file_path.replace(".ll", ".vs30")
    vs30ref_file_path = stat_file_path.replace(".ll", ".vs30ref")

    # this variable has to be empty
    # TODO: fix this legacy issue, very low priority
    event_name = ""

    # get all srf from source
    srf_dir = simulation_structure.get_srf_dir(root_folder, fault_name)

    list_srf = glob.glob(os.path.join(srf_dir, "*_REL*.srf"))
    if len(list_srf) == 0:
        list_srf = glob.glob(os.path.join(srf_dir, "*.srf"))

    list_srf.sort()
    if n_rel is not None and len(list_srf) != n_rel:
        message = (
            "Error: fault {} failed. Number of realisations do "
            "not match number of SRF files".format(fault_name)
        )
        logger.log(NOPRINTCRITICAL, message)
        raise RuntimeError(message)

    # Get & validate velocity model directory
    vel_mod_dir = simulation_structure.get_fault_VM_dir(root_folder, fault_name)
    valid_vm, message = validate_vm.validate_vm(vel_mod_dir, srf=list_srf[0])
    if not valid_vm:
        message = "Error: VM {} failed {}".format(fault_name, message)
        logger.log(NOPRINTCRITICAL, message)
        raise RuntimeError(message)
    # Load the variables from vm_params.yaml
    vm_params_path = os.path.join(vel_mod_dir, VM_PARAMS_FILE_NAME)
    vm_params_dict = utils.load_yaml(vm_params_path)
    yes_model_params = (
        False  # statgrid should normally be already generated with Velocity Model
    )

    sim_root_dir = simulation_structure.get_runs_dir(root_folder)
    fault_yaml_path = simulation_structure.get_fault_yaml_path(sim_root_dir, fault_name)
    root_yaml_path = simulation_structure.get_root_yaml_path(sim_root_dir)
    for srf in list_srf:
        logger.info("Installing {}".format(srf))
        # try to match find the stoch with same basename
        realisation_name = os.path.splitext(os.path.basename(srf))[0]
        stoch_file_path = simulation_structure.get_stoch_path(
            root_folder, realisation_name
        )
        sim_params_file = simulation_structure.get_source_params_path(
            root_folder, realisation_name
        )

        if not os.path.isfile(stoch_file_path):
            message = "Error: Corresponding Stoch file is not found: {}".format(
                stoch_file_path
            )
            logger.log(NOPRINTCRITICAL, message)
            raise RuntimeError(message)

        # install pairs one by one to fit the new structure
        sim_dir = simulation_structure.get_sim_dir(root_folder, realisation_name)

        (root_params_dict, fault_params_dict, sim_params_dict) = install_simulation(
            version=version,
            sim_dir=sim_dir,
            rel_name=realisation_name,
            run_dir=sim_root_dir,
            vel_mod_dir=vel_mod_dir,
            srf_file=srf,
            stoch_file=stoch_file_path,
            stat_file_path=stat_file_path,
            vs30_file_path=vs30_file_path,
            vs30ref_file_path=vs30ref_file_path,
            yes_statcords=False,
            fault_yaml_path=fault_yaml_path,
            root_yaml_path=root_yaml_path,
            cybershake_root=root_folder,
            site_v1d_dir=site_v1d_dir,
            hf_stat_vs_ref=hf_stat_vs_ref,
            v1d_full_path=v1d_full_path,
            sim_params_file=sim_params_file,
            seed=seed,
            logger=logger,
            extended_period=extended_period,
            vm_perturbations=vm_perturbations,
            ignore_vm_perturbations=ignore_vm_perturbations,
            vm_qpqs_files=vm_qpqs_files,
            ignore_vm_qpqs_files=ignore_vm_qpqs_files,
            components=components,
        )

        if (
            root_params_dict is None
            or fault_params_dict is None
            or sim_params_dict is None
        ):
            # Something has gone wrong, returning without saving anything
            logger.critical(f"Critical Error some params dictionary are None")
            return

        if root_params_dict is not None and not isclose(
            vm_params_dict["flo"], root_params_dict["flo"]
        ):
            logger.critical(
                "The parameter 'flo' does not match in the VM params and root params files. "
                "Please ensure you are installing the correct gmsim version"
            )
            return

        create_mgmt_db.create_mgmt_db(
            [], simulation_structure.get_mgmt_db(root_folder), srf_files=srf
        )
        utils.setup_dir(os.path.join(root_folder, "mgmt_db_queue"))
        root_params_dict["mgmt_db_location"] = root_folder

        # Generate the fd files, create these at the fault level
        fd_statcords, fd_statlist = generate_fd_files(
            simulation_structure.get_fault_dir(root_folder, fault_name),
            vm_params_dict,
            stat_file=stat_file_path,
            logger=logger,
            keep_dup_station=keep_dup_station,
        )

        fault_params_dict[FaultParams.stat_coords.value] = fd_statcords
        fault_params_dict[FaultParams.FD_STATLIST.value] = fd_statlist

        #     root_params_dict['hf_stat_vs_ref'] = cybershake_cfg['hf_stat_vs_ref']
        dump_all_yamls(sim_dir, root_params_dict, fault_params_dict, sim_params_dict)

        # test if the params are accepted by steps HF and BB
        sim_params = utils.load_sim_params(os.path.join(sim_dir, "sim_params.yaml"))
        # check hf

        # temporary change the script name to hf_sim, due to how error message are shown
        main_script_name = sys.argv[0]
        sys.argv[0] = "hf_sim.py"

        command_template, add_args = hf_gen_command_template(
            sim_params, list(HPC)[0].name, seed
        )
        run_command = gen_args_cmd(
            ProcessType.HF.command_template, command_template, add_args
        )
        hf_args_parser(cmd=run_command)

        # check bb
        sys.argv[0] = "bb_sim.py"

        command_template, add_args = bb_gen_command_template(sim_params)
        run_command = gen_args_cmd(
            ProcessType.BB.command_template, command_template, add_args
        )
        bb_args_parser(cmd=run_command)
        # change back, to prevent unexpected error
        sys.argv[0] = main_script_name
Пример #8
0
def queue_monitor_loop(
    root_folder: str,
    sleep_time: int,
    max_retries: int,
    queue_logger: Logger = qclogging.get_basic_logger(),
    alert_url=None,
):
    mgmt_db = MgmtDB(sim_struct.get_mgmt_db(root_folder))
    queue_folder = sim_struct.get_mgmt_db_queue(root_folder)

    queue_logger.info("Running queue-monitor, exit with Ctrl-C.")

    mgmt_db.add_retries(max_retries)

    sqlite_tmpdir = "/tmp/cer"
    while keepAlive:
        complete_data = True
        if not os.path.exists(sqlite_tmpdir):
            os.makedirs(sqlite_tmpdir)
            queue_logger.debug("Set up the sqlite_tmpdir")

        # For each hpc get a list of job id and status', and for each pair save them in a dictionary
        queued_tasks = {}
        for hpc in HPC:
            try:
                squeued_tasks = Scheduler.get_scheduler().check_queues(
                    user=False, target_machine=hpc
                )
            except EnvironmentError as e:
                queue_logger.critical(e)
                queue_logger.critical(
                    f"An error was encountered when attempting to check {Scheduler.get_scheduler().QUEUE_NAME} for HPC {hpc}. "
                    "Tasks will not be submitted to this HPC until the issue is resolved"
                )
                complete_data = False
            else:
                for task in squeued_tasks:
                    queued_tasks[task.split()[0]] = task.split()[1]

        if len(queued_tasks) > 0:
            if len(queued_tasks) > 200:
                queue_logger.log(
                    VERYVERBOSE,
                    f"{Scheduler.get_scheduler().QUEUE_NAME} tasks: {', '.join([' '.join(task) for task in queued_tasks.items()])}",
                )
                queue_logger.info(
                    f"Over 200 tasks were found in the queue. Check the log for an exact listing of them"
                )
            else:
                queue_logger.info(
                    f"{Scheduler.get_scheduler().QUEUE_NAME} tasks: {', '.join([' '.join(task) for task in queued_tasks.items()])}"
                )
        else:
            queue_logger.debug(f"No {Scheduler.get_scheduler().QUEUE_NAME} tasks")

        db_in_progress_tasks = mgmt_db.get_submitted_tasks()
        if len(db_in_progress_tasks) > 0:

            queue_logger.info(
                "In progress tasks in mgmt db:"
                + ", ".join(
                    [
                        "{}-{}-{}-{}".format(
                            entry.run_name,
                            const.ProcessType(entry.proc_type).str_value,
                            entry.job_id,
                            const.Status(entry.status).str_value,
                        )
                        for entry in db_in_progress_tasks
                    ]
                )
            )

        entry_files = os.listdir(queue_folder)
        entry_files.sort()

        entries = []

        for file_name in entry_files[::-1]:
            queue_logger.debug(
                "Checking {} to see if it is a valid update file".format(file_name)
            )
            entry = get_queue_entry(os.path.join(queue_folder, file_name), queue_logger)
            if entry is None:
                queue_logger.debug(
                    "Removing {} from the list of update files".format(file_name)
                )
                entry_files.remove(file_name)
            else:
                if str(entry.job_id) in queued_tasks.keys() and entry.status > 3:
                    # This will prevent race conditions if the failure/completion state file is made and picked up before the job actually finishes
                    # Most notabley happens on Kisti
                    # The queued and running states are allowed
                    queue_logger.debug(
                        "Job {} is still running on the HPC, skipping this iteration".format(
                            entry
                        )
                    )
                    entry_files.remove(file_name)
                else:
                    queue_logger.debug("Adding {} to the list of updates".format(entry))
                    entries.insert(0, entry)

        entries.extend(
            update_tasks(
                entry_files,
                queued_tasks,
                db_in_progress_tasks,
                complete_data,
                queue_logger,
                root_folder,
            )
        )

        if len(entries) > 0:
            queue_logger.info("Updating {} mgmt db tasks.".format(len(entries)))
            if mgmt_db.update_entries_live(entries, max_retries, queue_logger):
                for file_name in entry_files:
                    os.remove(os.path.join(queue_folder, file_name))
                # check for jobs that matches alert criteria
                if alert_url != None:
                    for entry in entries:
                        if entry.status == const.Status.failed.value:
                            entry_retries = mgmt_db.get_retries(
                                entry.proc_type, entry.run_name
                            )
                            if entry_retries < max_retries:
                                msg = f"fault:{entry.run_name} step:{entry.proc_type} has failed with error:{entry.error}"
                            elif entry_retries >= max_retries:
                                msg = f"@here fault:{entry.run_name} step:{entry.proc_type} has failed with error:{entry.error} and met the retry cap"
                            send_alert(msg, alert_url)
            else:
                queue_logger.error(
                    "Failed to update the current entries in the mgmt db queue. "
                    "Please investigate and fix. If this is a repeating error, then this "
                    "will block all other entries from updating."
                )
        else:
            queue_logger.info("No entries in the mgmt db queue.")

        # Nap time
        queue_logger.debug("Sleeping for {}".format(sleep_time))
        time.sleep(sleep_time)
Пример #9
0
def run_main_submit_loop(
        root_folder: str,
        n_runs: Dict[str, int],
        rels_to_run: str,
        given_tasks_to_run: List[const.ProcessType],
        sleep_time: int,
        models_tuple: Tuple[est.EstModel],
        main_logger: Logger = qclogging.get_basic_logger(),
        cycle_timeout=1,
):
    mgmt_queue_folder = sim_struct.get_mgmt_db_queue(root_folder)
    mgmt_db = MgmtDB(sim_struct.get_mgmt_db(root_folder))
    root_params_file = os.path.join(sim_struct.get_runs_dir(root_folder),
                                    "root_params.yaml")
    config = utils.load_yaml(root_params_file)
    main_logger.info("Loaded root params file: {}".format(root_params_file))
    # Default values

    hf_seed = config["hf"].get(const.RootParams.seed.value,
                               const.HF_DEFAULT_SEED)
    main_logger.debug("hf_seed set to {}".format(hf_seed))

    main_logger.debug(
        f"extended_period set to {config['ims']['extended_period']}")

    time_since_something_happened = cycle_timeout

    while time_since_something_happened > 0:
        main_logger.debug("time_since_something_happened is now {}".format(
            time_since_something_happened))
        time_since_something_happened -= 1
        # Get items in the mgmt queue, have to get a snapshot instead of
        # checking the directory real-time to prevent timing issues,
        # which can result in dual-submission
        mgmt_queue_entries = os.listdir(mgmt_queue_folder)

        # Get in progress tasks in the db and the HPC queue
        n_tasks_to_run = {}
        for hpc in HPC:
            try:
                squeued_tasks = Scheduler.get_scheduler().check_queues(
                    user=True, target_machine=hpc)
            except EnvironmentError as e:
                main_logger.critical(e)
                n_tasks_to_run[hpc] = 0
            else:
                n_tasks_to_run[hpc] = n_runs[hpc] - len(squeued_tasks)
                if len(squeued_tasks) > 0:
                    main_logger.debug(
                        "There was at least one job in squeue, resetting timeout"
                    )
                    time_since_something_happened = cycle_timeout

        # Gets all runnable tasks based on mgmt db state
        runnable_tasks = mgmt_db.get_runnable_tasks(
            rels_to_run,
            sum(n_runs.values()),
            os.listdir(sim_struct.get_mgmt_db_queue(root_folder)),
            given_tasks_to_run,
            main_logger,
        )
        if len(runnable_tasks) > 0:
            time_since_something_happened = cycle_timeout
            main_logger.info("Number of runnable tasks: {}".format(
                len(runnable_tasks)))
            main_logger.debug(
                "There was at least one runnable task, resetting timeout")
        else:
            main_logger.debug("No runnable_tasks")

        # Select the first ntask_to_run that are not waiting
        # for mgmt db updates (i.e. items in the queue)
        tasks_to_run, task_counter = [], {key: 0 for key in HPC}
        for cur_proc_type, cur_run_name, retries in runnable_tasks:

            cur_hpc = get_target_machine(cur_proc_type)
            # Add task if limit has not been reached and there are no
            # outstanding mgmt db updates
            if (not shared_automated_workflow.check_mgmt_queue(
                    mgmt_queue_entries, cur_run_name, cur_proc_type) and
                    task_counter.get(cur_hpc, 0) < n_tasks_to_run[cur_hpc]):
                tasks_to_run.append((cur_proc_type, cur_run_name, retries))
                task_counter[cur_hpc] += 1

            # Open to better suggestions
            # Break if enough tasks for each HPC have been added
            if np.all([
                    True if task_counter.get(hpc, 0) >= n_tasks_to_run[hpc]
                    else False for hpc in n_tasks_to_run.keys()
            ]):
                break

        if len(tasks_to_run) > 0:
            main_logger.info("Tasks to run this iteration: " + ", ".join([
                "{}-{}".format(entry[1],
                               const.ProcessType(entry[0]).str_value)
                for entry in tasks_to_run
            ]))
        else:
            main_logger.debug("No tasks to run this iteration")

        # Submit the runnable tasks
        for proc_type, run_name, retries in tasks_to_run:

            # Special handling for merge-ts
            if proc_type == const.ProcessType.merge_ts.value:
                # Check if clean up has already run
                if mgmt_db.is_task_complete([
                        const.ProcessType.clean_up.value,
                        run_name,
                        const.Status.completed.str_value,
                ]):
                    # If clean_up has already run, then we should set it to
                    # be run again after merge_ts has run
                    shared_automated_workflow.add_to_queue(
                        mgmt_queue_folder,
                        run_name,
                        const.ProcessType.clean_up.value,
                        const.Status.created.value,
                        logger=main_logger,
                    )

            # submit the job
            submit_task(
                sim_struct.get_sim_dir(root_folder, run_name),
                proc_type,
                run_name,
                root_folder,
                main_logger,
                retries=retries,
                hf_seed=hf_seed,
                models=models_tuple,
            )
        main_logger.debug("Sleeping for {} second(s)".format(sleep_time))
        time.sleep(sleep_time)
    main_logger.info(
        "Nothing was running or ready to run last cycle, exiting now")