Exemplo n.º 1
0
    def check_mgmt_db(self):
        """Create errors for all entries in management db that did not complete"""
        base_proc_types = [
            const.ProcessType.EMOD3D,
            const.ProcessType.HF,
            const.ProcessType.BB,
            const.ProcessType.IM_calculation,
        ]
        db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir))

        entries = db.command_builder(
            allowed_tasks=base_proc_types,
            allowed_states=[const.Status.unknown, const.Status.failed],
            blocked_ids=self.canceled_running,
        )

        for entry in entries:
            self.errors.append(
                Error(
                    "Slurm task",
                    "Run {} did not complete task {} "
                    "(Status {}, JobId {}".format(
                        entry.run_name,
                        const.ProcessType(entry.proc_type),
                        const.Status(entry.status),
                        entry.job_id,
                    ),
                ))
Exemplo n.º 2
0
    def cancel_running(self, proc_types: List[const.ProcessType]):
        """Looks for any running task of the specified process types
        and attempts to cancel one of each.
        """
        # Get all running jobs in the mgmt db
        db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir))
        entries = db.command_builder(allowed_tasks=proc_types,
                                     allowed_states=[const.Status.running])

        # Cancel one for each process type
        for entry in entries:
            if entry.proc_type in proc_types:
                print(
                    f"Checkpoint testing: Cancelling job-id {entry.job_id} "
                    "for {entry.run_name} and process type {entry.proc_type}")

                out, err = Scheduler.get_scheduler().cancel_job(entry.job_id)

                print("Scancel out: ", out, err)
                if "error" not in out.lower() and "error" not in err.lower():
                    self.canceled_running.append(str(entry.job_id))
                    proc_types.remove(entry.proc_type)
                    print("Cancelled job-id {}".format(entry.job_id))

        return proc_types
Exemplo n.º 3
0
    def check_mgmt_db_progress(self):
        """Checks auto submit progress in the management db"""
        base_proc_types = [
            const.ProcessType.EMOD3D,
            const.ProcessType.HF,
            const.ProcessType.BB,
            const.ProcessType.IM_calculation,
        ]
        db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir))

        total_count = len(db.command_builder(allowed_tasks=base_proc_types))

        comp_count = len(
            db.command_builder(allowed_tasks=base_proc_types,
                               allowed_states=[const.Status.completed]))

        failed_count = len(
            db.command_builder(
                allowed_tasks=base_proc_types,
                allowed_states=[const.Status.failed, const.Status.unknown],
            ))

        return total_count, comp_count, failed_count
Exemplo n.º 4
0
    def check_completed(self):
        """Checks all simulations that have completed"""
        base_proc_types = [const.ProcessType.IM_calculation]
        db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir))
        entries = db.command_builder(allowed_tasks=base_proc_types,
                                     allowed_states=[const.Status.completed])

        completed_sims = [sim_t.run_name for sim_t in entries]

        # Only check the ones that haven't been checked already
        completed_new = set(completed_sims) - (self._sim_passed
                                               | self._sim_failed)

        for sim in completed_new:
            result = self.check_sim_result(
                os.path.join(self.runs_dir,
                             sim_struct.get_fault_from_realisation(sim), sim))

            if not result:
                self._sim_failed.add(sim)

                if self._stop_on_error:
                    print("Quitting as the following errors occured: ")
                    self.print_errors()
                    return False
                else:
                    print("The following error occured for simulation {}:".
                          format(sim))
                    print("ERROR: {}, {}\n".format(self.errors[-1].location,
                                                   self.errors[-1].error))

            else:
                self._sim_passed.add(sim)

        print("Passed/Failed/Total simulations: {}/{}/{}, ".format(
            len(self._sim_passed), len(self._sim_failed), len(self.sim_dirs)))

        return True