Exemplo n.º 1
0
class SimulationContext(object):
    def __init__(self, ert, sim_fs, mask, itr, verbose=False):
        self._ert = ert
        """ :type: res.enkf.EnKFMain """
        max_runtime = ert.analysisConfig().get_max_runtime()
        self._mask = mask

        job_queue = ert.get_queue_config().create_job_queue()
        job_queue.set_max_job_duration(max_runtime)
        self._queue_manager = JobQueueManager(job_queue)
        self._queue_manager.startQueue(mask.count(), verbose=verbose)
        self._run_args = {}
        """ :type: dict[int, RunArg] """

        self._thread_pool = CThreadPool(8)
        self._thread_pool.addTaskFunction("submitJob", RES_LIB,
                                          "enkf_main_isubmit_job__")

        subst_list = self._ert.getDataKW()
        path_fmt = self._ert.getModelConfig().getRunpathFormat()
        jobname_fmt = self._ert.getModelConfig().getJobnameFormat()
        self._run_context = ErtRunContext(EnkfRunType.ENSEMBLE_EXPERIMENT,
                                          sim_fs, None, mask, path_fmt,
                                          jobname_fmt, subst_list, itr)
        self._ert.initRun(self._run_context)

    def __len__(self):
        return self._mask.count()

    def addSimulation(self, iens, geo_id):
        if not (0 <= iens < len(self._run_context)):
            raise UserWarning("Realization number out of range: %d >= %d" %
                              (iens, len(self._run_context)))

        if not self._mask[iens]:
            raise UserWarning("Realization number: '%d' is not active" % iens)

        if iens in self._run_args:
            raise UserWarning("Realization number: '%d' already queued" % iens)

        run_arg = self._run_context[iens]
        run_arg.geo_id = geo_id
        self._run_args[iens] = run_arg

        self._ert.createRunpath(self._run_context, iens=iens)

        queue = self._queue_manager.get_job_queue()
        self._thread_pool.submitJob(ArgPack(self._ert, run_arg, queue))

    def isRunning(self):
        return self._queue_manager.isRunning()

    def getNumPending(self):
        return self._queue_manager.getNumPending()

    def getNumRunning(self):
        return self._queue_manager.getNumRunning()

    def getNumSuccess(self):
        return self._queue_manager.getNumSuccess()

    def getNumFailed(self):
        return self._queue_manager.getNumFailed()

    def getNumWaiting(self):
        return self._queue_manager.getNumWaiting()

    def didRealizationSucceed(self, iens):
        queue_index = self._run_args[iens].getQueueIndex()
        return self._queue_manager.didJobSucceed(queue_index)

    def didRealizationFail(self, iens):
        # For the purposes of this class, a failure should be anything (killed job, etc) that is not an explicit success.
        return not self.didRealizationSucceed(iens)

    def isRealizationQueued(self, iens):
        return iens in self._run_args

    def isRealizationFinished(self, iens):
        run_arg = self._run_args[iens]

        if run_arg.isSubmitted():
            queue_index = run_arg.getQueueIndex()
            return self._queue_manager.isJobComplete(queue_index)
        else:
            return False

    def __repr__(self):
        running = 'running' if self.isRunning() else 'not running'
        numRunn = self.getNumRunning()
        numSucc = self.getNumSuccess()
        numFail = self.getNumFailed()
        numWait = self.getNumWaiting()
        fmt = '%s, #running = %d, #success = %d, #failed = %d, #waiting = %d'
        fmt = fmt % (running, numRunn, numSucc, numFail, numWait)
        return 'SimulationContext(%s)' % fmt

    def get_sim_fs(self):
        return self._run_context.get_sim_fs()

    def get_run_context(self):
        return self._run_context

    def stop(self):
        self._queue_manager.stop_queue()

    def job_progress(self, iens):
        """Will return a detailed progress of the job.

        The progress report is obtained by reading a file from the filesystem,
        that file is typically created by another process running on another
        machine, and reading might fail due to NFS issues, simultanoues write
        and so on. If loading valid json fails the function will sleep 0.10
        seconds and retry - eventually giving up and returning None. Also for
        jobs which have not yet started the method will return None.

        When the method succeeds in reading the progress file from the file
        system the return value will be an object with properties like this:|

           progress.start_time
           progress.end_time
           progress.run_id
           progress.jobs =[ (job1.name, job1.start_time, job1.end_time, job1.status, job1.error_msg),
                             (job2.name, job2.start_time, job2.end_time, job2.status, job2.error_msg),
                              ....
                             (jobN.name, jobN.start_time, jobN.end_time, jobN.status, jobN.error_msg) ]

        """
        if not iens in self._run_args:
            raise KeyError("No such simulation: %s" % iens)

        run_arg = self._run_args[iens]
        try:
            # will throw if not yet submitted (is in a limbo state)
            queue_index = run_arg.getQueueIndex()
        except ValueError:
            return None
        if self._queue_manager.isJobWaiting(queue_index):
            return None

        return ForwardModelStatus.load(run_arg.runpath)

    def run_path(self, iens):
        """
        Will return the path to the simulation.
        """
        if not iens in self._run_args:
            raise KeyError("No such simulation: %s" % iens)

        run_arg = self._run_args[iens]
        return run_arg.runpath

    def job_status(self, iens):
        """Will query the queue system for the status of the job.
        """
        if not iens in self._run_args:
            raise KeyError("No such simulation: %s" % iens)

        run_arg = self._run_args[iens]
        try:
            queue_index = run_arg.getQueueIndex()
        except ValueError:
            return None
        return self._queue_manager.getJobStatus(queue_index)

    def status_timestamp(self):
        """Will return a timestamp for the last status change of the simulations.

        The timestamp is related to status changes when a simulation has
        started, completed and failed.

        """
        return self._queue_manager.status_timestamp()

    def progress_timestamp(self, iens=None):
        """
        Will return a timestamp for when the simulation has progressed to a new forward model step.
        """

        if iens is None:
            return self._queue_manager.progress_timestamp()

        if not iens in self._run_args:
            raise KeyError("No such simulation: %s" % iens)

        run_arg = self._run_args[iens]
        queue_index = run_arg.getQueueIndex()
        return self._queue_manager.progress_timestamp(queue_index)
Exemplo n.º 2
0
class SimulationContext(object):
    def __init__(self, ert, sim_fs, mask, itr, verbose=False):
        self._ert = ert
        """ :type: res.enkf.EnKFMain """
        max_runtime = ert.analysisConfig().get_max_runtime()
        self._mask = mask

        job_queue = ert.get_queue_config().create_job_queue()
        self._queue_manager = JobQueueManager(job_queue)
        self._queue_manager.startQueue(mask.count(), verbose=verbose)
        self._run_args = {}
        """ :type: dict[int, RunArg] """

        self._thread_pool = CThreadPool(8)
        self._thread_pool.addTaskFunction("submitJob", ENKF_LIB,
                                          "enkf_main_isubmit_job__")

        subst_list = self._ert.getDataKW()
        path_fmt = self._ert.getModelConfig().getRunpathFormat()
        jobname_fmt = self._ert.getModelConfig().getJobnameFormat()
        self._run_context = ErtRunContext(EnkfRunType.ENSEMBLE_EXPERIMENT,
                                          sim_fs, None, mask, path_fmt,
                                          jobname_fmt, subst_list, itr)
        self._ert.createRunpath(self._run_context)

    def __len__(self):
        return self._mask.count()

    def addSimulation(self, iens, geo_id):
        if not (0 <= iens < len(self._run_context)):
            raise UserWarning("Realization number out of range: %d >= %d" %
                              (iens, len(self._run_context)))

        if not self._mask[iens]:
            raise UserWarning("Realization number: '%d' is not active" % iens)

        if iens in self._run_args:
            raise UserWarning("Realization number: '%d' already queued" % iens)

        run_arg = self._run_context[iens]
        queue = self._queue_manager.get_job_queue()
        self._run_args[iens] = run_arg
        self._thread_pool.submitJob(ArgPack(self._ert, run_arg, queue, geo_id))

    def isRunning(self):
        return self._queue_manager.isRunning()

    def getNumPending(self):
        return self._queue_manager.getNumPending()

    def getNumRunning(self):
        return self._queue_manager.getNumRunning()

    def getNumSuccess(self):
        return self._queue_manager.getNumSuccess()

    def getNumFailed(self):
        return self._queue_manager.getNumFailed()

    def getNumWaiting(self):
        return self._queue_manager.getNumWaiting()

    def didRealizationSucceed(self, iens):
        queue_index = self._run_args[iens].getQueueIndex()
        return self._queue_manager.didJobSucceed(queue_index)

    def didRealizationFail(self, iens):
        # For the purposes of this class, a failure should be anything (killed job, etc) that is not an explicit success.
        return not self.didRealizationSucceed(iens)

    def isRealizationQueued(self, iens):
        return iens in self._run_args

    def isRealizationFinished(self, iens):
        run_arg = self._run_args[iens]

        if run_arg.isSubmitted():
            queue_index = run_arg.getQueueIndex()
            return self._queue_manager.isJobComplete(queue_index)
        else:
            return False

    def __repr__(self):
        running = 'running' if self.isRunning() else 'not running'
        numRunn = self.getNumRunning()
        numSucc = self.getNumSuccess()
        numFail = self.getNumFailed()
        numWait = self.getNumWaiting()
        fmt = '%s, #running = %d, #success = %d, #failed = %d, #waiting = %d'
        fmt = fmt % (running, numRunn, numSucc, numFail, numWait)
        return 'SimulationContext(%s)' % fmt

    def get_sim_fs(self):
        return self._run_context.get_sim_fs()

    def get_run_context(self):
        return self._run_context
Exemplo n.º 3
0
class SimulationContext(object):
    def __init__(self, ert, size, verbose=False):
        self._ert = ert
        """ :type: res.enkf.EnKFMain """
        self._size = size

        max_runtime = ert.analysisConfig().get_max_runtime()
        raise Exception(
            "Code has lost access to job_queue instance. Refactor required.")

        job_queue = None
        self._queue_manager = JobQueueManager(job_queue)
        self._queue_manager.startQueue(size, verbose=verbose)
        self._run_args = {}
        """ :type: dict[int, RunArg] """

        self._thread_pool = CThreadPool(8)
        self._thread_pool.addTaskFunction("submitJob", ENKF_LIB,
                                          "enkf_main_isubmit_job__")

    def addSimulation(self, iens, target_fs):
        if iens >= self._size:
            raise UserWarning("Realization number out of range: %d >= %d" %
                              (iens, self._size))

        if iens in self._run_args:
            raise UserWarning("Realization number: '%d' already queued" % iens)

        runpath_fmt = self._ert.getModelConfig().getRunpathFormat()
        member = self._ert.getRealisation(iens)
        runpath = ErtRunContext.createRunpath(iens, runpath_fmt,
                                              member.getDataKW())
        run_arg = RunArg.createEnsembleExperimentRunArg(
            target_fs, iens, runpath)

        self._ert.createRunPath(run_arg)

        self._run_args[iens] = run_arg
        self._thread_pool.submitJob(ArgPack(self._ert, run_arg))

    def isRunning(self):
        return self._queue_manager.isRunning()

    def getNumRunning(self):
        return self._queue_manager.getNumRunning()

    def getNumSuccess(self):
        return self._queue_manager.getNumSuccess()

    def getNumFailed(self):
        return self._queue_manager.getNumFailed()

    def getNumWaiting(self):
        return self._queue_manager.getNumWaiting()

    def didRealizationSucceed(self, iens):
        queue_index = self._run_args[iens].getQueueIndex()
        return self._queue_manager.didJobSucceed(queue_index)

    def didRealizationFail(self, iens):
        # For the purposes of this class, a failure should be anything (killed job, etc) that is not an explicit success.
        return not self.didRealizationSucceed(iens)

    def isRealizationQueued(self, iens):
        return iens in self._run_args

    def isRealizationFinished(self, iens):
        run_arg = self._run_args[iens]

        if run_arg.isSubmitted():
            queue_index = run_arg.getQueueIndex()
            return self._queue_manager.isJobComplete(queue_index)
        else:
            return False

    def __repr__(self):
        running = 'running' if self.isRunning() else 'not running'
        numRunn = self.getNumRunning()
        numSucc = self.getNumSuccess()
        numFail = self.getNumFailed()
        numWait = self.getNumWaiting()
        fmt = '%s, #running = %d, #success = %d, #failed = %d, #waiting = %d'
        fmt = fmt % (running, numRunn, numSucc, numFail, numWait)
        return 'SimulationContext(%s)' % fmt
Exemplo n.º 4
0
class SimulationContext:
    def __init__(self, ert, sim_fs, mask, itr, case_data):
        self._ert = ert
        """ :type: res.enkf.EnKFMain """
        max_runtime = ert.analysisConfig().get_max_runtime()
        self._mask = mask

        job_queue = ert.get_queue_config().create_job_queue()
        job_queue.set_max_job_duration(max_runtime)
        self._queue_manager = JobQueueManager(job_queue)

        subst_list = self._ert.getDataKW()
        path_fmt = self._ert.getModelConfig().getRunpathFormat()
        jobname_fmt = self._ert.getModelConfig().getJobnameFormat()

        self._run_context = ErtRunContext(
            EnkfRunType.ENSEMBLE_EXPERIMENT,
            sim_fs,
            None,
            mask,
            path_fmt,
            jobname_fmt,
            subst_list,
            itr,
        )
        # fill in the missing geo_id data
        for sim_id, (geo_id, _) in enumerate(case_data):
            if mask[sim_id]:
                run_arg = self._run_context[sim_id]
                run_arg.geo_id = geo_id

        self._ert.getEnkfSimulationRunner().createRunPath(self._run_context)
        EnkfSimulationRunner.runWorkflows(HookRuntime.PRE_SIMULATION,
                                          self._ert)
        self._sim_thread = self._run_simulations_simple_step()

        # Wait until the queue is active before we finish the creation
        # to ensure sane job status while running
        while self.isRunning() and not self._queue_manager.isRunning():
            sleep(0.1)

    def get_run_args(self, iens):
        """
        raises an  exception if no iens simulation found

        :param iens: realization number
        :return: run_args for the realization
        """
        for run_arg in self._run_context:
            if run_arg is not None and run_arg.iens == iens:
                return run_arg
        raise KeyError("No such simulation: %s" % iens)

    def _run_simulations_simple_step(self):
        sim_thread = Thread(target=lambda: self._ert.getEnkfSimulationRunner(
        ).runSimpleStep(self._queue_manager.queue, self._run_context))
        sim_thread.start()
        return sim_thread

    def __len__(self):
        return self._mask.count()

    def isRunning(self):
        # TODO: Should separate between running jobs and having loaded all data
        return self._sim_thread.is_alive() or self._queue_manager.isRunning()

    def getNumPending(self):
        return self._queue_manager.getNumPending()

    def getNumRunning(self):
        return self._queue_manager.getNumRunning()

    def getNumSuccess(self):
        return self._queue_manager.getNumSuccess()

    def getNumFailed(self):
        return self._queue_manager.getNumFailed()

    def getNumWaiting(self):
        return self._queue_manager.getNumWaiting()

    def didRealizationSucceed(self, iens):
        queue_index = self.get_run_args(iens).getQueueIndex()
        return self._queue_manager.didJobSucceed(queue_index)

    def didRealizationFail(self, iens):
        # For the purposes of this class, a failure should be anything (killed
        # job, etc) that is not an explicit success.
        return not self.didRealizationSucceed(iens)

    def isRealizationQueued(self, iens):
        # an exception will be raised if it's not queued
        self.get_run_args(iens)
        return True

    def isRealizationFinished(self, iens):
        run_arg = self.get_run_args(iens)

        if run_arg.isSubmitted():
            queue_index = run_arg.getQueueIndex()
            return self._queue_manager.isJobComplete(queue_index)
        else:
            return False

    def __repr__(self):
        running = "running" if self.isRunning() else "not running"
        numRunn = self.getNumRunning()
        numSucc = self.getNumSuccess()
        numFail = self.getNumFailed()
        numWait = self.getNumWaiting()
        fmt = "%s, #running = %d, #success = %d, #failed = %d, #waiting = %d"
        fmt = fmt % (running, numRunn, numSucc, numFail, numWait)
        return "SimulationContext(%s)" % fmt

    def get_sim_fs(self):
        return self._run_context.get_sim_fs()

    def get_run_context(self):
        return self._run_context

    def stop(self):
        self._queue_manager.stop_queue()
        self._sim_thread.join()

    def job_progress(self, iens):
        """Will return a detailed progress of the job.

        The progress report is obtained by reading a file from the filesystem,
        that file is typically created by another process running on another
        machine, and reading might fail due to NFS issues, simultanoues write
        and so on. If loading valid json fails the function will sleep 0.10
        seconds and retry - eventually giving up and returning None. Also for
        jobs which have not yet started the method will return None.

        When the method succeeds in reading the progress file from the file
        system the return value will be an object with properties like this:

            progress.start_time
            progress.end_time
            progress.run_id
            progress.jobs = [
                (job1.name, job1.start_time, job1.end_time, job1.status, job1.error_msg),
                (job2.name, job2.start_time, job2.end_time, job2.status, job2.error_msg),
                (jobN.name, jobN.start_time, jobN.end_time, jobN.status, jobN.error_msg)
            ]
        """
        run_arg = self.get_run_args(iens)

        try:
            # will throw if not yet submitted (is in a limbo state)
            queue_index = run_arg.getQueueIndex()
        except ValueError:
            return None
        if self._queue_manager.isJobWaiting(queue_index):
            return None

        return ForwardModelStatus.load(run_arg.runpath)

    def run_path(self, iens):
        """
        Will return the path to the simulation.
        """
        return self.get_run_args(iens).runpath

    def job_status(self, iens):
        """Will query the queue system for the status of the job."""
        run_arg = self.get_run_args(iens)
        try:
            queue_index = run_arg.getQueueIndex()
        except ValueError:
            return None
        return self._queue_manager.getJobStatus(queue_index)