예제 #1
0
    def timeout_callback(self):
        """
        On timeout expiration check if the job process is still running, and
        act accordingly if not.
        """
        if not supervising.is_pid_running(self.pid):
            logging.info('Process %s not running', self.pid)

            # see what status was left in the database by the exited job
            job_status = get_job_status(self.job_id)

            if job_status == 'succeeded':
                signalling.signal_job_outcome(self.job_id, 'succeeded')
            else:
                signalling.signal_job_outcome(self.job_id, 'failed')

                if job_status == 'running':
                    # The job crashed without having a chance to update the
                    # status in the database.  We do it here.
                    update_job_status_and_error_msg(self.job_id, 'failed',
                                                    'crash')

            cleanup_after_job(self.job_id)

            raise StopIteration
예제 #2
0
def main():
    """
    Look through all jobs with status "running" and check
    the status of their supervisors: if one is missing --
    do :meth:`openquake.job.spawn_job_supervisor` for it.
    """
    qs = OqJob.objects.filter(status="running").values_list("id", "job_pid", "supervisor_pid")
    for job_id, job_pid, supervisor_pid in qs:
        if not supervising.is_pid_running(supervisor_pid):
            proc = multiprocessing.Process(target=supervise, args=(job_id, job_pid))
            proc.start()
예제 #3
0
def main():
    """
    Look through all jobs with status "running" and check
    the status of their supervisors: if one is missing --
    do :meth:`openquake.job.spawn_job_supervisor` for it.
    """
    qs = OqJob.objects.filter(status='running') \
                      .values_list('id', 'job_pid', 'supervisor_pid')
    for job_id, job_pid, supervisor_pid in qs:
        if not supervising.is_pid_running(supervisor_pid):
            job.spawn_job_supervisor(job_id, job_pid)
예제 #4
0
def main():
    """
    Look through all jobs with status "running" and check
    the status of their supervisors: if one is missing --
    do :meth:`openquake.job.spawn_job_supervisor` for it.
    """
    qs = OqJob.objects.filter(status='running') \
                      .values_list('id', 'job_pid', 'supervisor_pid')
    for job_id, job_pid, supervisor_pid in qs:
        if not supervising.is_pid_running(supervisor_pid):
            proc = multiprocessing.Process(target=supervise,
                                           args=(job_id, job_pid))
            proc.start()
예제 #5
0
    def timeout_callback(self):
        """
        On timeout expiration check if the job process is still running
        and whether it experienced any failures.

        Terminate the job process in the latter case.
        """
        def failure_counters_need_check():
            """Return `True` if failure counters should be checked."""
            self.fcc_delay_value += 1
            result = self.fcc_delay_value >= self.FCC_DELAY
            if result:
                self.fcc_delay_value = 0
            return result

        process_stopped = job_failed = False
        message = None

        if not supervising.is_pid_running(self.job_pid):
            message = ('job process %s crashed or terminated' % self.job_pid)
            process_stopped = True
        elif failure_counters_need_check():
            # Job process is still running.
            failures = stats.failure_counters(self.job_id)
            if failures:
                message = "job terminated with failures: %s" % failures
            else:
                failed_nodes = abort_due_to_failed_nodes(self.job_id)
                if failed_nodes:
                    message = ("job terminated due to %s failed nodes" %
                               failed_nodes)
            if failures or failed_nodes:
                terminate_job(self.job_pid)
                job_failed = True

        if job_failed or process_stopped:
            job_status = get_job_status(self.job_id)
            if process_stopped and job_status == 'complete':
                message = 'job process %s succeeded' % self.job_pid
                self.selflogger.debug(message)
            elif not job_status == 'complete':
                # The job crashed without having a chance to update the
                # status in the database, or it has been running even though
                # there were failures. We update the job status here.
                self.selflogger.error(message)
                update_job_status_and_error_msg(self.job_id, error_msg=message)

            record_job_stop_time(self.job_id)
            cleanup_after_job(self.job_id)
            raise StopIteration()
예제 #6
0
    def timeout_callback(self):
        """
        On timeout expiration check if the job process is still running
        and whether it experienced any failures.

        Terminate the job process in the latter case.
        """
        def failure_counters_need_check():
            """Return `True` if failure counters should be checked."""
            self.fcc_delay_value += 1
            result = self.fcc_delay_value >= self.FCC_DELAY
            if result:
                self.fcc_delay_value = 0
            return result

        process_stopped = job_failed = False
        message = None

        if not supervising.is_pid_running(self.job_pid):
            message = ('job process %s crashed or terminated' % self.job_pid)
            process_stopped = True
        elif failure_counters_need_check():
            # Job process is still running.
            failures = stats.failure_counters(self.job_id)
            if failures:
                message = "job terminated with failures: %s" % failures
            else:
                failed_nodes = abort_due_to_failed_nodes(self.job_id)
                if failed_nodes:
                    message = ("job terminated due to %s failed nodes" %
                               failed_nodes)
            if failures or failed_nodes:
                terminate_job(self.job_pid)
                job_failed = True

        if job_failed or process_stopped:
            job_status = get_job_status(self.job_id)
            if process_stopped and job_status == 'succeeded':
                message = 'job process %s succeeded' % self.job_pid
                self.selflogger.info(message)
            elif job_status == 'running':
                # The job crashed without having a chance to update the
                # status in the database, or it has been running even though
                # there were failures. We update the job status here.
                self.selflogger.error(message)
                update_job_status_and_error_msg(self.job_id, 'failed', message)

            record_job_stop_time(self.job_id)
            cleanup_after_job(self.job_id)
            raise StopIteration()