예제 #1
0
    def test_failure_counters_with_valid_area(self):
        # Failure counters are returned for valid computation areas.
        stats.delete_job_counters(123)
        fcname = itertools.cycle(string.ascii_lowercase)
        for cidx, carea in enumerate(["g", "h", "r"]):
            stats.incr_counter(123, carea, "%s:failed" % fcname.next())
            if not (cidx % 2):
                stats.incr_counter(123, carea, "%s:failed" % fcname.next())

        self.assertEqual([('oqs/123/g/a:failed/i', 1),
                          ('oqs/123/g/b:failed/i', 1)],
                         sorted(stats.failure_counters(123, "g")))
        self.assertEqual([('oqs/123/h/c:failed/i', 1)],
                         sorted(stats.failure_counters(123, "h")))
        self.assertEqual([('oqs/123/r/d:failed/i', 1),
                          ('oqs/123/r/e:failed/i', 1)],
                         sorted(stats.failure_counters(123, "r")))
예제 #2
0
    def test_failure_counters_with_valid_area(self):
        # Failure counters are returned for valid computation areas.
        stats.delete_job_counters(123)
        fcname = itertools.cycle(string.ascii_lowercase)
        for cidx, carea in enumerate(["g", "h", "r"]):
            stats.incr_counter(123, carea, "%s:failed" % fcname.next())
            if not (cidx % 2):
                stats.incr_counter(123, carea, "%s:failed" % fcname.next())

        self.assertEqual(
            [('oqs/123/g/a:failed/i', 1), ('oqs/123/g/b:failed/i', 1)],
            sorted(stats.failure_counters(123, "g")))
        self.assertEqual([('oqs/123/h/c:failed/i', 1)],
                         sorted(stats.failure_counters(123, "h")))
        self.assertEqual(
            [('oqs/123/r/d:failed/i', 1), ('oqs/123/r/e:failed/i', 1)],
            sorted(stats.failure_counters(123, "r")))
예제 #3
0
    def timeout_callback(self):
        """
        On timeout expiration check if the job process is still running
        and whether it experienced any failures.

        Terminate the job process in the latter case.
        """
        def failure_counters_need_check():
            """Return `True` if failure counters should be checked."""
            self.fcc_delay_value += 1
            result = self.fcc_delay_value >= self.FCC_DELAY
            if result:
                self.fcc_delay_value = 0
            return result

        process_stopped = job_failed = False
        message = None

        if not supervising.is_pid_running(self.job_pid):
            message = ('job process %s crashed or terminated' % self.job_pid)
            process_stopped = True
        elif failure_counters_need_check():
            # Job process is still running.
            failures = stats.failure_counters(self.job_id)
            failed_nodes = None
            if failures:
                message = "job terminated with failures: %s" % failures
            else:
                # Don't check for failed nodes if distribution is disabled.
                # In this case, we don't expect any nodes to be present, and
                # thus, there are none that can fail.
                if not openquake.engine.no_distribute():
                    failed_nodes = abort_due_to_failed_nodes(self.job_id)
                    if failed_nodes:
                        message = ("job terminated due to %s failed nodes" %
                                   failed_nodes)
            if failures or failed_nodes:
                terminate_job(self.job_pid)
                job_failed = True

        if job_failed or process_stopped:
            job_status = get_job_status(self.job_id)
            if process_stopped and job_status == 'complete':
                message = 'job process %s succeeded' % self.job_pid
                self.selflogger.debug(message)
            elif not job_status == 'complete':
                # The job crashed without having a chance to update the
                # status in the database, or it has been running even though
                # there were failures. We update the job status here.
                self.selflogger.error(message)
                update_job_status(self.job_id)

            record_job_stop_time(self.job_id)
            cleanup_after_job(self.job_id, self.terminate)
            raise StopIteration()
예제 #4
0
    def timeout_callback(self):
        """
        On timeout expiration check if the job process is still running
        and whether it experienced any failures.

        Terminate the job process in the latter case.
        """
        def failure_counters_need_check():
            """Return `True` if failure counters should be checked."""
            self.fcc_delay_value += 1
            result = self.fcc_delay_value >= self.FCC_DELAY
            if result:
                self.fcc_delay_value = 0
            return result

        process_stopped = job_failed = False
        message = None

        if not supervising.is_pid_running(self.job_pid):
            message = ('job process %s crashed or terminated' % self.job_pid)
            process_stopped = True
        elif failure_counters_need_check():
            # Job process is still running.
            failures = stats.failure_counters(self.job_id)
            failed_nodes = None
            if failures:
                message = "job terminated with failures: %s" % failures
            else:
                # Don't check for failed nodes if distribution is disabled.
                # In this case, we don't expect any nodes to be present, and
                # thus, there are none that can fail.
                if not openquake.engine.no_distribute():
                    failed_nodes = abort_due_to_failed_nodes(self.job_id)
                    if failed_nodes:
                        message = ("job terminated due to %s failed nodes" %
                                   failed_nodes)
            if failures or failed_nodes:
                terminate_job(self.job_pid)
                job_failed = True

        if job_failed or process_stopped:
            job_status = get_job_status(self.job_id)
            if process_stopped and job_status == 'complete':
                message = 'job process %s succeeded' % self.job_pid
                self.selflogger.debug(message)
            elif not job_status == 'complete':
                # The job crashed without having a chance to update the
                # status in the database, or it has been running even though
                # there were failures. We update the job status here.
                self.selflogger.error(message)
                update_job_status(self.job_id)

            record_job_stop_time(self.job_id)
            cleanup_after_job(self.job_id, self.terminate)
            raise StopIteration()
예제 #5
0
 def test_failure_counters_with_no_failures(self):
     # An empty list is returned in the absence of any failure counters
     stats.delete_job_counters(123)
     self.assertEqual([], stats.failure_counters(123))
예제 #6
0
 def test_failure_counters_with_no_failures(self):
     # An empty list is returned in the absence of any failure counters
     stats.delete_job_counters(123)
     self.assertEqual([], stats.failure_counters(123))