def test_failure_counters_with_valid_area(self): # Failure counters are returned for valid computation areas. stats.delete_job_counters(123) fcname = itertools.cycle(string.ascii_lowercase) for cidx, carea in enumerate(["g", "h", "r"]): stats.incr_counter(123, carea, "%s:failed" % fcname.next()) if not (cidx % 2): stats.incr_counter(123, carea, "%s:failed" % fcname.next()) self.assertEqual([('oqs/123/g/a:failed/i', 1), ('oqs/123/g/b:failed/i', 1)], sorted(stats.failure_counters(123, "g"))) self.assertEqual([('oqs/123/h/c:failed/i', 1)], sorted(stats.failure_counters(123, "h"))) self.assertEqual([('oqs/123/r/d:failed/i', 1), ('oqs/123/r/e:failed/i', 1)], sorted(stats.failure_counters(123, "r")))
def test_failure_counters_with_valid_area(self): # Failure counters are returned for valid computation areas. stats.delete_job_counters(123) fcname = itertools.cycle(string.ascii_lowercase) for cidx, carea in enumerate(["g", "h", "r"]): stats.incr_counter(123, carea, "%s:failed" % fcname.next()) if not (cidx % 2): stats.incr_counter(123, carea, "%s:failed" % fcname.next()) self.assertEqual( [('oqs/123/g/a:failed/i', 1), ('oqs/123/g/b:failed/i', 1)], sorted(stats.failure_counters(123, "g"))) self.assertEqual([('oqs/123/h/c:failed/i', 1)], sorted(stats.failure_counters(123, "h"))) self.assertEqual( [('oqs/123/r/d:failed/i', 1), ('oqs/123/r/e:failed/i', 1)], sorted(stats.failure_counters(123, "r")))
def timeout_callback(self): """ On timeout expiration check if the job process is still running and whether it experienced any failures. Terminate the job process in the latter case. """ def failure_counters_need_check(): """Return `True` if failure counters should be checked.""" self.fcc_delay_value += 1 result = self.fcc_delay_value >= self.FCC_DELAY if result: self.fcc_delay_value = 0 return result process_stopped = job_failed = False message = None if not supervising.is_pid_running(self.job_pid): message = ('job process %s crashed or terminated' % self.job_pid) process_stopped = True elif failure_counters_need_check(): # Job process is still running. failures = stats.failure_counters(self.job_id) failed_nodes = None if failures: message = "job terminated with failures: %s" % failures else: # Don't check for failed nodes if distribution is disabled. # In this case, we don't expect any nodes to be present, and # thus, there are none that can fail. if not openquake.engine.no_distribute(): failed_nodes = abort_due_to_failed_nodes(self.job_id) if failed_nodes: message = ("job terminated due to %s failed nodes" % failed_nodes) if failures or failed_nodes: terminate_job(self.job_pid) job_failed = True if job_failed or process_stopped: job_status = get_job_status(self.job_id) if process_stopped and job_status == 'complete': message = 'job process %s succeeded' % self.job_pid self.selflogger.debug(message) elif not job_status == 'complete': # The job crashed without having a chance to update the # status in the database, or it has been running even though # there were failures. We update the job status here. self.selflogger.error(message) update_job_status(self.job_id) record_job_stop_time(self.job_id) cleanup_after_job(self.job_id, self.terminate) raise StopIteration()
def test_failure_counters_with_no_failures(self): # An empty list is returned in the absence of any failure counters stats.delete_job_counters(123) self.assertEqual([], stats.failure_counters(123))