def abort_due_to_failed_nodes(job_id): """Should the job be aborted due to failed compute nodes? The job should be aborted when the following conditions coincide: - we observed failed compute nodes - the "no progress" timeout has been exceeded :param int job_id: the id of the job in question :returns: the number of failed compute nodes if the job should be aborted zero otherwise. """ logging.debug("> check for failed nodes") result = 0 job = OqJob.objects.get(id=job_id) failed_nodes = monitor.count_failed_nodes(job) if failed_nodes: logging.debug(">> failed_nodes: %s", failed_nodes) no_progress_period, timeout = stats.get_progress_timing_data(job) logging.debug(">> no_progress_period: %s", no_progress_period) logging.debug(">> timeout: %s", timeout) if no_progress_period > timeout: result = failed_nodes else: logging.debug('>> no failures') logging.debug("< check for failed nodes") return result
def abort_due_to_failed_nodes(job_id): """Should the job be aborted due to failed compute nodes? The job should be aborted when the following conditions coincide: - we observed failed compute nodes - the "no progress" timeout has been exceeded :param int job_id: the id of the job in question :returns: the number of failed compute nodes if the job should be aborted zero otherwise. """ logging.debug("> abort_due_to_failed_nodes") result = 0 job = OqJob.objects.get(id=job_id) failed_nodes = monitor.count_failed_nodes(job) logging.debug(">> failed_nodes: %s" % failed_nodes) if failed_nodes: no_progress_period, timeout = stats.get_progress_timing_data(job) logging.debug(">> no_progress_period: %s" % no_progress_period) logging.debug(">> timeout: %s" % timeout) if no_progress_period > timeout: result = failed_nodes logging.debug("< abort_due_to_failed_nodes") return result
def test_get_progress_timing_data_before_first_increment(self): # No "progress counter increment" time stamp exists, the time stamp of # the *executing* `JobPhaseStats` record is taken instead. five_mins_ago = datetime.utcnow() - timedelta(minutes=5) jps = JobPhaseStats(oq_job=self.job, ctype="hazard", job_status="executing") jps.start_time = five_mins_ago jps.save() actual, timeout = stats.get_progress_timing_data(self.job) self.assertTrue(approx_equal(300, actual, 5)) self.assertEqual(99, timeout)
def test_get_progress_timing_data_before_first_increment(self): # No "progress counter increment" time stamp exists, the time stamp of # the *executing* `JobPhaseStats` record is taken instead. five_mins_ago = datetime.utcnow() - timedelta(minutes=5) jps = JobPhaseStats(oq_job=self.job, ctype="hazard", job_status="executing") jps.start_time = five_mins_ago jps.save() actual, timeout = stats.get_progress_timing_data(self.job) self.assertTrue(approx_equal(300, actual, 5)) self.assertEqual(3601, timeout)
def test_get_progress_timing_data_with_stale_increment_ts(self): # The progress counter increment time stamp exists but is not used # since the time stamp in the *executing* `JobPhaseStats` record is # more recent. tstamp = datetime.utcnow() - timedelta(minutes=9) stats.pk_set(self.job.id, "lvr_ts", tstamp.strftime("%s")) tstamp = datetime.utcnow() - timedelta(minutes=8) jps = JobPhaseStats(oq_job=self.job, ctype="hazard", job_status="executing") jps.start_time = tstamp jps.save() actual, timeout = stats.get_progress_timing_data(self.job) self.assertTrue(approx_equal(480, actual, 5))
def test_get_progress_timing_data_no_increment_multiple_rows(self): # No progress counter increment time stamp exists, the time stamp of # the most recent *executing* `JobPhaseStats` record is taken instead. jps_ts = datetime.utcnow() - timedelta(minutes=5) jps = JobPhaseStats(oq_job=self.job, ctype="hazard", job_status="executing") jps.start_time = jps_ts jps.save() jps_ts = datetime.utcnow() - timedelta(minutes=2) jps = JobPhaseStats(oq_job=self.job, ctype="risk", job_status="executing") jps.start_time = jps_ts jps.save() actual, timeout = stats.get_progress_timing_data(self.job) self.assertTrue(approx_equal(120, actual, 5))