示例#1
0
def abort_due_to_failed_nodes(job_id):
    """Should the job be aborted due to failed compute nodes?

    The job should be aborted when the following conditions coincide:
        - we observed failed compute nodes
        - the "no progress" timeout has been exceeded

    :param int job_id: the id of the job in question
    :returns: the number of failed compute nodes if the job should be aborted
        zero otherwise.
    """
    logging.debug("> check for failed nodes")
    result = 0

    job = OqJob.objects.get(id=job_id)
    failed_nodes = monitor.count_failed_nodes(job)

    if failed_nodes:
        logging.debug(">> failed_nodes: %s", failed_nodes)
        no_progress_period, timeout = stats.get_progress_timing_data(job)
        logging.debug(">> no_progress_period: %s", no_progress_period)
        logging.debug(">> timeout: %s", timeout)
        if no_progress_period > timeout:
            result = failed_nodes
    else:
        logging.debug('>> no failures')

    logging.debug("< check for failed nodes")
    return result
示例#2
0
def abort_due_to_failed_nodes(job_id):
    """Should the job be aborted due to failed compute nodes?

    The job should be aborted when the following conditions coincide:
        - we observed failed compute nodes
        - the "no progress" timeout has been exceeded

    :param int job_id: the id of the job in question
    :returns: the number of failed compute nodes if the job should be aborted
        zero otherwise.
    """
    logging.debug("> abort_due_to_failed_nodes")
    result = 0

    job = OqJob.objects.get(id=job_id)
    failed_nodes = monitor.count_failed_nodes(job)
    logging.debug(">> failed_nodes: %s" % failed_nodes)

    if failed_nodes:
        no_progress_period, timeout = stats.get_progress_timing_data(job)
        logging.debug(">> no_progress_period: %s" % no_progress_period)
        logging.debug(">> timeout: %s" % timeout)
        if no_progress_period > timeout:
            result = failed_nodes

    logging.debug("< abort_due_to_failed_nodes")
    return result
示例#3
0
 def test_get_progress_timing_data_before_first_increment(self):
     # No "progress counter increment" time stamp exists, the time stamp of
     # the *executing* `JobPhaseStats` record is taken instead.
     five_mins_ago = datetime.utcnow() - timedelta(minutes=5)
     jps = JobPhaseStats(oq_job=self.job, ctype="hazard",
                         job_status="executing")
     jps.start_time = five_mins_ago
     jps.save()
     actual, timeout = stats.get_progress_timing_data(self.job)
     self.assertTrue(approx_equal(300, actual, 5))
     self.assertEqual(99, timeout)
示例#4
0
 def test_get_progress_timing_data_before_first_increment(self):
     # No "progress counter increment" time stamp exists, the time stamp of
     # the *executing* `JobPhaseStats` record is taken instead.
     five_mins_ago = datetime.utcnow() - timedelta(minutes=5)
     jps = JobPhaseStats(oq_job=self.job,
                         ctype="hazard",
                         job_status="executing")
     jps.start_time = five_mins_ago
     jps.save()
     actual, timeout = stats.get_progress_timing_data(self.job)
     self.assertTrue(approx_equal(300, actual, 5))
     self.assertEqual(3601, timeout)
示例#5
0
 def test_get_progress_timing_data_with_stale_increment_ts(self):
     # The progress counter increment time stamp exists but is not used
     # since the time stamp in the *executing* `JobPhaseStats` record is
     # more recent.
     tstamp = datetime.utcnow() - timedelta(minutes=9)
     stats.pk_set(self.job.id, "lvr_ts", tstamp.strftime("%s"))
     tstamp = datetime.utcnow() - timedelta(minutes=8)
     jps = JobPhaseStats(oq_job=self.job, ctype="hazard",
                         job_status="executing")
     jps.start_time = tstamp
     jps.save()
     actual, timeout = stats.get_progress_timing_data(self.job)
     self.assertTrue(approx_equal(480, actual, 5))
示例#6
0
 def test_get_progress_timing_data_with_stale_increment_ts(self):
     # The progress counter increment time stamp exists but is not used
     # since the time stamp in the *executing* `JobPhaseStats` record is
     # more recent.
     tstamp = datetime.utcnow() - timedelta(minutes=9)
     stats.pk_set(self.job.id, "lvr_ts", tstamp.strftime("%s"))
     tstamp = datetime.utcnow() - timedelta(minutes=8)
     jps = JobPhaseStats(oq_job=self.job,
                         ctype="hazard",
                         job_status="executing")
     jps.start_time = tstamp
     jps.save()
     actual, timeout = stats.get_progress_timing_data(self.job)
     self.assertTrue(approx_equal(480, actual, 5))
示例#7
0
 def test_get_progress_timing_data_no_increment_multiple_rows(self):
     # No progress counter increment time stamp exists, the time stamp of
     # the most recent *executing* `JobPhaseStats` record is taken instead.
     jps_ts = datetime.utcnow() - timedelta(minutes=5)
     jps = JobPhaseStats(oq_job=self.job, ctype="hazard",
                         job_status="executing")
     jps.start_time = jps_ts
     jps.save()
     jps_ts = datetime.utcnow() - timedelta(minutes=2)
     jps = JobPhaseStats(oq_job=self.job, ctype="risk",
                         job_status="executing")
     jps.start_time = jps_ts
     jps.save()
     actual, timeout = stats.get_progress_timing_data(self.job)
     self.assertTrue(approx_equal(120, actual, 5))
示例#8
0
 def test_get_progress_timing_data_no_increment_multiple_rows(self):
     # No progress counter increment time stamp exists, the time stamp of
     # the most recent *executing* `JobPhaseStats` record is taken instead.
     jps_ts = datetime.utcnow() - timedelta(minutes=5)
     jps = JobPhaseStats(oq_job=self.job,
                         ctype="hazard",
                         job_status="executing")
     jps.start_time = jps_ts
     jps.save()
     jps_ts = datetime.utcnow() - timedelta(minutes=2)
     jps = JobPhaseStats(oq_job=self.job,
                         ctype="risk",
                         job_status="executing")
     jps.start_time = jps_ts
     jps.save()
     actual, timeout = stats.get_progress_timing_data(self.job)
     self.assertTrue(approx_equal(120, actual, 5))