Exemplo n.º 1
0
def abort_due_to_failed_nodes(job_id):
    """Should the job be aborted due to failed compute nodes?

    The job should be aborted when the following conditions coincide:
        - we observed failed compute nodes
        - the "no progress" timeout has been exceeded

    :param int job_id: the id of the job in question
    :returns: the number of failed compute nodes if the job should be aborted
        zero otherwise.
    """
    result = 0

    job = OqJob.objects.get(id=job_id)
    failed_nodes = monitor.count_failed_nodes(job)

    if failed_nodes:
        logging.debug(">> failed_nodes: %s", failed_nodes)
        no_progress_period, timeout = stats.get_progress_timing_data(job)
        logging.debug(">> no_progress_period: %s", no_progress_period)
        logging.debug(">> timeout: %s", timeout)
        if no_progress_period > timeout:
            result = failed_nodes

    return result
Exemplo n.º 2
0
def abort_due_to_failed_nodes(job_id):
    """Should the job be aborted due to failed compute nodes?

    The job should be aborted when the following conditions coincide:
        - we observed failed compute nodes
        - the "no progress" timeout has been exceeded

    :param int job_id: the id of the job in question
    :returns: the number of failed compute nodes if the job should be aborted
        zero otherwise.
    """
    result = 0

    job = OqJob.objects.get(id=job_id)
    failed_nodes = monitor.count_failed_nodes(job)

    if failed_nodes:
        logging.debug(">> failed_nodes: %s", failed_nodes)
        no_progress_period, timeout = stats.get_progress_timing_data(job)
        logging.debug(">> no_progress_period: %s", no_progress_period)
        logging.debug(">> timeout: %s", timeout)
        if no_progress_period > timeout:
            result = failed_nodes

    return result
Exemplo n.º 3
0
 def test_get_progress_timing_data_before_first_increment(self):
     # No "progress counter increment" time stamp exists, the time stamp of
     # the *executing* `JobPhaseStats` record is taken instead.
     five_mins_ago = datetime.utcnow() - timedelta(minutes=5)
     jps = JobPhaseStats(oq_job=self.job, ctype="hazard",
                         job_status="executing")
     jps.start_time = five_mins_ago
     jps.save()
     actual, timeout = stats.get_progress_timing_data(self.job)
     self.assertTrue(approx_equal(300, actual, 5))
     self.assertEqual(99, timeout)
Exemplo n.º 4
0
 def test_get_progress_timing_data_before_first_increment(self):
     # No "progress counter increment" time stamp exists, the time stamp of
     # the *executing* `JobPhaseStats` record is taken instead.
     five_mins_ago = datetime.utcnow() - timedelta(minutes=5)
     jps = JobPhaseStats(oq_job=self.job,
                         ctype="hazard",
                         job_status="executing")
     jps.start_time = five_mins_ago
     jps.save()
     actual, timeout = stats.get_progress_timing_data(self.job)
     self.assertTrue(approx_equal(300, actual, 5))
     self.assertEqual(99, timeout)
Exemplo n.º 5
0
 def test_get_progress_timing_data_with_stale_increment_ts(self):
     # The progress counter increment time stamp exists but is not used
     # since the time stamp in the *executing* `JobPhaseStats` record is
     # more recent.
     tstamp = datetime.utcnow() - timedelta(minutes=9)
     stats.pk_set(self.job.id, "lvr_ts", tstamp.strftime("%s"))
     tstamp = datetime.utcnow() - timedelta(minutes=8)
     jps = JobPhaseStats(oq_job=self.job, ctype="hazard",
                         job_status="executing")
     jps.start_time = tstamp
     jps.save()
     actual, timeout = stats.get_progress_timing_data(self.job)
     self.assertTrue(approx_equal(480, actual, 5))
Exemplo n.º 6
0
 def test_get_progress_timing_data_with_stale_increment_ts(self):
     # The progress counter increment time stamp exists but is not used
     # since the time stamp in the *executing* `JobPhaseStats` record is
     # more recent.
     tstamp = datetime.utcnow() - timedelta(minutes=9)
     stats.pk_set(self.job.id, "lvr_ts", tstamp.strftime("%s"))
     tstamp = datetime.utcnow() - timedelta(minutes=8)
     jps = JobPhaseStats(oq_job=self.job,
                         ctype="hazard",
                         job_status="executing")
     jps.start_time = tstamp
     jps.save()
     actual, timeout = stats.get_progress_timing_data(self.job)
     self.assertTrue(approx_equal(480, actual, 5))
Exemplo n.º 7
0
 def test_get_progress_timing_data_no_increment_multiple_rows(self):
     # No progress counter increment time stamp exists, the time stamp of
     # the most recent *executing* `JobPhaseStats` record is taken instead.
     jps_ts = datetime.utcnow() - timedelta(minutes=5)
     jps = JobPhaseStats(oq_job=self.job, ctype="hazard",
                         job_status="executing")
     jps.start_time = jps_ts
     jps.save()
     jps_ts = datetime.utcnow() - timedelta(minutes=2)
     jps = JobPhaseStats(oq_job=self.job, ctype="risk",
                         job_status="executing")
     jps.start_time = jps_ts
     jps.save()
     actual, timeout = stats.get_progress_timing_data(self.job)
     self.assertTrue(approx_equal(120, actual, 5))
Exemplo n.º 8
0
 def test_get_progress_timing_data_no_increment_multiple_rows(self):
     # No progress counter increment time stamp exists, the time stamp of
     # the most recent *executing* `JobPhaseStats` record is taken instead.
     jps_ts = datetime.utcnow() - timedelta(minutes=5)
     jps = JobPhaseStats(oq_job=self.job,
                         ctype="hazard",
                         job_status="executing")
     jps.start_time = jps_ts
     jps.save()
     jps_ts = datetime.utcnow() - timedelta(minutes=2)
     jps = JobPhaseStats(oq_job=self.job,
                         ctype="risk",
                         job_status="executing")
     jps.start_time = jps_ts
     jps.save()
     actual, timeout = stats.get_progress_timing_data(self.job)
     self.assertTrue(approx_equal(120, actual, 5))