示例#1
0
def abort_due_to_failed_nodes(job_id):
    """Should the job be aborted due to failed compute nodes?

    The job should be aborted when the following conditions coincide:
        - we observed failed compute nodes
        - the "no progress" timeout has been exceeded

    :param int job_id: the id of the job in question
    :returns: the number of failed compute nodes if the job should be aborted
        zero otherwise.
    """
    result = 0

    job = OqJob.objects.get(id=job_id)
    failed_nodes = monitor.count_failed_nodes(job)

    if failed_nodes:
        logging.debug(">> failed_nodes: %s", failed_nodes)
        no_progress_period, timeout = stats.get_progress_timing_data(job)
        logging.debug(">> no_progress_period: %s", no_progress_period)
        logging.debug(">> timeout: %s", timeout)
        if no_progress_period > timeout:
            result = failed_nodes

    return result
示例#2
0
 def test_count_failed_nodes_with_a_node_that_went_offline(self):
     # Result: 1 failed nodes
     cs = models.CNodeStats(oq_job=self.job, node="N1", current_status="up")
     self.db_mock.return_value = {"N1": cs}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(1, actual)
示例#3
0
def abort_due_to_failed_nodes(job_id):
    """Should the job be aborted due to failed compute nodes?

    The job should be aborted when the following conditions coincide:
        - we observed failed compute nodes
        - the "no progress" timeout has been exceeded

    :param int job_id: the id of the job in question
    :returns: the number of failed compute nodes if the job should be aborted
        zero otherwise.
    """
    result = 0

    job = OqJob.objects.get(id=job_id)
    failed_nodes = monitor.count_failed_nodes(job)

    if failed_nodes:
        logging.debug(">> failed_nodes: %s", failed_nodes)
        no_progress_period, timeout = stats.get_progress_timing_data(job)
        logging.debug(">> no_progress_period: %s", no_progress_period)
        logging.debug(">> timeout: %s", timeout)
        if no_progress_period > timeout:
            result = failed_nodes

    return result
示例#4
0
def _switch_to_job_phase(job, ctype, status):
    """Switch to a particular phase of execution.

    This involves creating a `job_phase_stats` record and logging the new
    status.

    :param job:
        An :class:`~openquake.engine.db.models.OqJob` instance.
    :param str ctype: calculation type (hazard|risk)
    :param str status: one of the following: pre_executing, executing,
        post_executing, post_processing, export, clean_up, complete
    """
    job.status = status
    job.save()
    models.JobPhaseStats.objects.create(oq_job=job,
                                        job_status=status,
                                        ctype=ctype)
    logs.LOG.progress("%s (%s)" % (status, ctype))
    if status == "executing" and not openquake.engine.no_distribute():
        # Record the compute nodes that were available at the beginning of the
        # execute phase so we can detect failed nodes later.
        failed_nodes = monitor.count_failed_nodes(job)
        if failed_nodes == -1:
            logs.LOG.critical("No live compute nodes, aborting calculation")
            sys.exit(1)
示例#5
0
 def test_count_failed_nodes_with_a_node_that_went_offline(self):
     # Result: 1 failed nodes
     cs = models.CNodeStats(oq_job=self.job, node="N1", current_status="up")
     self.db_mock.return_value = {"N1": cs}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(1, actual)
示例#6
0
 def test_count_failed_nodes_with_failures_before_calculation(self):
     # Result: 1 node failure; this simulates the situation where a
     # node has failed from the very beginning and never recovered i.e. it
     # never took on any tasks. Only nodes that were functioning at some
     # time during the calculation and *then* failed are counted.
     n1 = models.CNodeStats(oq_job=self.job, node="N6", current_status="up")
     n1.save(using="job_init")
     n2 = models.CNodeStats(oq_job=self.job, node="N7",
                            current_status="down")
     self.db_mock.return_value = {"N6": n1, "N7": n2}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(1, actual)
     # The failed node has been updated to capture that.
     n1 = models.CNodeStats.objects.get(id=n1.id)
     self.assertEqual("down", n1.current_status)
     self.assertEqual(1, n1.failures)
示例#7
0
 def test_count_failed_nodes_with_failures_during_calculation(self):
     # Result: 2 node failures, please note that the function under test
     # counts the total number of node failures that occurred during a
     # calculation and *not* the number of currently failed nodes.
     n1 = models.CNodeStats(oq_job=self.job, node="N3",
                             current_status="up")
     n2 = models.CNodeStats(oq_job=self.job, node="N4",
                             current_status="down", failures=1)
     self.db_mock.return_value = {"N3": n1, "N4": n2}
     self.live_mock.return_value = set(["N5"])
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(2, actual)
     # Please note also that the new node ("N5") was written to the
     # database
     [n3] = models.CNodeStats.objects.filter(oq_job=self.job, node="N5")
     self.assertEqual("up", n3.current_status)
     self.assertEqual(0, n3.failures)
示例#8
0
def _switch_to_job_phase(job, ctype, status):
    """Switch to a particular phase of execution.
    :param job: An :class:`~openquake.engine.db.models.OqJob` instance.
    :param str ctype: calculation type (hazard|risk)
    :param str status: one of the following: pre_executing, executing,
        post_executing, post_processing, export, clean_up, complete
    """
    job.status = status
    job.save()
    logs.LOG.progress("%s (%s)", status, ctype)
    if status == "executing" and not openquake.engine.no_distribute():
        # Record the compute nodes that were available at the beginning of the
        # execute phase so we can detect failed nodes later.
        failed_nodes = monitor.count_failed_nodes(job)
        if failed_nodes == -1:
            logs.LOG.critical("No live compute nodes, aborting calculation")
            sys.exit(1)
示例#9
0
 def test_count_failed_nodes_with_failures_before_calculation(self):
     # Result: 1 node failure; this simulates the situation where a
     # node has failed from the very beginning and never recovered i.e. it
     # never took on any tasks. Only nodes that were functioning at some
     # time during the calculation and *then* failed are counted.
     n1 = models.CNodeStats(oq_job=self.job, node="N6", current_status="up")
     n1.save(using="job_superv")
     n2 = models.CNodeStats(oq_job=self.job,
                            node="N7",
                            current_status="down")
     self.db_mock.return_value = {"N6": n1, "N7": n2}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(1, actual)
     # The failed node has been updated to capture that.
     n1 = models.CNodeStats.objects.get(id=n1.id)
     self.assertEqual("down", n1.current_status)
     self.assertEqual(1, n1.failures)
示例#10
0
 def test_count_failed_nodes_with_failures_during_calculation(self):
     # Result: 2 node failures, please note that the function under test
     # counts the total number of node failures that occurred during a
     # calculation and *not* the number of currently failed nodes.
     n1 = models.CNodeStats(oq_job=self.job, node="N3", current_status="up")
     n2 = models.CNodeStats(oq_job=self.job,
                            node="N4",
                            current_status="down",
                            failures=1)
     self.db_mock.return_value = {"N3": n1, "N4": n2}
     self.live_mock.return_value = set(["N5"])
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(2, actual)
     # Please note also that the new node ("N5") was written to the
     # database
     [n3] = models.CNodeStats.objects.filter(oq_job=self.job, node="N5")
     self.assertEqual("up", n3.current_status)
     self.assertEqual(0, n3.failures)
示例#11
0
    def test_count_failed_nodes_with_failed_and_recovered_node(self):
        # Result: 1 node failure; the node failed and recovered. Its failures
        # counter is unaffected by the recovery.
        n1 = models.CNodeStats(oq_job=self.job, node="N8", current_status="up")
        n1.save(using="job_init")
        self.assertEqual(0, n1.failures)

        n1.current_status = "down"
        n1.save(using="job_init")
        n1 = models.CNodeStats.objects.get(id=n1.id)
        self.assertEqual(1, n1.failures)

        self.db_mock.return_value = {"N8": n1}
        self.live_mock.return_value = set(["N8"])
        actual = monitor.count_failed_nodes(self.job)
        self.assertEqual(1, actual)
        # The failed node has been updated to capture that.
        n1 = models.CNodeStats.objects.get(id=n1.id)
        self.assertEqual("up", n1.current_status)
        self.assertEqual(1, n1.failures)
示例#12
0
    def test_count_failed_nodes_with_failed_and_recovered_node(self):
        # Result: 1 node failure; the node failed and recovered. Its failures
        # counter is unaffected by the recovery.
        n1 = models.CNodeStats(oq_job=self.job, node="N8", current_status="up")
        n1.save(using="job_superv")
        self.assertEqual(0, n1.failures)

        n1.current_status = "down"
        n1.save(using="job_superv")
        n1 = models.CNodeStats.objects.get(id=n1.id)
        self.assertEqual(1, n1.failures)

        self.db_mock.return_value = {"N8": n1}
        self.live_mock.return_value = set(["N8"])
        actual = monitor.count_failed_nodes(self.job)
        self.assertEqual(1, actual)
        # The failed node has been updated to capture that.
        n1 = models.CNodeStats.objects.get(id=n1.id)
        self.assertEqual("up", n1.current_status)
        self.assertEqual(1, n1.failures)
示例#13
0
 def test_count_failed_nodes_with_zero_nodes(self):
     # Signal when there are zero live nodes at the start of the calculation
     self.db_mock.return_value = {}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(-1, actual)
示例#14
0
 def test_count_failed_nodes_with_zero_nodes(self):
     # Signal when there are zero live nodes at the start of the calculation
     self.db_mock.return_value = {}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(-1, actual)