Python count_failed_nodes示例，openquake.engine.utils.monitor.count_failed_nodes Python示例

示例#1

0

显示文件

文件： supervisor.py 项目： ryanberrio/oq-engine

def abort_due_to_failed_nodes(job_id):
    """Should the job be aborted due to failed compute nodes?

    The job should be aborted when the following conditions coincide:
        - we observed failed compute nodes
        - the "no progress" timeout has been exceeded

    :param int job_id: the id of the job in question
    :returns: the number of failed compute nodes if the job should be aborted
        zero otherwise.
    """
    result = 0

    job = OqJob.objects.get(id=job_id)
    failed_nodes = monitor.count_failed_nodes(job)

    if failed_nodes:
        logging.debug(">> failed_nodes: %s", failed_nodes)
        no_progress_period, timeout = stats.get_progress_timing_data(job)
        logging.debug(">> no_progress_period: %s", no_progress_period)
        logging.debug(">> timeout: %s", timeout)
        if no_progress_period > timeout:
            result = failed_nodes

    return result

示例#2

0

显示文件

文件： utils_monitor_test.py 项目： luisera/oq-engine

 def test_count_failed_nodes_with_a_node_that_went_offline(self):
     # Result: 1 failed nodes
     cs = models.CNodeStats(oq_job=self.job, node="N1", current_status="up")
     self.db_mock.return_value = {"N1": cs}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(1, actual)

示例#3

0

显示文件

def abort_due_to_failed_nodes(job_id):
    """Should the job be aborted due to failed compute nodes?

    The job should be aborted when the following conditions coincide:
        - we observed failed compute nodes
        - the "no progress" timeout has been exceeded

    :param int job_id: the id of the job in question
    :returns: the number of failed compute nodes if the job should be aborted
        zero otherwise.
    """
    result = 0

    job = OqJob.objects.get(id=job_id)
    failed_nodes = monitor.count_failed_nodes(job)

    if failed_nodes:
        logging.debug(">> failed_nodes: %s", failed_nodes)
        no_progress_period, timeout = stats.get_progress_timing_data(job)
        logging.debug(">> no_progress_period: %s", no_progress_period)
        logging.debug(">> timeout: %s", timeout)
        if no_progress_period > timeout:
            result = failed_nodes

    return result

示例#4

0

显示文件

def _switch_to_job_phase(job, ctype, status):
    """Switch to a particular phase of execution.

    This involves creating a `job_phase_stats` record and logging the new
    status.

    :param job:
        An :class:`~openquake.engine.db.models.OqJob` instance.
    :param str ctype: calculation type (hazard|risk)
    :param str status: one of the following: pre_executing, executing,
        post_executing, post_processing, export, clean_up, complete
    """
    job.status = status
    job.save()
    models.JobPhaseStats.objects.create(oq_job=job,
                                        job_status=status,
                                        ctype=ctype)
    logs.LOG.progress("%s (%s)" % (status, ctype))
    if status == "executing" and not openquake.engine.no_distribute():
        # Record the compute nodes that were available at the beginning of the
        # execute phase so we can detect failed nodes later.
        failed_nodes = monitor.count_failed_nodes(job)
        if failed_nodes == -1:
            logs.LOG.critical("No live compute nodes, aborting calculation")
            sys.exit(1)

示例#5

0

显示文件

 def test_count_failed_nodes_with_a_node_that_went_offline(self):
     # Result: 1 failed nodes
     cs = models.CNodeStats(oq_job=self.job, node="N1", current_status="up")
     self.db_mock.return_value = {"N1": cs}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(1, actual)

示例#6

0

显示文件

文件： utils_monitor_test.py 项目： luisera/oq-engine

 def test_count_failed_nodes_with_failures_before_calculation(self):
     # Result: 1 node failure; this simulates the situation where a
     # node has failed from the very beginning and never recovered i.e. it
     # never took on any tasks. Only nodes that were functioning at some
     # time during the calculation and *then* failed are counted.
     n1 = models.CNodeStats(oq_job=self.job, node="N6", current_status="up")
     n1.save(using="job_init")
     n2 = models.CNodeStats(oq_job=self.job, node="N7",
                            current_status="down")
     self.db_mock.return_value = {"N6": n1, "N7": n2}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(1, actual)
     # The failed node has been updated to capture that.
     n1 = models.CNodeStats.objects.get(id=n1.id)
     self.assertEqual("down", n1.current_status)
     self.assertEqual(1, n1.failures)

示例#7

0

显示文件

文件： utils_monitor_test.py 项目： luisera/oq-engine

 def test_count_failed_nodes_with_failures_during_calculation(self):
     # Result: 2 node failures, please note that the function under test
     # counts the total number of node failures that occurred during a
     # calculation and *not* the number of currently failed nodes.
     n1 = models.CNodeStats(oq_job=self.job, node="N3",
                             current_status="up")
     n2 = models.CNodeStats(oq_job=self.job, node="N4",
                             current_status="down", failures=1)
     self.db_mock.return_value = {"N3": n1, "N4": n2}
     self.live_mock.return_value = set(["N5"])
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(2, actual)
     # Please note also that the new node ("N5") was written to the
     # database
     [n3] = models.CNodeStats.objects.filter(oq_job=self.job, node="N5")
     self.assertEqual("up", n3.current_status)
     self.assertEqual(0, n3.failures)

示例#8

0

显示文件

文件： engine.py 项目： chenliu0831/oq-engine

def _switch_to_job_phase(job, ctype, status):
    """Switch to a particular phase of execution.
    :param job: An :class:`~openquake.engine.db.models.OqJob` instance.
    :param str ctype: calculation type (hazard|risk)
    :param str status: one of the following: pre_executing, executing,
        post_executing, post_processing, export, clean_up, complete
    """
    job.status = status
    job.save()
    logs.LOG.progress("%s (%s)", status, ctype)
    if status == "executing" and not openquake.engine.no_distribute():
        # Record the compute nodes that were available at the beginning of the
        # execute phase so we can detect failed nodes later.
        failed_nodes = monitor.count_failed_nodes(job)
        if failed_nodes == -1:
            logs.LOG.critical("No live compute nodes, aborting calculation")
            sys.exit(1)

示例#9

0

显示文件

 def test_count_failed_nodes_with_failures_before_calculation(self):
     # Result: 1 node failure; this simulates the situation where a
     # node has failed from the very beginning and never recovered i.e. it
     # never took on any tasks. Only nodes that were functioning at some
     # time during the calculation and *then* failed are counted.
     n1 = models.CNodeStats(oq_job=self.job, node="N6", current_status="up")
     n1.save(using="job_superv")
     n2 = models.CNodeStats(oq_job=self.job,
                            node="N7",
                            current_status="down")
     self.db_mock.return_value = {"N6": n1, "N7": n2}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(1, actual)
     # The failed node has been updated to capture that.
     n1 = models.CNodeStats.objects.get(id=n1.id)
     self.assertEqual("down", n1.current_status)
     self.assertEqual(1, n1.failures)

示例#10

0

显示文件

 def test_count_failed_nodes_with_failures_during_calculation(self):
     # Result: 2 node failures, please note that the function under test
     # counts the total number of node failures that occurred during a
     # calculation and *not* the number of currently failed nodes.
     n1 = models.CNodeStats(oq_job=self.job, node="N3", current_status="up")
     n2 = models.CNodeStats(oq_job=self.job,
                            node="N4",
                            current_status="down",
                            failures=1)
     self.db_mock.return_value = {"N3": n1, "N4": n2}
     self.live_mock.return_value = set(["N5"])
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(2, actual)
     # Please note also that the new node ("N5") was written to the
     # database
     [n3] = models.CNodeStats.objects.filter(oq_job=self.job, node="N5")
     self.assertEqual("up", n3.current_status)
     self.assertEqual(0, n3.failures)

示例#11

0

显示文件

文件： utils_monitor_test.py 项目： luisera/oq-engine

    def test_count_failed_nodes_with_failed_and_recovered_node(self):
        # Result: 1 node failure; the node failed and recovered. Its failures
        # counter is unaffected by the recovery.
        n1 = models.CNodeStats(oq_job=self.job, node="N8", current_status="up")
        n1.save(using="job_init")
        self.assertEqual(0, n1.failures)

        n1.current_status = "down"
        n1.save(using="job_init")
        n1 = models.CNodeStats.objects.get(id=n1.id)
        self.assertEqual(1, n1.failures)

        self.db_mock.return_value = {"N8": n1}
        self.live_mock.return_value = set(["N8"])
        actual = monitor.count_failed_nodes(self.job)
        self.assertEqual(1, actual)
        # The failed node has been updated to capture that.
        n1 = models.CNodeStats.objects.get(id=n1.id)
        self.assertEqual("up", n1.current_status)
        self.assertEqual(1, n1.failures)

示例#12

0

显示文件

    def test_count_failed_nodes_with_failed_and_recovered_node(self):
        # Result: 1 node failure; the node failed and recovered. Its failures
        # counter is unaffected by the recovery.
        n1 = models.CNodeStats(oq_job=self.job, node="N8", current_status="up")
        n1.save(using="job_superv")
        self.assertEqual(0, n1.failures)

        n1.current_status = "down"
        n1.save(using="job_superv")
        n1 = models.CNodeStats.objects.get(id=n1.id)
        self.assertEqual(1, n1.failures)

        self.db_mock.return_value = {"N8": n1}
        self.live_mock.return_value = set(["N8"])
        actual = monitor.count_failed_nodes(self.job)
        self.assertEqual(1, actual)
        # The failed node has been updated to capture that.
        n1 = models.CNodeStats.objects.get(id=n1.id)
        self.assertEqual("up", n1.current_status)
        self.assertEqual(1, n1.failures)

示例#13

0

显示文件

文件： utils_monitor_test.py 项目： luisera/oq-engine

 def test_count_failed_nodes_with_zero_nodes(self):
     # Signal when there are zero live nodes at the start of the calculation
     self.db_mock.return_value = {}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(-1, actual)

示例#14

0

显示文件

 def test_count_failed_nodes_with_zero_nodes(self):
     # Signal when there are zero live nodes at the start of the calculation
     self.db_mock.return_value = {}
     self.live_mock.return_value = set()
     actual = monitor.count_failed_nodes(self.job)
     self.assertEqual(-1, actual)