Exemplo n.º 1
0
 def test_resets_status_expires(self):
     node = factory.make_Node(status=NODE_STATUS.DEPLOYING,
                              status_expires=factory.make_date(),
                              with_empty_script_sets=True)
     payload = {
         'event_type':
         random.choice(['start', 'finish']),
         'origin':
         'curtin',
         'name':
         random.choice([
             'cmd-install', 'cmd-install/stage-early',
             'cmd-install/stage-late'
         ]),
         'description':
         'Installing',
         'timestamp':
         datetime.utcnow(),
     }
     self.processMessage(node, payload)
     node = reload_object(node)
     # Testing for the exact time will fail during testing due to now()
     # being different in reset_status_expires vs here. Pad by 1 minute
     # to make sure its reset but won't fail testing.
     expected_time = now() + timedelta(
         minutes=get_node_timeout(NODE_STATUS.DEPLOYING))
     self.assertGreaterEqual(node.status_expires,
                             expected_time - timedelta(minutes=1))
     self.assertLessEqual(node.status_expires,
                          expected_time + timedelta(minutes=1))
Exemplo n.º 2
0
 def test__resets_status_expires(self):
     rack_controller = factory.make_RackController()
     local_ip = factory.make_ip_address()
     remote_ip = factory.make_ip_address()
     status = random.choice(MONITORED_STATUSES)
     node = self.make_node(
         status=status, status_expires=factory.make_date())
     mac = node.get_boot_interface().mac_address
     get_config(
         rack_controller.system_id, local_ip, remote_ip, mac=mac)
     node = reload_object(node)
     # Testing for the exact time will fail during testing due to now()
     # being different in reset_status_expires vs here. Pad by 1 minute
     # to make sure its reset but won't fail testing.
     expected_time = now() + timedelta(minutes=get_node_timeout(status))
     self.assertGreaterEqual(
         node.status_expires, expected_time - timedelta(minutes=1))
     self.assertLessEqual(
         node.status_expires, expected_time + timedelta(minutes=1))
Exemplo n.º 3
0
def mark_nodes_failed_after_expiring(now, node_timeout):
    """Mark all nodes in that database as failed where the status did not
    transition in time. `status_expires` is checked on the node to see if the
    current time is newer than the expired time.
    """
    expired_nodes = Node.objects.filter(
        status__in=MONITORED_STATUSES,
        status_expires__isnull=False,
        status_expires__lte=now,
    )
    for node in expired_nodes:
        minutes = get_node_timeout(node.status, node_timeout)
        maaslog.info(
            "%s: Operation '%s' timed out after %s minutes." %
            (node.hostname, NODE_STATUS_CHOICES_DICT[node.status], minutes))
        node.mark_failed(
            comment="Node operation '%s' timed out after %s minutes." %
            (NODE_STATUS_CHOICES_DICT[node.status], minutes),
            script_result_status=SCRIPT_STATUS.ABORTED,
        )
Exemplo n.º 4
0
    def test_sets_status_expires_when_flatlined_with_may_reboot_script(self):
        node, script_set = self.make_node()
        current_time = now()
        if self.status == NODE_STATUS.COMMISSIONING:
            script_type = SCRIPT_TYPE.COMMISSIONING
        else:
            script_type = SCRIPT_TYPE.TESTING
        script = factory.make_Script(script_type=script_type, may_reboot=True)
        factory.make_ScriptResult(
            script=script, script_set=script_set, status=SCRIPT_STATUS.RUNNING)
        script_set.last_ping = current_time - timedelta(11)
        script_set.save()

        mark_nodes_failed_after_missing_script_timeout(current_time, 20)
        node = reload_object(node)

        self.assertEquals(
            current_time - (current_time - script_set.last_ping) + timedelta(
                minutes=get_node_timeout(self.status, 20)),
            node.status_expires)
Exemplo n.º 5
0
def mark_nodes_failed_after_missing_script_timeout(now, node_timeout):
    """Check on the status of commissioning or testing nodes.

    For any node currently commissioning or testing check that a region is
    still receiving its heartbeat and no running script has gone past its
    run limit. If the node fails either condition its put into a failed status.
    """
    # maas-run-remote-scripts sends a heartbeat every two minutes. If we
    # haven't received a heartbeat within node_timeout(20 min by default)
    # it's dead.
    heartbeat_expired = now - timedelta(minutes=node_timeout)
    # Get the list of nodes currently running testing. status_expires is used
    # while the node is booting. Once MAAS receives the signal that testing
    # has begun it resets status_expires and checks for the heartbeat instead.
    qs = Node.objects.filter(
        status__in=[NODE_STATUS.COMMISSIONING, NODE_STATUS.TESTING],
        status_expires=None,
    )
    qs = qs.prefetch_related(
        Prefetch(
            "current_commissioning_script_set",
            ScriptSet.objects.prefetch_related(
                Prefetch(
                    "scriptresult_set",
                    ScriptResult.objects.defer(
                        "output", "stdout", "stderr",
                        "result").prefetch_related(
                            Prefetch(
                                "script",
                                Script.objects.only("script_type", "name",
                                                    "may_reboot", "timeout"),
                            )),
                )),
        ),
        Prefetch(
            "current_testing_script_set",
            ScriptSet.objects.prefetch_related(
                Prefetch(
                    "scriptresult_set",
                    ScriptResult.objects.defer(
                        "output", "stdout", "stderr",
                        "result").prefetch_related(
                            Prefetch(
                                "script",
                                Script.objects.only("script_type", "name",
                                                    "may_reboot", "timeout"),
                            )),
                )),
        ),
    )
    for node in qs:
        if node.status == NODE_STATUS.COMMISSIONING:
            script_set = node.current_commissioning_script_set
        elif node.status == NODE_STATUS.TESTING:
            script_set = node.current_testing_script_set
        script_results = [
            script_result for script_result in script_set
            if script_result.status == SCRIPT_STATUS.RUNNING
        ]
        maybe_rebooting = False
        for script_result in script_results:
            if script_result.script and script_result.script.may_reboot:
                maybe_rebooting = True
                break
        flatlined = (script_set.last_ping is not None
                     and script_set.last_ping < heartbeat_expired)
        if maybe_rebooting and flatlined:
            # If the script currently running may_reboot and the nodes
            # heartbeat has flatlined assume the node is rebooting. Set the
            # node.status_expires time to the boot timeout minus what has
            # already passed.
            minutes = get_node_timeout(node.status, node_timeout)
            node.status_expires = (now - (now - script_set.last_ping) +
                                   timedelta(minutes=minutes))
            node.save(update_fields=["status_expires"])
            continue
        elif flatlined:
            maaslog.info(
                "%s: Has not been heard from for the last %s minutes" %
                (node.hostname, node_timeout))
            node.mark_failed(
                comment=(
                    "Node has not been heard from for the last %s minutes" %
                    node_timeout),
                script_result_status=SCRIPT_STATUS.TIMEDOUT,
            )
            if not node.enable_ssh:
                maaslog.info("%s: Stopped because SSH is disabled" %
                             node.hostname)
                node.stop(comment="Node stopped because SSH is disabled")
            continue

        # Check for scripts which have gone past their timeout.
        for script_result in script_results:
            timeout = None
            for param in script_result.parameters.values():
                if param.get("type") == "runtime":
                    timeout = param.get("value")
                    break
            if (timeout is None and script_result.name in NODE_INFO_SCRIPTS
                    and "timeout" in NODE_INFO_SCRIPTS[script_result.name]):
                timeout = NODE_INFO_SCRIPTS[script_result.name]["timeout"]
            elif (timeout is None and script_result.script is not None
                  and script_result.script.timeout.seconds > 0):
                timeout = script_result.script.timeout
            else:
                continue
            # The node running the scripts checks if the script has run past
            # its time limit. The node will try to kill the script and move on
            # by signaling the region. If after 5 minutes past the timeout the
            # region hasn't recieved the signal mark_failed and stop the node.
            script_expires = (script_result.started + timeout +
                              timedelta(minutes=5))
            if script_expires < now:
                script_result.status = SCRIPT_STATUS.TIMEDOUT
                script_result.save(update_fields=["status"])
                maaslog.info("%s: %s has run past it's timeout(%s)" %
                             (node.hostname, script_result.name, str(timeout)))
                node.mark_failed(
                    comment="%s has run past it's timeout(%s)" %
                    (script_result.name, str(timeout)),
                    script_result_status=SCRIPT_STATUS.ABORTED,
                )
                if not node.enable_ssh:
                    maaslog.info("%s: Stopped because SSH is disabled" %
                                 node.hostname)
                    node.stop(comment="Node stopped because SSH is disabled")
                break