Пример #1
0
    def cleanup_nodes(self) -> bool:
        if self.state == ScalesetState.halt:
            self.halt()
            return True

        nodes = Node.search_states(scaleset_id=self.scaleset_id,
                                   states=NodeState.ready_for_reset())

        outdated = Node.search_outdated(
            scaleset_id=self.scaleset_id,
            states=[NodeState.free],
        )

        if not (nodes or outdated):
            logging.debug("scaleset node gc done (no nodes) %s",
                          self.scaleset_id)
            return False

        to_delete = []
        to_reimage = []

        for node in outdated:
            if node.version == "1.0.0":
                to_reimage.append(node)
            else:
                stop_message = NodeMessage(
                    agent_id=node.machine_id,
                    message=NodeCommand(stop=StopNodeCommand()),
                )
                stop_message.save()

        for node in nodes:
            # delete nodes that are not waiting on the scaleset GC
            if not node.scaleset_node_exists():
                node.delete()
            elif node.state in [NodeState.shutdown, NodeState.halt]:
                to_delete.append(node)
            else:
                to_reimage.append(node)

        # Perform operations until they fail due to scaleset getting locked
        try:
            if to_delete:
                self.delete_nodes(to_delete)
                for node in to_delete:
                    node.state = NodeState.halt
                    node.save()

            if to_reimage:
                self.reimage_nodes(to_reimage)
        except UnableToUpdate:
            logging.info("scaleset update already in progress: %s",
                         self.scaleset_id)
        return True
Пример #2
0
def post(req: func.HttpRequest) -> func.HttpResponse:
    request = parse_request(CanScheduleRequest, req)
    if isinstance(request, Error):
        return not_ok(request, context="CanScheduleRequest")

    node = Node.get_by_machine_id(request.machine_id)
    if not node:
        return not_ok(
            Error(code=ErrorCode.UNABLE_TO_FIND,
                  errors=["unable to find node"]),
            context=request.machine_id,
        )

    allowed = True
    work_stopped = False
    if node.is_outdated():
        logging.info(
            "received can_schedule request from outdated node '%s' version '%s'",
            node.machine_id,
            node.version,
        )
        allowed = False
        stop_message = NodeMessage(
            agent_id=node.machine_id,
            message=NodeCommand(stop=StopNodeCommand()),
        )
        stop_message.save()

    task = Task.get_by_task_id(request.task_id)

    work_stopped = isinstance(task,
                              Error) or (task.state != TaskState.scheduled)

    if work_stopped:
        allowed = False

    return ok(CanSchedule(allowed=allowed, work_stopped=work_stopped))
Пример #3
0
 def stop(self, done: bool = False) -> None:
     self.to_reimage(done=done)
     self.send_message(NodeCommand(stop=StopNodeCommand()))
Пример #4
0
 def stop(self) -> None:
     self.to_reimage()
     self.send_message(NodeCommand(stop=StopNodeCommand()))