def cleanup_nodes(self) -> bool: if self.state == ScalesetState.halt: self.halt() return True nodes = Node.search_states(scaleset_id=self.scaleset_id, states=NodeState.ready_for_reset()) outdated = Node.search_outdated( scaleset_id=self.scaleset_id, states=[NodeState.free], ) if not (nodes or outdated): logging.debug("scaleset node gc done (no nodes) %s", self.scaleset_id) return False to_delete = [] to_reimage = [] for node in outdated: if node.version == "1.0.0": to_reimage.append(node) else: stop_message = NodeMessage( agent_id=node.machine_id, message=NodeCommand(stop=StopNodeCommand()), ) stop_message.save() for node in nodes: # delete nodes that are not waiting on the scaleset GC if not node.scaleset_node_exists(): node.delete() elif node.state in [NodeState.shutdown, NodeState.halt]: to_delete.append(node) else: to_reimage.append(node) # Perform operations until they fail due to scaleset getting locked try: if to_delete: self.delete_nodes(to_delete) for node in to_delete: node.state = NodeState.halt node.save() if to_reimage: self.reimage_nodes(to_reimage) except UnableToUpdate: logging.info("scaleset update already in progress: %s", self.scaleset_id) return True
def post(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(CanScheduleRequest, req) if isinstance(request, Error): return not_ok(request, context="CanScheduleRequest") node = Node.get_by_machine_id(request.machine_id) if not node: return not_ok( Error(code=ErrorCode.UNABLE_TO_FIND, errors=["unable to find node"]), context=request.machine_id, ) allowed = True work_stopped = False if node.is_outdated(): logging.info( "received can_schedule request from outdated node '%s' version '%s'", node.machine_id, node.version, ) allowed = False stop_message = NodeMessage( agent_id=node.machine_id, message=NodeCommand(stop=StopNodeCommand()), ) stop_message.save() task = Task.get_by_task_id(request.task_id) work_stopped = isinstance(task, Error) or (task.state != TaskState.scheduled) if work_stopped: allowed = False return ok(CanSchedule(allowed=allowed, work_stopped=work_stopped))
def stop(self, done: bool = False) -> None: self.to_reimage(done=done) self.send_message(NodeCommand(stop=StopNodeCommand()))
def stop(self) -> None: self.to_reimage() self.send_message(NodeCommand(stop=StopNodeCommand()))