def call(cmd: List[str]) -> None: shlexed = " ".join([shlex.quote(x) for x in cmd]) logging.trace("Running '%s'", shlexed) _QCMD_LOGGER.info(shlexed) stderr = "" completed_process = None try: # capture_output was added in 3.7 and we support as far back as 3.6 if sys.version_info < (3, 7): completed_process = subprocess.run(cmd, stderr=subprocess.PIPE) else: completed_process = subprocess.run(cmd, capture_output=True) if completed_process.returncode != 0: if completed_process.stderr: stderr = completed_process.stderr.decode() logging.warning( "'%s' failed with exit %d: Stderr '%s'", shlexed, completed_process.returncode, stderr, ) except Exception as e: logging.error("'%s' failed: %s.", shlexed, str(e)) _QCMD_LOGGER.error(">> %s", str(e)) raise
def __init__( self, name: str, consumed_core_count: int, max_core_count: int, consumed_count: Optional[int] = None, max_count: Optional[int] = None, ): self._name = name assert (consumed_core_count is not None ), "consumed_core_count is None for limit {}".format(name) assert max_core_count is not None, "max_core_count is None for limit {}".format( name) self._consumed_core_count = max(0, consumed_core_count) self._max_core_count = max(0, max_core_count) if self._consumed_core_count > self._max_core_count: logging.warning( "consumed_core_count(%s) > max_core_count(%s) for %s limit. Flooring it.", self._consumed_core_count, self._max_core_count, name, ) self._consumed_core_count = self._max_core_count if max_count is None: assert consumed_count is None else: assert consumed_count is not None self.__max_count = max_count self.__consumed_count = consumed_count
def parse_select(self, select_expression: str) -> List[Dict[str, Any]]: # Need to detect when slot_type is specified with `-l select=1:slot_type` assert isinstance(select_expression, str) chunks: List[Dict[str, Any]] = [] for chunk_expr in select_expression.split("+"): chunk = {} # give a default of 1 in case the user assumes 1 with their select # i.e. -l select=1:mem=16gb == -l select=mem=16gb # if they picked a number it will be overridden below chunk["select"] = "1" chunk["schedselect"] = "1" for expr in chunk_expr.split(":"): value: Any if "=" not in expr: key, value = "select", int(expr) else: key, value = expr.split("=", 1) if key in self.resource_definitions: value = self.resource_definitions[key].type.parse( value) else: logging.warning( "Unknown resource %s: treating as a string.", key) chunk[key] = value chunks.append(chunk) return chunks
def read_parallel_environments( autoscale_config: Dict, qbin: QBin, ) -> Dict[str, "ParallelEnvironment"]: parallel_envs = {} pe_config = autoscale_config.get("gridengine", {}).get("pes", {}) pe_names = qbin.qconf(["-spl"]).splitlines() for pe_name in pe_names: pe_name = pe_name.strip() lines = qbin.qconf(["-sp", pe_name]).splitlines(False) pe = parse_ge_config(lines) req_key = "requires_placement_groups" if req_key in pe_config.get(pe_name, {}): logging.warning( "Overriding placement group behavior for PE %s with %s", pe_name, pe_config[pe_name][req_key], ) pe[req_key] = pe_config[pe_name][req_key] parallel_envs[pe_name] = ParallelEnvironment(pe) return parallel_envs
def is_valid_hostname(config: Dict, node: "Node") -> bool: # delayed import, as logging will import this module from hpc.autoscale import hpclogging as logging if not node.hostname: return False valid_hostnames: Optional[List[str]] = config.get("valid_hostnames") if not valid_hostnames: if is_standalone_dns(node): valid_hostnames = ["^ip-[0-9A-Za-z]{8}$"] else: return True for valid_hostname in valid_hostnames: if re.match(valid_hostname, node.hostname): return True logging.warning( "Rejecting invalid hostname '%s': Did not match any of the following patterns: %s", node.hostname, valid_hostnames, ) return False
def hpcwrapper(*args: Any, **kwargs: Any) -> Optional[Any]: if function.__name__ in WHITELIST_FUNCTIONS_TYPES: if not hasattr(hpcwrapper, "hpcwarned"): setattr(hpcwrapper, "hpcwarned", True) logging.warning( "Runtime type checking is disabled for %s", function.__name__ ) return typechecked_func(*args, **kwargs)
def decorate(self, nodes: typing.List[Node]) -> None: if not nodes: nodes = [] nodes = [n for n in nodes if n.exists] equalities = [ " (node_id == '{}') ".format(n.delayed_node_id.node_id) for n in nodes ] if not equalities: return stmt = "select node_id, last_match_time, create_time, delete_time from nodes where {}".format( "{}".format(" OR ".join(equalities))) rows = self._execute(stmt) rows_by_id = partition_single(list(rows), lambda r: r[0]) now = self.now() for node in nodes: node_id = node.delayed_node_id.node_id # should be impossible because we already filtered by exists if not node_id: logging.warning( "Null node_id for %s. Leaving create/last_match/delete times as null.", node, ) continue if node_id in rows_by_id: node_id, last_match_time, create_time, delete_time = rows_by_id[ node_id] node.create_time_unix = create_time node.last_match_time_unix = last_match_time node.delete_time_unix = delete_time if self.create_timeout: create_elapsed = max(0, now - create_time) create_remaining = max( 0, self.create_timeout - create_elapsed) node.create_time_remaining = create_remaining if self.last_match_timeout: if node.keep_alive: node.idle_time_remaining = -1 else: match_elapsed = max(0, now - last_match_time) match_remaining = max( 0, self.last_match_timeout - match_elapsed) node.idle_time_remaining = match_remaining
def get_pbspro_parser() -> PBSProParser: global _PARSER if _PARSER is None: # avoid circular import from pbspro.pbscmd import PBSCMD from pbspro.resource import read_resource_definitions # chicken / egg issue: we want the resource definitions # as a member of the parser, but we need the parser to parse # the definitions... # So create temp parser with no resource definitions _PARSER = PBSProParser({}) pbscmd = PBSCMD(_PARSER) logging.warning("Using uninitialized PBSProParser: please call" + " set_pbspro_parser before calling get_pbspro_parser") resource_definitions = read_resource_definitions(pbscmd, {}) _PARSER = PBSProParser(resource_definitions) return _PARSER
def read_resource_definitions( pbscmd: PBSCMD, config: Dict) -> Dict[str, "PBSProResourceDefinition"]: ret: Dict[str, PBSProResourceDefinition] = {} res_dicts = pbscmd.qmgr_parsed("list", "resource") res_names = set([x["name"] for x in res_dicts]) # TODO I believe this is the only one, but leaving a config option # as a backup plan read_only = config.get("pbspro", {}).get("read_only_resources", ["host", "vnode"]) def_sched = pbscmd.qmgr_parsed("list", "sched", "default") sched_priv = def_sched[0]["sched_priv"] sched_config = os.path.join(sched_priv, "sched_config") from pbspro.parser import PBSProParser parser = PBSProParser(config) sched_resources = parser.parse_resources_from_sched_priv(sched_config) missing_res = sched_resources - res_names missing_res_dicts = [] for res_name in missing_res: try: missing_res_dicts.extend( pbscmd.qmgr_parsed("list", "resource", res_name)) except CalledProcessError as e: logging.warning( "Could not find resource %s that was defined in %s, Ignoring", res_name, sched_config, ) logging.fine(e) for rdict in res_dicts + missing_res_dicts: name = rdict["name"] res_type = RESOURCE_TYPES[rdict["type"]] flag: ResourceFlag = rdict.get("flag", "") # type: ignore ret[name] = PBSProResourceDefinition(name, res_type, flag) if name in read_only: ret[name].read_only = True return ret
def __init__( self, hostname: str, resources: typing.Optional[dict] = None, bucket_id: typing.Optional[ht.BucketId] = None, ) -> None: resources = resources or ht.ResourceDict({}) private_ip: typing.Optional[ht.IpAddress] if SchedulerNode.ignore_hostnames: private_ip = None else: try: private_ip = ht.IpAddress(socket.gethostbyname(hostname)) except Exception as e: logging.warning("Could not find private ip for %s: %s", hostname, e) private_ip = None Node.__init__( self, node_id=DelayedNodeId(ht.NodeName(hostname)), name=ht.NodeName(hostname), nodearray=ht.NodeArrayName("unknown"), bucket_id=bucket_id or ht.BucketId(str(uuid4())), hostname=ht.Hostname(hostname), private_ip=private_ip, instance_id=None, vm_size=ht.VMSize("unknown"), location=ht.Location("unknown"), spot=False, vcpu_count=1, memory=ht.Memory(0, "b"), infiniband=False, state=ht.NodeStatus("running"), target_state=ht.NodeStatus("running"), power_state=ht.NodeStatus("running"), exists=True, placement_group=None, managed=False, resources=ht.ResourceDict(resources), software_configuration=ImmutableOrderedDict({}), keep_alive=False, )
def update_scheduler_nodes(self, scheduler_nodes: List[SchedulerNode]) -> None: by_hostname: Dict[str, Node] = partition_single( self.__scheduler_nodes_queue, lambda n: n.hostname_or_uuid # type: ignore ) for new_snode in scheduler_nodes: if new_snode.hostname not in by_hostname: by_hostname[new_snode.hostname] = new_snode self.__scheduler_nodes_queue.push(new_snode) self.node_mgr.add_unmanaged_nodes([new_snode]) if new_snode.resources.get("ccnodeid"): logging.warning( "%s has ccnodeid defined, but no longer exists in CycleCloud", new_snode, ) else: logging.debug( "Found new node[hostname=%s] that does not exist in CycleCloud", new_snode.hostname, ) # TODO inform bucket catalog? elif new_snode.metadata.get("override_resources", True): old_snode = by_hostname[new_snode.hostname_or_uuid] logging.fine( "Found existing CycleCloud node[hostname=%s]", new_snode.hostname, ) old_snode.update(new_snode) else: logging.fine( "Found existing CycleCloud node[hostname=%s], but node.metadata.override_resources=false" + " so ignoring the reported resources and only copying metadata", new_snode.hostname, ) old_snode = by_hostname[new_snode.hostname_or_uuid] old_snode.metadata.update(new_snode.metadata)
def satisfied_by_node(self, node: "Node") -> SatisfiedResult: if self.attr not in node.available: # TODO log msg = "Resource[name={}] is not defined for Node[name={}]".format( self.attr, node.name) return SatisfiedResult( "UndefinedResource", self, node, [msg], ) try: if node.available[self.attr] >= self.value: return SatisfiedResult( "success", self, node, ) except TypeError as e: logging.warning( "For attribute %s: Could not evaluate %s >= %s because they are different types: %s", self.attr, node.available[self.attr], self.value, e, ) msg = "Resource[name={} value={}] < Node[name={} value={}]".format( self.attr, self.value, node.name, node.available[self.attr], ) return SatisfiedResult( "InsufficientResource", self, node, reasons=[msg], )
def get_node_hostgroups(config: Dict, node: Node) -> List[str]: hostgroups_expr = node.metadata.get("gridengine_hostgroups") if not hostgroups_expr: hostgroups_expr = node.software_configuration.get( "gridengine_hostgroups") if not hostgroups_expr: default_hostgroups = config.get("gridengine", {}).get("default_hostgroups", []) for dh in default_hostgroups: if "select" not in dh: logging.warning( "Missing key 'select' in gridengine.default_hostgroups %s", dh) continue if "hostgroups" not in dh: logging.warning( "Missing key 'hostgroups' in gridengine.default_hostgroups %s", dh) continue constraint_list = constraints.get_constraints(dh["select"]) satisfied = True for c in constraint_list: if not c.satisfied_by_node(node): satisfied = False break if satisfied: hostgroups = dh["hostgroups"] if isinstance(hostgroups, str): hostgroups = [hostgroups] hostgroups_expr = " ".join(hostgroups) # set it in metadata so we can output it in the cli node.metadata["gridengine_hostgroups"] = hostgroups_expr if hostgroups_expr: return re.split(",| +", hostgroups_expr) return []
def parse(self, value: str) -> Optional[Any]: try: if value.upper() == "NONE": return None if value.lower() == "infinity": return float("inf") if self.complex_type in ["INT", "RSMAP"]: return int(value) elif self.complex_type == "BOOL": try: return bool(float(value)) except ValueError: if value.lower() in ["true", "false"]: return value.lower() == "true" else: logging.warning( "Could not parse '%s' for complex type %s - treating as string.", value, self.complex_type, ) return value elif self.complex_type == "DOUBLE": return float(value) elif self.complex_type in ["RESTRING", "TIME", "STRING", "HOST"]: return value elif self.complex_type == "CSTRING": # TODO test return value.lower() # case insensitve - we will just always lc elif self.complex_type == "MEMORY": size = value[-1] if size.isdigit(): mem = ht.Memory(float(value), "b") else: mem = ht.Memory(float(value[:-1]), size) return mem.convert_to("g") else: if not self.__logged_type_warning: logging.warning( "Unknown complex type %s - treating as string.", self.complex_type, ) self.__logged_type_warning = True return value except Exception: if not self.__logged_parse_warning: logging.warning( "Could not parse complex %s with value '%s'. Treating as string", self, value, ) self.__logged_parse_warning = True return value
def _pack_job(self, job: Job) -> Result: """ 1) will it ever fit? - check num nodes with any capacity 2) does it have the proper resources? bucket.match(job.resources) 3) order them 4) tell the bucket to allocate X nodes - let the bucket figure out what is new and what is not. """ # TODO break non-exclusive allocated_nodes: List[Node] = [] slots_to_allocate = job.iterations_remaining assert job.iterations_remaining > 0 available_buckets = self.node_mgr.get_buckets() # I don't want to fill up the log with rejecting placement groups # so just filter them here filter_by_colocated = [ b for b in available_buckets if bool(b.placement_group) == job.colocated ] candidates_result = job.bucket_candidates(filter_by_colocated) if not candidates_result: # TODO log or something logging.warning("There are no resources to scale up for job %s", job) logging.warning("See below:") for child_result in candidates_result.child_results or []: logging.warning(" %s", child_result.message) return candidates_result failure_reasons = self._handle_allocate(job, allocated_nodes, all_or_nothing=False) # we have allocated at least some tasks if allocated_nodes: assert allocated_nodes return AllocationResult("success", nodes=allocated_nodes, slots_allocated=slots_to_allocate) return AllocationResult("Failed", reasons=failure_reasons)
def autoscale_hpcpack( config: Dict[str, Any], ctx_handler: DefaultContextHandler = None, hpcpack_rest_client: Optional[HpcRestClient] = None, dry_run: bool = False, ) -> None: if not hpcpack_rest_client: hpcpack_rest_client = new_rest_client(config) if ctx_handler: ctx_handler.set_context("[Sync-Status]") autoscale_config = config.get("autoscale") or {} # Load history info idle_timeout_seconds: int = autoscale_config.get("idle_timeout") or 600 provisioning_timeout_seconds = autoscale_config.get("boot_timeout") or 1500 statefile = autoscale_config.get( "statefile") or "C:\\cycle\\jetpack\\config\\autoscaler_state.txt" archivefile = autoscale_config.get( "archivefile") or "C:\\cycle\\jetpack\\config\\autoscaler_archive.txt" node_history = HpcNodeHistory( statefile=statefile, archivefile=archivefile, provisioning_timeout=provisioning_timeout_seconds, idle_timeout=idle_timeout_seconds) logging.info("Synchronizing the nodes between Cycle cloud and HPC Pack") # Initialize data of History info, cc nodes, HPC Pack nodes, HPC grow decisions # Get node list from Cycle Cloud def nodes_state_key(n: Node) -> Tuple[int, str, int]: try: state_pri = 1 if n.state == 'Deallocated': state_pri = 2 elif n.state == 'Stopping': state_pri = 3 elif n.state == 'Terminating': state_pri = 4 name, index = n.name.rsplit("-", 1) return (state_pri, name, int(index)) except Exception: return (state_pri, n.name, 0) node_mgr: NodeManager = new_node_manager(config) for b in node_mgr.get_buckets(): b.nodes.sort(key=nodes_state_key) cc_nodes: List[Node] = node_mgr.get_nodes() cc_nodes_by_id = partition_single(cc_nodes, func=lambda n: n.delayed_node_id.node_id) # Get compute node list and grow decision from HPC Pack hpc_node_groups = hpcpack_rest_client.list_node_groups() grow_decisions = hpcpack_rest_client.get_grow_decision() logging.info("grow decision: {}".format(grow_decisions)) hpc_cn_nodes: List[HpcNode] = hpcpack_rest_client.list_computenodes() hpc_cn_nodes = [n for n in hpc_cn_nodes if n.active] # This function will link node history items, cc nodes and hpc nodes node_history.synchronize(cc_nodes, hpc_cn_nodes) cc_nodearrays = set([b.nodearray for b in node_mgr.get_buckets()]) logging.info("Current node arrays in cyclecloud: {}".format(cc_nodearrays)) # Create HPC node groups for CC node arrays cc_map_hpc_groups = ["CycleCloudNodes"] + list(cc_nodearrays) for cc_grp in cc_map_hpc_groups: if ci_notin(cc_grp, hpc_node_groups): logging.info("Create HPC node group: {}".format(cc_grp)) hpcpack_rest_client.add_node_group(cc_grp, "Cycle Cloud Node group") # Add HPC nodes into corresponding node groups add_cc_tag_nodes = [ n.name for n in hpc_cn_nodes if n.shall_addcyclecloudtag ] if len(add_cc_tag_nodes) > 0: logging.info( "Adding HPC nodes to node group CycleCloudNodes: {}".format( add_cc_tag_nodes)) hpcpack_rest_client.add_node_to_node_group("CycleCloudNodes", add_cc_tag_nodes) for cc_grp in list(cc_nodearrays): add_array_tag_nodes = [ n.name for n in hpc_cn_nodes if n.shall_addnodearraytag and ci_equals(n.cc_nodearray, cc_grp) ] if len(add_array_tag_nodes) > 0: logging.info("Adding HPC nodes to node group {}: {}".format( cc_grp, add_array_tag_nodes)) hpcpack_rest_client.add_node_to_node_group(cc_grp, add_array_tag_nodes) # Possible values for HPC NodeState (states marked with * shall not occur for CC nodes): # Unknown, Provisioning, Offline, Starting, Online, Draining, Rejected(*), Removing, NotDeployed(*), Stopping(*) # Remove the following HPC Pack nodes: # 1. The corresponding CC node already removed # 2. The corresponding CC node is stopped and HPC node is not assigned a node template # Take offline the following HPC Pack nodes: # 1. The corresponding CC node is stopped or is going to stop hpc_nodes_to_remove = [ n.name for n in hpc_cn_nodes if n.removed_cc_node or (n.stopped_cc_node and not n.template_assigned) ] hpc_nodes_to_take_offline = [ n.name for n in hpc_cn_nodes if n.stopped_cc_node and ci_equals(n.state, "Online") ] if len(hpc_nodes_to_remove) > 0: logging.info("Removing the HPC nodes: {}".format(hpc_nodes_to_remove)) if dry_run: logging.info("Dry-run: no real action") else: hpcpack_rest_client.remove_nodes(hpc_nodes_to_remove) hpc_cn_nodes = [ n for n in hpc_cn_nodes if not (n.stopped_cc_node or n.removed_cc_node) ] # Assign default node template for unapproved CC node hpc_nodes_to_assign_template = [ n.name for n in hpc_cn_nodes if n.bound_cc_node and not n.template_assigned ] if len(hpc_nodes_to_assign_template) > 0: logging.info( "Assigning default node template for the HPC nodes: {}".format( hpc_nodes_to_assign_template)) if dry_run: logging.info("Dry-run: no real action") else: hpcpack_rest_client.assign_default_compute_node_template( hpc_nodes_to_assign_template) ### Start scale up checking: logging.info("Start scale up checking ...") if ctx_handler: ctx_handler.set_context("[scale-up]") hpc_nodes_with_active_cc = [ n for n in hpc_cn_nodes if n.template_assigned and n.bound_cc_node ] # Exclude the already online healthy HPC nodes before calling node_mgr.allocate for hpc_node in hpc_nodes_with_active_cc: if hpc_node.ready_for_job: hpc_node.bound_cc_node.closed = True # Terminate the provisioning timeout CC nodes cc_node_to_terminate: List[Node] = [] for cc_node in cc_nodes: if ci_equals(cc_node.target_state, 'Deallocated') or ci_equals( cc_node.target_state, 'Terminated') or cc_node.create_time_remaining: continue nhi = node_history.find(cc_id=cc_node.delayed_node_id.node_id) if not nhi.hpc_id: cc_node.closed = True cc_node_to_terminate.append(cc_node) else: hpc_node = ci_find_one(hpc_nodes_with_active_cc, nhi.hpc_id, lambda n: n.id) if hpc_node and hpc_node.error: cc_node.closed = True cc_node_to_terminate.append(cc_node) # "ComputeNodes", "CycleCloudNodes", "AzureIaaSNodes" are all treated as default # grow_by_socket not supported yet, treat as grow_by_node defaultGroups = [ "Default", "ComputeNodes", "AzureIaaSNodes", "CycleCloudNodes" ] default_cores_to_grow = default_nodes_to_grow = 0.0 # If the current CC nodes in the node array cannot satisfy the grow decision, the group is hungry # For a hungry group, no idle check is required if the node health is OK group_hungry: Dict[str, bool] = {} nbrNewNodes: int = 0 grow_groups = list(grow_decisions.keys()) for grp in grow_groups: tmp = grow_decisions.pop(grp) if not (tmp.cores_to_grow + tmp.nodes_to_grow + tmp.sockets_to_grow): continue if ci_in(grp, defaultGroups): default_cores_to_grow += tmp.cores_to_grow default_nodes_to_grow += tmp.nodes_to_grow + tmp.sockets_to_grow continue if ci_notin(grp, cc_nodearrays): logging.warning( "No mapping node array for the grow requirement {}:{}".format( grp, grow_decisions[grp])) grow_decisions.pop(grp) continue group_hungry[grp] = False array = ci_lookup(grp, cc_nodearrays) selector = {'ncpus': 1, 'node.nodearray': [array]} target_cores = math.ceil(tmp.cores_to_grow) target_nodes = math.ceil(tmp.nodes_to_grow + tmp.sockets_to_grow) if target_nodes: logging.info("Allocate: {} Target Nodes: {}".format( selector, target_nodes)) result = node_mgr.allocate(selector, node_count=target_nodes) logging.info(result) if not result or result.total_slots < target_nodes: group_hungry[grp] = True if target_cores: logging.info("Allocate: {} Target Cores: {}".format( selector, target_cores)) result = node_mgr.allocate(selector, slot_count=target_cores) logging.info(result) if not result or result.total_slots < target_cores: group_hungry[grp] = True if len(node_mgr.new_nodes) > nbrNewNodes: group_hungry[grp] = True nbrNewNodes = len(node_mgr.new_nodes) # We then check the grow decision for the default node groups: checkShrinkNeeded = True growForDefaultGroup = True if default_nodes_to_grow or default_cores_to_grow else False if growForDefaultGroup: selector = {'ncpus': 1} if default_nodes_to_grow: target_nodes = math.ceil(default_nodes_to_grow) logging.info("Allocate: {} Target Nodes: {}".format( selector, target_nodes)) result = node_mgr.allocate({'ncpus': 1}, node_count=target_nodes) if not result or result.total_slots < target_nodes: checkShrinkNeeded = False if default_cores_to_grow: target_cores = math.ceil(default_cores_to_grow) logging.info("Allocate: {} Target Cores: {}".format( selector, target_cores)) result = node_mgr.allocate({'ncpus': 1}, slot_count=target_cores) if not result or result.total_slots < target_cores: checkShrinkNeeded = False if len(node_mgr.new_nodes) > nbrNewNodes: checkShrinkNeeded = False nbrNewNodes = len(node_mgr.new_nodes) if nbrNewNodes > 0: logging.info("Need to Allocate {} nodes in total".format(nbrNewNodes)) if dry_run: logging.info("Dry-run: skipping node bootup...") else: logging.info("Allocating {} nodes in total".format( len(node_mgr.new_nodes))) bootup_result: BootupResult = node_mgr.bootup() logging.info(bootup_result) if bootup_result and bootup_result.nodes: for cc_node in bootup_result.nodes: nhi = node_history.find( cc_id=cc_node.delayed_node_id.node_id) if nhi is None: nhi = node_history.insert( NodeHistoryItem(cc_node.delayed_node_id.node_id)) else: nhi.restart() else: logging.info("No need to allocate new nodes ...") ### Start the shrink checking if ctx_handler: ctx_handler.set_context("[scale-down]") cc_node_to_shutdown: List[Node] = [] if not checkShrinkNeeded: logging.info("No shrink check at this round ...") if not dry_run: for nhi in node_history.items: if not nhi.stopped and nhi.hpc_id: nhi.idle_from = None else: logging.info("Start scale down checking ...") # By default, we check idle for active CC nodes in HPC Pack with 'Offline', 'Starting', 'Online', 'Draining' state candidate_idle_check_nodes = [ n for n in hpc_nodes_with_active_cc if (not n.bound_cc_node.keep_alive) and ci_in(n.state, ["Offline", "Starting", "Online", "Draining"]) ] # We can exclude some nodes from idle checking: # 1. If HPC Pack ask for grow in default node group(s), all healthy ONLINE nodes are considered as busy # 2. If HPC Pack ask for grow in certain node group, all healthy ONLINE nodes in that node group are considered as busy # 3. If a node group is hungry (new CC required or grow request not satisfied), no idle check needed for all nodes in that node array if growForDefaultGroup: candidate_idle_check_nodes = [ n for n in candidate_idle_check_nodes if not n.ready_for_job ] for grp, hungry in group_hungry.items(): if hungry: candidate_idle_check_nodes = [ n for n in candidate_idle_check_nodes if not ci_equals(grp, n.cc_nodearray) ] elif not growForDefaultGroup: candidate_idle_check_nodes = [ n for n in candidate_idle_check_nodes if not (ci_equals(grp, n.cc_nodearray) and n.ready_for_job) ] curtime = datetime.utcnow() # Offline node must be idle idle_node_names = [ n.name for n in candidate_idle_check_nodes if ci_equals(n.state, 'Offline') ] if len(candidate_idle_check_nodes) > len(idle_node_names): idle_nodes = hpcpack_rest_client.check_nodes_idle([ n.name for n in candidate_idle_check_nodes if not ci_equals(n.state, 'Offline') ]) if len(idle_nodes) > 0: idle_node_names.extend([n.node_name for n in idle_nodes]) if len(idle_node_names) > 0: logging.info( "The following node is idle: {}".format(idle_node_names)) else: logging.info("No idle node found in this round.") retention_days = autoscale_config.get("vm_retention_days") or 7 for nhi in node_history.items: if nhi.stopped: if nhi.stop_time + timedelta( days=retention_days) < datetime.utcnow(): cc_node = cc_nodes_by_id.get(nhi.cc_id) if cc_node is not None: cc_node_to_terminate.append(cc_node) continue if ci_in(nhi.hostname, idle_node_names): if nhi.idle_from is None: nhi.idle_from = curtime elif nhi.idle_timeout(idle_timeout_seconds): nhi.stop_time = curtime cc_node = cc_nodes_by_id.get(nhi.cc_id) if cc_node is not None: cc_node_to_shutdown.append(cc_node) else: nhi.idle_from = None shrinking_cc_node_ids = [ n.delayed_node_id.node_id for n in cc_node_to_terminate ] shrinking_cc_node_ids.extend( [n.delayed_node_id.node_id for n in cc_node_to_shutdown]) hpc_nodes_to_bring_online = [ n.name for n in hpc_nodes_with_active_cc if ci_equals(n.state, 'Offline') and not n.error and ci_notin(n.cc_node_id, shrinking_cc_node_ids) ] hpc_nodes_to_take_offline.extend([ n.name for n in hpc_nodes_with_active_cc if ci_equals(n.state, 'Online') and ci_in(n.cc_node_id, shrinking_cc_node_ids) ]) if len(hpc_nodes_to_bring_online) > 0: logging.info("Bringing the HPC nodes online: {}".format( hpc_nodes_to_bring_online)) if dry_run: logging.info("Dry-run: no real action") else: hpcpack_rest_client.bring_nodes_online(hpc_nodes_to_bring_online) if len(hpc_nodes_to_take_offline) > 0: logging.info("Taking the HPC nodes offline: {}".format( hpc_nodes_to_take_offline)) if dry_run: logging.info("Dry-run: no real action") else: hpcpack_rest_client.take_nodes_offline(hpc_nodes_to_take_offline) if len(cc_node_to_shutdown) > 0: logging.info("Shut down the following Cycle cloud node: {}".format( [cn.name for cn in cc_node_to_shutdown])) if dry_run: logging.info("Dry-run: skip ...") else: node_mgr.shutdown_nodes(cc_node_to_shutdown) if len(cc_node_to_terminate) > 0: logging.info( "Terminating the following provisioning-timeout Cycle cloud nodes: {}" .format([cn.name for cn in cc_node_to_terminate])) if dry_run: logging.info("Dry-run: skip ...") else: node_mgr.terminate_nodes(cc_node_to_terminate) if not dry_run: logging.info("Save node history: {}".format(node_history)) node_history.save()
def __init__( self, queue_config: Dict, scheduler: "GridEngineScheduler", pes: Dict[str, "ParallelEnvironment"], unbound_hostgroups: Dict[str, Hostgroup], complex_values: Dict[str, Dict], autoscale_enabled: bool = True, ) -> None: self.queue_config = queue_config self.complex_values = complex_values self.autoscale_enabled = autoscale_enabled assert isinstance(self.queue_config["hostlist"], str), self.queue_config[ "hostlist" ] self.__hostlist = re.split(",| +", self.queue_config["hostlist"]) self.__pe_to_hostgroups: Dict[str, List[str]] = {} self._pe_keys_cache: Dict[str, List[str]] = {} self.__parallel_environments: Dict[str, "ParallelEnvironment"] = {} self.__slots = parse_slots(self.queue_config.get("slots", "")) for hg, slots in self.__slots.items(): if hg is None: continue if hg not in self.complex_values: self.complex_values[hg] = {} self.complex_values[hg]["slots"] = slots self.__seq_no = parse_seq_no(self.queue_config.get("seq_no", "0")) pe_list = parse_hostgroup_mapping(queue_config["pe_list"]) def _get_seqno(hg_name: str) -> int: return self.__seq_no.get(hg_name, self.__seq_no.get(None, 0)) # type: ignore if scheduler.sort_by_seqno: potential_defaults = self.__hostlist + list(self.seq_no.keys()) self.default_hg = sorted(potential_defaults, key=_get_seqno)[0] else: self.default_hg = self.__hostlist[0] for hostgroup, pes_for_hg in pe_list.items(): for pe_name in pes_for_hg: if not pe_name: continue if pe_name not in pes: logging.warning( 'Unknown parallel environment %s defined in {"gridengine": {"pes": {"%s": {}}}} - %s. Skipping', pe_name, pe_name, list(pes.keys()), ) continue self.__parallel_environments[pe_name] = pes[pe_name] # common case, and let's avoid nlogn insertion if pe_name not in self.__pe_to_hostgroups: self.__pe_to_hostgroups[pe_name] = [hostgroup] else: all_hostgroups = self.__pe_to_hostgroups[pe_name] if hostgroup not in all_hostgroups: all_hostgroups.append(hostgroup) if queue_config["pe_list"] and queue_config["pe_list"].lower() != "none": assert self.__parallel_environments, queue_config["pe_list"] self.user_lists = parse_hostgroup_mapping( queue_config.get("user_lists") or "", self.hostlist_groups, filter_none=True ) self.xuser_lists = parse_hostgroup_mapping( queue_config.get("xuser_lists") or "", self.hostlist_groups, filter_none=True, ) self.projects = parse_hostgroup_mapping( queue_config.get("projects") or "", self.hostlist_groups, filter_none=True ) self.xprojects = parse_hostgroup_mapping( queue_config.get("xprojects") or "", self.hostlist_groups, filter_none=True ) hostgroup_mappings = ( [list(self.complex_values.keys())] + list(self.__pe_to_hostgroups.values()) + [list(self.seq_no.keys())] + [list(self.user_lists.keys())] + [list(self.xuser_lists.keys())] + [list(self.projects.keys())] + [list(self.xprojects.keys())] ) for hg_names in hostgroup_mappings: for hg_name in hg_names: if hg_name and hg_name not in self.__hostlist: self.__hostlist.append(hg_name) all_host_groups = set(self.hostlist) for pe in self.__parallel_environments.values(): if pe.requires_placement_groups: all_host_groups = all_host_groups - set( self.__pe_to_hostgroups.get(pe.name) or [] ) self.__ht_hostgroups = [x for x in list(all_host_groups) if x.startswith("@")] self.__bound_hostgroups: Dict[str, BoundHostgroup] = {} for hg_name in self.hostlist_groups: hg_seq_no = _get_seqno(hg_name) self.__bound_hostgroups[hg_name] = BoundHostgroup( self, unbound_hostgroups[hg_name], hg_seq_no )
def parse_scheduler_node( ndict: Dict[str, Any], resource_definitions: Dict[str, PBSProResourceDefinition]) -> SchedulerNode: """ Implementation of parsing a single scheduler node. """ parser = get_pbspro_parser() hostname = ndict["name"] res_avail = parser.parse_resources_available(ndict, filter_is_host=True) res_assigned = parser.parse_resources_assigned(ndict, filter_is_host=True) node = SchedulerNode(hostname, res_avail) jobs_expr = ndict.get("jobs", "") state = ndict.get("state") or "" if state == "free" and jobs_expr.strip(): state = "partially-free" node.metadata["pbs_state"] = state if "down" in state: node.marked_for_deletion = True node.metadata["last_state_change_time"] = ndict.get( "last_state_change_time", "") for tok in jobs_expr.split(","): tok = tok.strip() if not tok: continue job_id_full, sub_job_id = tok.rsplit("/", 1) sched_host = "" if "." in job_id_full: job_id, sched_host = job_id_full.split(".", 1) else: job_id = job_id_full node.assign(job_id) if "job_ids_long" not in node.metadata: node.metadata["job_ids_long"] = [job_id_full] elif job_id_full not in node.metadata["job_ids_long"]: node.metadata["job_ids_long"].append(job_id_full) for res_name, value in res_assigned.items(): resource = resource_definitions.get(res_name) if not resource or not resource.is_host: continue if resource.is_consumable: if res_name in node.available: node.available[res_name] -= value else: logging.warning( "%s was not defined under resources_available, but was " + "defined under resources_assigned for %s. Setting available to assigned.", res_name, node, ) node.available[res_name] = value if "exclusive" in node.metadata["pbs_state"]: node.closed = True return node
def parse_jobs( pbscmd: PBSCMD, resource_definitions: Dict[str, PBSProResourceDefinition], queues: Dict[str, PBSProQueue], resources_for_scheduling: Set[str], ) -> List[Job]: """ Parses PBS qstat output and creates relevant hpc.autoscale.job.job.Job objects """ parser = get_pbspro_parser() # alternate format triggered by # -a, -i, -G, -H, -M, -n, -r, -s, -T, or -u ret: List[Job] = [] response: Dict = pbscmd.qstat_json("-f", "-t") for job_id, jdict in response.get("Jobs", {}).items(): job_id = job_id.split(".")[0] job_state = jdict.get("job_state") if not job_state: logging.warning("No job_state defined for job %s. Skipping", job_id) continue if job_state != PBSProJobStates.Queued: continue # ensure we don't autoscale jobs from disabled or non-started queues qname = jdict.get("queue") if not qname or qname not in queues: logging.warning("queue was not defined for job %s: ignoring", job_id) continue queue: PBSProQueue = queues[qname] if not queue.enabled: logging.fine("Skipping job %s from disabled queue %s", job_id, qname) continue if not queue.started: logging.fine("Skipping job %s from non-started queue %s", job_id, qname) continue # handle array vs individual jobs if jdict.get("array"): iterations = parser.parse_range_size( jdict["array_indices_submitted"]) remaining = parser.parse_range_size( jdict["array_indices_remaining"]) elif "[" in job_id: continue else: iterations = 1 remaining = 1 res_list = jdict["Resource_List"] res_list["schedselect"] = jdict["schedselect"] rdict = parser.convert_resource_list(res_list) pack = (PackingStrategy.PACK if rdict["place"]["arrangement"] in ["free", "pack"] else PackingStrategy.SCATTER) # SMP style jobs is_smp = (rdict["place"].get("grouping") == "host" or rdict["place"]["arrangement"] == "pack") # pack jobs do not need to define node_count node_count = int(rdict.get("nodect", "0")) smp_multiplier = 1 if is_smp: smp_multiplier = max(1, iterations) * max(1, node_count) # for key, value in list(rdict.items()): # if isinstance(value, (float, int)): # value = value * smp_multiplier iterations = node_count = 1 effective_node_count = max(node_count, 1) # htc jobs set ungrouped=true. see our default htcq colocated = (not is_smp and queue.uses_placement and rdict.get("ungrouped", "false").lower() == "false") sharing = rdict["place"].get("sharing") for n, chunk_base in enumerate(rdict["schedselect"]): chunk: Dict[str, Any] = {} chunk.update(rdict) if "ncpus" not in chunk_base: chunk["ncpus"] = chunk["ncpus"] // effective_node_count if smp_multiplier > 1: for key, value in list(chunk_base.items()): if isinstance(value, (int, float)): chunk_base[key] = value * smp_multiplier # do this _after_ rdict, since the chunks # will override the top level resources # e.g. notice that ncpus=4. This will be the rdict value # but the chunks have ncpus=2 # Resource_List.ncpus = 4 # Resource_List.nodect = 2 # Resource_List.select = 2:ncpus=2 chunk.update(chunk_base) working_constraint: Dict[str, Any] = {} constraints = [working_constraint] if colocated: working_constraint["in-a-placement-group"] = True my_job_id = job_id if len(rdict["schedselect"]) > 1: if "." in job_id: job_index, host = job_id.split(".", 1) my_job_id = "{}+{}.{}".format(job_index, n, host) else: my_job_id = "{}+{}".format(job_id, n) if sharing == "excl": working_constraint["exclusive-task"] = True elif sharing == "exclhost": working_constraint["exclusive"] = True job_resources = {} for rname, rvalue in chunk.items(): if rname in ["select", "schedselect", "place", "nodect"]: continue if rname not in resources_for_scheduling: if rname == "skipcyclesubhook": continue logging.warning( "Ignoring resource %s as it was not defined in sched_config", rname, ) continue # add all resource requests here. By that, I mean # non resource requests, like exclusive, should be ignored # required for get_non_host_constraints job_resources[rname] = rvalue resource_def = resource_definitions.get(rname) # constraints are for the node/host # queue/scheduler level ones will be added using # > queue.get_non_host_constraints(job_resource) if not resource_def or not resource_def.is_host: continue if rname not in working_constraint: working_constraint[rname] = rvalue else: # hit a conflict, so start a new working cons # so we maintain precedence working_constraint = {rname: rvalue} constraints.append(working_constraint) queue_constraints = queue.get_non_host_constraints(job_resources) constraints.extend(queue_constraints) job = Job( name=my_job_id, constraints=constraints, iterations=iterations, node_count=node_count, colocated=colocated, packing_strategy=pack, ) job.iterations_remaining = remaining ret.append(job) return ret
def _validate_reverse_dns(self, node: Node) -> bool: # let's make sure the hostname is valid and reverse # dns compatible before adding to GE # if there is no private ip, then the hostname was removed, most likely # by azure DNS if not node.private_ip: return True try: addr_info = socket.gethostbyaddr(node.private_ip) except Exception as e: logging.error( "Could not convert private_ip(%s) to hostname using gethostbyaddr() for %s: %s", node.private_ip, node, str(e), ) return False addr_info_ips = addr_info[-1] if isinstance(addr_info_ips, str): addr_info_ips = [addr_info_ips] if node.private_ip not in addr_info_ips: logging.warning( "%s has a hostname that does not match the" + " private_ip (%s) reported by cyclecloud (%s)! Skipping", node, addr_info_ips, node.private_ip, ) return False expect_multiple_entries = (node.software_configuration.get( "cyclecloud", {}).get("hosts", {}).get("standalone_dns", {}).get("enabled", True)) addr_info_hostname = addr_info[0].split(".")[0] if addr_info_hostname.lower() != node.hostname.lower(): if expect_multiple_entries: logging.warning( "%s has a hostname that can not be queried via reverse" + " dns (private_ip=%s cyclecloud hostname=%s reverse dns hostname=%s)." + " This is common and usually repairs itself. Skipping", node, node.private_ip, node.hostname, addr_info_hostname, ) else: logging.error( "%s has a hostname that can not be queried via reverse" + " dns (private_ip=%s cyclecloud hostname=%s reverse dns hostname=%s)." + " If you have an entry for this address in your /etc/hosts file, please remove it.", node, node.private_ip, node.hostname, addr_info_hostname, ) return False return True
def _parse_complexes( autoscale_config: Dict, complex_lines: List[str] ) -> Dict[str, "Complex"]: relevant_complexes = None if autoscale_config: relevant_complexes = autoscale_config.get("gridengine", {}).get( "relevant_complexes" ) if relevant_complexes: # special handling of ccnodeid, since it is something we # create for the user relevant_complexes = relevant_complexes + ["ccnodeid"] if relevant_complexes: logging.info( "Restricting complexes for autoscaling to %s", relevant_complexes ) complexes: List[Complex] = [] headers = complex_lines[0].lower().replace("#", "").split() required = set(["name", "type", "consumable"]) missing = required - set(headers) if missing: logging.error( "Could not parse complex file as it is missing expected columns: %s." + " Autoscale likely will not work.", list(missing), ) return {} for n, line in enumerate(complex_lines[1:]): if line.startswith("#"): continue toks = line.split() if len(toks) != len(headers): logging.warning( "Could not parse complex at line {} - ignoring: '{}'".format(n, line) ) continue c = dict(zip(headers, toks)) try: if ( relevant_complexes and c["name"] not in relevant_complexes and c["shortcut"] not in relevant_complexes ): logging.trace( "Ignoring complex %s because it was not defined in gridengine.relevant_complexes", c["name"], ) continue complex = Complex( name=c["name"], shortcut=c.get("shortcut", c["name"]), complex_type=c["type"], relop=c.get("relop", "=="), requestable=c.get("requestable", "YES").lower() == "yes", consumable=c.get("consumable", "YES").lower() == "yes", default=c.get("default"), urgency=int(c.get("urgency", 0)), ) complexes.append(complex) except Exception: logging.exception("Could not parse complex %s - %s", line, c) # TODO test RDH ret = partition_single(complexes, lambda x: x.name) shortcut_dict = partition_single(complexes, lambda x: x.shortcut) ret.update(shortcut_dict) return ret
def autoscale_pbspro( config: Dict[str, Any], pbs_env: Optional[PBSProEnvironment] = None, pbs_driver: Optional[PBSProDriver] = None, ctx_handler: Optional[DefaultContextHandler] = None, node_history: Optional[NodeHistory] = None, dry_run: bool = False, ) -> DemandResult: global _exit_code assert not config.get("read_only", False) if dry_run: logging.warning("Running pbs autoscaler in dry run mode") # allow multiple instances config["lock_file"] = None # put in read only mode config["read_only"] = True # interface to PBSPro, generally by cli if pbs_driver is None: # allow tests to pass in a mock pbs_driver = PBSProDriver(config) if pbs_env is None: pbs_env = envlib.from_driver(config, pbs_driver) pbs_driver.initialize() config = pbs_driver.preprocess_config(config) logging.debug("Driver = %s", pbs_driver) demand_calculator = calculate_demand(config, pbs_env, ctx_handler, node_history) failed_nodes = demand_calculator.node_mgr.get_failed_nodes() for node in pbs_env.scheduler_nodes: if "down" in node.metadata.get("pbs_state", ""): failed_nodes.append(node) pbs_driver.handle_failed_nodes(failed_nodes) demand_result = demand_calculator.finish() if ctx_handler: ctx_handler.set_context("[joining]") # details here are that we pass in nodes that matter (matched) and the driver figures out # which ones are new and need to be added joined = pbs_driver.add_nodes_to_cluster( [x for x in demand_result.compute_nodes if x.exists]) pbs_driver.handle_post_join_cluster(joined) if ctx_handler: ctx_handler.set_context("[scaling]") # bootup all nodes. Optionally pass in a filtered list if demand_result.new_nodes: if not dry_run: demand_calculator.bootup() if not dry_run: demand_calculator.update_history() # we also tell the driver about nodes that are unmatched. It filters them out # and returns a list of ones we can delete. idle_timeout = int(config.get("idle_timeout", 300)) boot_timeout = int(config.get("boot_timeout", 3600)) logging.fine("Idle timeout is %s", idle_timeout) unmatched_for_5_mins = demand_calculator.find_unmatched_for( at_least=idle_timeout) timed_out_booting = demand_calculator.find_booting(at_least=boot_timeout) # I don't care about nodes that have keep_alive=true timed_out_booting = [n for n in timed_out_booting if not n.keep_alive] timed_out_to_deleted = [] unmatched_nodes_to_delete = [] if timed_out_booting: logging.info("The following nodes have timed out while booting: %s", timed_out_booting) timed_out_to_deleted = pbs_driver.handle_boot_timeout( timed_out_booting) or [] if unmatched_for_5_mins: logging.info("unmatched_for_5_mins %s", unmatched_for_5_mins) unmatched_nodes_to_delete = ( pbs_driver.handle_draining(unmatched_for_5_mins) or []) nodes_to_delete = [] for node in timed_out_to_deleted + unmatched_nodes_to_delete: if node.assignments: logging.warning( "%s has jobs assigned to it so we will take no action.", node) continue nodes_to_delete.append(node) if nodes_to_delete: try: logging.info("Deleting %s", [str(n) for n in nodes_to_delete]) delete_result = demand_calculator.delete(nodes_to_delete) if delete_result: # in case it has anything to do after a node is deleted (usually just remove it from the cluster) pbs_driver.handle_post_delete(delete_result.nodes) except Exception as e: _exit_code = 1 logging.warning( "Deletion failed, will retry on next iteration: %s", e) logging.exception(str(e)) print_demand(config, demand_result, log=not dry_run) return demand_result
from hpc.autoscale import hpclogging as logging from hpc.autoscale.job.schedulernode import SchedulerNode as _SchedulerNode logging.warning("hpc.autoscale.job.computenode is deprecated.") logging.warning("Please use hpc.autoscale.job.schedulernode") SchedulerNode = _SchedulerNode
def new_demand_calculator( config: Dict, ge_env: Optional[GridEngineEnvironment] = None, ge_driver: Optional["GridEngineDriver"] = None, ctx_handler: Optional[DefaultContextHandler] = None, node_history: Optional[NodeHistory] = None, singleton_lock: Optional[SingletonLock] = None, ) -> DemandCalculator: if ge_env is None: ge_env = envlib.from_qconf(config) if ge_driver is None: ge_driver = new_driver(config, ge_env) if node_history is None: db_path = config.get("nodehistorydb") if not db_path: db_dir = "/opt/cycle/jetpack/system/bootstrap/gridengine" if not os.path.exists(db_dir): db_dir = os.getcwd() db_path = os.path.join(db_dir, "nodehistory.db") read_only = config.get("read_only", False) node_history = SQLiteNodeHistory(db_path, read_only) node_history.create_timeout = config.get("boot_timeout", 3600) node_history.last_match_timeout = config.get("idle_timeout", 300) demand_calculator = dcalclib.new_demand_calculator( config, existing_nodes=ge_env.nodes, node_history=node_history, node_queue=ge_driver.new_node_queue(), singleton_lock=singleton_lock, # it will handle the none case ) for name, default_complex in ge_env.complexes.items(): if name == "slots": continue if default_complex.default is None: continue if not default_complex.requestable: continue logging.trace("Adding default resource %s=%s", name, default_complex.default) demand_calculator.node_mgr.add_default_resource( {}, name, default_complex.default) ccnode_id_added = False slots_added: Set[str] = set() for bucket in demand_calculator.node_mgr.get_buckets(): if "slots" not in bucket.resources and bucket.nodearray not in slots_added: default = ( '"default_resources": [{"select": {"node.nodearray": "%s"}, "name": "slots", "value": "node.vcpu_count"}]' % (bucket.nodearray)) demand_calculator.node_mgr.add_default_resource( selection={"node.nodearray": bucket.nodearray}, resource_name="slots", default_value="node.vcpu_count", ) logging.warning( """slots is not defined for bucket {}. Using the default, which you can add to your config: {}""" .format(bucket, default)) slots_added.add(bucket.nodearray) # ccnodeid will almost certainly not be defined. It just needs # to be definede once, so we will add a default for all nodes # the first time we see it is missingg if "ccnodeid" not in bucket.resources and not ccnode_id_added: demand_calculator.node_mgr.add_default_resource( selection={}, # applies to all nodes resource_name="ccnodeid", default_value=lambda n: n.delayed_node_id.node_id, ) ccnode_id_added = True return demand_calculator
def autoscale_grid_engine( config: Dict[str, Any], ge_env: Optional[GridEngineEnvironment] = None, ge_driver: Optional["GridEngineDriver"] = None, ctx_handler: Optional[DefaultContextHandler] = None, node_history: Optional[NodeHistory] = None, dry_run: bool = False, ) -> DemandResult: global _exit_code assert not config.get("read_only", False) if dry_run: logging.warning("Running gridengine autoscaler in dry run mode") # allow multiple instances config["lock_file"] = None # put in read only mode config["read_only"] = True if ge_env is None: ge_env = envlib.from_qconf(config) # interface to GE, generally by cli if ge_driver is None: # allow tests to pass in a mock ge_driver = new_driver(config, ge_env) ge_driver.initialize_environment() config = ge_driver.preprocess_config(config) logging.fine("Driver = %s", ge_driver) invalid_nodes = [] # we need an instance without any scheduler nodes, so don't # pass in the existing nodes. tmp_node_mgr = new_node_manager(config) by_hostname = partition_single(tmp_node_mgr.get_nodes(), lambda n: n.hostname_or_uuid) for node in ge_env.nodes: # many combinations of a u and other states. However, # as long as a and u are in there it is down state = node.metadata.get("state", "") cc_node = by_hostname.get(node.hostname) ccnodeid = node.resources.get("ccnodeid") if cc_node: if not ccnodeid or ccnodeid == cc_node.delayed_node_id.node_id: if cc_node.state in ["Preparing", "Acquiring"]: continue if "a" in state and "u" in state: invalid_nodes.append(node) # nodes in error state must also be deleted nodes_to_delete = ge_driver.clean_hosts(invalid_nodes) for node in nodes_to_delete: ge_env.delete_node(node) demand_calculator = calculate_demand(config, ge_env, ge_driver, ctx_handler, node_history) ge_driver.handle_failed_nodes( demand_calculator.node_mgr.get_failed_nodes()) demand_result = demand_calculator.finish() if ctx_handler: ctx_handler.set_context("[joining]") # details here are that we pass in nodes that matter (matched) and the driver figures out # which ones are new and need to be added via qconf joined = ge_driver.handle_join_cluster( [x for x in demand_result.compute_nodes if x.exists]) ge_driver.handle_post_join_cluster(joined) if ctx_handler: ctx_handler.set_context("[scaling]") # bootup all nodes. Optionally pass in a filtered list if demand_result.new_nodes: if not dry_run: demand_calculator.bootup() if not dry_run: demand_calculator.update_history() # we also tell the driver about nodes that are unmatched. It filters them out # and returns a list of ones we can delete. idle_timeout = int(config.get("idle_timeout", 300)) boot_timeout = int(config.get("boot_timeout", 3600)) logging.fine("Idle timeout is %s", idle_timeout) unmatched_for_5_mins = demand_calculator.find_unmatched_for( at_least=idle_timeout) timed_out_booting = demand_calculator.find_booting(at_least=boot_timeout) # I don't care about nodes that have keep_alive=true timed_out_booting = [n for n in timed_out_booting if not n.keep_alive] timed_out_to_deleted = [] unmatched_nodes_to_delete = [] if timed_out_booting: logging.info("The following nodes have timed out while booting: %s", timed_out_booting) timed_out_to_deleted = ge_driver.handle_boot_timeout( timed_out_booting) or [] if unmatched_for_5_mins: node_expr = ", ".join([str(x) for x in unmatched_for_5_mins]) logging.info("Unmatched for at least %s seconds: %s", idle_timeout, node_expr) unmatched_nodes_to_delete = ( ge_driver.handle_draining(unmatched_for_5_mins) or []) nodes_to_delete = [] for node in timed_out_to_deleted + unmatched_nodes_to_delete: if node.assignments: logging.warning( "%s has jobs assigned to it so we will take no action.", node) continue nodes_to_delete.append(node) if nodes_to_delete: try: logging.info("Deleting %s", [str(n) for n in nodes_to_delete]) delete_result = demand_calculator.delete(nodes_to_delete) if delete_result: # in case it has anything to do after a node is deleted (usually just remove it from the cluster) ge_driver.handle_post_delete(delete_result.nodes) except Exception as e: _exit_code = 1 logging.warning( "Deletion failed, will retry on next iteration: %s", e) logging.exception(str(e)) print_demand(config, demand_result, log=not dry_run) return demand_result