def _setup_shell_locals(self, config: Dict) -> Dict: ctx = DefaultContextHandler("[interactive-readonly]") driver = self._driver(config) dcalc, jobs_list = self._demand_calc(config, driver) nodes_list = dcalc.node_mgr.get_nodes() for node in nodes_list: node.shellify() nodes = partition_single(nodes_list, lambda n: n.name) nodes.update( partition_single([x for x in nodes_list if x.hostname], lambda n: n.hostname)) jobs: Dict[str, Any] try: jobs = partition_single(jobs_list, lambda j: j.name) except Exception: jobs = partition(jobs_list, lambda j: j.name) return { "config": config, "cli": self, "ctx": ctx, "demand_calc": dcalc, "node_mgr": dcalc.node_mgr, "jobs": ShellDict(jobs), "nodes": ShellDict(nodes), }
def update(self, nodes: typing.Iterable[Node]) -> None: if self.read_only: return now = self.now() rows = list( self._execute("""SELECT node_id, hostname, last_match_time, create_time from nodes where delete_time IS NULL""")) rows_by_id = partition_single(rows, lambda r: r[0]) nodes_with_ids = [n for n in nodes if n.delayed_node_id.node_id] nodes_by_id: typing.Dict[typing.Optional[NodeId], Node] = partition_single( nodes_with_ids, lambda n: n.delayed_node_id.node_id, ) to_delete = set(rows_by_id.keys()) - set(nodes_by_id.keys()) for node in nodes: node_id = node.delayed_node_id.node_id if node_id not in rows_by_id: # first time we see it, just put an entry rows_by_id[node_id] = tuple([node_id, node.hostname, now, now]) if node.required: rec = list(rows_by_id[node_id]) rec[-2] = now rows_by_id[node_id] = tuple(rec) if rows_by_id: exprs = [] for row in rows_by_id.values(): node_id, hostname, match_time, create_time = row expr = "('{}', '{}', {}, {}, NULL)".format( node_id, hostname, match_time, create_time) exprs.append(expr) values_expr = ",".join(exprs) stmt = "INSERT OR REPLACE INTO nodes (node_id, hostname, last_match_time, create_time, delete_time) VALUES {}".format( values_expr) self._execute(stmt) if to_delete: to_delete_expr = " OR ".join( ['node_id="{}"'.format(node_id) for node_id in to_delete]) now = datetime.datetime.utcnow().timestamp() self._execute("UPDATE nodes set delete_time={} where {}".format( now, to_delete_expr)) self.retire_records(commit=True)
def test_partition_single() -> None: objs = [{"id": 1}, {"id": 2}, {"id": 3}] by_id = partition_single(objs, lambda x: x["id"]) assert set([1, 2, 3]) == set(by_id.keys()) for k, v in by_id.items(): assert by_id[k] == {"id": k} try: partition_single(objs, lambda x: None) assert False except RuntimeError as e: expected = "Could not partition list into single values - key=None values=[{'id': 1}, {'id': 2}, {'id': 3}]" assert str(e) == expected
def find_booting( self, at_least: float = 1800, booting_nodes: Optional[List[Node]] = None, ) -> List[Node]: if not booting_nodes: booting_nodes = self.node_mgr.get_nodes() # filter out nodes that have converged. booting_nodes = [ n for n in booting_nodes if n.target_state == "Started" and n.state not in ["Ready", "Started"] and n.delayed_node_id.node_id ] by_id = partition_single(booting_nodes, lambda n: n.delayed_node_id.node_id) ret = [] for node_id, hostname, create_time in self.node_history.find_booting( for_at_least=at_least): if not node_id: continue if node_id in by_id: ret.append(by_id[node_id]) return ret
def update_scheduler_nodes(self, scheduler_nodes: List[SchedulerNode]) -> None: by_hostname: Dict[str, Node] = partition_single( self.__scheduler_nodes_queue, lambda n: n.hostname_or_uuid # type: ignore ) for new_snode in scheduler_nodes: if new_snode.hostname not in by_hostname: logging.debug( "Found new node[hostname=%s] that does not exist in CycleCloud", new_snode.hostname, ) by_hostname[new_snode.hostname] = new_snode self.__scheduler_nodes_queue.push(new_snode) self.node_mgr.add_unmanaged_nodes([new_snode]) # TODO inform bucket catalog? else: old_snode = by_hostname[new_snode.hostname_or_uuid] logging.fine( "Found existing CycleCloud node[hostname=%s]", new_snode.hostname, ) old_snode.update(new_snode)
def read_queues( autoscale_config: Dict, scheduler: "GridEngineScheduler", pes: Dict[str, "ParallelEnvironment"], hostgroups: List[Hostgroup], complexes: Dict[str, "Complex"], qbin: QBin, ) -> Dict[str, GridEngineQueue]: queues = {} qnames = qbin.qconf(["-sql"]).split() logging.debug("Found %d queues: %s", len(qnames), " ".join(qnames)) autoscale_queues_config = autoscale_config.get("gridengine", {}).get("queues", {}) unbound_hostgroups = partition_single(hostgroups, lambda h: h.name) for qname in qnames: lines = qbin.qconf(["-sq", qname]).splitlines() queue_config = parse_ge_config(lines) autoscale_enabled = autoscale_queues_config.get(queue_config["qname"], {}).get( "autoscale_enabled", True ) expr = queue_config.get("complex_values", "NONE") complex_values = parse_queue_complex_values(expr, complexes, qname) queues[qname] = GridEngineQueue( queue_config, scheduler, pes, unbound_hostgroups, complex_values, autoscale_enabled, ) return queues
def read_schedulers( pbscmd: PBSCMD, resource_definitions: Dict[str, PBSProResourceDefinition] ) -> Dict[Hostname, PBSProScheduler]: parser = get_pbspro_parser() sched_dicts = pbscmd.qmgr_parsed("list", "sched") server_dicts = pbscmd.qmgr_parsed("list", "server") server_dicts_by_host = partition_single(server_dicts, lambda s: s["server_host"]) ret: Dict[str, PBSProScheduler] = {} for sched_dict in sched_dicts: hostname = sched_dict["sched_host"] server_dict = server_dicts_by_host[hostname] for key, value in server_dict.items(): if key not in sched_dict: sched_dict[key] = value # this is a scheduler, so it has no parent shared resources resource_state = parser.parse_resource_state( sched_dict, parent_shared_resources=None) scheduler = PBSProScheduler(sched_dict, resource_state) ret[scheduler.hostname] = scheduler return ret
def common_cluster_test(qsub_commands: List[str], pg_counts: Optional[Dict[str, int]] = None, previous_dcalc: Optional[DemandCalculator] = None, **array_counts: int) -> DemandCalculator: pg_counts = pg_counts or {} dcalc = common_cluster(qsub_commands, previous_dcalc) demand = dcalc.get_demand() demandprinter.print_demand(["name", "job_ids", "placement_group"], demand) # sanity check that we don't recreate the same node partition_single(demand.new_nodes, lambda n: n.name) by_array = partition(demand.new_nodes, lambda n: n.nodearray) by_pg = partition(demand.new_nodes, lambda n: n.placement_group) if set(by_pg.keys()) != set([None]): if set(by_pg.keys()) != set(pg_counts.keys()): assert False, "\n%s\n%s" % ( [(x, len(y)) for x, y in by_pg.items()], pg_counts, ) assert set(by_pg.keys()) == set(pg_counts.keys()) assert not (bool(by_pg) ^ bool(pg_counts)) if pg_counts: for pg_name, count in pg_counts.items(): assert pg_name in by_pg assert ( len(by_pg[pg_name]) == count ), "Expected pg {} to have {} nodes. Found {}. Full {}".format( pg_name, count, len(by_pg[pg_name]), [(x, len(y)) for x, y in by_pg.items()], ) for pg_name in by_pg: assert pg_name in pg_counts for nodearray_name, count in array_counts.items(): assert nodearray_name in by_array assert len(by_array[nodearray_name]) == count, [ n.name for n in by_array[nodearray_name] ] for nodearray_name, node_list in by_array.items(): assert nodearray_name in array_counts return dcalc
def test_mock_bindings2() -> None: bindings = MockClusterBinding() bindings.add_nodearray("w", {}, location="westus2", max_count=8) bindings.add_bucket( "w", "Standard_E2_v3", max_count=80, available_count=8, family_consumed_core_count=72 * 2, family_quota_core_count=160, family_quota_count=80, ) bindings.add_bucket( "w", "Standard_E4_v3", max_count=40, available_count=4, family_consumed_core_count=72 * 2, family_quota_core_count=160, family_quota_count=80, ) bindings.add_bucket("w", "Standard_D8s_v3", max_count=80, available_count=8) bindings.add_nodearray("e", {}, location="eastus") bindings.add_bucket("e", "Standard_E2_v3", max_count=20, available_count=4) node_mgr = _node_mgr(bindings) by_size = partition_single(node_mgr.get_buckets(), lambda b: (b.location, b.vm_size)) assert by_size[("westus2", "Standard_E2_v3")].available_count == 8 assert by_size[("westus2", "Standard_E2_v3")].limits.nodearray_available_count == 8 assert by_size[("westus2", "Standard_E2_v3")].limits.family_max_count == 80 assert by_size[("westus2", "Standard_E4_v3")].available_count == 4 assert by_size[("westus2", "Standard_D8s_v3")].available_count == 8 assert by_size[("eastus", "Standard_E2_v3")].available_count == 4 result = node_mgr.allocate( { "node.vm_size": "Standard_E4_v3", "exclusive": True, "node.location": "westus2", }, node_count=1, ) assert result, "\n".join(result.reasons) assert by_size[("westus2", "Standard_E2_v3")].limits.nodearray_available_count == 7 assert by_size[("westus2", "Standard_E2_v3")].available_count == 6 assert by_size[("westus2", "Standard_E4_v3")].available_count == 3 assert by_size[("westus2", "Standard_D8s_v3")].available_count == 7 assert by_size[("eastus", "Standard_E2_v3")].available_count == 4
def decorate(self, nodes: typing.List[Node]) -> None: if not nodes: nodes = [] nodes = [n for n in nodes if n.exists] equalities = [ " (node_id == '{}') ".format(n.delayed_node_id.node_id) for n in nodes ] if not equalities: return stmt = "select node_id, last_match_time, create_time, delete_time from nodes where {}".format( "{}".format(" OR ".join(equalities))) rows = self._execute(stmt) rows_by_id = partition_single(list(rows), lambda r: r[0]) now = self.now() for node in nodes: node_id = node.delayed_node_id.node_id # should be impossible because we already filtered by exists if not node_id: logging.warning( "Null node_id for %s. Leaving create/last_match/delete times as null.", node, ) continue if node_id in rows_by_id: node_id, last_match_time, create_time, delete_time = rows_by_id[ node_id] node.create_time_unix = create_time node.last_match_time_unix = last_match_time node.delete_time_unix = delete_time if self.create_timeout: create_elapsed = max(0, now - create_time) create_remaining = max( 0, self.create_timeout - create_elapsed) node.create_time_remaining = create_remaining if self.last_match_timeout: if node.keep_alive: node.idle_time_remaining = -1 else: match_elapsed = max(0, now - last_match_time) match_remaining = max( 0, self.last_match_timeout - match_elapsed) node.idle_time_remaining = match_remaining
def _find_nodes( config: Dict, hostnames: List[str], node_names: List[str] ) -> Tuple[GridEngineDriver, DemandCalculator, List[Node]]: hostnames = hostnames or [] node_names = node_names or [] ge_env = environment.from_qconf(config) ge_driver = autoscaler.new_driver(config, ge_env) demand_calc = autoscaler.calculate_demand(config, ge_env, ge_driver) demand_result = demand_calc.finish() by_hostname = partition_single( demand_result.compute_nodes, lambda n: n.hostname_or_uuid.lower() ) by_node_name = partition_single( demand_result.compute_nodes, lambda n: n.name.lower() ) found_nodes = [] for hostname in hostnames: if not hostname: error("Please specify a hostname") if hostname.lower() not in by_hostname: # it doesn't exist in CC, but we still want to delete it # from the cluster by_hostname[hostname.lower()] = SchedulerNode(hostname, {}) found_nodes.append(by_hostname[hostname.lower()]) for node_name in node_names: if not node_name: error("Please specify a node_name") if node_name.lower() not in by_node_name: error( "Could not find a CycleCloud node that has node_name %s." + " Run 'nodes' to see available nodes.", node_name, ) found_nodes.append(by_node_name[node_name.lower()]) return ge_driver, demand_calc, found_nodes
def __init__( self, scheduler: GridEngineScheduler, jobs: Optional[List[Job]] = None, nodes: Optional[List[Node]] = None, queues: Optional[Dict[str, GridEngineQueue]] = None, hostgroups: Optional[List[hglib.Hostgroup]] = None, pes: Optional[Dict[str, ParallelEnvironment]] = None, complexes: Optional[Dict[str, Complex]] = None, unfiltered_complexes: Optional[Dict[str, Complex]] = None, qbin: Optional[QBin] = None, ) -> None: self.__scheduler = scheduler self.__jobs: List[Job] = jobs or [] self.__nodes: Dict[str, Node] = partition_single( nodes or [], lambda n: n.hostname_or_uuid.lower()) self.__queues: Dict[str, GridEngineQueue] = queues or {} self.__pes: Dict[str, ParallelEnvironment] = pes or {} self.__complexes: Dict[str, Complex] = complexes or {} if unfiltered_complexes: self.__unfiltered_complexes = unfiltered_complexes else: self.__unfiltered_complexes = deepcopy(self.__complexes) self.__qbin = qbin or QBinImpl() self.__hostgroups = partition_single(hostgroups or [], lambda h: h.name) self.__host_memberships: Dict[str, List[str]] = {} for hostgroup in self.__hostgroups.values(): for host in hostgroup.members: if host not in self.__host_memberships: self.__host_memberships[host] = [] self.__host_memberships[host].append(hostgroup.name)
def validate_hg_intersections(ge_env: GridEngineEnvironment, node_mgr: NodeManager, warn_function: WarnFunction) -> bool: bucket_to_hgs: Dict[str, Set[str]] = {} for bucket in node_mgr.get_buckets(): if bucket.bucket_id not in bucket_to_hgs: bucket_to_hgs[str(bucket)] = set() by_str = partition_single(node_mgr.get_buckets(), str) for queue in ge_env.queues.values(): if not queue.autoscale_enabled: continue for hostgroup in queue.bound_hostgroups.values(): for bucket in node_mgr.get_buckets(): is_satisfied = True for constraint in hostgroup.constraints: result = constraint.satisfied_by_bucket(bucket) if not result: is_satisfied = False break if is_satisfied: bucket_to_hgs[str(bucket)].add(hostgroup.name) failure = False for bkey, matches in bucket_to_hgs.items(): bucket = by_str[bkey] if not matches: warn_function( "%s is not matched by any hostgroup. This is not an error.", bucket, ) elif len(matches) > 1: # seq_no will be used to determine ties if not ge_env.scheduler.sort_by_seqno: warn_function( "%s is matched by more than one hostgroup %s. This is not an error.", bucket, ",".join(matches), ) return failure
def update_scheduler_nodes(self, scheduler_nodes: List[SchedulerNode]) -> None: by_hostname: Dict[str, Node] = partition_single( self.__scheduler_nodes_queue, lambda n: n.hostname_or_uuid # type: ignore ) for new_snode in scheduler_nodes: if new_snode.hostname not in by_hostname: by_hostname[new_snode.hostname] = new_snode self.__scheduler_nodes_queue.push(new_snode) self.node_mgr.add_unmanaged_nodes([new_snode]) if new_snode.resources.get("ccnodeid"): logging.warning( "%s has ccnodeid defined, but no longer exists in CycleCloud", new_snode, ) else: logging.debug( "Found new node[hostname=%s] that does not exist in CycleCloud", new_snode.hostname, ) # TODO inform bucket catalog? elif new_snode.metadata.get("override_resources", True): old_snode = by_hostname[new_snode.hostname_or_uuid] logging.fine( "Found existing CycleCloud node[hostname=%s]", new_snode.hostname, ) old_snode.update(new_snode) else: logging.fine( "Found existing CycleCloud node[hostname=%s], but node.metadata.override_resources=false" + " so ignoring the reported resources and only copying metadata", new_snode.hostname, ) old_snode = by_hostname[new_snode.hostname_or_uuid] old_snode.metadata.update(new_snode.metadata)
def test_complex_shortcut() -> None: # make sure that if a user mixes the shortcut and long form # we still handle that. dcalc = common_cluster_test( [ "-l m_mem_free=2g -q htc.q sleep.sh", "-l m_mem_free=2g -q htc.q sleep.sh", "-l m_mem_free=2g -q htc.q sleep.sh", "-l mfree=2g -q htc.q sleep.sh", "-l mfree=2g -q htc.q sleep.sh", "-l mfree=2g -q htc.q sleep.sh", # "-l m_mem_free=2g -q htc.q sleep.sh", # "-l m_mem_free=2g -q htc.q sleep.sh", # "-l m_mem_free=2g -q htc.q sleep.sh", ], htc=2, ) eg = dcalc.node_mgr.example_node("westus", "Standard_F4") new_nodes = dcalc.get_demand().new_nodes by_name = partition_single(new_nodes, lambda n: n.name) def m(expr: str) -> Memory: return Memory.value_of(expr) assert eg.memory == m("8g") assert by_name["htc-1"].memory == m("8g") assert by_name["htc-1"].resources["m_mem_free"] == m("8g") assert by_name["htc-1"].resources["mfree"] == m("8g") assert by_name["htc-1"].available["m_mem_free"] == m("0g") assert by_name["htc-1"].available["mfree"] == m("0g") assert by_name["htc-2"].resources["m_mem_free"] == m("8g") assert by_name["htc-2"].resources["mfree"] == m("8g") assert by_name["htc-2"].available["m_mem_free"] == m("4g") assert by_name["htc-2"].available["mfree"] == m("4g")
def create_vm_sizes(cache_path: Optional[str] = None) -> None: if cache_path and os.path.exists(cache_path): raw = open(cache_path).read() else: az_path = which("az") if az_path: raw = check_output([ az_path, "vm", "list-skus", "--all", ]).decode() else: print("You need az cli installed.", file=sys.stderr) sys.exit(1) if cache_path: with open(cache_path, "w") as fw: fw.write(raw) print("Parsing list-skus...") try: skus = json.loads(raw) except Exception as e: toks = str(e).split() line_no = int(toks[toks.index("line") + 1]) print("{}: '{}'".format(e, raw.splitlines()[line_no - 1])) return print("done") skus = [ s for s in skus if s.get("family") and s.get("resourceType") == "virtualMachines" ] min_skus = [] for sku in skus: min_sku = {} for key in ["name", "family", "size", "tier"]: min_sku[key] = sku[key] assert min_sku["family"], sku if not sku["locationInfo"]: print("WARNING: Missing location info. See", min_sku) continue min_sku["location"] = sku["locationInfo"][0]["location"] cap_list = sku["capabilities"] cap_dict = {} for entry in cap_list: value = entry["value"] if value.isdigit(): value = int(value) elif value in ["True", "False"]: value = value == "True" elif "," in value: value = value.split(",") else: try: value = float(value) except ValueError: pass cap_dict[entry["name"]] = value min_sku["capabilities"] = cap_dict min_skus.append(min_sku) by_location = partition(min_skus, lambda s: s["location"]) if os.path.exists("src/hpc/autoscale/node/vm_sizes.json"): print("reload") vm_sizes = json.load(open("src/hpc/autoscale/node/vm_sizes.json")) else: vm_sizes = {} locs = list(by_location.keys()) a = sorted(by_location.items(), key=lambda x: locs.index(x[0]) if x[0] in locs else -1) for loc, loc_skus in a: vm_sizes[loc] = partition_single(loc_skus, lambda s: s["name"]) if which("cycle_server"): cs_mts = json.loads( check_output([ "cycle_server", "execute", "--format", "json", "select * from Azure.MachineType", ]).decode()) else: print( "Warning: cycle_server found! Skipping validation", file=sys.stderr, ) cs_mts = [] for row in cs_mts: try: aux_info = AuxVMSizeInfo(vm_sizes[row["Location"]][row["Name"]]) if aux_info.vcpu_count != row["CoreCount"]: print( row, aux_info.vcpu_count, json.dumps(getattr(aux_info, "_AuxVMSizeInfo__record"), indent=2), ) if row["Location"] not in vm_sizes: vm_sizes[row["Location"]] = {} rec = { "name": row.pop("Name"), "family": row.pop("Family"), "size": row.pop("SKU"), "tier": row.pop("Tier"), "location": row.pop("Location"), "linux_price": row.get("Linux", {}).get("Regular", 0.0), "windows_price": row.get("Linux", {}).get("Regular", 0.0), "capabilities": row, } vm_sizes[row["Location"]][row["Name"]] = rec sys.exit(1) continue except KeyError: pass if row["Location"] not in vm_sizes: vm_sizes[row["Location"]] = {} final_vm_sizes: Dict = {} for loc in sorted(vm_sizes): final_vm_sizes[loc] = loc_dict = {} for vm_size in sorted(vm_sizes[loc]): loc_dict[vm_size] = vm_sizes[loc][vm_size] with open("new_vm_sizes.json", "w") as fw: json.dump(final_vm_sizes, fw, indent=2) with open("../src/hpc/autoscale/node/vm_sizes.json") as fr: old_data = json.load(fr) missing_locations = set(old_data.keys()) - set(final_vm_sizes.keys()) new_locations = set(final_vm_sizes.keys()) - set(old_data.keys()) if missing_locations: print("WARNING: Missing locations:", ",".join(missing_locations)) if missing_locations: print("INFO: New locations:", ",".join(new_locations)) all_locations = list(old_data.keys()) + list(new_locations) for location in all_locations: old_loc_data = old_data.get(location, {}) new_loc_data = final_vm_sizes.get(location, {}) missing_skus = set(old_loc_data.keys()) - set(new_loc_data.keys()) new_skus = set(new_loc_data.keys()) - set(old_loc_data.keys()) if missing_skus and location not in missing_locations: print( "WARNING: Missing SKUs for location", location, ":", ",".join(missing_skus), ) if new_skus and location not in new_locations: print("INFO: New SKUs for location", location, ":", ",".join(new_skus)) print( "Copy ./new_vm_sizes.json to ./src/hpc/autoscale/node/vm_sizes.json to complete the creation." )
def _parse_complexes( autoscale_config: Dict, complex_lines: List[str] ) -> Dict[str, "Complex"]: relevant_complexes = None if autoscale_config: relevant_complexes = autoscale_config.get("gridengine", {}).get( "relevant_complexes" ) if relevant_complexes: # special handling of ccnodeid, since it is something we # create for the user relevant_complexes = relevant_complexes + ["ccnodeid"] if relevant_complexes: logging.info( "Restricting complexes for autoscaling to %s", relevant_complexes ) complexes: List[Complex] = [] headers = complex_lines[0].lower().replace("#", "").split() required = set(["name", "type", "consumable"]) missing = required - set(headers) if missing: logging.error( "Could not parse complex file as it is missing expected columns: %s." + " Autoscale likely will not work.", list(missing), ) return {} for n, line in enumerate(complex_lines[1:]): if line.startswith("#"): continue toks = line.split() if len(toks) != len(headers): logging.warning( "Could not parse complex at line {} - ignoring: '{}'".format(n, line) ) continue c = dict(zip(headers, toks)) try: if ( relevant_complexes and c["name"] not in relevant_complexes and c["shortcut"] not in relevant_complexes ): logging.trace( "Ignoring complex %s because it was not defined in gridengine.relevant_complexes", c["name"], ) continue complex = Complex( name=c["name"], shortcut=c.get("shortcut", c["name"]), complex_type=c["type"], relop=c.get("relop", "=="), requestable=c.get("requestable", "YES").lower() == "yes", consumable=c.get("consumable", "YES").lower() == "yes", default=c.get("default"), urgency=int(c.get("urgency", 0)), ) complexes.append(complex) except Exception: logging.exception("Could not parse complex %s - %s", line, c) # TODO test RDH ret = partition_single(complexes, lambda x: x.name) shortcut_dict = partition_single(complexes, lambda x: x.shortcut) ret.update(shortcut_dict) return ret
def _setup_shell_locals(self, config: Dict) -> Dict: """ Provides read only interactive shell. type pbsprohelp() in the shell for more information """ ctx = DefaultContextHandler("[interactive-readonly]") pbs_driver = PBSProDriver(config) pbs_env = self._pbs_env(pbs_driver) def pbsprohelp() -> None: print( "config - dict representing autoscale configuration." ) print( "cli - object representing the CLI commands") print( "pbs_env - object that contains data structures for queues, resources etc" ) print( "queues - dict of queue name -> PBSProQueue object" ) print("jobs - dict of job id -> Autoscale Job") print( "scheduler_nodes - dict of hostname -> node objects. These represent purely what" " the scheduler sees without additional booting nodes / information from CycleCloud" ) print( "resource_definitions - dict of resource name -> PBSProResourceDefinition objects." ) print( "default_scheduler - PBSProScheduler object representing the default scheduler." ) print( "pbs_driver - PBSProDriver object that interacts directly with PBS and implements" " PBS specific behavior for scalelib.") print( "demand_calc - ScaleLib DemandCalculator - pseudo-scheduler that determines the what nodes are unnecessary" ) print( "node_mgr - ScaleLib NodeManager - interacts with CycleCloud for all node related" + " activities - creation, deletion, limits, buckets etc." ) print("pbsprohelp - This help function") # try to make the key "15" instead of "15.hostname" if only # a single submitter was in use num_scheds = len(set([x.name.split(".", 1)[-1] for x in pbs_env.jobs])) if num_scheds == 1: jobs_dict = partition_single(pbs_env.jobs, lambda j: j.name.split(".")[0]) else: jobs_dict = partition_single(pbs_env.jobs, lambda j: j.name) sched_nodes_dict = partition_single(pbs_env.scheduler_nodes, lambda n: n.hostname) pbs_env.queues = clilib.ShellDict(pbs_env.queues) for snode in pbs_env.scheduler_nodes: snode.shellify() pbs_env.resource_definitions = clilib.ShellDict( pbs_env.resource_definitions) demand_calc, _ = self._demand_calc(config, pbs_driver) shell_locals = { "config": config, "cli": self, "ctx": ctx, "pbs_env": pbs_env, "queues": pbs_env.queues, "jobs": clilib.ShellDict(jobs_dict, "j"), "scheduler_nodes": clilib.ShellDict(sched_nodes_dict), "resource_definitions": pbs_env.resource_definitions, "default_scheduler": pbs_env.default_scheduler, "pbs_driver": pbs_driver, "demand_calc": demand_calc, "node_mgr": demand_calc.node_mgr, "pbsprohelp": pbsprohelp, } return shell_locals
def autoscale_grid_engine( config: Dict[str, Any], ge_env: Optional[GridEngineEnvironment] = None, ge_driver: Optional["GridEngineDriver"] = None, ctx_handler: Optional[DefaultContextHandler] = None, node_history: Optional[NodeHistory] = None, dry_run: bool = False, ) -> DemandResult: global _exit_code assert not config.get("read_only", False) if dry_run: logging.warning("Running gridengine autoscaler in dry run mode") # allow multiple instances config["lock_file"] = None # put in read only mode config["read_only"] = True if ge_env is None: ge_env = envlib.from_qconf(config) # interface to GE, generally by cli if ge_driver is None: # allow tests to pass in a mock ge_driver = new_driver(config, ge_env) ge_driver.initialize_environment() config = ge_driver.preprocess_config(config) logging.fine("Driver = %s", ge_driver) invalid_nodes = [] # we need an instance without any scheduler nodes, so don't # pass in the existing nodes. tmp_node_mgr = new_node_manager(config) by_hostname = partition_single(tmp_node_mgr.get_nodes(), lambda n: n.hostname_or_uuid) for node in ge_env.nodes: # many combinations of a u and other states. However, # as long as a and u are in there it is down state = node.metadata.get("state", "") cc_node = by_hostname.get(node.hostname) ccnodeid = node.resources.get("ccnodeid") if cc_node: if not ccnodeid or ccnodeid == cc_node.delayed_node_id.node_id: if cc_node.state in ["Preparing", "Acquiring"]: continue if "a" in state and "u" in state: invalid_nodes.append(node) # nodes in error state must also be deleted nodes_to_delete = ge_driver.clean_hosts(invalid_nodes) for node in nodes_to_delete: ge_env.delete_node(node) demand_calculator = calculate_demand(config, ge_env, ge_driver, ctx_handler, node_history) ge_driver.handle_failed_nodes( demand_calculator.node_mgr.get_failed_nodes()) demand_result = demand_calculator.finish() if ctx_handler: ctx_handler.set_context("[joining]") # details here are that we pass in nodes that matter (matched) and the driver figures out # which ones are new and need to be added via qconf joined = ge_driver.handle_join_cluster( [x for x in demand_result.compute_nodes if x.exists]) ge_driver.handle_post_join_cluster(joined) if ctx_handler: ctx_handler.set_context("[scaling]") # bootup all nodes. Optionally pass in a filtered list if demand_result.new_nodes: if not dry_run: demand_calculator.bootup() if not dry_run: demand_calculator.update_history() # we also tell the driver about nodes that are unmatched. It filters them out # and returns a list of ones we can delete. idle_timeout = int(config.get("idle_timeout", 300)) boot_timeout = int(config.get("boot_timeout", 3600)) logging.fine("Idle timeout is %s", idle_timeout) unmatched_for_5_mins = demand_calculator.find_unmatched_for( at_least=idle_timeout) timed_out_booting = demand_calculator.find_booting(at_least=boot_timeout) # I don't care about nodes that have keep_alive=true timed_out_booting = [n for n in timed_out_booting if not n.keep_alive] timed_out_to_deleted = [] unmatched_nodes_to_delete = [] if timed_out_booting: logging.info("The following nodes have timed out while booting: %s", timed_out_booting) timed_out_to_deleted = ge_driver.handle_boot_timeout( timed_out_booting) or [] if unmatched_for_5_mins: node_expr = ", ".join([str(x) for x in unmatched_for_5_mins]) logging.info("Unmatched for at least %s seconds: %s", idle_timeout, node_expr) unmatched_nodes_to_delete = ( ge_driver.handle_draining(unmatched_for_5_mins) or []) nodes_to_delete = [] for node in timed_out_to_deleted + unmatched_nodes_to_delete: if node.assignments: logging.warning( "%s has jobs assigned to it so we will take no action.", node) continue nodes_to_delete.append(node) if nodes_to_delete: try: logging.info("Deleting %s", [str(n) for n in nodes_to_delete]) delete_result = demand_calculator.delete(nodes_to_delete) if delete_result: # in case it has anything to do after a node is deleted (usually just remove it from the cluster) ge_driver.handle_post_delete(delete_result.nodes) except Exception as e: _exit_code = 1 logging.warning( "Deletion failed, will retry on next iteration: %s", e) logging.exception(str(e)) print_demand(config, demand_result, log=not dry_run) return demand_result
def autoscale_hpcpack( config: Dict[str, Any], ctx_handler: DefaultContextHandler = None, hpcpack_rest_client: Optional[HpcRestClient] = None, dry_run: bool = False, ) -> None: if not hpcpack_rest_client: hpcpack_rest_client = new_rest_client(config) if ctx_handler: ctx_handler.set_context("[Sync-Status]") autoscale_config = config.get("autoscale") or {} # Load history info idle_timeout_seconds: int = autoscale_config.get("idle_timeout") or 600 provisioning_timeout_seconds = autoscale_config.get("boot_timeout") or 1500 statefile = autoscale_config.get( "statefile") or "C:\\cycle\\jetpack\\config\\autoscaler_state.txt" archivefile = autoscale_config.get( "archivefile") or "C:\\cycle\\jetpack\\config\\autoscaler_archive.txt" node_history = HpcNodeHistory( statefile=statefile, archivefile=archivefile, provisioning_timeout=provisioning_timeout_seconds, idle_timeout=idle_timeout_seconds) logging.info("Synchronizing the nodes between Cycle cloud and HPC Pack") # Initialize data of History info, cc nodes, HPC Pack nodes, HPC grow decisions # Get node list from Cycle Cloud def nodes_state_key(n: Node) -> Tuple[int, str, int]: try: state_pri = 1 if n.state == 'Deallocated': state_pri = 2 elif n.state == 'Stopping': state_pri = 3 elif n.state == 'Terminating': state_pri = 4 name, index = n.name.rsplit("-", 1) return (state_pri, name, int(index)) except Exception: return (state_pri, n.name, 0) node_mgr: NodeManager = new_node_manager(config) for b in node_mgr.get_buckets(): b.nodes.sort(key=nodes_state_key) cc_nodes: List[Node] = node_mgr.get_nodes() cc_nodes_by_id = partition_single(cc_nodes, func=lambda n: n.delayed_node_id.node_id) # Get compute node list and grow decision from HPC Pack hpc_node_groups = hpcpack_rest_client.list_node_groups() grow_decisions = hpcpack_rest_client.get_grow_decision() logging.info("grow decision: {}".format(grow_decisions)) hpc_cn_nodes: List[HpcNode] = hpcpack_rest_client.list_computenodes() hpc_cn_nodes = [n for n in hpc_cn_nodes if n.active] # This function will link node history items, cc nodes and hpc nodes node_history.synchronize(cc_nodes, hpc_cn_nodes) cc_nodearrays = set([b.nodearray for b in node_mgr.get_buckets()]) logging.info("Current node arrays in cyclecloud: {}".format(cc_nodearrays)) # Create HPC node groups for CC node arrays cc_map_hpc_groups = ["CycleCloudNodes"] + list(cc_nodearrays) for cc_grp in cc_map_hpc_groups: if ci_notin(cc_grp, hpc_node_groups): logging.info("Create HPC node group: {}".format(cc_grp)) hpcpack_rest_client.add_node_group(cc_grp, "Cycle Cloud Node group") # Add HPC nodes into corresponding node groups add_cc_tag_nodes = [ n.name for n in hpc_cn_nodes if n.shall_addcyclecloudtag ] if len(add_cc_tag_nodes) > 0: logging.info( "Adding HPC nodes to node group CycleCloudNodes: {}".format( add_cc_tag_nodes)) hpcpack_rest_client.add_node_to_node_group("CycleCloudNodes", add_cc_tag_nodes) for cc_grp in list(cc_nodearrays): add_array_tag_nodes = [ n.name for n in hpc_cn_nodes if n.shall_addnodearraytag and ci_equals(n.cc_nodearray, cc_grp) ] if len(add_array_tag_nodes) > 0: logging.info("Adding HPC nodes to node group {}: {}".format( cc_grp, add_array_tag_nodes)) hpcpack_rest_client.add_node_to_node_group(cc_grp, add_array_tag_nodes) # Possible values for HPC NodeState (states marked with * shall not occur for CC nodes): # Unknown, Provisioning, Offline, Starting, Online, Draining, Rejected(*), Removing, NotDeployed(*), Stopping(*) # Remove the following HPC Pack nodes: # 1. The corresponding CC node already removed # 2. The corresponding CC node is stopped and HPC node is not assigned a node template # Take offline the following HPC Pack nodes: # 1. The corresponding CC node is stopped or is going to stop hpc_nodes_to_remove = [ n.name for n in hpc_cn_nodes if n.removed_cc_node or (n.stopped_cc_node and not n.template_assigned) ] hpc_nodes_to_take_offline = [ n.name for n in hpc_cn_nodes if n.stopped_cc_node and ci_equals(n.state, "Online") ] if len(hpc_nodes_to_remove) > 0: logging.info("Removing the HPC nodes: {}".format(hpc_nodes_to_remove)) if dry_run: logging.info("Dry-run: no real action") else: hpcpack_rest_client.remove_nodes(hpc_nodes_to_remove) hpc_cn_nodes = [ n for n in hpc_cn_nodes if not (n.stopped_cc_node or n.removed_cc_node) ] # Assign default node template for unapproved CC node hpc_nodes_to_assign_template = [ n.name for n in hpc_cn_nodes if n.bound_cc_node and not n.template_assigned ] if len(hpc_nodes_to_assign_template) > 0: logging.info( "Assigning default node template for the HPC nodes: {}".format( hpc_nodes_to_assign_template)) if dry_run: logging.info("Dry-run: no real action") else: hpcpack_rest_client.assign_default_compute_node_template( hpc_nodes_to_assign_template) ### Start scale up checking: logging.info("Start scale up checking ...") if ctx_handler: ctx_handler.set_context("[scale-up]") hpc_nodes_with_active_cc = [ n for n in hpc_cn_nodes if n.template_assigned and n.bound_cc_node ] # Exclude the already online healthy HPC nodes before calling node_mgr.allocate for hpc_node in hpc_nodes_with_active_cc: if hpc_node.ready_for_job: hpc_node.bound_cc_node.closed = True # Terminate the provisioning timeout CC nodes cc_node_to_terminate: List[Node] = [] for cc_node in cc_nodes: if ci_equals(cc_node.target_state, 'Deallocated') or ci_equals( cc_node.target_state, 'Terminated') or cc_node.create_time_remaining: continue nhi = node_history.find(cc_id=cc_node.delayed_node_id.node_id) if not nhi.hpc_id: cc_node.closed = True cc_node_to_terminate.append(cc_node) else: hpc_node = ci_find_one(hpc_nodes_with_active_cc, nhi.hpc_id, lambda n: n.id) if hpc_node and hpc_node.error: cc_node.closed = True cc_node_to_terminate.append(cc_node) # "ComputeNodes", "CycleCloudNodes", "AzureIaaSNodes" are all treated as default # grow_by_socket not supported yet, treat as grow_by_node defaultGroups = [ "Default", "ComputeNodes", "AzureIaaSNodes", "CycleCloudNodes" ] default_cores_to_grow = default_nodes_to_grow = 0.0 # If the current CC nodes in the node array cannot satisfy the grow decision, the group is hungry # For a hungry group, no idle check is required if the node health is OK group_hungry: Dict[str, bool] = {} nbrNewNodes: int = 0 grow_groups = list(grow_decisions.keys()) for grp in grow_groups: tmp = grow_decisions.pop(grp) if not (tmp.cores_to_grow + tmp.nodes_to_grow + tmp.sockets_to_grow): continue if ci_in(grp, defaultGroups): default_cores_to_grow += tmp.cores_to_grow default_nodes_to_grow += tmp.nodes_to_grow + tmp.sockets_to_grow continue if ci_notin(grp, cc_nodearrays): logging.warning( "No mapping node array for the grow requirement {}:{}".format( grp, grow_decisions[grp])) grow_decisions.pop(grp) continue group_hungry[grp] = False array = ci_lookup(grp, cc_nodearrays) selector = {'ncpus': 1, 'node.nodearray': [array]} target_cores = math.ceil(tmp.cores_to_grow) target_nodes = math.ceil(tmp.nodes_to_grow + tmp.sockets_to_grow) if target_nodes: logging.info("Allocate: {} Target Nodes: {}".format( selector, target_nodes)) result = node_mgr.allocate(selector, node_count=target_nodes) logging.info(result) if not result or result.total_slots < target_nodes: group_hungry[grp] = True if target_cores: logging.info("Allocate: {} Target Cores: {}".format( selector, target_cores)) result = node_mgr.allocate(selector, slot_count=target_cores) logging.info(result) if not result or result.total_slots < target_cores: group_hungry[grp] = True if len(node_mgr.new_nodes) > nbrNewNodes: group_hungry[grp] = True nbrNewNodes = len(node_mgr.new_nodes) # We then check the grow decision for the default node groups: checkShrinkNeeded = True growForDefaultGroup = True if default_nodes_to_grow or default_cores_to_grow else False if growForDefaultGroup: selector = {'ncpus': 1} if default_nodes_to_grow: target_nodes = math.ceil(default_nodes_to_grow) logging.info("Allocate: {} Target Nodes: {}".format( selector, target_nodes)) result = node_mgr.allocate({'ncpus': 1}, node_count=target_nodes) if not result or result.total_slots < target_nodes: checkShrinkNeeded = False if default_cores_to_grow: target_cores = math.ceil(default_cores_to_grow) logging.info("Allocate: {} Target Cores: {}".format( selector, target_cores)) result = node_mgr.allocate({'ncpus': 1}, slot_count=target_cores) if not result or result.total_slots < target_cores: checkShrinkNeeded = False if len(node_mgr.new_nodes) > nbrNewNodes: checkShrinkNeeded = False nbrNewNodes = len(node_mgr.new_nodes) if nbrNewNodes > 0: logging.info("Need to Allocate {} nodes in total".format(nbrNewNodes)) if dry_run: logging.info("Dry-run: skipping node bootup...") else: logging.info("Allocating {} nodes in total".format( len(node_mgr.new_nodes))) bootup_result: BootupResult = node_mgr.bootup() logging.info(bootup_result) if bootup_result and bootup_result.nodes: for cc_node in bootup_result.nodes: nhi = node_history.find( cc_id=cc_node.delayed_node_id.node_id) if nhi is None: nhi = node_history.insert( NodeHistoryItem(cc_node.delayed_node_id.node_id)) else: nhi.restart() else: logging.info("No need to allocate new nodes ...") ### Start the shrink checking if ctx_handler: ctx_handler.set_context("[scale-down]") cc_node_to_shutdown: List[Node] = [] if not checkShrinkNeeded: logging.info("No shrink check at this round ...") if not dry_run: for nhi in node_history.items: if not nhi.stopped and nhi.hpc_id: nhi.idle_from = None else: logging.info("Start scale down checking ...") # By default, we check idle for active CC nodes in HPC Pack with 'Offline', 'Starting', 'Online', 'Draining' state candidate_idle_check_nodes = [ n for n in hpc_nodes_with_active_cc if (not n.bound_cc_node.keep_alive) and ci_in(n.state, ["Offline", "Starting", "Online", "Draining"]) ] # We can exclude some nodes from idle checking: # 1. If HPC Pack ask for grow in default node group(s), all healthy ONLINE nodes are considered as busy # 2. If HPC Pack ask for grow in certain node group, all healthy ONLINE nodes in that node group are considered as busy # 3. If a node group is hungry (new CC required or grow request not satisfied), no idle check needed for all nodes in that node array if growForDefaultGroup: candidate_idle_check_nodes = [ n for n in candidate_idle_check_nodes if not n.ready_for_job ] for grp, hungry in group_hungry.items(): if hungry: candidate_idle_check_nodes = [ n for n in candidate_idle_check_nodes if not ci_equals(grp, n.cc_nodearray) ] elif not growForDefaultGroup: candidate_idle_check_nodes = [ n for n in candidate_idle_check_nodes if not (ci_equals(grp, n.cc_nodearray) and n.ready_for_job) ] curtime = datetime.utcnow() # Offline node must be idle idle_node_names = [ n.name for n in candidate_idle_check_nodes if ci_equals(n.state, 'Offline') ] if len(candidate_idle_check_nodes) > len(idle_node_names): idle_nodes = hpcpack_rest_client.check_nodes_idle([ n.name for n in candidate_idle_check_nodes if not ci_equals(n.state, 'Offline') ]) if len(idle_nodes) > 0: idle_node_names.extend([n.node_name for n in idle_nodes]) if len(idle_node_names) > 0: logging.info( "The following node is idle: {}".format(idle_node_names)) else: logging.info("No idle node found in this round.") retention_days = autoscale_config.get("vm_retention_days") or 7 for nhi in node_history.items: if nhi.stopped: if nhi.stop_time + timedelta( days=retention_days) < datetime.utcnow(): cc_node = cc_nodes_by_id.get(nhi.cc_id) if cc_node is not None: cc_node_to_terminate.append(cc_node) continue if ci_in(nhi.hostname, idle_node_names): if nhi.idle_from is None: nhi.idle_from = curtime elif nhi.idle_timeout(idle_timeout_seconds): nhi.stop_time = curtime cc_node = cc_nodes_by_id.get(nhi.cc_id) if cc_node is not None: cc_node_to_shutdown.append(cc_node) else: nhi.idle_from = None shrinking_cc_node_ids = [ n.delayed_node_id.node_id for n in cc_node_to_terminate ] shrinking_cc_node_ids.extend( [n.delayed_node_id.node_id for n in cc_node_to_shutdown]) hpc_nodes_to_bring_online = [ n.name for n in hpc_nodes_with_active_cc if ci_equals(n.state, 'Offline') and not n.error and ci_notin(n.cc_node_id, shrinking_cc_node_ids) ] hpc_nodes_to_take_offline.extend([ n.name for n in hpc_nodes_with_active_cc if ci_equals(n.state, 'Online') and ci_in(n.cc_node_id, shrinking_cc_node_ids) ]) if len(hpc_nodes_to_bring_online) > 0: logging.info("Bringing the HPC nodes online: {}".format( hpc_nodes_to_bring_online)) if dry_run: logging.info("Dry-run: no real action") else: hpcpack_rest_client.bring_nodes_online(hpc_nodes_to_bring_online) if len(hpc_nodes_to_take_offline) > 0: logging.info("Taking the HPC nodes offline: {}".format( hpc_nodes_to_take_offline)) if dry_run: logging.info("Dry-run: no real action") else: hpcpack_rest_client.take_nodes_offline(hpc_nodes_to_take_offline) if len(cc_node_to_shutdown) > 0: logging.info("Shut down the following Cycle cloud node: {}".format( [cn.name for cn in cc_node_to_shutdown])) if dry_run: logging.info("Dry-run: skip ...") else: node_mgr.shutdown_nodes(cc_node_to_shutdown) if len(cc_node_to_terminate) > 0: logging.info( "Terminating the following provisioning-timeout Cycle cloud nodes: {}" .format([cn.name for cn in cc_node_to_terminate])) if dry_run: logging.info("Dry-run: skip ...") else: node_mgr.terminate_nodes(cc_node_to_terminate) if not dry_run: logging.info("Save node history: {}".format(node_history)) node_history.save()