def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): request_resources(num_cpus=42) # Disable event clearing for test. monitor.event_summarizer.clear = lambda *a: None while True: monitor.update_load_metrics() monitor.update_resource_requests() monitor.update_event_summary() resource_usage = monitor.load_metrics._get_resource_usage() # Check resource request propagation. req = monitor.load_metrics.resource_requests assert req == [{"CPU": 1}] * 42, req if "memory" in resource_usage[0]: del resource_usage[0]["memory"] if "object_store_memory" in resource_usage[1]: del resource_usage[0]["object_store_memory"] if "memory" in resource_usage[1]: del resource_usage[1]["memory"] if "object_store_memory" in resource_usage[1]: del resource_usage[1]["object_store_memory"] for key in list(resource_usage[0].keys()): if key.startswith("node:"): del resource_usage[0][key] for key in list(resource_usage[1].keys()): if key.startswith("node:"): del resource_usage[1][key] if expected_resource_usage is None: if all(x for x in resource_usage[0:]): break elif all(x == y for x, y in zip(resource_usage, expected_resource_usage)): break else: timeout -= 1 time.sleep(1) if timeout <= 0: raise ValueError("Timeout. {} != {}".format( resource_usage, expected_resource_usage)) # Sanity check we emitted a resize event. assert any("Resized to" in x for x in monitor.event_summarizer.summary()) return resource_usage
def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): request_resources(num_cpus=42) while True: monitor.update_load_metrics() monitor.update_resource_requests() resource_usage = monitor.load_metrics._get_resource_usage() # Check resource request propagation. req = monitor.load_metrics.resource_requests assert req == [{"CPU": 1}] * 42, req if "memory" in resource_usage[0]: del resource_usage[0]["memory"] if "object_store_memory" in resource_usage[1]: del resource_usage[0]["object_store_memory"] if "memory" in resource_usage[1]: del resource_usage[1]["memory"] if "object_store_memory" in resource_usage[1]: del resource_usage[1]["object_store_memory"] for key in list(resource_usage[0].keys()): if key.startswith("node:"): del resource_usage[0][key] for key in list(resource_usage[1].keys()): if key.startswith("node:"): del resource_usage[1][key] if expected_resource_usage is None: if all(x for x in resource_usage[0:]): break elif all(x == y for x, y in zip(resource_usage, expected_resource_usage)): break else: timeout -= 1 time.sleep(1) if timeout <= 0: raise ValueError("Timeout. {} != {}".format( resource_usage, expected_resource_usage)) return resource_usage
async def expand_cluster(self, current_workers, allocation): logging.info("Attempting to expand cluster to " f"{len(allocation)} nodes") nodes = self.get_nodes() node_ips = {node["NodeManagerAddress"] for node in nodes} invalid_workers = { worker_id: ip for worker_id, ip in current_workers.items() if ip not in node_ips } if len(invalid_workers) == len(current_workers): rescale_timeout = FULL_RESCALE_TIMEOUT logging.info("No live workers found. " "Waiting longer than specified for rescaling.") else: rescale_timeout = self._rescale_timeout worker_resources = [ copy.deepcopy(self._worker_resources) for _ in range(len(allocation) + len(invalid_workers)) ] for bundle in worker_resources: bundle["CPU"] += 0.1 sdk.request_resources(bundles=worker_resources) waited = 0.0 logging.info(f"Waiting for up to {rescale_timeout} seconds for " "nodes to be ready") while (waited < rescale_timeout and not self._cluster_ready(allocation)[0] and not self._force_immediate_allocation.is_set()): await asyncio.sleep(1.0) waited += 1.0 ready, nodes = self._cluster_ready(allocation) logging.info(f"Found {nodes} available nodes") if not ready: allocation = ( [node for node in allocation if "adaptdl_virtual" not in node] + [node for node in allocation if "adaptdl_virtual" in node]) return allocation[:nodes] else: return allocation
def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): request_resources(num_cpus=42) # add placement groups. pg_demands = [{"GPU": 2}, {"extra_resource": 2}] strategy = "STRICT_PACK" pg = placement_group(pg_demands, strategy=strategy) pg.ready() time.sleep(2) # wait for placemnt groups to propogate. # Disable event clearing for test. monitor.event_summarizer.clear = lambda *a: None visited_atleast_once = [set(), set()] while True: monitor.update_load_metrics() monitor.update_resource_requests() monitor.update_event_summary() resource_usage = monitor.load_metrics._get_resource_usage() # Check resource request propagation. req = monitor.load_metrics.resource_requests assert req == [{"CPU": 1}] * 42, req pg_response_data = monitor.load_metrics.pending_placement_groups assert_correct_pg(pg_response_data, pg_demands, strategy) if "memory" in resource_usage[0]: del resource_usage[0]["memory"] visited_atleast_once[0].add("memory") if "object_store_memory" in resource_usage[0]: del resource_usage[0]["object_store_memory"] visited_atleast_once[0].add("object_store_memory") if "memory" in resource_usage[1]: del resource_usage[1]["memory"] visited_atleast_once[1].add("memory") if "object_store_memory" in resource_usage[1]: del resource_usage[1]["object_store_memory"] visited_atleast_once[1].add("object_store_memory") for key in list(resource_usage[0].keys()): if key.startswith("node:"): del resource_usage[0][key] visited_atleast_once[0].add("node:") for key in list(resource_usage[1].keys()): if key.startswith("node:"): del resource_usage[1][key] visited_atleast_once[1].add("node:") if expected_resource_usage is None: if all(x for x in resource_usage[0:]): break elif all(x == y for x, y in zip(resource_usage, expected_resource_usage)): break else: timeout -= 1 time.sleep(1) if timeout <= 0: raise ValueError("Timeout. {} != {}".format( resource_usage, expected_resource_usage)) # Sanity check we emitted a resize event. assert any("Resized to" in x for x in monitor.event_summarizer.summary()) assert visited_atleast_once[0] == { "memory", "object_store_memory", "node:" } assert visited_atleast_once[0] == visited_atleast_once[1] remove_placement_group(pg) return resource_usage