def test_allocate_job(): nodes = { "0": NodeInfo({"gpu": 1, "cpu": 500, "pods": 32}, preemptible=False), "1": NodeInfo({"gpu": 2, "cpu": 2000, "pods": 32}, preemptible=False), "2": NodeInfo({"gpu": 2, "cpu": 3000, "pods": 32}, preemptible=True), } perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634, 0.0118, 0.00317, 1.14) grad_params = GradParams(sqr=0.00136, var=0.000502) goodput_fn = GoodputFunction(perf_params, grad_params, 128) speedup_fn = SpeedupFunction(goodput_fn, max_batch_size=1280, atomic_bsz_range=(64, 256)) now = datetime.now() min_replicas = 0 job_1 = JobInfo({"gpu": 1, "cpu": 500, "pods": 1}, speedup_fn, now + timedelta(minutes=0), min_replicas, max_replicas=1) job_2 = JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn, now + timedelta(minutes=1), min_replicas, max_replicas=1) job_3 = JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn, now + timedelta(minutes=1), 2, max_replicas=2) job_4 = JobInfo({"gpu": 1, "cpu": 2000, "pods": 1}, speedup_fn, now + timedelta(minutes=1), 2, max_replicas=2) policy = PolluxPolicy() assert(policy.allocate_job(job_1, nodes) == ["0"]) assert(policy.allocate_job(job_2, nodes) == ["1"]) assert(policy.allocate_job(job_3, nodes) == ["1", "1"]) assert(policy.allocate_job(job_4, nodes) == [])
def test_unusable_node(): # Test where one of the nodes can't be used due to one resource type. nodes = { 0: NodeInfo({"gpu": 1, "cpu": 500, "pods": 32}, preemptible=False), 1: NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=False), 2: NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=False), } template = NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=True) perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634, 0.0118, 0.00317, 1.14) grad_params = GradParams(sqr=0.00136, var=0.000502) goodput_fn = GoodputFunction(perf_params, grad_params, 128) speedup_fn = SpeedupFunction(goodput_fn, max_batch_size=1280, atomic_bsz_range=(64, 256)) now = datetime.now() min_replicas = 0 jobs = { 0: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn, now + timedelta(minutes=0), min_replicas, max_replicas=1), 1: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn, now + timedelta(minutes=1), min_replicas, max_replicas=1), 2: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn, now + timedelta(minutes=2), min_replicas, max_replicas=1), } policy = PolluxPolicy() allocations, desired_nodes = policy.optimize(jobs, nodes, {}, template) # Check that more nodes are asked for. assert desired_nodes > 3 # Check no job was allocated more than 1 replica. assert max(len(alloc) for alloc in allocations.values()) == 1 # Check two jobs were allocated. assert sum(len(alloc) for alloc in allocations.values()) == 2
def __init__(self, expander): self._core_api = kubernetes.client.CoreV1Api() self._objs_api = kubernetes.client.CustomObjectsApi() self._custom_resource = ("adaptdl.petuum.com", "v1", "", "adaptdljobs") self._cluster_expander = expander self._policy = PolluxPolicy() # lock for the two corountines in run() self._lock = asyncio.Lock()
def __init__(self, nodes: List = None): nodes = nodes if nodes is not None else config.nodes() self._node_infos = { node['NodeManagerAddress']: NodeInfo(node['Resources'], preemptible=False) for node in nodes } self._default_node = cycle(list(self._node_infos)) # Add a node template. self._node_template = NodeInfo(list( self._node_infos.values())[0].resources, preemptible=False) self._policy = PolluxPolicy()
def test_optimize(num_nodes, total_devices=16): assert total_devices % num_nodes == 0 num_devices = total_devices // num_nodes print("{}x{} nodes:".format(num_nodes, num_devices)) # Make up a realistic speedup function. params = Params(0.121, 0.00568, 0.0236, 0.00634, 0.0118, 0.00317, 1.14) grad_params = {"norm": 0.00136, "var": 0.000502} speedup_fn = SpeedupFunction(params, grad_params, init_batch_size=128, max_batch_size=1280, local_bsz_bounds=(64, 256), elastic_bsz=True) now = datetime.now() jobs = {} # Add a few jobs. job_resources = {"nvidia.com/gpu": 1, "pods": 1} for i in range(16): creation_timestamp = now + timedelta(minutes=len(jobs)), max_replicas = 8 key = len(jobs) jobs[key] = JobInfo(job_resources, speedup_fn, creation_timestamp, max_replicas) # Add a few nodes. node_resources = {"nvidia.com/gpu": num_devices, "pods": 32} nodes = { i: NodeInfo(node_resources, preemptible=False) for i in range(num_nodes) } # Add a node template. node_template = NodeInfo(node_resources, preemptible=True) policy = PolluxPolicy() prev_allocs = {} for i in range(3): start = time.time() allocations, desired_nodes = \ policy.optimize(jobs, nodes, prev_allocs, node_template) duration = time.time() - start print("optimize {}x ({}s sec):".format(i + 1, duration)) node_count = Counter() for job_key, placement in allocations.items(): assert len(placement) <= jobs[job_key].max_replicas for node_key in placement: node_count[node_key] += 1 for node_key, count in node_count.items(): assert count <= nodes[node_key].resources["nvidia.com/gpu"] assert count <= nodes[node_key].resources["pods"]
class AdaptDLAllocator: def __init__(self, nodes: List = None): nodes = nodes if nodes is not None else config.nodes() self._node_infos = { node['NodeManagerAddress']: NodeInfo(node['Resources'], preemptible=False) for node in nodes } self._default_node = cycle(list(self._node_infos)) # Add a node template. self._node_template = NodeInfo(list( self._node_infos.values())[0].resources, preemptible=False) self._policy = PolluxPolicy() def default_allocation(self, num_devices=1) -> List[str]: """ Cycle through nodes for default trial allocation.""" return [f"{next(self._default_node)}"] * num_devices def allocate(self, jobs: List[AdaptDLJobMixin], nodes: List = None) -> (Dict, int): """ Use Pollux to distribute available resources between jobs.""" if nodes is not None: node_infos = { node['NodeManagerAddress']: NodeInfo(node['Resources'], preemptible=False) for node in nodes } else: node_infos = self._node_infos assert len(jobs) > 0 # gather JobInfos job_infos = {job.job_id: job.job_info for job in jobs} # gather previous allocations prev_allocs = {job.job_id: job.allocation for job in jobs} allocations, desired_nodes = \ self._policy.optimize(job_infos, node_infos, prev_allocs, self._node_template) # Fill empty allocations for jobs which didn't get any for job_id in job_infos: allocations[job_id] = allocations.get(job_id, []) assert all(v == [] for k, v in allocations.items()) is False return allocations, desired_nodes
def __init__(self, expander): self._core_api = kubernetes.client.CoreV1Api() self._objs_api = kubernetes.client.CustomObjectsApi() self._cluster_expander = expander self._policy = PolluxPolicy()
class AdaptDLAllocator(object): def __init__(self, expander): self._core_api = kubernetes.client.CoreV1Api() self._objs_api = kubernetes.client.CustomObjectsApi() self._cluster_expander = expander self._policy = PolluxPolicy() async def run(self): while True: LOG.info("Running allocator loop") nodes, node_template = await self._find_nodes() LOG.info("Node resources: %s", {k: v.resources for k, v in nodes.items()}) jobs = await self._find_jobs() LOG.info("Job resources: %s", {k: v.resources for k, v in jobs.items()}) prev_allocations = await self._get_allocations() start = time.time() allocations = self._allocate(jobs, nodes, prev_allocations, node_template) duration = time.time() - start LOG.info("Allocations (in %.3f sec): %s", duration, allocations) await self._update_allocations(allocations) LOG.info("Sleep for 60 seconds") await asyncio.sleep(60) async def _get_allocations(self): job_list = await self._objs_api.list_namespaced_custom_object( "adaptdl.petuum.com", "v1", "", "adaptdljobs") ret = {} for job in job_list["items"]: if "allocation" in job.get("status", {}): namespace = job["metadata"]["namespace"] name = job["metadata"]["name"] ret[namespace, name] = list(job["status"]["allocation"]) return ret async def _update_allocations(self, allocations): job_list = await self._objs_api.list_namespaced_custom_object( "adaptdl.petuum.com", "v1", "", "adaptdljobs") for job in job_list["items"]: namespace = job["metadata"]["namespace"] name = job["metadata"]["name"] job_allocation = job.get("status", {}).get("allocation", []) new_allocation = list(allocations.get((namespace, name), [])) if list(job_allocation) != new_allocation: patch = {"status": {"allocation": new_allocation}} LOG.info("Patch AdaptDLJob %s/%s: %s", namespace, name, patch) await patch_job_status(self._objs_api, namespace, name, patch) async def _find_nodes(self): node_infos = {} node_list = await self._core_api.list_node() # Find all non-AdaptDL pods which are taking up resources and subtract # those resources from the available pool. Apparently there's not a # more efficient way to get currently available resources in k8s?. We # also check if we have reached the pod limit on the node. This number # denotes (allocatable pods - Non-terminated pods) on that node. pod_list = await self._core_api.list_pod_for_all_namespaces( label_selector="!adaptdl/job") for node in node_list.items: if allowed_taints(node.spec.taints): resources = get_node_unrequested(node, pod_list.items) if not resources.get("pods"): LOG.warning(f"node {node.metadata.name} " "has no free pods available.") node_infos[node.metadata.name] = NodeInfo(resources, False) # For cluster autoscaling: to determine if additional nodes would be # helpful, add a few "virtual" nodes which only become available in # "eta" seconds. Currently, we only consider as many virtual nodes as # there are real nodes. We infer each resource to be the maximum amount # observed in any real node. max_resources = {} for node_name in node_infos: for key, val in node_infos[node_name].resources.items(): if key not in max_resources or val > max_resources[key]: max_resources[key] = val node_template = NodeInfo(max_resources, True) return node_infos, node_template async def _find_jobs(self): job_list = await self._objs_api.list_namespaced_custom_object( "adaptdl.petuum.com", "v1", "", "adaptdljobs") job_infos = {} for job in job_list["items"]: if job.get("status", {}).get("phase") \ not in ["Pending", "Running", "Starting", "Stopping"]: continue job["spec"]["template"]["spec"] = \ set_default_resources(job["spec"]["template"]["spec"]) resources = get_pod_requests(job["spec"]["template"]["spec"]) hints = job.get("status", {}).get("train", {}) max_replicas = max(2 * hints.get("maxProfiledReplicas", 0), 1) if job["spec"].get("maxReplicas"): max_replicas = min(max_replicas, job["spec"]["maxReplicas"]) if hints: max_batch_size = hints.get("maxBatchSize") or \ hints.get("initBatchSize") if hints.get("localBszBounds"): min_local_bsz = hints["localBszBounds"][0] or 1 # Make sure max_batch_size / replicas >= min_local_bsz max_replicas = min(max_replicas, int(max_batch_size / min_local_bsz)) # derive if job is elastic elastic = hints.get("maxBatchSize", 0) > hints["initBatchSize"] speedup_fn = SpeedupFunction( [hints["perfParams"][k] for k in PERF_PARAMS.keys()] if hints.get("perfParams") else None, hints.get("gradParams", { "var": None, "norm": None }), hints.get("initBatchSize"), hints.get("maxBatchSize"), hints.get("localBszBounds", [None, None]), elastic_bsz=elastic) else: speedup_fn = lambda n, r: r # noqa: E731 creation_ts = dateutil.parser.isoparse( job["metadata"]["creationTimestamp"]) namespace = job["metadata"]["namespace"] name = job["metadata"]["name"] job_infos[(namespace, name)] = JobInfo(resources, speedup_fn, creation_ts, max_replicas) return job_infos def _allocate(self, jobs, nodes, prev_allocations, node_template): for job_key in list(jobs): job_resources = jobs[job_key].resources for node in nodes.values(): if all(val <= node.resources.get(key, 0) for key, val in job_resources.items()): # Found a node which can fit a replica of this job. break else: # No node can fit a replica of this job. # TODO: propagate this to the controller so the job is Failed. LOG.warning("Job %s cannot be scheduled!", job_key) jobs.pop(job_key) allocations = {} if not jobs: # There are no jobs, let the expander shrink the cluster. self._cluster_expander.fit([]) elif jobs and nodes: allocations, desired_nodes = self._policy.optimize( jobs, nodes, prev_allocations, node_template) if desired_nodes < len(nodes): active_nodes = list(set.union(*map(set, allocations.values()))) else: active_nodes = list(nodes) while len(active_nodes) < desired_nodes: active_nodes.append(f"~{desired_nodes-len(active_nodes)}") self._cluster_expander.fit(active_nodes) LOG.info("Active nodes: %s", active_nodes) elif jobs: # Expand job ASG from zero nodes. # Assumption is AdaptDL is running on a different ASG self._cluster_expander.fit(['~1']) return allocations
def test_optimize(num_nodes, total_devices=16): # Globals N_JOBS = 10 JOBS = list(range(N_JOBS)) random.shuffle(JOBS) PREEMPTIBLE_IDXS = JOBS[:len(JOBS) // 2] NON_PREEMPTIBLE_IDXS = JOBS[len(JOBS) // 2:] assert total_devices % num_nodes == 0 num_devices = total_devices // num_nodes print(f"{num_nodes}x{num_devices} nodes:") # Make up a realistic speedup function. perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634, 0.0118, 0.00317, 1.14) grad_params = GradParams(sqr=0.00136, var=0.000502) goodput_fn = GoodputFunction(perf_params, grad_params, 128) speedup_fn = SpeedupFunction(goodput_fn, max_batch_size=1280, atomic_bsz_range=(64, 256)) now = datetime.now() # Add a node template. policy = PolluxPolicy() job_resources = {"nvidia.com/gpu": 1, "pods": 1} # Add a few nodes. node_resources = {"nvidia.com/gpu": num_devices, "pods": 32} nodes = { i: NodeInfo(node_resources, preemptible=False) for i in range(num_nodes) } node_template = NodeInfo(node_resources, preemptible=True) # Empty allocations prev_allocs = {i: [] for i in JOBS} for cycle in range(3): # Start allocation cycle jobs = {} for i in PREEMPTIBLE_IDXS: creation_timestamp = now + timedelta(minutes=i), jobs[i] = JobInfo(job_resources, speedup_fn, creation_timestamp, min_replicas=0, max_replicas=8) for i in NON_PREEMPTIBLE_IDXS: creation_timestamp = now + timedelta(minutes=i), jobs[i] = JobInfo(job_resources, speedup_fn, creation_timestamp, min_replicas=2, max_replicas=4, preemptible=False) start = time.time() assert len(jobs) > 0 allocations, desired_nodes = \ policy.optimize(jobs, nodes, prev_allocs, node_template) duration = time.time() - start print(f"optimize {cycle + 1}x ({duration}s sec)") node_count = Counter() for job_key, placement in allocations.items(): assert len(placement) <= jobs[job_key].max_replicas if placement: assert len(placement) >= jobs[job_key].min_replicas for node_key in placement: node_count[node_key] += 1 for node_key, count in node_count.items(): assert count <= nodes[node_key].resources["nvidia.com/gpu"] assert count <= nodes[node_key].resources["pods"] # Check if we are maintaining allocations for non-preemptible jobs for i in NON_PREEMPTIBLE_IDXS: if (i in allocations) and prev_allocs[i]: assert allocations[i] == prev_allocs[i] prev_allocs = copy.deepcopy(allocations) # Remove one random job remove = random.sample(allocations.keys(), 1)[0] if remove in NON_PREEMPTIBLE_IDXS: NON_PREEMPTIBLE_IDXS.remove(remove) print(f"Deleting non-preemptible job {remove}") else: PREEMPTIBLE_IDXS.remove(remove) print(f"Deleting preemptible job {remove}") prev_allocs.pop(remove)
class AdaptDLAllocator(object): def __init__(self, expander): self._core_api = kubernetes.client.CoreV1Api() self._objs_api = kubernetes.client.CustomObjectsApi() self._custom_resource = ("adaptdl.petuum.com", "v1", "", "adaptdljobs") self._cluster_expander = expander self._policy = PolluxPolicy() # lock for the two corountines in run() self._lock = asyncio.Lock() async def run(self): # two functionality: (1) watch for new job and start if possible. # (2) periodically optimize existing jobs await asyncio.gather(self._allocate_one_loop(), self._optimize_all_loop()) async def _allocate_one_loop(self): async with kubernetes.watch.Watch() as watch: while True: async for event in watch.stream( self._objs_api.list_namespaced_custom_object, *self._custom_resource, timeout_seconds=60): if event["type"] == "ADDED": # there is a n arriving job async with self._lock: await self._allocate_one(event) async def _allocate_one(self, event): # re-read the job , compared with the previous readjob job = event["object"] namespace = job["metadata"]["namespace"] name = job["metadata"]["name"] try: job = await self._objs_api.get_namespaced_custom_object( "adaptdl.petuum.com", "v1", namespace, "adaptdljobs", name) except kubernetes.client.rest.ApiException as exc: if exc.status == 404: return raise # unexpected # some other coroutine has handled this if job.get("status", {}).get("allocation") is not None or\ job.get("status", {}).get("group") is not None: return namespace = job["metadata"]["namespace"] name = job["metadata"]["name"] # if this is a restarted job, skip i LOG.info("detected an added job %s/%s.", namespace, name) # parse the job infomation job_info = self._get_job_info(job) # find available nodes. node_infos, _ = await self._find_nodes() # get the node to allocate new_allocation = self._policy.allocate_job(job_info, node_infos) patch = {"status": {"allocation": new_allocation}} LOG.info("Patch AdaptdlJob %s/%s: %s ", namespace, name, patch) await patch_job_status(self._objs_api, namespace, name, patch) async def _optimize_all_loop(self): while True: # try to gain lock async with self._lock: await self._optimize_all() LOG.info("Sleep for 60 seconds") await asyncio.sleep(60) async def _optimize_all(self): LOG.info("Running allocator loop") nodes, node_template = await self._find_nodes( pod_label_selector="!adaptdl/job") LOG.info("Node resources: %s", {k: v.resources for k, v in nodes.items()}) jobs, prev_allocations = \ await self._find_jobs_and_allocations() LOG.info("Job resources: %s", {k: v.resources for k, v in jobs.items()}) start = time.time() allocations = self._allocate(jobs, nodes, prev_allocations, node_template) duration = time.time() - start LOG.info("Allocations (in %.3f sec): %s", duration, allocations) await self._update_allocations(allocations) async def _update_allocations(self, allocations): job_list = await self._objs_api.list_namespaced_custom_object( "adaptdl.petuum.com", "v1", "", "adaptdljobs") for job in job_list["items"]: namespace = job["metadata"]["namespace"] name = job["metadata"]["name"] job_allocation = job.get("status", {}).get("allocation", []) new_allocation = list(allocations.get((namespace, name), [])) if list(job_allocation) != new_allocation: patch = {"status": {"allocation": new_allocation}} LOG.info("Patch AdaptDLJob %s/%s: %s", namespace, name, patch) await patch_job_status(self._objs_api, namespace, name, patch) async def _find_nodes(self, pod_label_selector=None): node_infos = {} node_list = await self._core_api.list_node() # Find all non-AdaptDL pods which are taking up resources and subtract # those resources from the available pool. Apparently there's not a # more efficient way to get currently available resources in k8s?. We # also check if we have reached the pod limit on the node. This number # denotes (allocatable pods - Non-terminated pods) on that node. pod_list = await self._core_api.list_pod_for_all_namespaces( label_selector=pod_label_selector) for node in node_list.items: if allowed_taints(node.spec.taints): resources = get_node_unrequested(node, pod_list.items) if not resources.get("pods"): LOG.warning(f"node {node.metadata.name} " "has no free pods available.") node_infos[node.metadata.name] = NodeInfo(resources, False) # For cluster autoscaling: to determine if additional nodes would be # helpful, add a few "virtual" nodes which only become available in # "eta" seconds. Currently, we only consider as many virtual nodes as # there are real nodes. We infer each resource to be the maximum amount # observed in any real node. max_resources = {} for node_name in node_infos: for key, val in node_infos[node_name].resources.items(): if key not in max_resources or val > max_resources[key]: max_resources[key] = val node_template = NodeInfo(max_resources, True) return node_infos, node_template def _get_job_info(self, job): job["spec"]["template"]["spec"] = \ set_default_resources(job["spec"]["template"]["spec"]) resources = get_pod_requests(job["spec"]["template"]["spec"]) hints = job.get("status", {}).get("train", {}) max_replicas = max(2 * hints.get("maxProfiledReplicas", 0), 1) if job["spec"].get("maxReplicas"): max_replicas = min(max_replicas, job["spec"]["maxReplicas"]) min_replicas = job["spec"].get("minReplicas", 0) # max_replicas should be greater or equal to min_replicas max_replicas = max(max_replicas, min_replicas) preemptible = job["spec"].get("preemptible", True) if {"perfParams", "initBatchSize"} <= hints.keys() and preemptible: max_batch_size = (hints.get("maxBatchSize") or hints["initBatchSize"]) if hints.get("localBszBounds"): min_local_bsz = hints["localBszBounds"][0] or 1 # Make sure max_batch_size / replicas >= min_local_bsz if max_batch_size < min_local_bsz * max_replicas: max_replicas = int(max_batch_size / min_local_bsz) perf_params = PerfParams( *[hints["perfParams"][k] for k in PERF_PARAMS.keys()]) if "gradParams" in hints: grad_params = GradParams(hints["gradParams"]["norm"], hints["gradParams"]["var"]) else: grad_params = GradParams(0.0, 1.0) goodput_fn = GoodputFunction(perf_params, grad_params, hints["initBatchSize"]) speedup_fn = SpeedupFunction( goodput_fn, hints.get("maxBatchSize"), hints.get("localBszBounds"), hints.get("gradientAccumulation", False)) else: speedup_fn = lambda n, r: r # noqa: E731 creation_ts = dateutil.parser.isoparse( job["metadata"]["creationTimestamp"]) return JobInfo(resources, speedup_fn, creation_ts, min_replicas, max_replicas, preemptible) async def _find_jobs_and_allocations(self): job_list = await self._objs_api.list_namespaced_custom_object( "adaptdl.petuum.com", "v1", "", "adaptdljobs") job_infos = {} allocations = {} for job in job_list["items"]: if job.get("status", {}).get("phase") \ not in ["Pending", "Running", "Starting", "Stopping"]: continue namespace = job["metadata"]["namespace"] name = job["metadata"]["name"] if "allocation" in job.get("status", {}): allocations[namespace, name] = \ list(job["status"]["allocation"]) job_info = self._get_job_info(job) job_infos[(namespace, name)] = job_info return job_infos, allocations def _allocate(self, jobs, nodes, prev_allocations, node_template): for job_key in list(jobs): job_resources = jobs[job_key].resources for node in nodes.values(): if all(val <= node.resources.get(key, 0) for key, val in job_resources.items()): # Found a node which can fit a replica of this job. break else: # No node can fit a replica of this job. # TODO: propagate this to the controller so the job is Failed. LOG.warning("Job %s cannot be scheduled!", job_key) jobs.pop(job_key) allocations = {} if not jobs: # There are no jobs, let the expander shrink the cluster. self._cluster_expander.fit([]) elif jobs and nodes: allocations, desired_nodes = self._policy.optimize( jobs, nodes, prev_allocations, node_template) if desired_nodes < len(nodes): active_nodes = list(set.union(*map(set, allocations.values()))) else: active_nodes = list(nodes) while len(active_nodes) < desired_nodes: active_nodes.append(f"~{desired_nodes-len(active_nodes)}") self._cluster_expander.fit(active_nodes) LOG.info("Active nodes: %s", active_nodes) elif jobs: # Expand job ASG from zero nodes. # Assumption is AdaptDL is running on a different ASG self._cluster_expander.fit(['~1']) return allocations