def test_optimize_local_bounds(perf_params, grad_params): fun = GoodputFunction(perf_params, grad_params, 128) goodput, bsz, steps = fun.optimize(1, 1, atomic_bsz_range=(64, 256)) assert (bsz == 128), "expected bsz = 128, got {}".format(bsz) assert (isinstance(goodput, float)) replicas = np.asarray(range(1, 100)) # single-node goodput, bsz, steps = fun.optimize(np.ones_like(replicas), replicas, atomic_bsz_range=(64, 256)) assert (np.all(bsz >= np.ceil(128 / replicas).astype(int))) assert (np.all(np.logical_or(bsz >= (64), goodput == 0.0))) assert (np.all(bsz <= (256))) assert (np.all(bsz * replicas <= 100 * 128)) assert (bsz[0] == 128) assert (np.all(steps == 0)) # multi-node goodput, bsz, steps = fun.optimize(replicas, replicas, atomic_bsz_range=(64, 256)) assert (np.all(bsz >= np.ceil(128 / replicas).astype(int))) assert (np.all(np.logical_or(bsz >= (64), goodput == 0.0))) assert (np.all(bsz <= (256))) assert (np.all(bsz * replicas <= 100 * 128)) assert (bsz[0] == 128) assert (np.all(steps == 0))
def test_evaluate(perf_params, grad_params): init_batch_size = 16 goodput_fn = GoodputFunction(perf_params, grad_params, init_batch_size) # Generate a range of different goodput function arguments. num_nodes = np.array([1, 2, 3, 4]) num_replicas = np.array([1, 2, 4, 8]) atomic_bsz = np.array([8, 12, 16, 20, 24]) accum_steps = np.array([0, 1, 2, 3, 4]) # Cartesian product. num_nodes, num_replicas, atomic_bsz, accum_steps = \ map(np.array, zip(*itertools.product(num_nodes, num_replicas, atomic_bsz, accum_steps))) # Only keep valid arguments. valid = np.logical_and( num_nodes <= num_replicas, init_batch_size <= num_replicas * atomic_bsz * accum_steps) num_nodes = num_nodes[valid] num_replicas = num_replicas[valid] atomic_bsz = atomic_bsz[valid] accum_steps = accum_steps[valid] # Evaluate goodput. goodput = goodput_fn(num_nodes, num_replicas, atomic_bsz, accum_steps) throughput = goodput_fn.throughput(num_nodes, num_replicas, atomic_bsz, accum_steps) efficiency = goodput_fn.efficiency(num_replicas * atomic_bsz * (accum_steps + 1)) # Check basic invariants. assert np.all(0 <= throughput) assert np.all(0 <= efficiency) and np.all(efficiency <= 1) assert np.allclose(goodput, throughput * efficiency) # Increasing batch size should decrease efficiency. batch_size = num_replicas * atomic_bsz * (accum_steps + 1) sort = np.argsort(batch_size) assert np.all(np.diff(efficiency[sort]) <= 0) # All else equal, increasing atomic_bsz should increase throughput. for indices in groupby_indices(num_nodes, num_replicas, accum_steps): sort = np.argsort(atomic_bsz[indices]) assert np.all(np.diff(throughput[indices][sort]) >= 0) # Increasing throughput should experience diminishing returns. if len(indices) > 1: diffx = np.diff(atomic_bsz[indices][sort]) diffy = np.diff(throughput[indices][sort]) assert np.all(diffx[:-1] * diffy[1:] - diffx[1:] * diffy[:-1] <= 0) # All else equal, scalability is sublinear with respect to num_replicas. for indices in groupby_indices(num_nodes, atomic_bsz, accum_steps): scalability = throughput / num_replicas sort = np.argsort(num_replicas[indices]) assert np.all(np.diff(scalability[indices][sort]) <= 0)
def test_allocate_job(): nodes = { "0": NodeInfo({"gpu": 1, "cpu": 500, "pods": 32}, preemptible=False), "1": NodeInfo({"gpu": 2, "cpu": 2000, "pods": 32}, preemptible=False), "2": NodeInfo({"gpu": 2, "cpu": 3000, "pods": 32}, preemptible=True), } perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634, 0.0118, 0.00317, 1.14) grad_params = GradParams(sqr=0.00136, var=0.000502) goodput_fn = GoodputFunction(perf_params, grad_params, 128) speedup_fn = SpeedupFunction(goodput_fn, max_batch_size=1280, atomic_bsz_range=(64, 256)) now = datetime.now() min_replicas = 0 job_1 = JobInfo({"gpu": 1, "cpu": 500, "pods": 1}, speedup_fn, now + timedelta(minutes=0), min_replicas, max_replicas=1) job_2 = JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn, now + timedelta(minutes=1), min_replicas, max_replicas=1) job_3 = JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn, now + timedelta(minutes=1), 2, max_replicas=2) job_4 = JobInfo({"gpu": 1, "cpu": 2000, "pods": 1}, speedup_fn, now + timedelta(minutes=1), 2, max_replicas=2) policy = PolluxPolicy() assert(policy.allocate_job(job_1, nodes) == ["0"]) assert(policy.allocate_job(job_2, nodes) == ["1"]) assert(policy.allocate_job(job_3, nodes) == ["1", "1"]) assert(policy.allocate_job(job_4, nodes) == [])
def test_unusable_node(): # Test where one of the nodes can't be used due to one resource type. nodes = { 0: NodeInfo({"gpu": 1, "cpu": 500, "pods": 32}, preemptible=False), 1: NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=False), 2: NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=False), } template = NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=True) perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634, 0.0118, 0.00317, 1.14) grad_params = GradParams(sqr=0.00136, var=0.000502) goodput_fn = GoodputFunction(perf_params, grad_params, 128) speedup_fn = SpeedupFunction(goodput_fn, max_batch_size=1280, atomic_bsz_range=(64, 256)) now = datetime.now() min_replicas = 0 jobs = { 0: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn, now + timedelta(minutes=0), min_replicas, max_replicas=1), 1: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn, now + timedelta(minutes=1), min_replicas, max_replicas=1), 2: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn, now + timedelta(minutes=2), min_replicas, max_replicas=1), } policy = PolluxPolicy() allocations, desired_nodes = policy.optimize(jobs, nodes, {}, template) # Check that more nodes are asked for. assert desired_nodes > 3 # Check no job was allocated more than 1 replica. assert max(len(alloc) for alloc in allocations.values()) == 1 # Check two jobs were allocated. assert sum(len(alloc) for alloc in allocations.values()) == 2
def test_optimize_accumulation(perf_params, grad_params): fun = GoodputFunction(perf_params, grad_params, 128) goodput, bsz, steps = fun.optimize(1, 1, max_batch_size=1280, atomic_bsz_range=(64, 256), accumulation=True) assert (isinstance(goodput, float)) replicas = np.asarray(range(1, 20)) # single-node goodput, bsz, steps = fun.optimize(np.ones_like(replicas), replicas, max_batch_size=1280, atomic_bsz_range=(64, 256), accumulation=True) assert (np.all( np.logical_or(bsz >= np.ceil(128 / replicas).astype(int), goodput == 0.0))) assert (np.all(np.logical_or(bsz >= (64), goodput == 0.0))) assert (np.all(bsz <= (256))) assert (np.all( np.logical_or( bsz * replicas * (steps + 1) < 1280 + replicas * (steps + 1), goodput == 0.0))) assert (np.all(steps <= 15)) assert (np.all(steps >= 0)) assert (np.all( np.logical_or(replicas > 1, np.logical_or(bsz == 128, steps > 0)))) # multi-node goodput, bsz, steps = fun.optimize(replicas, replicas, max_batch_size=1280, atomic_bsz_range=(64, 256), accumulation=True) assert (np.all( np.logical_or(bsz >= np.ceil(128 / replicas).astype(int), goodput == 0.0))) assert (np.all(np.logical_or(bsz >= (64), goodput == 0.0))) assert (np.all(bsz <= (256))) assert (np.all( np.logical_or( bsz * replicas * (steps + 1) < 1280 + replicas * (steps + 1), goodput == 0.0))) assert (np.all(steps <= 15)) assert (np.all(steps >= 0)) assert (np.all(np.logical_or(np.multiply(steps, bsz) >= 256, steps == 0)))
def test_optimize_no_bounds(perf_params, grad_params): goodput_fn = GoodputFunction(perf_params, grad_params, 128) goodput, bsz, steps = goodput_fn.optimize(1, 3) assert (bsz == 128 // 3 + 1), "expected bsz = 43, got {}".format(bsz) assert (isinstance(goodput, float)) replicas = np.asarray([1, 2, 3, 4, 5]) # single-node goodput, bsz, steps = goodput_fn.optimize(np.ones_like(replicas), replicas) assert (bsz.shape == (5, )) assert (np.all(bsz == np.ceil(128 / replicas).astype(int))) assert (goodput.shape == (5, )) assert (bsz[0] == 128) assert (np.all(steps == 0)) # multi-node goodput, bsz, steps = goodput_fn.optimize(replicas, replicas) assert (bsz.shape == (5, )) assert (np.all(bsz == np.ceil(128 / replicas).astype(int))) assert (goodput.shape == (5, )) assert (bsz[0] == 128) assert (np.all(steps == 0))
def test_optimize_max_bounds(perf_params, grad_params): fun = GoodputFunction(perf_params, grad_params, 128) goodput, bsz, steps = fun.optimize(1, 1, max_batch_size=1280) assert (bsz == 128), "expected bsz = 128, got {}".format(bsz) assert (isinstance(goodput, float)) replicas = np.asarray(range(1, 100)) # single-node goodput, bsz, steps = fun.optimize(np.ones_like(replicas), replicas, max_batch_size=1280) assert (np.all(bsz >= np.ceil(128 / replicas).astype(int))) assert (np.all(bsz * replicas <= 1280 + replicas)) assert (bsz[0] == 128) assert (np.all(steps == 0)) # multi-node goodput, bsz, steps = fun.optimize(replicas, replicas, max_batch_size=1280) assert (np.all(bsz >= np.ceil(128 / replicas).astype(int))) assert (np.all(bsz * replicas <= 1280 + replicas)) assert (bsz[0] == 128) assert (np.all(steps == 0))
def test_one_replica_accumulation(perf_params, grad_params): fun = GoodputFunction(perf_params, grad_params, 128) replicas = np.asarray([1]) max_batch_sizes = np.asarray(range(128, 128 * 20, 128)) # single-node for max_batch_size in max_batch_sizes: goodput, bsz, steps = fun.optimize(np.ones_like(replicas), replicas, max_batch_size=1280, atomic_bsz_range=(64, 256), accumulation=True) assert (np.all(np.logical_or(bsz >= (64), goodput == 0.0))) assert (np.all(bsz <= (256))) assert (np.all( np.logical_or(bsz * (steps + 1) <= max_batch_size, goodput == 0.0))) assert (np.all( np.logical_or(bsz >= np.ceil(128 / replicas).astype(int), goodput == 0.0))) assert (np.all(np.logical_or(bsz * (steps + 1) != 128, steps == 0)))
def test_optimize(num_nodes, total_devices=16): assert total_devices % num_nodes == 0 num_devices = total_devices // num_nodes print("{}x{} nodes:".format(num_nodes, num_devices)) # Make up a realistic speedup function. perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634, 0.0118, 0.00317, 1.14) grad_params = GradParams(sqr=0.00136, var=0.000502) goodput_fn = GoodputFunction(perf_params, grad_params, 128) speedup_fn = SpeedupFunction(goodput_fn, max_batch_size=1280, atomic_bsz_range=(64, 256)) now = datetime.now() jobs = {} # Add a few jobs. job_resources = {"nvidia.com/gpu": 1, "pods": 1} for i in range(16): creation_timestamp = now + timedelta(minutes=len(jobs)), max_replicas = 8 min_replicas = 0 key = len(jobs) jobs[key] = JobInfo(job_resources, speedup_fn, creation_timestamp, min_replicas, max_replicas) # Add a few nodes. node_resources = {"nvidia.com/gpu": num_devices, "pods": 32} nodes = {i: NodeInfo(node_resources, preemptible=False) for i in range(num_nodes)} # Add a node template. node_template = NodeInfo(node_resources, preemptible=True) policy = PolluxPolicy() prev_allocs = {} for i in range(3): start = time.time() allocations, desired_nodes = \ policy.optimize(jobs, nodes, prev_allocs, node_template) duration = time.time() - start print("optimize {}x ({}s sec):".format(i + 1, duration)) node_count = Counter() for job_key, placement in allocations.items(): assert len(placement) <= jobs[job_key].max_replicas for node_key in placement: node_count[node_key] += 1 for node_key, count in node_count.items(): assert count <= nodes[node_key].resources["nvidia.com/gpu"] assert count <= nodes[node_key].resources["pods"]
def job_info(self) -> JobInfo: metrics = self._fetch_metrics() if metrics is not None: perf_params = metrics.perf_params if metrics.grad_params is not None: grad_params = metrics.grad_params else: grad_params = GradParams(0.0, 1.0) goodput_fn = GoodputFunction(perf_params, grad_params, metrics.init_batch_size) speedup_fn = SpeedupFunction(goodput_fn, metrics.max_batch_size, metrics.local_bsz_bounds, metrics.gradient_accumulation) else: speedup_fn = lambda n, r: r # noqa: E731 return JobInfo(config.job_resources(), speedup_fn, self.creation_timestamp, config._JOB_MIN_REPLICAS, config._JOB_MAX_REPLICAS)
def _get_job_info(self, job): job["spec"]["template"]["spec"] = \ set_default_resources(job["spec"]["template"]["spec"]) resources = get_pod_requests(job["spec"]["template"]["spec"]) hints = job.get("status", {}).get("train", {}) max_replicas = max(2 * hints.get("maxProfiledReplicas", 0), 1) if job["spec"].get("maxReplicas"): max_replicas = min(max_replicas, job["spec"]["maxReplicas"]) min_replicas = job["spec"].get("minReplicas", 0) # max_replicas should be greater or equal to min_replicas max_replicas = max(max_replicas, min_replicas) preemptible = job["spec"].get("preemptible", True) if {"perfParams", "initBatchSize"} <= hints.keys() and preemptible: max_batch_size = (hints.get("maxBatchSize") or hints["initBatchSize"]) if hints.get("localBszBounds"): min_local_bsz = hints["localBszBounds"][0] or 1 # Make sure max_batch_size / replicas >= min_local_bsz if max_batch_size < min_local_bsz * max_replicas: max_replicas = int(max_batch_size / min_local_bsz) perf_params = PerfParams( *[hints["perfParams"][k] for k in PERF_PARAMS.keys()]) if "gradParams" in hints: grad_params = GradParams(hints["gradParams"]["norm"], hints["gradParams"]["var"]) else: grad_params = GradParams(0.0, 1.0) goodput_fn = GoodputFunction(perf_params, grad_params, hints["initBatchSize"], self._metrics_options) speedup_fn = SpeedupFunction( goodput_fn, hints.get("maxBatchSize"), hints.get("localBszBounds"), hints.get("gradientAccumulation", False)) else: speedup_fn = lambda n, r: r # noqa: E731 creation_ts = dateutil.parser.isoparse( job["metadata"]["creationTimestamp"]) return JobInfo(resources, speedup_fn, creation_ts, min_replicas, max_replicas, preemptible)
def get_goodput_fn(): state = _metrics_state() if state.grad_params is None or state.perf_params is None: return None return GoodputFunction(state.perf_params, state.grad_params, state.init_batch_size)
def test_optimize(num_nodes, total_devices=16): # Globals N_JOBS = 10 JOBS = list(range(N_JOBS)) random.shuffle(JOBS) PREEMPTIBLE_IDXS = JOBS[:len(JOBS) // 2] NON_PREEMPTIBLE_IDXS = JOBS[len(JOBS) // 2:] assert total_devices % num_nodes == 0 num_devices = total_devices // num_nodes print(f"{num_nodes}x{num_devices} nodes:") # Make up a realistic speedup function. perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634, 0.0118, 0.00317, 1.14) grad_params = GradParams(sqr=0.00136, var=0.000502) goodput_fn = GoodputFunction(perf_params, grad_params, 128) speedup_fn = SpeedupFunction(goodput_fn, max_batch_size=1280, atomic_bsz_range=(64, 256)) now = datetime.now() # Add a node template. policy = PolluxPolicy() job_resources = {"nvidia.com/gpu": 1, "pods": 1} # Add a few nodes. node_resources = {"nvidia.com/gpu": num_devices, "pods": 32} nodes = { i: NodeInfo(node_resources, preemptible=False) for i in range(num_nodes) } node_template = NodeInfo(node_resources, preemptible=True) # Empty allocations prev_allocs = {i: [] for i in JOBS} for cycle in range(3): # Start allocation cycle jobs = {} for i in PREEMPTIBLE_IDXS: creation_timestamp = now + timedelta(minutes=i), jobs[i] = JobInfo(job_resources, speedup_fn, creation_timestamp, min_replicas=0, max_replicas=8) for i in NON_PREEMPTIBLE_IDXS: creation_timestamp = now + timedelta(minutes=i), jobs[i] = JobInfo(job_resources, speedup_fn, creation_timestamp, min_replicas=2, max_replicas=4, preemptible=False) start = time.time() assert len(jobs) > 0 allocations, desired_nodes = \ policy.optimize(jobs, nodes, prev_allocs, node_template) duration = time.time() - start print(f"optimize {cycle + 1}x ({duration}s sec)") node_count = Counter() for job_key, placement in allocations.items(): assert len(placement) <= jobs[job_key].max_replicas if placement: assert len(placement) >= jobs[job_key].min_replicas for node_key in placement: node_count[node_key] += 1 for node_key, count in node_count.items(): assert count <= nodes[node_key].resources["nvidia.com/gpu"] assert count <= nodes[node_key].resources["pods"] # Check if we are maintaining allocations for non-preemptible jobs for i in NON_PREEMPTIBLE_IDXS: if (i in allocations) and prev_allocs[i]: assert allocations[i] == prev_allocs[i] prev_allocs = copy.deepcopy(allocations) # Remove one random job remove = random.sample(allocations.keys(), 1)[0] if remove in NON_PREEMPTIBLE_IDXS: NON_PREEMPTIBLE_IDXS.remove(remove) print(f"Deleting non-preemptible job {remove}") else: PREEMPTIBLE_IDXS.remove(remove) print(f"Deleting preemptible job {remove}") prev_allocs.pop(remove)