Пример #1
0
def test_allocate_job():
    nodes = {
        "0": NodeInfo({"gpu": 1, "cpu": 500, "pods": 32}, preemptible=False),
        "1": NodeInfo({"gpu": 2, "cpu": 2000, "pods": 32}, preemptible=False),
        "2": NodeInfo({"gpu": 2, "cpu": 3000, "pods": 32}, preemptible=True),
    }
    perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634,
                             0.0118, 0.00317, 1.14)
    grad_params = GradParams(sqr=0.00136, var=0.000502)
    goodput_fn = GoodputFunction(perf_params, grad_params, 128)
    speedup_fn = SpeedupFunction(goodput_fn, max_batch_size=1280,
                                 atomic_bsz_range=(64, 256))
    now = datetime.now()
    min_replicas = 0
    job_1 = JobInfo({"gpu": 1, "cpu": 500, "pods": 1}, speedup_fn,
                    now + timedelta(minutes=0), min_replicas, max_replicas=1)
    job_2 = JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                    now + timedelta(minutes=1), min_replicas, max_replicas=1)
    job_3 = JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                    now + timedelta(minutes=1), 2, max_replicas=2)
    job_4 = JobInfo({"gpu": 1, "cpu": 2000, "pods": 1}, speedup_fn,
                    now + timedelta(minutes=1), 2, max_replicas=2)
    policy = PolluxPolicy()

    assert(policy.allocate_job(job_1, nodes) == ["0"])
    assert(policy.allocate_job(job_2, nodes) == ["1"])
    assert(policy.allocate_job(job_3, nodes) == ["1", "1"])
    assert(policy.allocate_job(job_4, nodes) == [])
Пример #2
0
def test_unusable_node():
    # Test where one of the nodes can't be used due to one resource type.
    nodes = {
        0: NodeInfo({"gpu": 1, "cpu": 500, "pods": 32}, preemptible=False),
        1: NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=False),
        2: NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=False),
    }
    template = NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=True)
    perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634,
                             0.0118, 0.00317, 1.14)
    grad_params = GradParams(sqr=0.00136, var=0.000502)
    goodput_fn = GoodputFunction(perf_params, grad_params, 128)
    speedup_fn = SpeedupFunction(goodput_fn, max_batch_size=1280,
                                 atomic_bsz_range=(64, 256))
    now = datetime.now()
    min_replicas = 0
    jobs = {
        0: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                   now + timedelta(minutes=0), min_replicas, max_replicas=1),
        1: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                   now + timedelta(minutes=1), min_replicas, max_replicas=1),
        2: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                   now + timedelta(minutes=2), min_replicas, max_replicas=1),
    }
    policy = PolluxPolicy()
    allocations, desired_nodes = policy.optimize(jobs, nodes, {}, template)
    # Check that more nodes are asked for.
    assert desired_nodes > 3
    # Check no job was allocated more than 1 replica.
    assert max(len(alloc) for alloc in allocations.values()) == 1
    # Check two jobs were allocated.
    assert sum(len(alloc) for alloc in allocations.values()) == 2
Пример #3
0
 def __init__(self, expander):
     self._core_api = kubernetes.client.CoreV1Api()
     self._objs_api = kubernetes.client.CustomObjectsApi()
     self._custom_resource = ("adaptdl.petuum.com", "v1", "", "adaptdljobs")
     self._cluster_expander = expander
     self._policy = PolluxPolicy()
     # lock for the two corountines in run()
     self._lock = asyncio.Lock()
Пример #4
0
 def __init__(self, nodes: List = None):
     nodes = nodes if nodes is not None else config.nodes()
     self._node_infos = {
         node['NodeManagerAddress']: NodeInfo(node['Resources'],
                                              preemptible=False)
         for node in nodes
     }
     self._default_node = cycle(list(self._node_infos))
     # Add a node template.
     self._node_template = NodeInfo(list(
         self._node_infos.values())[0].resources,
                                    preemptible=False)
     self._policy = PolluxPolicy()
Пример #5
0
def test_optimize(num_nodes, total_devices=16):
    assert total_devices % num_nodes == 0
    num_devices = total_devices // num_nodes
    print("{}x{} nodes:".format(num_nodes, num_devices))
    # Make up a realistic speedup function.
    params = Params(0.121, 0.00568, 0.0236, 0.00634, 0.0118, 0.00317, 1.14)
    grad_params = {"norm": 0.00136, "var": 0.000502}
    speedup_fn = SpeedupFunction(params,
                                 grad_params,
                                 init_batch_size=128,
                                 max_batch_size=1280,
                                 local_bsz_bounds=(64, 256),
                                 elastic_bsz=True)
    now = datetime.now()
    jobs = {}
    # Add a few jobs.
    job_resources = {"nvidia.com/gpu": 1, "pods": 1}
    for i in range(16):
        creation_timestamp = now + timedelta(minutes=len(jobs)),
        max_replicas = 8
        key = len(jobs)
        jobs[key] = JobInfo(job_resources, speedup_fn, creation_timestamp,
                            max_replicas)
    # Add a few nodes.
    node_resources = {"nvidia.com/gpu": num_devices, "pods": 32}
    nodes = {
        i: NodeInfo(node_resources, preemptible=False)
        for i in range(num_nodes)
    }
    # Add a node template.
    node_template = NodeInfo(node_resources, preemptible=True)
    policy = PolluxPolicy()
    prev_allocs = {}
    for i in range(3):
        start = time.time()
        allocations, desired_nodes = \
            policy.optimize(jobs, nodes, prev_allocs, node_template)
        duration = time.time() - start
        print("optimize {}x ({}s sec):".format(i + 1, duration))
        node_count = Counter()
        for job_key, placement in allocations.items():
            assert len(placement) <= jobs[job_key].max_replicas
            for node_key in placement:
                node_count[node_key] += 1
        for node_key, count in node_count.items():
            assert count <= nodes[node_key].resources["nvidia.com/gpu"]
            assert count <= nodes[node_key].resources["pods"]
Пример #6
0
class AdaptDLAllocator:
    def __init__(self, nodes: List = None):
        nodes = nodes if nodes is not None else config.nodes()
        self._node_infos = {
            node['NodeManagerAddress']: NodeInfo(node['Resources'],
                                                 preemptible=False)
            for node in nodes
        }
        self._default_node = cycle(list(self._node_infos))
        # Add a node template.
        self._node_template = NodeInfo(list(
            self._node_infos.values())[0].resources,
                                       preemptible=False)
        self._policy = PolluxPolicy()

    def default_allocation(self, num_devices=1) -> List[str]:
        """ Cycle through nodes for default trial allocation."""
        return [f"{next(self._default_node)}"] * num_devices

    def allocate(self,
                 jobs: List[AdaptDLJobMixin],
                 nodes: List = None) -> (Dict, int):
        """ Use Pollux to distribute available resources between jobs."""
        if nodes is not None:
            node_infos = {
                node['NodeManagerAddress']: NodeInfo(node['Resources'],
                                                     preemptible=False)
                for node in nodes
            }
        else:
            node_infos = self._node_infos

        assert len(jobs) > 0
        # gather JobInfos
        job_infos = {job.job_id: job.job_info for job in jobs}
        # gather previous allocations
        prev_allocs = {job.job_id: job.allocation for job in jobs}

        allocations, desired_nodes = \
            self._policy.optimize(job_infos,
                                  node_infos,
                                  prev_allocs,
                                  self._node_template)
        # Fill empty allocations for jobs which didn't get any
        for job_id in job_infos:
            allocations[job_id] = allocations.get(job_id, [])

        assert all(v == [] for k, v in allocations.items()) is False
        return allocations, desired_nodes
Пример #7
0
 def __init__(self, expander):
     self._core_api = kubernetes.client.CoreV1Api()
     self._objs_api = kubernetes.client.CustomObjectsApi()
     self._cluster_expander = expander
     self._policy = PolluxPolicy()
Пример #8
0
class AdaptDLAllocator(object):
    def __init__(self, expander):
        self._core_api = kubernetes.client.CoreV1Api()
        self._objs_api = kubernetes.client.CustomObjectsApi()
        self._cluster_expander = expander
        self._policy = PolluxPolicy()

    async def run(self):
        while True:
            LOG.info("Running allocator loop")
            nodes, node_template = await self._find_nodes()
            LOG.info("Node resources: %s",
                     {k: v.resources
                      for k, v in nodes.items()})
            jobs = await self._find_jobs()
            LOG.info("Job resources: %s",
                     {k: v.resources
                      for k, v in jobs.items()})
            prev_allocations = await self._get_allocations()
            start = time.time()
            allocations = self._allocate(jobs, nodes, prev_allocations,
                                         node_template)
            duration = time.time() - start
            LOG.info("Allocations (in %.3f sec): %s", duration, allocations)
            await self._update_allocations(allocations)
            LOG.info("Sleep for 60 seconds")
            await asyncio.sleep(60)

    async def _get_allocations(self):
        job_list = await self._objs_api.list_namespaced_custom_object(
            "adaptdl.petuum.com", "v1", "", "adaptdljobs")
        ret = {}
        for job in job_list["items"]:
            if "allocation" in job.get("status", {}):
                namespace = job["metadata"]["namespace"]
                name = job["metadata"]["name"]
                ret[namespace, name] = list(job["status"]["allocation"])
        return ret

    async def _update_allocations(self, allocations):
        job_list = await self._objs_api.list_namespaced_custom_object(
            "adaptdl.petuum.com", "v1", "", "adaptdljobs")
        for job in job_list["items"]:
            namespace = job["metadata"]["namespace"]
            name = job["metadata"]["name"]
            job_allocation = job.get("status", {}).get("allocation", [])
            new_allocation = list(allocations.get((namespace, name), []))
            if list(job_allocation) != new_allocation:
                patch = {"status": {"allocation": new_allocation}}
                LOG.info("Patch AdaptDLJob %s/%s: %s", namespace, name, patch)
                await patch_job_status(self._objs_api, namespace, name, patch)

    async def _find_nodes(self):
        node_infos = {}
        node_list = await self._core_api.list_node()
        # Find all non-AdaptDL pods which are taking up resources and subtract
        # those resources from the available pool. Apparently there's not a
        # more efficient way to get currently available resources in k8s?. We
        # also check if we have reached the pod limit on the node. This number
        # denotes (allocatable pods - Non-terminated pods) on that node.
        pod_list = await self._core_api.list_pod_for_all_namespaces(
            label_selector="!adaptdl/job")
        for node in node_list.items:
            if allowed_taints(node.spec.taints):
                resources = get_node_unrequested(node, pod_list.items)
                if not resources.get("pods"):
                    LOG.warning(f"node {node.metadata.name} "
                                "has no free pods available.")
                node_infos[node.metadata.name] = NodeInfo(resources, False)
        # For cluster autoscaling: to determine if additional nodes would be
        # helpful, add a few "virtual" nodes which only become available in
        # "eta" seconds. Currently, we only consider as many virtual nodes as
        # there are real nodes. We infer each resource to be the maximum amount
        # observed in any real node.
        max_resources = {}
        for node_name in node_infos:
            for key, val in node_infos[node_name].resources.items():
                if key not in max_resources or val > max_resources[key]:
                    max_resources[key] = val
        node_template = NodeInfo(max_resources, True)
        return node_infos, node_template

    async def _find_jobs(self):
        job_list = await self._objs_api.list_namespaced_custom_object(
            "adaptdl.petuum.com", "v1", "", "adaptdljobs")
        job_infos = {}
        for job in job_list["items"]:
            if job.get("status", {}).get("phase") \
                   not in ["Pending", "Running", "Starting", "Stopping"]:
                continue
            job["spec"]["template"]["spec"] = \
                set_default_resources(job["spec"]["template"]["spec"])
            resources = get_pod_requests(job["spec"]["template"]["spec"])
            hints = job.get("status", {}).get("train", {})
            max_replicas = max(2 * hints.get("maxProfiledReplicas", 0), 1)
            if job["spec"].get("maxReplicas"):
                max_replicas = min(max_replicas, job["spec"]["maxReplicas"])
            if hints:
                max_batch_size = hints.get("maxBatchSize") or \
                                                 hints.get("initBatchSize")
                if hints.get("localBszBounds"):
                    min_local_bsz = hints["localBszBounds"][0] or 1
                    # Make sure max_batch_size / replicas >= min_local_bsz
                    max_replicas = min(max_replicas,
                                       int(max_batch_size / min_local_bsz))
                # derive if job is elastic
                elastic = hints.get("maxBatchSize", 0) > hints["initBatchSize"]
                speedup_fn = SpeedupFunction(
                    [hints["perfParams"][k] for k in PERF_PARAMS.keys()]
                    if hints.get("perfParams") else None,
                    hints.get("gradParams", {
                        "var": None,
                        "norm": None
                    }),
                    hints.get("initBatchSize"),
                    hints.get("maxBatchSize"),
                    hints.get("localBszBounds", [None, None]),
                    elastic_bsz=elastic)
            else:
                speedup_fn = lambda n, r: r  # noqa: E731
            creation_ts = dateutil.parser.isoparse(
                job["metadata"]["creationTimestamp"])
            namespace = job["metadata"]["namespace"]
            name = job["metadata"]["name"]
            job_infos[(namespace, name)] = JobInfo(resources, speedup_fn,
                                                   creation_ts, max_replicas)
        return job_infos

    def _allocate(self, jobs, nodes, prev_allocations, node_template):
        for job_key in list(jobs):
            job_resources = jobs[job_key].resources
            for node in nodes.values():
                if all(val <= node.resources.get(key, 0)
                       for key, val in job_resources.items()):
                    # Found a node which can fit a replica of this job.
                    break
            else:
                # No node can fit a replica of this job.
                # TODO: propagate this to the controller so the job is Failed.
                LOG.warning("Job %s cannot be scheduled!", job_key)
                jobs.pop(job_key)
        allocations = {}
        if not jobs:
            # There are no jobs, let the expander shrink the cluster.
            self._cluster_expander.fit([])
        elif jobs and nodes:
            allocations, desired_nodes = self._policy.optimize(
                jobs, nodes, prev_allocations, node_template)
            if desired_nodes < len(nodes):
                active_nodes = list(set.union(*map(set, allocations.values())))
            else:
                active_nodes = list(nodes)
                while len(active_nodes) < desired_nodes:
                    active_nodes.append(f"~{desired_nodes-len(active_nodes)}")
            self._cluster_expander.fit(active_nodes)
            LOG.info("Active nodes: %s", active_nodes)
        elif jobs:
            # Expand job ASG from zero nodes.
            # Assumption is AdaptDL is running on a different ASG
            self._cluster_expander.fit(['~1'])
        return allocations
Пример #9
0
def test_optimize(num_nodes, total_devices=16):
    # Globals
    N_JOBS = 10
    JOBS = list(range(N_JOBS))
    random.shuffle(JOBS)

    PREEMPTIBLE_IDXS = JOBS[:len(JOBS) // 2]
    NON_PREEMPTIBLE_IDXS = JOBS[len(JOBS) // 2:]

    assert total_devices % num_nodes == 0
    num_devices = total_devices // num_nodes
    print(f"{num_nodes}x{num_devices} nodes:")
    # Make up a realistic speedup function.
    perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634, 0.0118, 0.00317,
                             1.14)
    grad_params = GradParams(sqr=0.00136, var=0.000502)
    goodput_fn = GoodputFunction(perf_params, grad_params, 128)
    speedup_fn = SpeedupFunction(goodput_fn,
                                 max_batch_size=1280,
                                 atomic_bsz_range=(64, 256))
    now = datetime.now()
    # Add a node template.
    policy = PolluxPolicy()
    job_resources = {"nvidia.com/gpu": 1, "pods": 1}
    # Add a few nodes.
    node_resources = {"nvidia.com/gpu": num_devices, "pods": 32}
    nodes = {
        i: NodeInfo(node_resources, preemptible=False)
        for i in range(num_nodes)
    }
    node_template = NodeInfo(node_resources, preemptible=True)

    # Empty allocations
    prev_allocs = {i: [] for i in JOBS}
    for cycle in range(3):
        # Start allocation cycle
        jobs = {}
        for i in PREEMPTIBLE_IDXS:
            creation_timestamp = now + timedelta(minutes=i),
            jobs[i] = JobInfo(job_resources,
                              speedup_fn,
                              creation_timestamp,
                              min_replicas=0,
                              max_replicas=8)
        for i in NON_PREEMPTIBLE_IDXS:
            creation_timestamp = now + timedelta(minutes=i),
            jobs[i] = JobInfo(job_resources,
                              speedup_fn,
                              creation_timestamp,
                              min_replicas=2,
                              max_replicas=4,
                              preemptible=False)
        start = time.time()
        assert len(jobs) > 0
        allocations, desired_nodes = \
            policy.optimize(jobs, nodes, prev_allocs, node_template)
        duration = time.time() - start
        print(f"optimize {cycle + 1}x ({duration}s sec)")
        node_count = Counter()
        for job_key, placement in allocations.items():
            assert len(placement) <= jobs[job_key].max_replicas
            if placement:
                assert len(placement) >= jobs[job_key].min_replicas
            for node_key in placement:
                node_count[node_key] += 1
        for node_key, count in node_count.items():
            assert count <= nodes[node_key].resources["nvidia.com/gpu"]
            assert count <= nodes[node_key].resources["pods"]

        # Check if we are maintaining allocations for non-preemptible jobs
        for i in NON_PREEMPTIBLE_IDXS:
            if (i in allocations) and prev_allocs[i]:
                assert allocations[i] == prev_allocs[i]

        prev_allocs = copy.deepcopy(allocations)
        # Remove one random job
        remove = random.sample(allocations.keys(), 1)[0]
        if remove in NON_PREEMPTIBLE_IDXS:
            NON_PREEMPTIBLE_IDXS.remove(remove)
            print(f"Deleting non-preemptible job {remove}")
        else:
            PREEMPTIBLE_IDXS.remove(remove)
            print(f"Deleting preemptible job {remove}")
        prev_allocs.pop(remove)
Пример #10
0
class AdaptDLAllocator(object):
    def __init__(self, expander):
        self._core_api = kubernetes.client.CoreV1Api()
        self._objs_api = kubernetes.client.CustomObjectsApi()
        self._custom_resource = ("adaptdl.petuum.com", "v1", "", "adaptdljobs")
        self._cluster_expander = expander
        self._policy = PolluxPolicy()
        # lock for the two corountines in run()
        self._lock = asyncio.Lock()

    async def run(self):
        # two functionality: (1) watch for new job and start if possible.
        # (2) periodically optimize existing jobs
        await asyncio.gather(self._allocate_one_loop(),
                             self._optimize_all_loop())

    async def _allocate_one_loop(self):
        async with kubernetes.watch.Watch() as watch:
            while True:
                async for event in watch.stream(
                        self._objs_api.list_namespaced_custom_object,
                        *self._custom_resource,
                        timeout_seconds=60):
                    if event["type"] == "ADDED":  # there is a n arriving job
                        async with self._lock:
                            await self._allocate_one(event)

    async def _allocate_one(self, event):
        # re-read the job , compared with the previous readjob
        job = event["object"]
        namespace = job["metadata"]["namespace"]
        name = job["metadata"]["name"]

        try:
            job = await self._objs_api.get_namespaced_custom_object(
                "adaptdl.petuum.com", "v1", namespace, "adaptdljobs", name)
        except kubernetes.client.rest.ApiException as exc:
            if exc.status == 404:
                return
            raise  # unexpected

        # some other coroutine has handled this
        if job.get("status", {}).get("allocation") is not None or\
                job.get("status", {}).get("group") is not None:
            return

        namespace = job["metadata"]["namespace"]
        name = job["metadata"]["name"]
        # if this is a restarted job, skip i
        LOG.info("detected an added job %s/%s.", namespace, name)
        # parse the job infomation
        job_info = self._get_job_info(job)

        # find available nodes.
        node_infos, _ = await self._find_nodes()

        # get the node to allocate
        new_allocation = self._policy.allocate_job(job_info, node_infos)
        patch = {"status": {"allocation": new_allocation}}
        LOG.info("Patch AdaptdlJob %s/%s: %s ", namespace, name, patch)
        await patch_job_status(self._objs_api, namespace, name, patch)

    async def _optimize_all_loop(self):
        while True:
            # try to gain lock
            async with self._lock:
                await self._optimize_all()

            LOG.info("Sleep for 60 seconds")
            await asyncio.sleep(60)

    async def _optimize_all(self):
        LOG.info("Running allocator loop")
        nodes, node_template = await self._find_nodes(
            pod_label_selector="!adaptdl/job")
        LOG.info("Node resources: %s",
                 {k: v.resources
                  for k, v in nodes.items()})
        jobs, prev_allocations = \
            await self._find_jobs_and_allocations()
        LOG.info("Job resources: %s",
                 {k: v.resources
                  for k, v in jobs.items()})
        start = time.time()
        allocations = self._allocate(jobs, nodes, prev_allocations,
                                     node_template)
        duration = time.time() - start
        LOG.info("Allocations (in %.3f sec): %s", duration, allocations)
        await self._update_allocations(allocations)

    async def _update_allocations(self, allocations):
        job_list = await self._objs_api.list_namespaced_custom_object(
            "adaptdl.petuum.com", "v1", "", "adaptdljobs")
        for job in job_list["items"]:
            namespace = job["metadata"]["namespace"]
            name = job["metadata"]["name"]
            job_allocation = job.get("status", {}).get("allocation", [])
            new_allocation = list(allocations.get((namespace, name), []))
            if list(job_allocation) != new_allocation:
                patch = {"status": {"allocation": new_allocation}}
                LOG.info("Patch AdaptDLJob %s/%s: %s", namespace, name, patch)
                await patch_job_status(self._objs_api, namespace, name, patch)

    async def _find_nodes(self, pod_label_selector=None):
        node_infos = {}
        node_list = await self._core_api.list_node()
        # Find all non-AdaptDL pods which are taking up resources and subtract
        # those resources from the available pool. Apparently there's not a
        # more efficient way to get currently available resources in k8s?. We
        # also check if we have reached the pod limit on the node. This number
        # denotes (allocatable pods - Non-terminated pods) on that node.
        pod_list = await self._core_api.list_pod_for_all_namespaces(
            label_selector=pod_label_selector)
        for node in node_list.items:
            if allowed_taints(node.spec.taints):
                resources = get_node_unrequested(node, pod_list.items)
                if not resources.get("pods"):
                    LOG.warning(f"node {node.metadata.name} "
                                "has no free pods available.")
                node_infos[node.metadata.name] = NodeInfo(resources, False)
        # For cluster autoscaling: to determine if additional nodes would be
        # helpful, add a few "virtual" nodes which only become available in
        # "eta" seconds. Currently, we only consider as many virtual nodes as
        # there are real nodes. We infer each resource to be the maximum amount
        # observed in any real node.
        max_resources = {}
        for node_name in node_infos:
            for key, val in node_infos[node_name].resources.items():
                if key not in max_resources or val > max_resources[key]:
                    max_resources[key] = val
        node_template = NodeInfo(max_resources, True)
        return node_infos, node_template

    def _get_job_info(self, job):
        job["spec"]["template"]["spec"] = \
            set_default_resources(job["spec"]["template"]["spec"])
        resources = get_pod_requests(job["spec"]["template"]["spec"])
        hints = job.get("status", {}).get("train", {})
        max_replicas = max(2 * hints.get("maxProfiledReplicas", 0), 1)
        if job["spec"].get("maxReplicas"):
            max_replicas = min(max_replicas, job["spec"]["maxReplicas"])
        min_replicas = job["spec"].get("minReplicas", 0)
        # max_replicas should be greater or equal to min_replicas
        max_replicas = max(max_replicas, min_replicas)
        preemptible = job["spec"].get("preemptible", True)
        if {"perfParams", "initBatchSize"} <= hints.keys() and preemptible:
            max_batch_size = (hints.get("maxBatchSize")
                              or hints["initBatchSize"])
            if hints.get("localBszBounds"):
                min_local_bsz = hints["localBszBounds"][0] or 1
                # Make sure max_batch_size / replicas >= min_local_bsz
                if max_batch_size < min_local_bsz * max_replicas:
                    max_replicas = int(max_batch_size / min_local_bsz)
            perf_params = PerfParams(
                *[hints["perfParams"][k] for k in PERF_PARAMS.keys()])
            if "gradParams" in hints:
                grad_params = GradParams(hints["gradParams"]["norm"],
                                         hints["gradParams"]["var"])
            else:
                grad_params = GradParams(0.0, 1.0)
            goodput_fn = GoodputFunction(perf_params, grad_params,
                                         hints["initBatchSize"])
            speedup_fn = SpeedupFunction(
                goodput_fn, hints.get("maxBatchSize"),
                hints.get("localBszBounds"),
                hints.get("gradientAccumulation", False))
        else:
            speedup_fn = lambda n, r: r  # noqa: E731
        creation_ts = dateutil.parser.isoparse(
            job["metadata"]["creationTimestamp"])
        return JobInfo(resources, speedup_fn, creation_ts, min_replicas,
                       max_replicas, preemptible)

    async def _find_jobs_and_allocations(self):
        job_list = await self._objs_api.list_namespaced_custom_object(
            "adaptdl.petuum.com", "v1", "", "adaptdljobs")
        job_infos = {}
        allocations = {}

        for job in job_list["items"]:
            if job.get("status", {}).get("phase") \
                    not in ["Pending", "Running", "Starting", "Stopping"]:
                continue

            namespace = job["metadata"]["namespace"]
            name = job["metadata"]["name"]

            if "allocation" in job.get("status", {}):
                allocations[namespace, name] = \
                    list(job["status"]["allocation"])

            job_info = self._get_job_info(job)
            job_infos[(namespace, name)] = job_info

        return job_infos, allocations

    def _allocate(self, jobs, nodes, prev_allocations, node_template):
        for job_key in list(jobs):
            job_resources = jobs[job_key].resources
            for node in nodes.values():
                if all(val <= node.resources.get(key, 0)
                       for key, val in job_resources.items()):
                    # Found a node which can fit a replica of this job.
                    break
            else:
                # No node can fit a replica of this job.
                # TODO: propagate this to the controller so the job is Failed.
                LOG.warning("Job %s cannot be scheduled!", job_key)
                jobs.pop(job_key)
        allocations = {}
        if not jobs:
            # There are no jobs, let the expander shrink the cluster.
            self._cluster_expander.fit([])
        elif jobs and nodes:
            allocations, desired_nodes = self._policy.optimize(
                jobs, nodes, prev_allocations, node_template)
            if desired_nodes < len(nodes):
                active_nodes = list(set.union(*map(set, allocations.values())))
            else:
                active_nodes = list(nodes)
                while len(active_nodes) < desired_nodes:
                    active_nodes.append(f"~{desired_nodes-len(active_nodes)}")
            self._cluster_expander.fit(active_nodes)
            LOG.info("Active nodes: %s", active_nodes)
        elif jobs:
            # Expand job ASG from zero nodes.
            # Assumption is AdaptDL is running on a different ASG
            self._cluster_expander.fit(['~1'])
        return allocations