def testResourceNumericalError(self): resource = Resources(cpu=0.99, gpu=0.99, custom_resources={"a": 0.99}) small_resource = Resources( cpu=0.33, gpu=0.33, custom_resources={"a": 0.33}) for i in range(3): resource = Resources.subtract(resource, small_resource) self.assertTrue(resource.is_nonnegative())
def testSubtraction(self): resource_1 = Resources( 1, 0, 0, 1, custom_resources={ "a": 1, "b": 2 }, extra_custom_resources={ "a": 1, "b": 1 }) resource_2 = Resources( 1, 0, 0, 1, custom_resources={ "a": 1, "b": 2 }, extra_custom_resources={ "a": 1, "b": 1 }) new_res = Resources.subtract(resource_1, resource_2) self.assertTrue(new_res.cpu == 0) self.assertTrue(new_res.gpu == 0) self.assertTrue(new_res.extra_cpu == 0) self.assertTrue(new_res.extra_gpu == 0) self.assertTrue(all(k == 0 for k in new_res.custom_resources.values())) self.assertTrue( all(k == 0 for k in new_res.extra_custom_resources.values()))
def has_resources(self, resources: Resources) -> bool: """Returns whether this runner has at least the specified resources. This refreshes the Ray cluster resources if the time since last update has exceeded self._refresh_period. This also assumes that the cluster is not resizing very frequently. """ if resources.has_placement_group: return self._pg_manager.can_stage() self._update_avail_resources() currently_available = Resources.subtract(self._avail_resources, self._committed_resources) have_space = ( resources.cpu_total() <= currently_available.cpu and resources.gpu_total() <= currently_available.gpu and resources.memory_total() <= currently_available.memory and resources.object_store_memory_total() <= currently_available.object_store_memory and all( resources.get_res_total(res) <= currently_available.get(res) for res in resources.custom_resources)) if have_space: # The assumption right now is that we block all trials if one # trial is queued. return True return False
def _fluid(self, meta: TrialGroupMeta): """Run fluid on a specific group""" self._dump_groups() # set of trials to consider A = {trial.trial_id for trial in self._trial_group(meta.grp)} logger.debug( f"_fluid: meta.perf.trials_missing_info={meta.perf.trials_missing_info} meta.trials={meta.trials}, meta.grp={meta.grp}, trial_groups={self.trial_groups}, A={A}" ) # assignment of resources W: Dict[str, Resources] = {} # compute new idle resources if every trials in this group were stopped M = resources_add(self.idle_resources, self._committed_resources_in_group(meta.grp)) if meta.perf.trials_missing_info: # there are still trials need perf data, # restrict A to only these trials others = A.difference(meta.perf.trials_missing_info) A = meta.perf.trials_missing_info # set others to use 0 resource for tid in others: W[tid] = Resources(cpu=0, gpu=0) # use 1 gpu per trial to get reference perf for tid in A: r = Resources(cpu=1, gpu=1) Mp = Resources.subtract(M, r) if not Mp.is_nonnegative(): break M = Mp W[tid] = r else: # convert A to array for sorting A = np.array(list(A)) # reference height (1 width) H1 = np.array([meta.perf.get_height(tid, 1) for tid in A]) # sort by H1 in non-increasing order ord = np.argsort(H1[::-1]) A = A[ord] H1 = H1[ord] # $$w_i= \min( # \max( # \floor{ # \frac{h_{i,1}}{\sum_j h_{j,1} } n # }, # \frac{1}{c}), # d # )$$ c = 1 / 2 d = 4 w = np.minimum( np.maximum(np.floor(H1 * np.size(H1) / np.sum(H1)), 1 / c), d) # assign resources based on w w = w / w.sum() * self._avail_resources.gpu_total() resW = [Resources(cpu=1, gpu=g) for g in w] # write to W W = dict(zip(A, resW)) self._ensure_W(W, meta)
def _ensure_W(self, W: Dict[str, Resources], meta: TrialGroupMeta): """Adjust group resources given in W""" logger.debug(f"ensure_W: W={W} meta.trials={meta.trials}") # stop any trials with 0 res # this has to be done first to free up resources for others to use for trial_id, res in W.items(): trial = self.trial_groups[trial_id].trial if res.cpu_total() + res.gpu_total() == 0: # add to paused, then ensure_stop, we do not change trial's status which is visible outside running = self._find_running(trial) if running is not None: # don't call pause_trial, which will trigger another fluid reschedule self.jobs_paused[running.in_flight_future] = running self._ensure_stop(running.trial) trial.resources = res # add to pending self.start_trial(trial) # adjust any trials with different res, including any not already running for trial_id, res in W.items(): # use trial group to map trial_id to trial trial = self.trial_groups[trial_id].trial if res.cpu_total() + res.gpu_total() == 0: # already handled in the loop above continue if ( # current_res != res Resources.subtract(trial.resources, res).is_nonnegative() != Resources.subtract(res, trial.resources).is_nonnegative()): running = self._find_running(trial) if running is not None: # don't call pause_trial, which will trigger another fluid reschedule self.jobs_paused[running.in_flight_future] = running self._ensure_stop(trial) # at this point, the job is always stopped but not in the pending queue, # because fluid clears the pending queue. trial.resources = res self._kickoff(PendingJob(trial, None, True), res)
def testDifferentResources(self): resource_1 = Resources(1, 0, 0, 1, custom_resources={"a": 1, "b": 2}) resource_2 = Resources(1, 0, 0, 1, custom_resources={"a": 1, "c": 2}) new_res = Resources.subtract(resource_1, resource_2) assert "c" in new_res.custom_resources assert "b" in new_res.custom_resources self.assertTrue(new_res.cpu == 0) self.assertTrue(new_res.gpu == 0) self.assertTrue(new_res.extra_cpu == 0) self.assertTrue(new_res.extra_gpu == 0) self.assertTrue(new_res.get("a") == 0)
def has_resources(self, resources): """Returns whether this runner has at least the specified resources. This refreshes the Ray cluster resources if the time since last update has exceeded self._refresh_period. This also assumes that the cluster is not resizing very frequently. """ if time.time() - self._last_resource_refresh > self._refresh_period: self._update_avail_resources() currently_available = Resources.subtract(self._avail_resources, self._committed_resources) have_space = ( resources.cpu_total() <= currently_available.cpu and resources.gpu_total() <= currently_available.gpu and resources.memory_total() <= currently_available.memory and resources.object_store_memory_total() <= currently_available.object_store_memory and all( resources.get_res_total(res) <= currently_available.get(res) for res in resources.custom_resources)) if have_space: return True can_overcommit = self._queue_trials if ((resources.cpu_total() > 0 and currently_available.cpu <= 0) or (resources.gpu_total() > 0 and currently_available.gpu <= 0) or (resources.memory_total() > 0 and currently_available.memory <= 0) or (resources.object_store_memory_total() > 0 and currently_available.object_store_memory <= 0) or any( (resources.get_res_total(res_name) > 0 and currently_available.get(res_name) <= 0) for res_name in resources.custom_resources)): can_overcommit = False # requested resource is already saturated if can_overcommit: logger.warning( "Allowing trial to start even though the " "cluster does not have enough free resources. Trial actors " "may appear to hang until enough resources are added to the " "cluster (e.g., via autoscaling). You can disable this " "behavior by specifying `queue_trials=False` in " "ray.tune.run().") return True return False
def has_resources(self, resources): """Returns whether this runner has at least the specified resources. This refreshes the Ray cluster resources if the time since last update has exceeded self._refresh_period. This also assumes that the cluster is not resizing very frequently. """ if resources.has_placement_group: return self._pg_manager.can_stage() self._update_avail_resources() currently_available = Resources.subtract(self._avail_resources, self._committed_resources) have_space = ( resources.cpu_total() <= currently_available.cpu and resources.gpu_total() <= currently_available.gpu and resources.memory_total() <= currently_available.memory and resources.object_store_memory_total() <= currently_available.object_store_memory and all( resources.get_res_total(res) <= currently_available.get(res) for res in resources.custom_resources)) if have_space: # The assumption right now is that we block all trials if one # trial is queued. self._trial_queued = False return True can_overcommit = self._queue_trials and not self._trial_queued if can_overcommit: self._trial_queued = True logger.warning( "Allowing trial to start even though the " "cluster does not have enough free resources. Trial actors " "may appear to hang until enough resources are added to the " "cluster (e.g., via autoscaling). You can disable this " "behavior by specifying `queue_trials=False` in " "ray.tune.run().") return True return False
def resources_add(a: Resources, b: Resources) -> Resources: zero = Resources(cpu=0, gpu=0) nb = Resources.subtract(zero, b) return Resources.subtract(a, nb)
def idle_resources(self) -> Resources: return Resources.subtract(self._avail_resources, self._committed_resources)