예제 #1
0
 def testResourceNumericalError(self):
     resource = Resources(cpu=0.99, gpu=0.99, custom_resources={"a": 0.99})
     small_resource = Resources(
         cpu=0.33, gpu=0.33, custom_resources={"a": 0.33})
     for i in range(3):
         resource = Resources.subtract(resource, small_resource)
     self.assertTrue(resource.is_nonnegative())
예제 #2
0
 def testSubtraction(self):
     resource_1 = Resources(
         1,
         0,
         0,
         1,
         custom_resources={
             "a": 1,
             "b": 2
         },
         extra_custom_resources={
             "a": 1,
             "b": 1
         })
     resource_2 = Resources(
         1,
         0,
         0,
         1,
         custom_resources={
             "a": 1,
             "b": 2
         },
         extra_custom_resources={
             "a": 1,
             "b": 1
         })
     new_res = Resources.subtract(resource_1, resource_2)
     self.assertTrue(new_res.cpu == 0)
     self.assertTrue(new_res.gpu == 0)
     self.assertTrue(new_res.extra_cpu == 0)
     self.assertTrue(new_res.extra_gpu == 0)
     self.assertTrue(all(k == 0 for k in new_res.custom_resources.values()))
     self.assertTrue(
         all(k == 0 for k in new_res.extra_custom_resources.values()))
예제 #3
0
    def has_resources(self, resources: Resources) -> bool:
        """Returns whether this runner has at least the specified resources.

        This refreshes the Ray cluster resources if the time since last update
        has exceeded self._refresh_period. This also assumes that the
        cluster is not resizing very frequently.
        """
        if resources.has_placement_group:
            return self._pg_manager.can_stage()

        self._update_avail_resources()
        currently_available = Resources.subtract(self._avail_resources,
                                                 self._committed_resources)
        have_space = (
            resources.cpu_total() <= currently_available.cpu
            and resources.gpu_total() <= currently_available.gpu
            and resources.memory_total() <= currently_available.memory
            and resources.object_store_memory_total() <=
            currently_available.object_store_memory and all(
                resources.get_res_total(res) <= currently_available.get(res)
                for res in resources.custom_resources))

        if have_space:
            # The assumption right now is that we block all trials if one
            # trial is queued.
            return True

        return False
예제 #4
0
    def _fluid(self, meta: TrialGroupMeta):
        """Run fluid on a specific group"""
        self._dump_groups()
        # set of trials to consider
        A = {trial.trial_id for trial in self._trial_group(meta.grp)}
        logger.debug(
            f"_fluid: meta.perf.trials_missing_info={meta.perf.trials_missing_info} meta.trials={meta.trials}, meta.grp={meta.grp}, trial_groups={self.trial_groups}, A={A}"
        )
        # assignment of resources
        W: Dict[str, Resources] = {}
        # compute new idle resources if every trials in this group were stopped
        M = resources_add(self.idle_resources,
                          self._committed_resources_in_group(meta.grp))

        if meta.perf.trials_missing_info:
            # there are still trials need perf data,
            # restrict A to only these trials
            others = A.difference(meta.perf.trials_missing_info)
            A = meta.perf.trials_missing_info
            # set others to use 0 resource
            for tid in others:
                W[tid] = Resources(cpu=0, gpu=0)
            # use 1 gpu per trial to get reference perf
            for tid in A:
                r = Resources(cpu=1, gpu=1)
                Mp = Resources.subtract(M, r)
                if not Mp.is_nonnegative():
                    break
                M = Mp
                W[tid] = r
        else:
            # convert A to array for sorting
            A = np.array(list(A))
            # reference height (1 width)
            H1 = np.array([meta.perf.get_height(tid, 1) for tid in A])
            # sort by H1 in non-increasing order
            ord = np.argsort(H1[::-1])
            A = A[ord]
            H1 = H1[ord]
            # $$w_i= \min(
            #   \max(
            #       \floor{
            #           \frac{h_{i,1}}{\sum_j h_{j,1} } n
            #       },
            #       \frac{1}{c}),
            #   d
            # )$$
            c = 1 / 2
            d = 4
            w = np.minimum(
                np.maximum(np.floor(H1 * np.size(H1) / np.sum(H1)), 1 / c), d)
            # assign resources based on w
            w = w / w.sum() * self._avail_resources.gpu_total()
            resW = [Resources(cpu=1, gpu=g) for g in w]
            # write to W
            W = dict(zip(A, resW))

        self._ensure_W(W, meta)
예제 #5
0
    def _ensure_W(self, W: Dict[str, Resources], meta: TrialGroupMeta):
        """Adjust group resources given in W"""
        logger.debug(f"ensure_W: W={W} meta.trials={meta.trials}")
        # stop any trials with 0 res
        # this has to be done first to free up resources for others to use
        for trial_id, res in W.items():
            trial = self.trial_groups[trial_id].trial
            if res.cpu_total() + res.gpu_total() == 0:
                # add to paused, then ensure_stop, we do not change trial's status which is visible outside
                running = self._find_running(trial)
                if running is not None:
                    # don't call pause_trial, which will trigger another fluid reschedule
                    self.jobs_paused[running.in_flight_future] = running
                self._ensure_stop(running.trial)
                trial.resources = res
                # add to pending
                self.start_trial(trial)
        # adjust any trials with different res, including any not already running
        for trial_id, res in W.items():
            # use trial group to map trial_id to trial
            trial = self.trial_groups[trial_id].trial

            if res.cpu_total() + res.gpu_total() == 0:
                # already handled in the loop above
                continue

            if (
                    # current_res != res
                    Resources.subtract(trial.resources, res).is_nonnegative()
                    != Resources.subtract(res,
                                          trial.resources).is_nonnegative()):
                running = self._find_running(trial)
                if running is not None:
                    # don't call pause_trial, which will trigger another fluid reschedule
                    self.jobs_paused[running.in_flight_future] = running

                self._ensure_stop(trial)

            # at this point, the job is always stopped but not in the pending queue,
            # because fluid clears the pending queue.
            trial.resources = res
            self._kickoff(PendingJob(trial, None, True), res)
예제 #6
0
 def testDifferentResources(self):
     resource_1 = Resources(1, 0, 0, 1, custom_resources={"a": 1, "b": 2})
     resource_2 = Resources(1, 0, 0, 1, custom_resources={"a": 1, "c": 2})
     new_res = Resources.subtract(resource_1, resource_2)
     assert "c" in new_res.custom_resources
     assert "b" in new_res.custom_resources
     self.assertTrue(new_res.cpu == 0)
     self.assertTrue(new_res.gpu == 0)
     self.assertTrue(new_res.extra_cpu == 0)
     self.assertTrue(new_res.extra_gpu == 0)
     self.assertTrue(new_res.get("a") == 0)
예제 #7
0
    def has_resources(self, resources):
        """Returns whether this runner has at least the specified resources.

        This refreshes the Ray cluster resources if the time since last update
        has exceeded self._refresh_period. This also assumes that the
        cluster is not resizing very frequently.
        """
        if time.time() - self._last_resource_refresh > self._refresh_period:
            self._update_avail_resources()

        currently_available = Resources.subtract(self._avail_resources,
                                                 self._committed_resources)

        have_space = (
            resources.cpu_total() <= currently_available.cpu
            and resources.gpu_total() <= currently_available.gpu
            and resources.memory_total() <= currently_available.memory
            and resources.object_store_memory_total() <=
            currently_available.object_store_memory and all(
                resources.get_res_total(res) <= currently_available.get(res)
                for res in resources.custom_resources))

        if have_space:
            return True

        can_overcommit = self._queue_trials

        if ((resources.cpu_total() > 0 and currently_available.cpu <= 0)
                or (resources.gpu_total() > 0 and currently_available.gpu <= 0)
                or
            (resources.memory_total() > 0 and currently_available.memory <= 0)
                or (resources.object_store_memory_total() > 0
                    and currently_available.object_store_memory <= 0) or any(
                        (resources.get_res_total(res_name) > 0
                         and currently_available.get(res_name) <= 0)
                        for res_name in resources.custom_resources)):
            can_overcommit = False  # requested resource is already saturated

        if can_overcommit:
            logger.warning(
                "Allowing trial to start even though the "
                "cluster does not have enough free resources. Trial actors "
                "may appear to hang until enough resources are added to the "
                "cluster (e.g., via autoscaling). You can disable this "
                "behavior by specifying `queue_trials=False` in "
                "ray.tune.run().")
            return True

        return False
예제 #8
0
    def has_resources(self, resources):
        """Returns whether this runner has at least the specified resources.

        This refreshes the Ray cluster resources if the time since last update
        has exceeded self._refresh_period. This also assumes that the
        cluster is not resizing very frequently.
        """
        if resources.has_placement_group:
            return self._pg_manager.can_stage()

        self._update_avail_resources()
        currently_available = Resources.subtract(self._avail_resources,
                                                 self._committed_resources)

        have_space = (
            resources.cpu_total() <= currently_available.cpu
            and resources.gpu_total() <= currently_available.gpu
            and resources.memory_total() <= currently_available.memory
            and resources.object_store_memory_total() <=
            currently_available.object_store_memory and all(
                resources.get_res_total(res) <= currently_available.get(res)
                for res in resources.custom_resources))

        if have_space:
            # The assumption right now is that we block all trials if one
            # trial is queued.
            self._trial_queued = False
            return True

        can_overcommit = self._queue_trials and not self._trial_queued
        if can_overcommit:
            self._trial_queued = True
            logger.warning(
                "Allowing trial to start even though the "
                "cluster does not have enough free resources. Trial actors "
                "may appear to hang until enough resources are added to the "
                "cluster (e.g., via autoscaling). You can disable this "
                "behavior by specifying `queue_trials=False` in "
                "ray.tune.run().")
            return True

        return False
예제 #9
0
def resources_add(a: Resources, b: Resources) -> Resources:
    zero = Resources(cpu=0, gpu=0)
    nb = Resources.subtract(zero, b)
    return Resources.subtract(a, nb)
예제 #10
0
 def idle_resources(self) -> Resources:
     return Resources.subtract(self._avail_resources,
                               self._committed_resources)