def _update_avail_resources(self, num_retries=5): resources = None for i in range(num_retries): if i > 0: logger.warning( "Cluster resources not detected or are 0. Attempt #" "%s...", i + 1) time.sleep(0.5) try: resources = ray.cluster_resources() except Exception: # TODO(rliaw): Remove this when local mode is fixed. # https://github.com/ray-project/ray/issues/4147 logger.debug("Using resources for local machine.") resources = ResourceSpec().resolve(True).to_resource_dict() if resources: break if not resources: # NOTE: This hides the possibility that Ray may be waiting for # clients to connect. resources.setdefault("CPU", 0) resources.setdefault("GPU", 0) logger.warning("Cluster resources cannot be detected or are 0. " "You can resume this experiment by passing in " "`resume=True` to `run`.") resources = resources.copy() num_cpus = resources.pop("CPU", 0) num_gpus = resources.pop("GPU", 0) memory = ray_constants.from_memory_units(resources.pop("memory", 0)) object_store_memory = ray_constants.from_memory_units( resources.pop("object_store_memory", 0)) custom_resources = resources if num_gpus == 0: warnings.warn( "No GPU resources found, assuming local test, using CPU resources instead" ) # local test num_gpus = num_cpus self._fake_gpus = True else: self._fake_gpus = False avail_resources = Resources( int(num_cpus), int(num_gpus), memory=int(memory), object_store_memory=int(object_store_memory), custom_resources=custom_resources, ) assert (self.idle_resources.is_nonnegative() ), "Cluster removed resources from running trials!" self._avail_resources = avail_resources self._last_resource_refresh = time.time() self._resources_initialized = True
def get_resource_spec(self): """Resolve and return the current resource spec for the node.""" def merge_resources(env_dict, params_dict): """Merge two dictionaries, picking from the second in the event of a conflict. Also emit a warning on every conflict. """ result = params_dict.copy() result.update(env_dict) for key in set(env_dict.keys()).intersection( set(params_dict.keys())): logger.warning("Autoscaler is overriding your resource:" "{}: {} with {}.".format( key, params_dict[key], env_dict[key])) return result env_resources = {} env_string = os.getenv(ray_constants.RESOURCES_ENVIRONMENT_VARIABLE) if env_string: env_resources = json.loads(env_string) if not self._resource_spec: resources = merge_resources(env_resources, self._ray_params.resources) self._resource_spec = ResourceSpec( self._ray_params.num_cpus, self._ray_params.num_gpus, self._ray_params.memory, self._ray_params.object_store_memory, resources, self._ray_params.redis_max_memory).resolve( is_head=self.head, node_ip_address=self.node_ip_address) return self._resource_spec