class WorkQueue: def __init__(self, max_depth: int = 8): self._queue = Queue(maxsize=max_depth) def get_queue(self): """ :return: Ray Queue actor, needed by the consumers. """ return self._queue def empty(self): """ :return: Ray Queue actor, needed by the consumers. """ return self._queue.empty() def group(self, labels_all: np.ndarray, probs_all: np.ndarray, filename: str, original_shape: tuple, inference_time_sec: float, page_number: int) -> dict: return { "labels_all": labels_all, "probs_all": probs_all, "filename": filename, "original_shape": original_shape, "inference_time_sec": inference_time_sec, "page_number": page_number } def ungroup(self, dictionary): """ use this like: labels_all, probs_all, filename, original_shape = ungroup(d) :param dictionary: :return: """ return dictionary["labels_all"], dictionary["probs_all"], dictionary[ "filename"], dictionary["original_shape"], dictionary[ "inference_time_sec"], dictionary["page_number"] def push(self, dictionary): """ Push dictionary of params to post-process. Blocks if queue is full for flow-control and proceeds when queue has enough space. :param dictionary: a dictionary created with group() method. :return: None """ # put in object store ref = ray.put(dictionary) # put ref in queue self._queue.put(ref) return None def pop(self): """ :return: a dictionary created with group() method, use ungroup() to unpack or lookup individually. """ return self._queue.get()
def test_simple_usage(ray_start_regular_shared): q = Queue() items = list(range(10)) for item in items: q.put(item) for item in items: assert item == q.get()
def test_put(ray_start_regular_shared): q = Queue(1) item = 0 q.put(item, block=False) assert q.get() == item item = 1 q.put(item, timeout=0.2) assert q.get() == item with pytest.raises(ValueError): q.put(0, timeout=-1) q.put(0) with pytest.raises(Full): q.put_nowait(1) with pytest.raises(Full): q.put(1, timeout=0.2)
def test_async_put(ray_start_regular_shared): q = Queue(1) q.put(1) future = async_put.remote(q, 2) with pytest.raises(Full): q.put_nowait(3) with pytest.raises(GetTimeoutError): ray.get(future, timeout=0.1) # task not canceled on timeout. assert q.get() == 1 assert q.get() == 2
def test_qsize(ray_start_regular_shared): q = Queue() items = list(range(10)) size = 0 assert q.qsize() == size for item in items: q.put(item) size += 1 assert q.qsize() == size for item in items: assert q.get() == item size -= 1 assert q.qsize() == size
def test_get(ray_start_regular_shared): q = Queue() item = 0 q.put(item) assert q.get(block=False) == item item = 1 q.put(item) assert q.get(timeout=0.2) == item with pytest.raises(ValueError): q.get(timeout=-1) with pytest.raises(Empty): q.get_nowait() with pytest.raises(Empty): q.get(timeout=0.2)
class DistributedPool(DistributedPoolAPI): """PoolAPI is an abstract class defining a resource Pool. A resource pool object which controls a pool of ressources (CPU, GPU, ...) to which jobs can be submitted. It supports asynchronous results with timeouts and callbacks and has a parallel map implementation. """ def __init__( self, n_worker: int, n_cpu_per_worker: int, memory_limit_per_worker: float = 0, n_gpu_per_worker: float = 0, max_pending_task: int = 10000, local_pool_class: Type[LocalPoolAPI] = LocalPool, ) -> None: """RayDistributedCluster constructor""" super().__init__( n_worker=n_worker, n_cpu_per_worker=n_cpu_per_worker, memory_limit_per_worker=memory_limit_per_worker, n_gpu_per_worker=n_gpu_per_worker, local_pool_class=local_pool_class, ) self.max_pending_task = max_pending_task # create task and results queues self.task_queue = Queue(max_pending_task) self.result_queue = Queue(max_pending_task) # consume processed results from result_queue # start consuming result queue def consume_result_queue(): self.started = True while self.started: try: result = self.result_queue.get(timeout=1, block=True) if type(result) is str: continue if result and result.task_id in self.processed_results: self.processed_results[result.task_id].result = result del self.processed_results[result.task_id] except Empty: continue except (RayActorError, AttributeError): break self.result_consumer_thread = threading.Thread( target=consume_result_queue) self.result_consumer_thread.start() # start actors opt = { "num_cpus": n_cpu_per_worker, "num_gpus": n_gpu_per_worker, } self.actor_pool = [ _RayExecutorActor.options(**opt).remote( # type: ignore self.task_queue, self.result_queue, self.create_local_pool(n_cpu=n_cpu_per_worker, memory_limit=0, n_visible_gpu=[]), ) for _ in range(n_worker) ] for a in self.actor_pool: a.start.remote() # wait agent ready # self.result_queue.get(block=True, timeout=30) self.processed_results: Dict[uuid.UUID, _RayAsyncResult] = {} def apply_async( self, func: Callable[..., _OutputType], args: Optional[Iterable[Any]] = None, kwds: Optional[Mapping[str, Any]] = None, callback: Optional[Callable[[_OutputType], None]] = None, error_callback: Optional[Callable[[BaseException], None]] = None, ) -> AsyncResult[_OutputType]: __doc__ = super().apply_async.__doc__ # noqa: F841 task = _RayTask( task_id=uuid4(), func=func, args=args, kwds=kwds, callback=callback, error_callback=error_callback, ) self.task_queue.put(task) async_res = _RayAsyncResult[_OutputType](task.task_id) self.processed_results[task.task_id] = async_res return async_res def map_async( self, func: Callable[[_InputType], _OutputType], iterable: Iterable[_InputType], chunksize: Optional[int] = 1, callback: Optional[Callable[[_OutputType], None]] = None, error_callback: Optional[Callable[[BaseException], None]] = None, ) -> MapResult[_OutputType]: __doc__ = super().apply_async.__doc__ # noqa: F841 chuncks_async_results: List[AsyncResult[List[_OutputType]]] = [] for c in mitertools.divide(self.n_worker, iterable=iterable): task = _RayMapTask( task_id=uuid4(), func=func, args=c, callback=callback, error_callback=error_callback, ) chunck_async_result = _RayAsyncResult[List[_OutputType]]( task.task_id) chuncks_async_results.append(chunck_async_result) self.processed_results[task.task_id] = chunck_async_result self.task_queue.put(task) async_res: MapResult[_OutputType] = _RayAsyncMapResult[_OutputType]( async_results=chuncks_async_results) return async_res def terminate(self) -> None: __doc__ = super().terminate.__doc__ # noqa: F841 self.close() for a in self.actor_pool: ray.kill(a) def close(self) -> None: __doc__ = super().close.__doc__ # noqa: F841 self.started = False for a in self.actor_pool: a.stop.remote() ray.kill(self.result_queue.actor) ray.kill(self.task_queue.actor) sleep(1) def create_local_pool(self, n_cpu: int = 0, memory_limit: float = 0, n_visible_gpu: List[int] = []) -> LocalPoolAPI: if memory_limit == 0: memory_limit = self.memory_limit_per_worker return self.local_pool_class(n_cpu, memory_limit, n_visible_gpu, lazy=True)
class DockerCluster: """Docker cluster wrapper. Creates a directory for starting a fake multinode docker cluster. Includes APIs to update the cluster config as needed in tests, and to start and connect to the cluster. """ def __init__(self, config: Optional[Dict[str, Any]] = None): self._base_config_file = os.path.join(os.path.dirname(__file__), "example_docker.yaml") self._tempdir = None self._config_file = None self._nodes_file = None self._nodes = {} self._status_file = None self._status = {} self._partial_config = config self._cluster_config = None self._docker_image = None self._monitor_script = os.path.join(os.path.dirname(__file__), "docker_monitor.py") self._monitor_process = None self._execution_thread = None self._execution_event = threading.Event() self._execution_queue = None @property def config_file(self): return self._config_file @property def cluster_config(self): return self._cluster_config @property def cluster_dir(self): return self._tempdir @property def gcs_port(self): return self._cluster_config.get("provider", {}).get("host_gcs_port", FAKE_DOCKER_DEFAULT_GCS_PORT) @property def client_port(self): return self._cluster_config.get("provider", {}).get( "host_client_port", FAKE_DOCKER_DEFAULT_CLIENT_PORT) def connect(self, client: bool = True, timeout: int = 120, **init_kwargs): """Connect to the docker-compose Ray cluster. Assumes the cluster is at RAY_TESTHOST (defaults to ``127.0.0.1``). Args: client: If True, uses Ray client to connect to the cluster. If False, uses GCS to connect to the cluster. timeout: Connection timeout in seconds. **init_kwargs: kwargs to pass to ``ray.init()``. """ host = os.environ.get("RAY_TESTHOST", "127.0.0.1") if client: port = self.client_port address = f"ray://{host}:{port}" else: port = self.gcs_port address = f"{host}:{port}" timeout_at = time.monotonic() + timeout while time.monotonic() < timeout_at: try: ray.init(address, **init_kwargs) self.wait_for_resources({"CPU": 1}) except ResourcesNotReadyError: time.sleep(1) continue else: break try: ray.cluster_resources() except Exception as e: raise RuntimeError(f"Timed out connecting to Ray: {e}") def remote_execution_api(self) -> "RemoteAPI": """Create an object to control cluster state from within the cluster.""" self._execution_queue = Queue(actor_options={"num_cpus": 0}) stop_event = self._execution_event def entrypoint(): while not stop_event.is_set(): try: cmd, kwargs = self._execution_queue.get(timeout=1) except Empty: continue if cmd == "kill_node": self.kill_node(**kwargs) self._execution_thread = threading.Thread(target=entrypoint) self._execution_thread.start() return RemoteAPI(self._execution_queue) @staticmethod def wait_for_resources(resources: Dict[str, float], timeout: int = 60): """Wait until Ray cluster resources are available Args: resources: Minimum resources needed before this function returns. timeout: Timeout in seconds. """ timeout = time.monotonic() + timeout available = ray.cluster_resources() while any(available.get(k, 0.0) < v for k, v in resources.items()): if time.monotonic() > timeout: raise ResourcesNotReadyError( f"Timed out waiting for resources: {resources}") time.sleep(1) available = ray.cluster_resources() def update_config(self, config: Optional[Dict[str, Any]] = None): """Update autoscaling config. Does a deep update of the base config with a new configuration. This can change autoscaling behavior. Args: config: Partial config to update current config with. """ assert self._tempdir, "Call setup() first" config = config or {} if config: self._partial_config = config if not config.get("provider", {}).get("image"): # No image specified, trying to parse from buildkite docker_image = os.environ.get("RAY_DOCKER_IMAGE", None) if not docker_image: # If still no docker image, use one according to Python version mj = sys.version_info.major mi = sys.version_info.minor docker_image = DEFAULT_DOCKER_IMAGE.format(major=mj, minor=mi) self._docker_image = docker_image with open(self._base_config_file, "rt") as f: cluster_config = yaml.safe_load(f) if self._partial_config: deep_update(cluster_config, self._partial_config, new_keys_allowed=True) if self._docker_image: cluster_config["provider"]["image"] = self._docker_image cluster_config["provider"]["shared_volume_dir"] = self._tempdir self._cluster_config = cluster_config with open(self._config_file, "wt") as f: yaml.safe_dump(self._cluster_config, f) logging.info(f"Updated cluster config to: {self._cluster_config}") def maybe_pull_image(self): if self._docker_image: try: images_str = subprocess.check_output( f"docker image inspect {self._docker_image}", shell=True) images = json.loads(images_str) except Exception as e: logger.error( f"Error inspecting image {self._docker_image}: {e}") return if not images: try: subprocess.check_output( f"docker pull {self._docker_image}", shell=True) except Exception as e: logger.error( f"Error pulling image {self._docker_image}: {e}") def setup(self): """Setup docker compose cluster environment. Creates the temporary directory, writes the initial config file, and pulls the docker image, if required. """ self._tempdir = tempfile.mkdtemp( dir=os.environ.get("RAY_TEMPDIR", None)) os.chmod(self._tempdir, 0o777) self._config_file = os.path.join(self._tempdir, "cluster.yaml") self._nodes_file = os.path.join(self._tempdir, "nodes.json") self._status_file = os.path.join(self._tempdir, "status.json") self.update_config() self.maybe_pull_image() def teardown(self, keep_dir: bool = False): """Tear down docker compose cluster environment. Args: keep_dir: If True, cluster directory will not be removed after termination. """ if not keep_dir: shutil.rmtree(self._tempdir) self._tempdir = None self._config_file = None def _start_monitor(self): self._monitor_process = subprocess.Popen( ["python", self._monitor_script, self.config_file]) time.sleep(2) def _stop_monitor(self): if self._monitor_process: self._monitor_process.wait(timeout=30) if self._monitor_process.poll() is None: self._monitor_process.terminate() self._monitor_process = None def start(self): """Start docker compose cluster. Starts the monitor process and runs ``ray up``. """ self._start_monitor() subprocess.check_output( f"RAY_FAKE_CLUSTER=1 ray up -y {self.config_file}", shell=True) def stop(self): """Stop docker compose cluster. Runs ``ray down`` and stops the monitor process. """ if ray.is_initialized: ray.shutdown() subprocess.check_output( f"RAY_FAKE_CLUSTER=1 ray down -y {self.config_file}", shell=True) self._stop_monitor() self._execution_event.set() def _update_nodes(self): with open(self._nodes_file, "rt") as f: self._nodes = json.load(f) def _update_status(self): with open(self._status_file, "rt") as f: self._status = json.load(f) def _get_node( self, node_id: Optional[str] = None, num: Optional[int] = None, rand: Optional[str] = None, ) -> str: self._update_nodes() if node_id: assert (not num and not rand ), "Only provide either `node_id`, `num`, or `random`." elif num: assert (not node_id and not rand ), "Only provide either `node_id`, `num`, or `random`." base = "fffffffffffffffffffffffffffffffffffffffffffffffffff" node_id = base + str(num).zfill(5) elif rand: assert (not node_id and not num ), "Only provide either `node_id`, `num`, or `random`." assert rand in [ "worker", "any", ], "`random` must be one of ['worker', 'any']" choices = list(self._nodes.keys()) if rand == "worker": choices.remove( "fffffffffffffffffffffffffffffffffffffffffffffffffff00000") # Else: any node_id = random.choice(choices) assert node_id in self._nodes, f"Node with ID {node_id} is not in active nodes." return node_id def _get_docker_container(self, node_id: str) -> Optional[str]: self._update_status() node_status = self._status.get(node_id) if not node_status: return None return node_status["Name"] def kill_node( self, node_id: Optional[str] = None, num: Optional[int] = None, rand: Optional[str] = None, ): """Kill node. If ``node_id`` is given, kill that node. If ``num`` is given, construct node_id from this number, and kill that node. If ``rand`` is given (as either ``worker`` or ``any``), kill a random node. """ node_id = self._get_node(node_id=node_id, num=num, rand=rand) container = self._get_docker_container(node_id=node_id) subprocess.check_output(f"docker kill {container}", shell=True)
def _train(params: Dict, dtrain: RayDMatrix, *args, evals=(), num_actors: int = 4, cpus_per_actor: int = 0, gpus_per_actor: int = -1, resources_per_actor: Optional[Dict] = None, checkpoint_prefix: Optional[str] = None, checkpoint_path: str = "/tmp", checkpoint_frequency: int = 5, **kwargs) -> Tuple[xgb.Booster, Dict, Dict]: _assert_ray_support() if not ray.is_initialized(): ray.init() if gpus_per_actor == -1: gpus_per_actor = 0 if "tree_method" in params and params["tree_method"].startswith("gpu"): gpus_per_actor = 1 if cpus_per_actor <= 0: cluster_cpus = _ray_get_cluster_cpus() or 1 cpus_per_actor = min(int(_get_max_node_cpus() or 1), int(cluster_cpus // num_actors)) if "nthread" in params: if params["nthread"] > cpus_per_actor: raise ValueError( "Specified number of threads greater than number of CPUs. " "\nFIX THIS by passing a lower value for the `nthread` " "parameter or a higher number for `cpus_per_actor`.") else: params["nthread"] = cpus_per_actor # Create queue for communication from worker to caller. # Always create queue. queue = Queue() # Create remote actors actors = [ _create_actor(i, num_actors, cpus_per_actor, gpus_per_actor, resources_per_actor, queue, checkpoint_prefix, checkpoint_path, checkpoint_frequency) for i in range(num_actors) ] logger.info(f"[RayXGBoost] Created {len(actors)} remote actors.") # Split data across workers wait_load = [] for _, actor in enumerate(actors): wait_load.extend(_trigger_data_load(actor, dtrain, evals)) try: ray.get(wait_load) except Exception: _shutdown(actors, queue, force=True) raise logger.info("[RayXGBoost] Starting XGBoost training.") # Start tracker env = _start_rabit_tracker(num_actors) rabit_args = [("%s=%s" % item).encode() for item in env.items()] # Train fut = [ actor.train.remote(rabit_args, params, dtrain, evals, *args, **kwargs) for actor in actors ] callback_returns = [list() for _ in range(len(actors))] try: not_ready = fut while not_ready: if queue: while not queue.empty(): (actor_rank, item) = queue.get() if isinstance(item, Callable): item() else: callback_returns[actor_rank].append(item) ready, not_ready = ray.wait(not_ready, timeout=0) logger.debug("[RayXGBoost] Waiting for results...") ray.get(ready) # Once everything is ready ray.get(fut) # The inner loop should catch all exceptions except Exception: _shutdown(remote_workers=actors, queue=queue, force=True) raise # All results should be the same because of Rabit tracking. So we just # return the first one. res: Dict[str, Any] = ray.get(fut[0]) bst = res["bst"] evals_result = res["evals_result"] additional_results = {} if callback_returns: additional_results["callback_returns"] = callback_returns all_res = ray.get(fut) total_n = sum(res["train_n"] or 0 for res in all_res) logger.info(f"[RayXGBoost] Finished XGBoost training on training data " f"with total N={total_n:,}.") if checkpoint_prefix: _cleanup(checkpoint_prefix, checkpoint_path, num_actors) _shutdown(remote_workers=actors, queue=queue, force=False) return bst, evals_result, additional_results