def connect_ray_pdb(host=None, port=None, patch_stdstreams=False, quiet=None): """ Opens a remote PDB on first available port. """ if host is None: host = os.environ.get("REMOTE_PDB_HOST", "127.0.0.1") if port is None: port = int(os.environ.get("REMOTE_PDB_PORT", "0")) if quiet is None: quiet = bool(os.environ.get("REMOTE_PDB_QUIET", "")) rdb = RemotePdb(host=host, port=port, patch_stdstreams=patch_stdstreams, quiet=quiet) sockname = rdb._listen_socket.getsockname() pdb_address = "{}:{}".format(sockname[0], sockname[1]) parentframeinfo = inspect.getouterframes(inspect.currentframe())[2] data = { "proctitle": setproctitle.getproctitle(), "pdb_address": pdb_address, "filename": parentframeinfo.filename, "lineno": parentframeinfo.lineno, "traceback": "\n".join(traceback.format_exception(*sys.exc_info())) } breakpoint_uuid = uuid.uuid4() _internal_kv_put("RAY_PDB_{}".format(breakpoint_uuid), json.dumps(data), overwrite=True) rdb.listen() _internal_kv_del("RAY_PDB_{}".format(breakpoint_uuid)) return rdb
def connect_ray_pdb( host=None, port=None, patch_stdstreams=False, quiet=None, breakpoint_uuid=None, debugger_external=False, ): """ Opens a remote PDB on first available port. """ if debugger_external: assert not host, "Cannot specify both host and debugger_external" host = "0.0.0.0" elif host is None: host = os.environ.get("REMOTE_PDB_HOST", "127.0.0.1") if port is None: port = int(os.environ.get("REMOTE_PDB_PORT", "0")) if quiet is None: quiet = bool(os.environ.get("REMOTE_PDB_QUIET", "")) if not breakpoint_uuid: breakpoint_uuid = uuid.uuid4().hex if debugger_external: ip_address = ray.worker.global_worker.node_ip_address else: ip_address = "localhost" rdb = RemotePdb( breakpoint_uuid=breakpoint_uuid, host=host, port=port, ip_address=ip_address, patch_stdstreams=patch_stdstreams, quiet=quiet, ) sockname = rdb._listen_socket.getsockname() pdb_address = "{}:{}".format(ip_address, sockname[1]) parentframeinfo = inspect.getouterframes(inspect.currentframe())[2] data = { "proctitle": setproctitle.getproctitle(), "pdb_address": pdb_address, "filename": parentframeinfo.filename, "lineno": parentframeinfo.lineno, "traceback": "\n".join(traceback.format_exception(*sys.exc_info())), "timestamp": time.time(), "job_id": ray.get_runtime_context().job_id.hex(), } _internal_kv_put( "RAY_PDB_{}".format(breakpoint_uuid), json.dumps(data), overwrite=True, namespace=ray_constants.KV_NAMESPACE_PDB, ) rdb.listen() _internal_kv_del("RAY_PDB_{}".format(breakpoint_uuid), namespace=ray_constants.KV_NAMESPACE_PDB) return rdb
def test_spread_scheduling_strategy(ray_start_cluster, connect_to_client): cluster = ray_start_cluster # Create a head node cluster.add_node( num_cpus=0, _system_config={ "scheduler_spread_threshold": 1, }, ) ray.init(address=cluster.address) for i in range(2): cluster.add_node(num_cpus=8, resources={f"foo:{i}": 1}) cluster.wait_for_nodes() with connect_to_client_or_not(connect_to_client): @ray.remote def get_node_id(): return ray.worker.global_worker.current_node_id worker_node_ids = { ray.get(get_node_id.options(resources={ f"foo:{i}": 1 }).remote()) for i in range(2) } # Wait for updating driver raylet's resource view. time.sleep(5) @ray.remote(scheduling_strategy=SPREAD_SCHEDULING_STRATEGY) def task1(): internal_kv._internal_kv_put("test_task1", "task1") while internal_kv._internal_kv_exists("test_task1"): time.sleep(0.1) return ray.worker.global_worker.current_node_id @ray.remote def task2(): internal_kv._internal_kv_put("test_task2", "task2") return ray.worker.global_worker.current_node_id locations = [] locations.append(task1.remote()) while not internal_kv._internal_kv_exists("test_task1"): time.sleep(0.1) # Wait for updating driver raylet's resource view. time.sleep(5) locations.append( task2.options( scheduling_strategy=SPREAD_SCHEDULING_STRATEGY).remote()) while not internal_kv._internal_kv_exists("test_task2"): time.sleep(0.1) internal_kv._internal_kv_del("test_task1") internal_kv._internal_kv_del("test_task2") assert set(ray.get(locations)) == worker_node_ids
def run(self): # Register signal handlers for autoscaler termination. signal.signal(signal.SIGINT, self._signal_handler) signal.signal(signal.SIGTERM, self._signal_handler) try: if _internal_kv_initialized(): # Delete any previous autoscaling errors. _internal_kv_del(DEBUG_AUTOSCALING_ERROR) self._initialize_autoscaler() self._run() except Exception: self._handle_failure(traceback.format_exc()) raise
def test_create_upload_once(self, tmp_path, random_dir, ray_start_regular): uri = get_uri_for_directory(random_dir) uploaded = upload_package_if_needed(uri, tmp_path, random_dir) assert uploaded assert _internal_kv_exists(uri, namespace=KV_NAMESPACE_PACKAGE) uploaded = upload_package_if_needed(uri, tmp_path, random_dir) assert not uploaded assert _internal_kv_exists(uri, namespace=KV_NAMESPACE_PACKAGE) # Delete the URI from the internal_kv. This should trigger re-upload. _internal_kv_del(uri, namespace=KV_NAMESPACE_PACKAGE) assert not _internal_kv_exists(uri, namespace=KV_NAMESPACE_PACKAGE) uploaded = upload_package_if_needed(uri, tmp_path, random_dir) assert uploaded
def test_create_upload_once(self, empty_dir, random_dir, ray_start_regular): uri = get_uri_for_directory(random_dir) uploaded = upload_package_if_needed(uri, empty_dir, random_dir) assert uploaded assert _internal_kv_exists(uri) uploaded = upload_package_if_needed(uri, empty_dir, random_dir) assert not uploaded assert _internal_kv_exists(uri) # Delete the URI from the internal_kv. This should trigger re-upload. _internal_kv_del(uri) assert not _internal_kv_exists(uri) uploaded = upload_package_if_needed(uri, empty_dir, random_dir) assert uploaded
def delete(self, key): """Delete the value associated with the given key from the store. Args: key (str) """ if not isinstance(key, str): raise TypeError("key must be a string, got: {}.".format(type(key))) return ray_kv._internal_kv_del(self._format_key(key))
def delete(self, key: str): """Delete the value associated with the given key from the store. Args: key (str) """ if not isinstance(key, str): raise TypeError("key must be a string, got: {}.".format(type(key))) return ray_kv._internal_kv_del( self.get_storage_key(key), namespace=ray_constants.KV_NAMESPACE_SERVE)
def test_internal_kv(ray_start_regular): import ray.experimental.internal_kv as kv assert kv._internal_kv_get("k1") is None assert kv._internal_kv_put("k1", "v1") is False assert kv._internal_kv_put("k1", "v1") is True assert kv._internal_kv_get("k1") == b"v1" assert kv._internal_kv_get("k1", namespace="n") is None assert kv._internal_kv_put("k1", "v1", namespace="n") is False assert kv._internal_kv_put("k1", "v1", namespace="n") is True assert kv._internal_kv_put("k1", "v2", True, namespace="n") is True assert kv._internal_kv_get("k1", namespace="n") == b"v2" assert kv._internal_kv_del("k1") == 1 assert kv._internal_kv_del("k1") == 0 assert kv._internal_kv_get("k1") is None assert kv._internal_kv_put("k2", "v2", namespace="n") is False assert kv._internal_kv_put("k3", "v3", namespace="n") is False assert set(kv._internal_kv_list("k", namespace="n")) == {b"k1", b"k2", b"k3"} assert kv._internal_kv_del("k", del_by_prefix=True, namespace="n") == 3 assert kv._internal_kv_del("x", del_by_prefix=True, namespace="n") == 0 assert kv._internal_kv_get("k1", namespace="n") is None assert kv._internal_kv_get("k2", namespace="n") is None assert kv._internal_kv_get("k3", namespace="n") is None with pytest.raises(RuntimeError): kv._internal_kv_put("@namespace_", "x", True) with pytest.raises(RuntimeError): kv._internal_kv_get("@namespace_", namespace="n") with pytest.raises(RuntimeError): kv._internal_kv_del("@namespace_def", namespace="n") with pytest.raises(RuntimeError): kv._internal_kv_list("@namespace_abc", namespace="n")
def del_keys(self, keys: list): for key in keys: ok = internal_kv._internal_kv_del(key) if not ok: return False return True
def test_spread_scheduling_strategy(ray_start_cluster, connect_to_client): cluster = ray_start_cluster # Create a head node cluster.add_node( num_cpus=0, _system_config={ "scheduler_spread_threshold": 1, }, ) ray.init(address=cluster.address) for i in range(2): cluster.add_node(num_cpus=8, resources={f"foo:{i}": 1}) cluster.wait_for_nodes() with connect_to_client_or_not(connect_to_client): @ray.remote def get_node_id(): return ray.get_runtime_context().node_id.hex() worker_node_ids = { ray.get(get_node_id.options(resources={ f"foo:{i}": 1 }).remote()) for i in range(2) } # Wait for updating driver raylet's resource view. time.sleep(5) @ray.remote(scheduling_strategy="SPREAD") def task1(): internal_kv._internal_kv_put("test_task1", "task1") while internal_kv._internal_kv_exists("test_task1"): time.sleep(0.1) return ray.get_runtime_context().node_id.hex() @ray.remote def task2(): internal_kv._internal_kv_put("test_task2", "task2") return ray.get_runtime_context().node_id.hex() locations = [] locations.append(task1.remote()) while not internal_kv._internal_kv_exists("test_task1"): time.sleep(0.1) # Wait for updating driver raylet's resource view. time.sleep(5) locations.append(task2.options(scheduling_strategy="SPREAD").remote()) while not internal_kv._internal_kv_exists("test_task2"): time.sleep(0.1) internal_kv._internal_kv_del("test_task1") internal_kv._internal_kv_del("test_task2") assert set(ray.get(locations)) == worker_node_ids # Wait for updating driver raylet's resource view. time.sleep(5) # Make sure actors can be spreaded as well. @ray.remote(num_cpus=1) class Actor: def ping(self): return ray.get_runtime_context().node_id.hex() actors = [] locations = [] for i in range(8): actors.append(Actor.options(scheduling_strategy="SPREAD").remote()) locations.append(ray.get(actors[-1].ping.remote())) locations.sort() expected_locations = list(worker_node_ids) * 4 expected_locations.sort() assert locations == expected_locations