def _create(): logger.debug("Setting up conda for runtime_env: " f"{runtime_env.serialize()}") protocol, hash = parse_uri(uri) conda_env_name = self._get_path_from_hash(hash) conda_dict = _get_conda_dict_with_ray_inserted(runtime_env, logger=logger) logger.info(f"Setting up conda environment with {runtime_env}") with FileLock(self._installs_and_deletions_file_lock): try: conda_yaml_file = os.path.join(self._resources_dir, "environment.yml") with open(conda_yaml_file, "w") as file: yaml.dump(conda_dict, file) create_conda_env_if_needed(conda_yaml_file, prefix=conda_env_name, logger=logger) finally: os.remove(conda_yaml_file) if runtime_env.get_extension("_inject_current_ray") == "True": _inject_ray_to_conda_site(conda_path=conda_env_name, logger=logger) logger.info( f"Finished creating conda environment at {conda_env_name}") return get_directory_size_bytes(conda_env_name)
def delete_uri(self, uri: str, logger: Optional[logging.Logger] = default_logger) -> int: """Delete URI and return the number of bytes deleted.""" logger.info("Got request to delete pip URI %s", uri) protocol, hash = parse_uri(uri) if protocol != Protocol.PIP: raise ValueError("PipManager can only delete URIs with protocol " f"pip. Received protocol {protocol}, URI {uri}") # Cancel running create task. task = self._creating_task.pop(hash, None) if task is not None: task.cancel() pip_env_path = self._get_path_from_hash(hash) local_dir_size = get_directory_size_bytes(pip_env_path) del self._create_locks[uri] try: shutil.rmtree(pip_env_path) except OSError as e: logger.warning( f"Error when deleting pip env {pip_env_path}: {str(e)}") return 0 return local_dir_size
def test_hit_cache_size_limit(self, start_cluster, URI_cache_10_MB): """Test eviction happens when we exceed a nonzero (10MB) cache size.""" NUM_NODES = 3 cluster, address = start_cluster for i in range(NUM_NODES - 1): # Head node already added. cluster.add_node( num_cpus=1, runtime_env_dir_name=f"node_{i}_runtime_resources") with tempfile.TemporaryDirectory() as tmp_dir, chdir(tmp_dir): with open("test_file_1", "wb") as f: f.write(os.urandom(8 * 1024 * 1024)) # 8 MiB ray.init(address, runtime_env={"working_dir": tmp_dir}) @ray.remote def f(): pass ray.get(f.remote()) ray.shutdown() with open("test_file_2", "wb") as f: f.write(os.urandom(4 * 1024 * 1024)) os.remove("test_file_1") ray.init(address, runtime_env={"working_dir": tmp_dir}) # Without the cache size limit, we would expect the local dir to be # 12 MB. Since we do have a size limit, the first package must be # GC'ed, leaving us with 4 MB. Sleep to give time for deletion. time.sleep(5) for node in cluster.list_all_nodes(): local_dir = os.path.join(node.get_runtime_env_dir_path(), "working_dir_files") assert 3 < get_directory_size_bytes(local_dir) / (1024**2) < 5
def create( self, uri: str, runtime_env: RuntimeEnv, context: RuntimeEnvContext, logger: Optional[logging.Logger] = default_logger, ) -> int: logger.debug("Setting up pip for runtime_env: " f"{runtime_env.serialize()}") protocol, hash = parse_uri(uri) target_dir = self._get_path_from_hash(hash) pip_packages: List[str] = runtime_env.pip_packages() with FileLock(self._installs_and_deletions_file_lock): _install_pip_list_to_dir(pip_packages, target_dir, logger=logger) # Despite Ray being removed from the input pip list during # validation, other packages in the pip list (for example, # xgboost_ray) may themselves include Ray as a dependency. In this # case, we will have inadvertently installed the latest Ray version # in the target_dir, which may cause Ray version mismatch issues. # Uninstall it here, if it exists, to make the workers use the Ray # that is already installed in the cluster. # # In the case where the user explicitly wants to include Ray in # their pip list (and signals this by setting the environment # variable below) then we don't want this deletion logic, so we # skip it. if os.environ.get(RAY_RUNTIME_ENV_ALLOW_RAY_IN_PIP) != 1: ray_path = Path(target_dir) / "ray" if ray_path.exists() and ray_path.is_dir(): shutil.rmtree(ray_path) return get_directory_size_bytes(target_dir)
def test_hit_cache_size_limit(self, start_cluster, URI_cache_10_MB, disable_temporary_uri_pinning): """Test eviction happens when we exceed a nonzero (10MB) cache size.""" NUM_NODES = 3 cluster, address = start_cluster for i in range(NUM_NODES - 1): # Head node already added. cluster.add_node( num_cpus=1, runtime_env_dir_name=f"node_{i}_runtime_resources") print( f'Added node with runtime_env_dir_name "node_{i}_runtime_resources".' ) print(f"Added all {NUM_NODES} nodes.") with tempfile.TemporaryDirectory() as tmp_dir, chdir(tmp_dir): print("Entered tempfile context manager.") with open("test_file_1", "wb") as f: f.write(os.urandom(8 * 1024 * 1024)) # 8 MiB print('Wrote random bytes to "test_file_1" file.') ray.init(address, runtime_env={"working_dir": tmp_dir}) print(f'Initialized Ray at "{address}" with working_dir.') @ray.remote def f(): pass ray.get(f.remote()) print('Created and received response from task "f".') ray.shutdown() print("Ray has been shut down.") with open("test_file_2", "wb") as f: f.write(os.urandom(4 * 1024 * 1024)) print('Wrote random bytes to "test_file_2".') os.remove("test_file_1") print('Removed "test_file_1".') ray.init(address, runtime_env={"working_dir": tmp_dir}) print( f'Reinitialized Ray at address "{address}" with working_dir.') # Without the cache size limit, we would expect the local dir to be # 12 MB. Since we do have a size limit, the first package must be # GC'ed, leaving us with 4 MB. Sleep to give time for deletion. time.sleep(5) print("Slept for 5 seconds.") for idx, node in enumerate(cluster.list_all_nodes()): local_dir = os.path.join(node.get_runtime_env_dir_path(), "working_dir_files") print("Created local_dir path.") assert 3 < get_directory_size_bytes(local_dir) / (1024**2) < 5 print(f"get_directory_size_bytes assertion {idx} passed.")
def _create(): if is_jar_uri(uri): module_dir = self._download_jars(uri=uri, logger=logger) else: module_dir = download_and_unpack_package( uri, self._resources_dir, logger=logger ) return get_directory_size_bytes(module_dir)
async def create( self, uri: str, runtime_env: dict, context: RuntimeEnvContext, logger: Optional[logging.Logger] = default_logger, ) -> int: local_dir = download_and_unpack_package(uri, self._resources_dir, logger=logger) return get_directory_size_bytes(local_dir)
def delete_uri(self, uri: str, logger: Optional[logging.Logger] = default_logger) -> int: """Delete URI and return the number of bytes deleted.""" local_dir = get_local_dir_from_uri(uri, self._resources_dir) local_dir_size = get_directory_size_bytes(local_dir) deleted = delete_package(uri, self._resources_dir) if not deleted: logger.warning(f"Tried to delete nonexistent URI: {uri}.") return 0 return local_dir_size
async def create( self, uri: str, runtime_env: "RuntimeEnv", # noqa: F821 context: RuntimeEnvContext, logger: Optional[logging.Logger] = default_logger, ) -> int: if is_jar_uri(uri): module_dir = await self._download_jars(uri=uri, logger=logger) else: module_dir = await download_and_unpack_package( uri, self._resources_dir, self._gcs_aio_client, logger=logger) return get_directory_size_bytes(module_dir)
async def create( self, uri: str, runtime_env: RuntimeEnv, context: RuntimeEnvContext, logger: Optional[logging.Logger] = default_logger, ) -> int: if not runtime_env.has_pip(): return 0 protocol, hash = parse_uri(uri) target_dir = self._get_path_from_hash(hash) with FileLock(self._installs_and_deletions_file_lock): pip_processor = PipProcessor(target_dir, runtime_env, logger) pip_processor.run() return get_directory_size_bytes(target_dir)
def delete_uri(self, uri: str, logger: Optional[logging.Logger] = default_logger) -> int: """Delete URI and return the number of bytes deleted.""" logger.info(f"Got request to delete URI {uri}") protocol, hash = parse_uri(uri) if protocol != Protocol.CONDA: raise ValueError( "CondaManager can only delete URIs with protocol " f"conda. Received protocol {protocol}, URI {uri}") conda_env_path = self._get_path_from_hash(hash) local_dir_size = get_directory_size_bytes(conda_env_path) with FileLock(self._installs_and_deletions_file_lock): successful = delete_conda_env(prefix=conda_env_path, logger=logger) if not successful: logger.warning(f"Error when deleting conda env {conda_env_path}. ") return 0 return local_dir_size
def delete_uri(self, uri: str, logger: Optional[logging.Logger] = default_logger) -> int: """Delete URI and return the number of bytes deleted.""" logger.info(f"Got request to delete pip URI {uri}") protocol, hash = parse_uri(uri) if protocol != Protocol.PIP: raise ValueError("PipManager can only delete URIs with protocol " f"pip. Received protocol {protocol}, URI {uri}") pip_env_path = self._get_path_from_hash(hash) local_dir_size = get_directory_size_bytes(pip_env_path) try: with FileLock(self._installs_and_deletions_file_lock): shutil.rmtree(pip_env_path) except OSError as e: logger.warning( f"Error when deleting pip env {pip_env_path}: {str(e)}") return 0 return local_dir_size
async def test_create_delete_size_equal(tmpdir, ray_start_regular): """Tests that `create` and `delete_uri` return the same size for a URI.""" # Create an arbitrary nonempty directory to upload. path = Path(tmpdir) dir_to_upload = path / "dir_to_upload" dir_to_upload.mkdir(parents=True) filepath = dir_to_upload / "file" with filepath.open("w") as file: file.write("F" * 100) uri = get_uri_for_directory(dir_to_upload) assert get_directory_size_bytes(dir_to_upload) > 0 uploaded = upload_package_if_needed(uri, tmpdir, dir_to_upload) assert uploaded manager = WorkingDirManager(tmpdir) created_size_bytes = await manager.create(uri, {}, RuntimeEnvContext()) deleted_size_bytes = manager.delete_uri(uri) assert created_size_bytes == deleted_size_bytes
def _create(): local_dir = download_and_unpack_package(uri, self._resources_dir, logger=logger) return get_directory_size_bytes(local_dir)
def local_dir_size_near_4mb(): return 3 < get_directory_size_bytes(local_dir) / (1024 ** 2) < 5