def start_specific_server(self, client_id: str, job_config: JobConfig) -> bool: """ Start up a RayClient Server for an incoming client to communicate with. Returns whether creation was successful. """ specific_server = self._get_server_for_client(client_id) assert specific_server, f"Server has not been created for: {client_id}" output, error = self.node.get_log_file_handles( f"ray_client_server_{specific_server.port}", unique=True) serialized_runtime_env = job_config.get_serialized_runtime_env() runtime_env = json.loads(serialized_runtime_env) # Set up the working_dir for the server. # TODO(edoakes): this should go be unified with the worker setup code # by going through the runtime_env agent. context = RuntimeEnvContext( env_vars=runtime_env.get("env_vars"), resources_dir=self.node.get_runtime_env_dir_path()) working_dir_pkg.setup_working_dir(runtime_env, context) proc = start_ray_client_server( self.redis_address, specific_server.port, stdout_file=output, stderr_file=error, fate_share=self.fate_share, server_type="specific-server", serialized_runtime_env=serialized_runtime_env, serialized_runtime_env_context=context.serialize(), redis_password=self._redis_password) # Wait for the process being run transitions from the shim process # to the actual RayClient Server. pid = proc.process.pid if sys.platform != "win32": psutil_proc = psutil.Process(pid) else: psutil_proc = None # Don't use `psutil` on Win32 while psutil_proc is not None: if proc.process.poll() is not None: logger.error( f"SpecificServer startup failed for client: {client_id}") break cmd = psutil_proc.cmdline() if _match_running_client_server(cmd): break logger.debug( "Waiting for Process to reach the actual client server.") time.sleep(0.5) specific_server.set_result(proc) logger.info(f"SpecificServer started on port: {specific_server.port} " f"with PID: {pid} for client: {client_id}") return proc.process.poll() is None
def setup_worker(input_args): # remaining_args contains the arguments to the original worker command, # minus the python executable, e.g. default_worker.py --node-ip-address=... args, remaining_args = parser.parse_known_args(args=input_args) commands = [] py_executable: str = sys.executable runtime_env: dict = json.loads(args.serialized_runtime_env or "{}") runtime_env_context: RuntimeEnvContext = None if args.serialized_runtime_env_context: runtime_env_context = RuntimeEnvContext.deserialize( args.serialized_runtime_env_context) # Ray client server setups runtime env by itself instead of agent. if runtime_env.get("conda") or runtime_env.get("pip"): if not args.serialized_runtime_env_context: runtime_env_context = RuntimeEnvContext(args.session_dir) setup_conda_or_pip(runtime_env, runtime_env_context, logger=logger) if runtime_env_context and runtime_env_context.working_dir is not None: commands += [f"cd {runtime_env_context.working_dir}"] # Insert the working_dir as the first entry in PYTHONPATH. This is # compatible with users providing their own PYTHONPATH in env_vars. env_vars = runtime_env.get("env_vars", None) or {} python_path = runtime_env_context.working_dir if "PYTHONPATH" in env_vars: python_path += os.pathsep + runtime_env["PYTHONPATH"] env_vars["PYTHONPATH"] = python_path runtime_env["env_vars"] = env_vars # Add a conda activate command prefix if using a conda env. if runtime_env_context and runtime_env_context.conda_env_name is not None: py_executable = "python" conda_activate_commands = get_conda_activate_commands( runtime_env_context.conda_env_name) if (conda_activate_commands): commands += conda_activate_commands elif runtime_env.get("conda"): logger.warning( "Conda env name is not found in context, " "but conda exists in runtime env. The runtime env %s, " "the context %s.", args.serialized_runtime_env, args.serialized_runtime_env_context) commands += [" ".join([f"exec {py_executable}"] + remaining_args)] command_str = " && ".join(commands) # update env vars if runtime_env.get("env_vars"): env_vars = runtime_env["env_vars"] os.environ.update(env_vars) os.execvp("bash", ["bash", "-c", command_str])
def setup_working_dir(runtime_env: dict, context: RuntimeEnvContext, logger: Optional[logging.Logger] = None): if not runtime_env.get("uris"): return # Overwrite the module-wide logger and PKG_DIR temporarily. # TODO(edoakes): we should be able to remove this by refactoring the # working_dir setup code into a class instead of using global vars. global _logger, PKG_DIR if logger: prev_logger = _logger _logger = logger assert context.resources_dir is not None prev_pkg_dir = PKG_DIR PKG_DIR = context.resources_dir working_dir = ensure_runtime_env_setup(runtime_env["uris"]) context.command_prefix += [f"cd {working_dir}"] # Insert the working_dir as the first entry in PYTHONPATH. This is # compatible with users providing their own PYTHONPATH in env_vars. python_path = working_dir if "PYTHONPATH" in context.env_vars: python_path += os.pathsep + context.env_vars["PYTHONPATH"] context.env_vars["PYTHONPATH"] = python_path PKG_DIR = prev_pkg_dir if logger: _logger = prev_logger
def run_setup_with_logger(): runtime_env: dict = json.loads(serialized_runtime_env or "{}") allocated_resource: dict = json.loads( serialized_allocated_resource_instances or "{}") # Use a separate logger for each job. per_job_logger = self.get_or_create_logger(request.job_id) # TODO(chenk008): Add log about allocated_resource to # avoid lint error. That will be moved to cgroup plugin. per_job_logger.debug(f"Worker has resource :" f"{allocated_resource}") context = RuntimeEnvContext( env_vars=runtime_env.get("env_vars")) self._conda_manager.setup(runtime_env, context, logger=per_job_logger) self._working_dir_manager.setup(runtime_env, context, logger=per_job_logger) # Add the mapping of URIs -> the serialized environment to be # used for cache invalidation. for uri in runtime_env.get("uris", []): self._working_dir_uri_to_envs[uri].add( serialized_runtime_env) return context
def run_setup_with_logger(): runtime_env: dict = json.loads(serialized_runtime_env or "{}") # Use a separate logger for each job. per_job_logger = self.get_or_create_logger(request.job_id) context = RuntimeEnvContext( env_vars=runtime_env.get("env_vars")) self._conda_manager.setup(runtime_env, context, logger=per_job_logger) self._working_dir_manager.setup(runtime_env, context, logger=per_job_logger) # Add the mapping of URIs -> the serialized environment to be # used for cache invalidation. for uri in runtime_env.get("uris") or []: self._working_dir_uri_to_envs[uri].add( serialized_runtime_env) # Run setup function from all the plugins for plugin_class_path in runtime_env.get("plugins", {}).keys(): plugin_class = import_attr(plugin_class_path) # TODO(simon): implement uri support plugin_class.create("uri not implemented", runtime_env, context) plugin_class.modify_context("uri not implemented", runtime_env, context) return context
def run_setup_with_logger(): runtime_env: dict = json.loads(serialized_runtime_env or "{}") # Use a separate logger for each job. per_job_logger = self.get_or_create_logger(request.job_id) context = RuntimeEnvContext(self._runtime_env_dir) setup_conda_or_pip(runtime_env, context, logger=per_job_logger) setup_working_dir(runtime_env, context, logger=per_job_logger) return context
def start_specific_server(self, client_id: str, job_config: JobConfig) -> bool: """ Start up a RayClient Server for an incoming client to communicate with. Returns whether creation was successful. """ specific_server = self._get_server_for_client(client_id) assert specific_server, f"Server has not been created for: {client_id}" output, error = self.node.get_log_file_handles( f"ray_client_server_{specific_server.port}", unique=True) serialized_runtime_env = job_config.get_serialized_runtime_env() if serialized_runtime_env == "{}": serialized_runtime_env_context = RuntimeEnvContext().serialize() else: serialized_runtime_env_context = self._create_runtime_env( serialized_runtime_env=serialized_runtime_env, specific_server=specific_server, ) proc = start_ray_client_server( self.redis_address, specific_server.port, stdout_file=output, stderr_file=error, fate_share=self.fate_share, server_type="specific-server", serialized_runtime_env_context=serialized_runtime_env_context, redis_password=self._redis_password) # Wait for the process being run transitions from the shim process # to the actual RayClient Server. pid = proc.process.pid if sys.platform != "win32": psutil_proc = psutil.Process(pid) else: psutil_proc = None # Don't use `psutil` on Win32 while psutil_proc is not None: if proc.process.poll() is not None: logger.error( f"SpecificServer startup failed for client: {client_id}") break cmd = psutil_proc.cmdline() if _match_running_client_server(cmd): break logger.debug( "Waiting for Process to reach the actual client server.") time.sleep(0.5) specific_server.set_result(proc) logger.info(f"SpecificServer started on port: {specific_server.port} " f"with PID: {pid} for client: {client_id}") return proc.process.poll() is None
def setup_worker(input_args): # remaining_args contains the arguments to the original worker command, # minus the python executable, e.g. default_worker.py --node-ip-address=... args, remaining_args = parser.parse_known_args(args=input_args) runtime_env: dict = json.loads(args.serialized_runtime_env or "{}") runtime_env_context: RuntimeEnvContext = None if args.serialized_runtime_env_context: runtime_env_context = RuntimeEnvContext.deserialize( args.serialized_runtime_env_context) else: runtime_env_context = RuntimeEnvContext( env_vars=runtime_env.get("env_vars")) # Ray client server setups runtime env by itself instead of agent. if args.from_ray_client: if runtime_env.get("conda") or runtime_env.get("pip"): setup_conda_or_pip(runtime_env, runtime_env_context, logger=logger) runtime_env_context.exec_worker(remaining_args)
def setup_conda_or_pip(runtime_env: dict, context: RuntimeEnvContext, logger: Optional[logging.Logger] = None): if logger is None: logger = logging.getLogger(__name__) if not runtime_env.get("conda") and not runtime_env.get("pip"): return logger.debug(f"Setting up conda or pip for runtime_env: {runtime_env}") conda_dict = get_conda_dict(runtime_env, context.resources_dir) if isinstance(runtime_env.get("conda"), str): conda_env_name = runtime_env["conda"] else: assert conda_dict is not None ray_pip = current_ray_pip_specifier(logger) if ray_pip: extra_pip_dependencies = [ray_pip, "ray[default]"] elif runtime_env.get("_inject_current_ray"): extra_pip_dependencies = ( _resolve_install_from_source_ray_dependencies()) else: extra_pip_dependencies = [] conda_dict = inject_dependencies(conda_dict, _current_py_version(), extra_pip_dependencies) logger.info(f"Setting up conda environment with {runtime_env}") # It is not safe for multiple processes to install conda envs # concurrently, even if the envs are different, so use a global # lock for all conda installs. # See https://github.com/ray-project/ray/issues/17086 file_lock_name = "ray-conda-install.lock" with FileLock(os.path.join(context.resources_dir, file_lock_name)): conda_dir = os.path.join(context.resources_dir, "conda") try_to_create_directory(conda_dir) conda_yaml_path = os.path.join(conda_dir, "environment.yml") with open(conda_yaml_path, "w") as file: # Sort keys because we hash based on the file contents, # and we don't want the hash to depend on the order # of the dependencies. yaml.dump(conda_dict, file, sort_keys=True) conda_env_name = get_or_create_conda_env(conda_yaml_path, conda_dir, logger=logger) if runtime_env.get("_inject_current_ray"): conda_path = os.path.join(conda_dir, conda_env_name) _inject_ray_to_conda_site(conda_path, logger) context.py_executable = "python" context.command_prefix += get_conda_activate_commands(conda_env_name) logger.info(f"Finished setting up runtime environment at {conda_env_name}")
def setup(self, runtime_env: dict, context: RuntimeEnvContext, logger: Optional[logging.Logger] = default_logger): if not runtime_env.get("uris"): return working_dir = self.ensure_runtime_env_setup(runtime_env["uris"], logger=logger) context.command_prefix += [f"cd {working_dir}"] # Insert the working_dir as the first entry in PYTHONPATH. This is # compatible with users providing their own PYTHONPATH in env_vars. python_path = working_dir if "PYTHONPATH" in context.env_vars: python_path += os.pathsep + context.env_vars["PYTHONPATH"] context.env_vars["PYTHONPATH"] = python_path
def run_setup_with_logger(): runtime_env: dict = json.loads(serialized_runtime_env or "{}") # Use a separate logger for each job. per_job_logger = self.get_or_create_logger(request.job_id) context = RuntimeEnvContext( env_vars=runtime_env.get("env_vars")) self._conda_manager.setup(runtime_env, context, logger=per_job_logger) self._working_dir_manager.setup(runtime_env, context, logger=per_job_logger) # Add the mapping of URIs -> the serialized environment to be # used for cache invalidation. for uri in runtime_env.get("uris") or []: self._working_dir_uri_to_envs[uri].add( serialized_runtime_env) return context
] container_command.append("--env") container_command.append("RAY_RAYLET_PID=" + str(os.getppid())) if container_option.get("run_options"): container_command.extend(container_option.get("run_options")) container_command.extend( parse_allocated_resource(args.allocated_instances_serialized_json)) container_command.append("--entrypoint") container_command.append("python") container_command.append(container_option.get("image")) container_command.extend(entrypoint_args) logger.warning("start worker in container: {}".format(container_command)) os.execvp(container_driver, container_command) if __name__ == "__main__": args, remaining_args = parser.parse_known_args() runtime_env: dict = json.loads(args.serialized_runtime_env or "{}") container_option = runtime_env.get("container") if container_option and container_option.get("image"): start_worker_in_container(container_option, args, remaining_args) else: # NOTE(edoakes): args.serialized_runtime_env_context is only None when # we're starting the main Ray client proxy server. That case should # probably not even go through this codepath. runtime_env_context = RuntimeEnvContext.deserialize( args.serialized_runtime_env_context or "{}") runtime_env_context.exec_worker(remaining_args)