async def CreateRuntimeEnv(self, request, context): async def _setup_runtime_env(serialized_runtime_env, session_dir): loop = asyncio.get_event_loop() runtime_env: dict = json.loads(serialized_runtime_env or "{}") return await loop.run_in_executor(None, self._setup, runtime_env, session_dir) serialized_env = request.serialized_runtime_env if serialized_env in self._created_env_cache: serialized_context = self._created_env_cache[serialized_env] logger.info("Runtime env already created. Env: %s, context: %s", serialized_env, self._created_env_cache[serialized_env]) return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_OK, serialized_runtime_env_context=serialized_context) logger.info("Creating runtime env: %s.", request.serialized_runtime_env) runtime_env_dict = json.loads(request.serialized_runtime_env or "{}") uris = runtime_env_dict.get("uris") runtime_env_context: RuntimeEnvContext = None error_message = None for _ in range(runtime_env_consts.RUNTIME_ENV_RETRY_TIMES): try: if uris: # TODO(guyang.sgy): Try `ensure_runtime_env_setup(uris)` # to download packages. # But we don't initailize internal kv in agent now. pass runtime_env_context = await _setup_runtime_env( request.serialized_runtime_env, self._session_dir) break except Exception as ex: logger.exception("Runtime env creation failed.") error_message = str(ex) await asyncio.sleep( runtime_env_consts.RUNTIME_ENV_RETRY_INTERVAL_MS / 1000) if error_message: logger.error( "Runtime env creation failed for %d times, " "don't retry any more.", runtime_env_consts.RUNTIME_ENV_RETRY_TIMES) return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED, error_message=error_message) serialized_context = runtime_env_context.serialize() self._created_env_cache[serialized_env] = serialized_context logger.info("Successfully created runtime env: %s, the context: %s", request.serialized_runtime_env, serialized_context) return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_OK, serialized_runtime_env_context=serialized_context)
async def CreateRuntimeEnv(self, request, context): runtime_env_dict = json.loads(request.serialized_runtime_env or "{}") uris = runtime_env_dict.get("uris") if uris: logger.info("Creating runtime env with uris %s", repr(uris)) # TODO(guyang.sgy): Try `ensure_runtime_env_setup(uris)` # to download packages. # But we don't initailize internal kv in agent now. pass return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_OK)
async def CreateRuntimeEnv(self, request, context): async def _setup_runtime_env(serialized_runtime_env): # This function will be ran inside a thread def run_setup_with_logger(): runtime_env: dict = json.loads(serialized_runtime_env or "{}") # Use a separate logger for each job. per_job_logger = self.get_or_create_logger(request.job_id) context = RuntimeEnvContext( env_vars=runtime_env.get("env_vars"), resources_dir=self._runtime_env_dir) setup_conda_or_pip(runtime_env, context, logger=per_job_logger) setup_working_dir(runtime_env, context, logger=per_job_logger) return context loop = asyncio.get_event_loop() return await loop.run_in_executor(None, run_setup_with_logger) serialized_env = request.serialized_runtime_env if serialized_env not in self._env_locks: # async lock to prevent the same env being concurrently installed self._env_locks[serialized_env] = asyncio.Lock() async with self._env_locks[serialized_env]: if serialized_env in self._env_cache: serialized_context = self._env_cache[serialized_env] result = self._env_cache[serialized_env] if result.success: context = result.result logger.info("Runtime env already created successfully. " f"Env: {serialized_env}, context: {context}") return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_OK, serialized_runtime_env_context=context) else: error_message = result.result logger.info("Runtime env already failed. " f"Env: {serialized_env}, err: {error_message}") return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED, error_message=error_message) if SLEEP_FOR_TESTING_S: logger.info(f"Sleeping for {SLEEP_FOR_TESTING_S}s.") time.sleep(int(SLEEP_FOR_TESTING_S)) logger.info(f"Creating runtime env: {serialized_env}") runtime_env_context: RuntimeEnvContext = None error_message = None for _ in range(runtime_env_consts.RUNTIME_ENV_RETRY_TIMES): try: runtime_env_context = await _setup_runtime_env( serialized_env) break except Exception as ex: logger.exception("Runtime env creation failed.") error_message = str(ex) await asyncio.sleep( runtime_env_consts.RUNTIME_ENV_RETRY_INTERVAL_MS / 1000 ) if error_message: logger.error( "Runtime env creation failed for %d times, " "don't retry any more.", runtime_env_consts.RUNTIME_ENV_RETRY_TIMES) self._env_cache[serialized_env] = CreatedEnvResult( False, error_message) return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED, error_message=error_message) serialized_context = runtime_env_context.serialize() self._env_cache[serialized_env] = CreatedEnvResult( True, serialized_context) logger.info( "Successfully created runtime env: %s, the context: %s", serialized_env, serialized_context) return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_OK, serialized_runtime_env_context=serialized_context)
async def CreateRuntimeEnv(self, request, context): async def _setup_runtime_env( serialized_runtime_env, serialized_allocated_resource_instances ): # This function will be ran inside a thread def run_setup_with_logger(): runtime_env = RuntimeEnv(serialized_runtime_env=serialized_runtime_env) allocated_resource: dict = json.loads( serialized_allocated_resource_instances or "{}" ) # Use a separate logger for each job. per_job_logger = self.get_or_create_logger(request.job_id) # TODO(chenk008): Add log about allocated_resource to # avoid lint error. That will be moved to cgroup plugin. per_job_logger.debug(f"Worker has resource :" f"{allocated_resource}") context = RuntimeEnvContext(env_vars=runtime_env.env_vars()) self._container_manager.setup( runtime_env, context, logger=per_job_logger ) for (manager, uri_cache) in [ (self._working_dir_manager, self._working_dir_uri_cache), (self._conda_manager, self._conda_uri_cache), (self._pip_manager, self._pip_uri_cache), ]: uri = manager.get_uri(runtime_env) if uri is not None: if uri not in uri_cache: per_job_logger.debug(f"Cache miss for URI {uri}.") size_bytes = manager.create( uri, runtime_env, context, logger=per_job_logger ) uri_cache.add(uri, size_bytes, logger=per_job_logger) else: per_job_logger.debug(f"Cache hit for URI {uri}.") uri_cache.mark_used(uri, logger=per_job_logger) manager.modify_context(uri, runtime_env, context) # Set up py_modules. For now, py_modules uses multiple URIs so # the logic is slightly different from working_dir, conda, and # pip above. py_modules_uris = self._py_modules_manager.get_uris(runtime_env) if py_modules_uris is not None: for uri in py_modules_uris: if uri not in self._py_modules_uri_cache: per_job_logger.debug(f"Cache miss for URI {uri}.") size_bytes = self._py_modules_manager.create( uri, runtime_env, context, logger=per_job_logger ) self._py_modules_uri_cache.add( uri, size_bytes, logger=per_job_logger ) else: per_job_logger.debug(f"Cache hit for URI {uri}.") self._py_modules_uri_cache.mark_used( uri, logger=per_job_logger ) self._py_modules_manager.modify_context( py_modules_uris, runtime_env, context ) # Add the mapping of URIs -> the serialized environment to be # used for cache invalidation. if runtime_env.working_dir_uri(): uri = runtime_env.working_dir_uri() self._uris_to_envs[uri].add(serialized_runtime_env) if runtime_env.py_modules_uris(): for uri in runtime_env.py_modules_uris(): self._uris_to_envs[uri].add(serialized_runtime_env) if runtime_env.conda_uri(): uri = runtime_env.conda_uri() self._uris_to_envs[uri].add(serialized_runtime_env) if runtime_env.pip_uri(): uri = runtime_env.pip_uri() self._uris_to_envs[uri].add(serialized_runtime_env) if runtime_env.plugin_uris(): for uri in runtime_env.plugin_uris(): self._uris_to_envs[uri].add(serialized_runtime_env) # Run setup function from all the plugins for plugin_class_path, config in runtime_env.plugins(): per_job_logger.debug( f"Setting up runtime env plugin {plugin_class_path}" ) plugin_class = import_attr(plugin_class_path) # TODO(simon): implement uri support plugin_class.create( "uri not implemented", json.loads(config), context ) plugin_class.modify_context( "uri not implemented", json.loads(config), context ) return context loop = asyncio.get_event_loop() return await loop.run_in_executor(None, run_setup_with_logger) serialized_env = request.serialized_runtime_env if serialized_env not in self._env_locks: # async lock to prevent the same env being concurrently installed self._env_locks[serialized_env] = asyncio.Lock() async with self._env_locks[serialized_env]: if serialized_env in self._env_cache: serialized_context = self._env_cache[serialized_env] result = self._env_cache[serialized_env] if result.success: context = result.result self._logger.info( "Runtime env already created " f"successfully. Env: {serialized_env}, " f"context: {context}" ) return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_OK, serialized_runtime_env_context=context, ) else: error_message = result.result self._logger.info( "Runtime env already failed. " f"Env: {serialized_env}, err: {error_message}" ) return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED, error_message=error_message, ) if SLEEP_FOR_TESTING_S: self._logger.info(f"Sleeping for {SLEEP_FOR_TESTING_S}s.") time.sleep(int(SLEEP_FOR_TESTING_S)) self._logger.info(f"Creating runtime env: {serialized_env}") runtime_env_context: RuntimeEnvContext = None error_message = None for _ in range(runtime_env_consts.RUNTIME_ENV_RETRY_TIMES): try: runtime_env_context = await _setup_runtime_env( serialized_env, request.serialized_allocated_resource_instances ) break except Exception: err_msg = f"Failed to create runtime env {serialized_env}." self._logger.exception(err_msg) error_message = f"{err_msg}\n{traceback.format_exc()}" await asyncio.sleep( runtime_env_consts.RUNTIME_ENV_RETRY_INTERVAL_MS / 1000 ) if error_message: self._logger.error( "Runtime env creation failed for %d times, " "don't retry any more.", runtime_env_consts.RUNTIME_ENV_RETRY_TIMES, ) self._env_cache[serialized_env] = CreatedEnvResult(False, error_message) return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED, error_message=error_message, ) serialized_context = runtime_env_context.serialize() self._env_cache[serialized_env] = CreatedEnvResult(True, serialized_context) self._logger.info( "Successfully created runtime env: %s, the context: %s", serialized_env, serialized_context, ) return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_OK, serialized_runtime_env_context=serialized_context, )
async def CreateRuntimeEnv(self, request, context): async def _setup_runtime_env(serialized_runtime_env, session_dir): # This function will be ran inside a thread def run_setup_with_logger(): runtime_env: dict = json.loads(serialized_runtime_env or "{}") per_job_logger = self.get_or_create_logger(request.job_id) # Here we set the logger context for the setup hook execution. # The logger needs to be thread local because there can be # setup hooks ran for arbitrary job in arbitrary threads. with using_thread_local_logger(per_job_logger): env_context = self._setup(runtime_env, session_dir) return env_context loop = asyncio.get_event_loop() return await loop.run_in_executor(None, run_setup_with_logger) serialized_env = request.serialized_runtime_env if serialized_env not in self._env_locks: # async lock to prevent the same env being concurrently installed self._env_locks[serialized_env] = asyncio.Lock() async with self._env_locks[serialized_env]: if serialized_env in self._env_cache: serialized_context = self._env_cache[serialized_env] result = self._env_cache[serialized_env] if result.success: context = result.result logger.info("Runtime env already created successfully. " f"Env: {serialized_env}, context: {context}") return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_OK, serialized_runtime_env_context=context) else: error_message = result.result logger.info("Runtime env already failed. " f"Env: {serialized_env}, err: {error_message}") return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED, error_message=error_message) logger.info(f"Creating runtime env: {serialized_env}") runtime_env_dict = json.loads(serialized_env or "{}") uris = runtime_env_dict.get("uris") runtime_env_context: RuntimeEnvContext = None error_message = None for _ in range(runtime_env_consts.RUNTIME_ENV_RETRY_TIMES): try: if uris: # TODO(guyang.sgy): Try ensure_runtime_env_setup(uris) # to download packages. # But we don't initailize internal kv in agent now. pass runtime_env_context = await _setup_runtime_env( serialized_env, self._session_dir) break except Exception as ex: logger.exception("Runtime env creation failed.") error_message = str(ex) await asyncio.sleep( runtime_env_consts.RUNTIME_ENV_RETRY_INTERVAL_MS / 1000 ) if error_message: logger.error( "Runtime env creation failed for %d times, " "don't retry any more.", runtime_env_consts.RUNTIME_ENV_RETRY_TIMES) self._env_cache[serialized_env] = CreatedEnvResult( False, error_message) return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED, error_message=error_message) serialized_context = runtime_env_context.serialize() self._env_cache[serialized_env] = CreatedEnvResult( True, serialized_context) logger.info( "Successfully created runtime env: %s, the context: %s", serialized_env, serialized_context) return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_OK, serialized_runtime_env_context=serialized_context)
async def CreateRuntimeEnv(self, request, context): async def _setup_runtime_env(serialized_runtime_env, serialized_allocated_resource_instances): # This function will be ran inside a thread def run_setup_with_logger(): runtime_env: dict = json.loads(serialized_runtime_env or "{}") allocated_resource: dict = json.loads( serialized_allocated_resource_instances or "{}") # Use a separate logger for each job. per_job_logger = self.get_or_create_logger(request.job_id) # TODO(chenk008): Add log about allocated_resource to # avoid lint error. That will be moved to cgroup plugin. per_job_logger.debug(f"Worker has resource :" f"{allocated_resource}") context = RuntimeEnvContext( env_vars=runtime_env.get("env_vars")) self._conda_manager.setup(runtime_env, context, logger=per_job_logger) self._working_dir_manager.setup(runtime_env, context, logger=per_job_logger) # Add the mapping of URIs -> the serialized environment to be # used for cache invalidation. for uri in runtime_env.get("uris", []): self._working_dir_uri_to_envs[uri].add( serialized_runtime_env) return context loop = asyncio.get_event_loop() return await loop.run_in_executor(None, run_setup_with_logger) serialized_env = request.serialized_runtime_env if serialized_env not in self._env_locks: # async lock to prevent the same env being concurrently installed self._env_locks[serialized_env] = asyncio.Lock() async with self._env_locks[serialized_env]: if serialized_env in self._env_cache: serialized_context = self._env_cache[serialized_env] result = self._env_cache[serialized_env] if result.success: context = result.result logger.info("Runtime env already created successfully. " f"Env: {serialized_env}, context: {context}") return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_OK, serialized_runtime_env_context=context) else: error_message = result.result logger.info("Runtime env already failed. " f"Env: {serialized_env}, err: {error_message}") return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED, error_message=error_message) if SLEEP_FOR_TESTING_S: logger.info(f"Sleeping for {SLEEP_FOR_TESTING_S}s.") time.sleep(int(SLEEP_FOR_TESTING_S)) logger.info(f"Creating runtime env: {serialized_env}") runtime_env_context: RuntimeEnvContext = None error_message = None for _ in range(runtime_env_consts.RUNTIME_ENV_RETRY_TIMES): try: runtime_env_context = await _setup_runtime_env( serialized_env, request.serialized_allocated_resource_instances) break except Exception as ex: logger.exception("Runtime env creation failed.") error_message = str(ex) await asyncio.sleep( runtime_env_consts.RUNTIME_ENV_RETRY_INTERVAL_MS / 1000 ) if error_message: logger.error( "Runtime env creation failed for %d times, " "don't retry any more.", runtime_env_consts.RUNTIME_ENV_RETRY_TIMES) self._env_cache[serialized_env] = CreatedEnvResult( False, error_message) return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED, error_message=error_message) serialized_context = runtime_env_context.serialize() self._env_cache[serialized_env] = CreatedEnvResult( True, serialized_context) logger.info( "Successfully created runtime env: %s, the context: %s", serialized_env, serialized_context) return runtime_env_agent_pb2.CreateRuntimeEnvReply( status=agent_manager_pb2.AGENT_RPC_STATUS_OK, serialized_runtime_env_context=serialized_context)