示例#1
0
    async def CreateRuntimeEnv(self, request, context):
        async def _setup_runtime_env(serialized_runtime_env, session_dir):
            loop = asyncio.get_event_loop()
            runtime_env: dict = json.loads(serialized_runtime_env or "{}")
            return await loop.run_in_executor(None, self._setup, runtime_env,
                                              session_dir)

        serialized_env = request.serialized_runtime_env
        if serialized_env in self._created_env_cache:
            serialized_context = self._created_env_cache[serialized_env]
            logger.info("Runtime env already created. Env: %s, context: %s",
                        serialized_env,
                        self._created_env_cache[serialized_env])
            return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                status=agent_manager_pb2.AGENT_RPC_STATUS_OK,
                serialized_runtime_env_context=serialized_context)

        logger.info("Creating runtime env: %s.",
                    request.serialized_runtime_env)
        runtime_env_dict = json.loads(request.serialized_runtime_env or "{}")
        uris = runtime_env_dict.get("uris")
        runtime_env_context: RuntimeEnvContext = None
        error_message = None
        for _ in range(runtime_env_consts.RUNTIME_ENV_RETRY_TIMES):
            try:
                if uris:
                    # TODO(guyang.sgy): Try `ensure_runtime_env_setup(uris)`
                    # to download packages.
                    # But we don't initailize internal kv in agent now.
                    pass
                runtime_env_context = await _setup_runtime_env(
                    request.serialized_runtime_env, self._session_dir)
                break
            except Exception as ex:
                logger.exception("Runtime env creation failed.")
                error_message = str(ex)
                await asyncio.sleep(
                    runtime_env_consts.RUNTIME_ENV_RETRY_INTERVAL_MS / 1000)
        if error_message:
            logger.error(
                "Runtime env creation failed for %d times, "
                "don't retry any more.",
                runtime_env_consts.RUNTIME_ENV_RETRY_TIMES)
            return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                error_message=error_message)

        serialized_context = runtime_env_context.serialize()
        self._created_env_cache[serialized_env] = serialized_context
        logger.info("Successfully created runtime env: %s, the context: %s",
                    request.serialized_runtime_env, serialized_context)
        return runtime_env_agent_pb2.CreateRuntimeEnvReply(
            status=agent_manager_pb2.AGENT_RPC_STATUS_OK,
            serialized_runtime_env_context=serialized_context)
示例#2
0
    async def CreateRuntimeEnv(self, request, context):
        runtime_env_dict = json.loads(request.serialized_runtime_env or "{}")
        uris = runtime_env_dict.get("uris")
        if uris:
            logger.info("Creating runtime env with uris %s", repr(uris))
            # TODO(guyang.sgy): Try `ensure_runtime_env_setup(uris)`
            # to download packages.
            # But we don't initailize internal kv in agent now.
            pass

        return runtime_env_agent_pb2.CreateRuntimeEnvReply(
            status=agent_manager_pb2.AGENT_RPC_STATUS_OK)
示例#3
0
    async def CreateRuntimeEnv(self, request, context):
        async def _setup_runtime_env(serialized_runtime_env):
            # This function will be ran inside a thread
            def run_setup_with_logger():
                runtime_env: dict = json.loads(serialized_runtime_env or "{}")

                # Use a separate logger for each job.
                per_job_logger = self.get_or_create_logger(request.job_id)
                context = RuntimeEnvContext(
                    env_vars=runtime_env.get("env_vars"),
                    resources_dir=self._runtime_env_dir)
                setup_conda_or_pip(runtime_env, context, logger=per_job_logger)
                setup_working_dir(runtime_env, context, logger=per_job_logger)
                return context

            loop = asyncio.get_event_loop()
            return await loop.run_in_executor(None, run_setup_with_logger)

        serialized_env = request.serialized_runtime_env

        if serialized_env not in self._env_locks:
            # async lock to prevent the same env being concurrently installed
            self._env_locks[serialized_env] = asyncio.Lock()

        async with self._env_locks[serialized_env]:
            if serialized_env in self._env_cache:
                serialized_context = self._env_cache[serialized_env]
                result = self._env_cache[serialized_env]
                if result.success:
                    context = result.result
                    logger.info("Runtime env already created successfully. "
                                f"Env: {serialized_env}, context: {context}")
                    return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                        status=agent_manager_pb2.AGENT_RPC_STATUS_OK,
                        serialized_runtime_env_context=context)
                else:
                    error_message = result.result
                    logger.info("Runtime env already failed. "
                                f"Env: {serialized_env}, err: {error_message}")
                    return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                        status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                        error_message=error_message)

            if SLEEP_FOR_TESTING_S:
                logger.info(f"Sleeping for {SLEEP_FOR_TESTING_S}s.")
                time.sleep(int(SLEEP_FOR_TESTING_S))

            logger.info(f"Creating runtime env: {serialized_env}")
            runtime_env_context: RuntimeEnvContext = None
            error_message = None
            for _ in range(runtime_env_consts.RUNTIME_ENV_RETRY_TIMES):
                try:
                    runtime_env_context = await _setup_runtime_env(
                        serialized_env)
                    break
                except Exception as ex:
                    logger.exception("Runtime env creation failed.")
                    error_message = str(ex)
                    await asyncio.sleep(
                        runtime_env_consts.RUNTIME_ENV_RETRY_INTERVAL_MS / 1000
                    )
            if error_message:
                logger.error(
                    "Runtime env creation failed for %d times, "
                    "don't retry any more.",
                    runtime_env_consts.RUNTIME_ENV_RETRY_TIMES)
                self._env_cache[serialized_env] = CreatedEnvResult(
                    False, error_message)
                return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                    status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                    error_message=error_message)

            serialized_context = runtime_env_context.serialize()
            self._env_cache[serialized_env] = CreatedEnvResult(
                True, serialized_context)
            logger.info(
                "Successfully created runtime env: %s, the context: %s",
                serialized_env, serialized_context)
            return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                status=agent_manager_pb2.AGENT_RPC_STATUS_OK,
                serialized_runtime_env_context=serialized_context)
示例#4
0
    async def CreateRuntimeEnv(self, request, context):
        async def _setup_runtime_env(
            serialized_runtime_env, serialized_allocated_resource_instances
        ):
            # This function will be ran inside a thread
            def run_setup_with_logger():
                runtime_env = RuntimeEnv(serialized_runtime_env=serialized_runtime_env)
                allocated_resource: dict = json.loads(
                    serialized_allocated_resource_instances or "{}"
                )

                # Use a separate logger for each job.
                per_job_logger = self.get_or_create_logger(request.job_id)
                # TODO(chenk008): Add log about allocated_resource to
                # avoid lint error. That will be moved to cgroup plugin.
                per_job_logger.debug(f"Worker has resource :" f"{allocated_resource}")
                context = RuntimeEnvContext(env_vars=runtime_env.env_vars())
                self._container_manager.setup(
                    runtime_env, context, logger=per_job_logger
                )

                for (manager, uri_cache) in [
                    (self._working_dir_manager, self._working_dir_uri_cache),
                    (self._conda_manager, self._conda_uri_cache),
                    (self._pip_manager, self._pip_uri_cache),
                ]:
                    uri = manager.get_uri(runtime_env)
                    if uri is not None:
                        if uri not in uri_cache:
                            per_job_logger.debug(f"Cache miss for URI {uri}.")
                            size_bytes = manager.create(
                                uri, runtime_env, context, logger=per_job_logger
                            )
                            uri_cache.add(uri, size_bytes, logger=per_job_logger)
                        else:
                            per_job_logger.debug(f"Cache hit for URI {uri}.")
                            uri_cache.mark_used(uri, logger=per_job_logger)
                    manager.modify_context(uri, runtime_env, context)

                # Set up py_modules. For now, py_modules uses multiple URIs so
                # the logic is slightly different from working_dir, conda, and
                # pip above.
                py_modules_uris = self._py_modules_manager.get_uris(runtime_env)
                if py_modules_uris is not None:
                    for uri in py_modules_uris:
                        if uri not in self._py_modules_uri_cache:
                            per_job_logger.debug(f"Cache miss for URI {uri}.")
                            size_bytes = self._py_modules_manager.create(
                                uri, runtime_env, context, logger=per_job_logger
                            )
                            self._py_modules_uri_cache.add(
                                uri, size_bytes, logger=per_job_logger
                            )
                        else:
                            per_job_logger.debug(f"Cache hit for URI {uri}.")
                            self._py_modules_uri_cache.mark_used(
                                uri, logger=per_job_logger
                            )
                self._py_modules_manager.modify_context(
                    py_modules_uris, runtime_env, context
                )

                # Add the mapping of URIs -> the serialized environment to be
                # used for cache invalidation.
                if runtime_env.working_dir_uri():
                    uri = runtime_env.working_dir_uri()
                    self._uris_to_envs[uri].add(serialized_runtime_env)
                if runtime_env.py_modules_uris():
                    for uri in runtime_env.py_modules_uris():
                        self._uris_to_envs[uri].add(serialized_runtime_env)
                if runtime_env.conda_uri():
                    uri = runtime_env.conda_uri()
                    self._uris_to_envs[uri].add(serialized_runtime_env)
                if runtime_env.pip_uri():
                    uri = runtime_env.pip_uri()
                    self._uris_to_envs[uri].add(serialized_runtime_env)
                if runtime_env.plugin_uris():
                    for uri in runtime_env.plugin_uris():
                        self._uris_to_envs[uri].add(serialized_runtime_env)

                # Run setup function from all the plugins
                for plugin_class_path, config in runtime_env.plugins():
                    per_job_logger.debug(
                        f"Setting up runtime env plugin {plugin_class_path}"
                    )
                    plugin_class = import_attr(plugin_class_path)
                    # TODO(simon): implement uri support
                    plugin_class.create(
                        "uri not implemented", json.loads(config), context
                    )
                    plugin_class.modify_context(
                        "uri not implemented", json.loads(config), context
                    )

                return context

            loop = asyncio.get_event_loop()
            return await loop.run_in_executor(None, run_setup_with_logger)

        serialized_env = request.serialized_runtime_env

        if serialized_env not in self._env_locks:
            # async lock to prevent the same env being concurrently installed
            self._env_locks[serialized_env] = asyncio.Lock()

        async with self._env_locks[serialized_env]:
            if serialized_env in self._env_cache:
                serialized_context = self._env_cache[serialized_env]
                result = self._env_cache[serialized_env]
                if result.success:
                    context = result.result
                    self._logger.info(
                        "Runtime env already created "
                        f"successfully. Env: {serialized_env}, "
                        f"context: {context}"
                    )
                    return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                        status=agent_manager_pb2.AGENT_RPC_STATUS_OK,
                        serialized_runtime_env_context=context,
                    )
                else:
                    error_message = result.result
                    self._logger.info(
                        "Runtime env already failed. "
                        f"Env: {serialized_env}, err: {error_message}"
                    )
                    return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                        status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                        error_message=error_message,
                    )

            if SLEEP_FOR_TESTING_S:
                self._logger.info(f"Sleeping for {SLEEP_FOR_TESTING_S}s.")
                time.sleep(int(SLEEP_FOR_TESTING_S))

            self._logger.info(f"Creating runtime env: {serialized_env}")
            runtime_env_context: RuntimeEnvContext = None
            error_message = None
            for _ in range(runtime_env_consts.RUNTIME_ENV_RETRY_TIMES):
                try:
                    runtime_env_context = await _setup_runtime_env(
                        serialized_env, request.serialized_allocated_resource_instances
                    )
                    break
                except Exception:
                    err_msg = f"Failed to create runtime env {serialized_env}."
                    self._logger.exception(err_msg)
                    error_message = f"{err_msg}\n{traceback.format_exc()}"
                    await asyncio.sleep(
                        runtime_env_consts.RUNTIME_ENV_RETRY_INTERVAL_MS / 1000
                    )
            if error_message:
                self._logger.error(
                    "Runtime env creation failed for %d times, "
                    "don't retry any more.",
                    runtime_env_consts.RUNTIME_ENV_RETRY_TIMES,
                )
                self._env_cache[serialized_env] = CreatedEnvResult(False, error_message)
                return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                    status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                    error_message=error_message,
                )

            serialized_context = runtime_env_context.serialize()
            self._env_cache[serialized_env] = CreatedEnvResult(True, serialized_context)
            self._logger.info(
                "Successfully created runtime env: %s, the context: %s",
                serialized_env,
                serialized_context,
            )
            return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                status=agent_manager_pb2.AGENT_RPC_STATUS_OK,
                serialized_runtime_env_context=serialized_context,
            )
示例#5
0
    async def CreateRuntimeEnv(self, request, context):
        async def _setup_runtime_env(serialized_runtime_env, session_dir):
            # This function will be ran inside a thread
            def run_setup_with_logger():
                runtime_env: dict = json.loads(serialized_runtime_env or "{}")
                per_job_logger = self.get_or_create_logger(request.job_id)
                # Here we set the logger context for the setup hook execution.
                # The logger needs to be thread local because there can be
                # setup hooks ran for arbitrary job in arbitrary threads.
                with using_thread_local_logger(per_job_logger):
                    env_context = self._setup(runtime_env, session_dir)
                return env_context

            loop = asyncio.get_event_loop()
            return await loop.run_in_executor(None, run_setup_with_logger)

        serialized_env = request.serialized_runtime_env

        if serialized_env not in self._env_locks:
            # async lock to prevent the same env being concurrently installed
            self._env_locks[serialized_env] = asyncio.Lock()

        async with self._env_locks[serialized_env]:
            if serialized_env in self._env_cache:
                serialized_context = self._env_cache[serialized_env]
                result = self._env_cache[serialized_env]
                if result.success:
                    context = result.result
                    logger.info("Runtime env already created successfully. "
                                f"Env: {serialized_env}, context: {context}")
                    return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                        status=agent_manager_pb2.AGENT_RPC_STATUS_OK,
                        serialized_runtime_env_context=context)
                else:
                    error_message = result.result
                    logger.info("Runtime env already failed. "
                                f"Env: {serialized_env}, err: {error_message}")
                    return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                        status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                        error_message=error_message)

            logger.info(f"Creating runtime env: {serialized_env}")
            runtime_env_dict = json.loads(serialized_env or "{}")
            uris = runtime_env_dict.get("uris")
            runtime_env_context: RuntimeEnvContext = None
            error_message = None
            for _ in range(runtime_env_consts.RUNTIME_ENV_RETRY_TIMES):
                try:
                    if uris:
                        # TODO(guyang.sgy): Try ensure_runtime_env_setup(uris)
                        # to download packages.
                        # But we don't initailize internal kv in agent now.
                        pass
                    runtime_env_context = await _setup_runtime_env(
                        serialized_env, self._session_dir)
                    break
                except Exception as ex:
                    logger.exception("Runtime env creation failed.")
                    error_message = str(ex)
                    await asyncio.sleep(
                        runtime_env_consts.RUNTIME_ENV_RETRY_INTERVAL_MS / 1000
                    )
            if error_message:
                logger.error(
                    "Runtime env creation failed for %d times, "
                    "don't retry any more.",
                    runtime_env_consts.RUNTIME_ENV_RETRY_TIMES)
                self._env_cache[serialized_env] = CreatedEnvResult(
                    False, error_message)
                return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                    status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                    error_message=error_message)

            serialized_context = runtime_env_context.serialize()
            self._env_cache[serialized_env] = CreatedEnvResult(
                True, serialized_context)
            logger.info(
                "Successfully created runtime env: %s, the context: %s",
                serialized_env, serialized_context)
            return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                status=agent_manager_pb2.AGENT_RPC_STATUS_OK,
                serialized_runtime_env_context=serialized_context)
示例#6
0
    async def CreateRuntimeEnv(self, request, context):
        async def _setup_runtime_env(serialized_runtime_env,
                                     serialized_allocated_resource_instances):
            # This function will be ran inside a thread
            def run_setup_with_logger():
                runtime_env: dict = json.loads(serialized_runtime_env or "{}")
                allocated_resource: dict = json.loads(
                    serialized_allocated_resource_instances or "{}")

                # Use a separate logger for each job.
                per_job_logger = self.get_or_create_logger(request.job_id)
                # TODO(chenk008): Add log about allocated_resource to
                # avoid lint error. That will be moved to cgroup plugin.
                per_job_logger.debug(f"Worker has resource :"
                                     f"{allocated_resource}")
                context = RuntimeEnvContext(
                    env_vars=runtime_env.get("env_vars"))
                self._conda_manager.setup(runtime_env,
                                          context,
                                          logger=per_job_logger)
                self._working_dir_manager.setup(runtime_env,
                                                context,
                                                logger=per_job_logger)

                # Add the mapping of URIs -> the serialized environment to be
                # used for cache invalidation.
                for uri in runtime_env.get("uris", []):
                    self._working_dir_uri_to_envs[uri].add(
                        serialized_runtime_env)

                return context

            loop = asyncio.get_event_loop()
            return await loop.run_in_executor(None, run_setup_with_logger)

        serialized_env = request.serialized_runtime_env

        if serialized_env not in self._env_locks:
            # async lock to prevent the same env being concurrently installed
            self._env_locks[serialized_env] = asyncio.Lock()

        async with self._env_locks[serialized_env]:
            if serialized_env in self._env_cache:
                serialized_context = self._env_cache[serialized_env]
                result = self._env_cache[serialized_env]
                if result.success:
                    context = result.result
                    logger.info("Runtime env already created successfully. "
                                f"Env: {serialized_env}, context: {context}")
                    return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                        status=agent_manager_pb2.AGENT_RPC_STATUS_OK,
                        serialized_runtime_env_context=context)
                else:
                    error_message = result.result
                    logger.info("Runtime env already failed. "
                                f"Env: {serialized_env}, err: {error_message}")
                    return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                        status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                        error_message=error_message)

            if SLEEP_FOR_TESTING_S:
                logger.info(f"Sleeping for {SLEEP_FOR_TESTING_S}s.")
                time.sleep(int(SLEEP_FOR_TESTING_S))

            logger.info(f"Creating runtime env: {serialized_env}")
            runtime_env_context: RuntimeEnvContext = None
            error_message = None
            for _ in range(runtime_env_consts.RUNTIME_ENV_RETRY_TIMES):
                try:
                    runtime_env_context = await _setup_runtime_env(
                        serialized_env,
                        request.serialized_allocated_resource_instances)
                    break
                except Exception as ex:
                    logger.exception("Runtime env creation failed.")
                    error_message = str(ex)
                    await asyncio.sleep(
                        runtime_env_consts.RUNTIME_ENV_RETRY_INTERVAL_MS / 1000
                    )
            if error_message:
                logger.error(
                    "Runtime env creation failed for %d times, "
                    "don't retry any more.",
                    runtime_env_consts.RUNTIME_ENV_RETRY_TIMES)
                self._env_cache[serialized_env] = CreatedEnvResult(
                    False, error_message)
                return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                    status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                    error_message=error_message)

            serialized_context = runtime_env_context.serialize()
            self._env_cache[serialized_env] = CreatedEnvResult(
                True, serialized_context)
            logger.info(
                "Successfully created runtime env: %s, the context: %s",
                serialized_env, serialized_context)
            return runtime_env_agent_pb2.CreateRuntimeEnvReply(
                status=agent_manager_pb2.AGENT_RPC_STATUS_OK,
                serialized_runtime_env_context=serialized_context)