def submit(self, app: Application, mode: RunMode) -> str: if len(self._apps) == self._cache_size: if not self._evict_lru(): raise IndexError( f"App cache size ({self._cache_size}) exceeded. Increase the cache size" ) id = self._ids.setdefault(app.name, -1) + 1 self._ids[app.name] = id app_id = f"{app.name}_{id}" assert ( app_id not in self._apps ), "no app_id collisons expected since incremental integer suffix is used" local_app = _LocalApplication(app.name) local_app.set_run_mode(mode) for role in app.roles: container = role.container assert ( container ), "all roles in a submitted app must have container association" img_root = self._image_fetcher.fetch(container.image) cmd = os.path.join(img_root, role.entrypoint) for replica_id in range(role.num_replicas): args = [cmd] + macros.substitute(role.args, img_root, app_id, str(replica_id)) log.info(f"Running {args} with env: {role.env}") proc = subprocess.Popen(args, env=role.env) local_app.add_process(role.name, proc) self._apps[app_id] = local_app return app_id
def _to_app_popen_args(self, app_id: str, roles: List[Role], cfg: RunConfig): """ returns the popen args for all processes that needs to be created for the app :: # for each role [ { <role_name_1> : [{args: cmd, env: env, ... other popen args ...}, ...]}, { <role_name_1> : [{args: cmd, env: env, ... other popen args ...}, ...]}, ... ] # example (app has 2 roles: master (1 replica), trainer (2 replicas) [ { "master" : [ {args: "master.par", env: env, ... other popen args ...} ] }, { "trainer" : [ {args: "trainer.par", env: env, ... other popen args ...}, {args: "trainer.par", env: env, ... other popen args ...} ] }, ] """ app_popen_params = [] for role in roles: container = role.container assert ( container ), "all roles in a submitted app must have container association" image_fetcher = self._get_img_fetcher(cfg) img_root = image_fetcher.fetch(container.image) cmd = os.path.join(img_root, role.entrypoint) role_popen_params = {} for replica_id in range(role.num_replicas): args = [cmd] + macros.substitute(role.args, img_root, app_id, str(replica_id)) replica_popen_params = role_popen_params.setdefault( role.name, []) env_vars = {**self._default_role_envs(), **role.env} params: Dict[str, Any] = {"args": args, "env": env_vars} app_log_dir = self._get_app_log_dir(app_id, cfg) if app_log_dir: base_log_dir = os.path.join(app_log_dir, role.name, str(replica_id)) params["stdout"] = os.path.join(base_log_dir, "stdout.log") params["stderr"] = os.path.join(base_log_dir, "stderr.log") replica_popen_params.append(params) app_popen_params.append(role_popen_params) return app_popen_params
def _to_popen_request( self, app: Application, cfg: RunConfig, ) -> PopenRequest: """ Converts the application and cfg into a ``PopenRequest``. """ app_id = make_unique(app.name) image_fetcher = self._get_img_fetcher(cfg) app_log_dir, redirect_std = self._get_app_log_dir(app_id, cfg) role_params: Dict[str, List[ReplicaParam]] = {} role_log_dirs: Dict[str, List[str]] = {} for role in app.roles: replica_params = role_params.setdefault(role.name, []) replica_log_dirs = role_log_dirs.setdefault(role.name, []) container = role.container img_root = image_fetcher.fetch(container.image) cmd = os.path.join(img_root, role.entrypoint) for replica_id in range(role.num_replicas): args = [cmd] + macros.substitute(role.args, img_root, app_id, str(replica_id)) replica_log_dir = os.path.join(app_log_dir, role.name, str(replica_id)) env_vars = { # this is the top level (agent if using elastic role) error file # a.k.a scheduler reply file "TORCHELASTIC_ERROR_FILE": os.path.join(replica_log_dir, "error.json"), **role.env, } stdout = None stderr = None if redirect_std: stdout = os.path.join(replica_log_dir, "stdout.log") stderr = os.path.join(replica_log_dir, "stderr.log") replica_params.append( ReplicaParam(args, env_vars, stdout, stderr)) replica_log_dirs.append(replica_log_dir) return PopenRequest(app_id, app_log_dir, role_params, role_log_dirs)
def _to_app_popen_args( self, app_id: str, roles: List[Role], app_log_dir: str, redirect_std: bool, cfg: RunConfig, dryrun: bool = True, ): """ returns the popen args for all processes that needs to be created for the app :: # for each role [ { <role_name_1> : [{args: cmd, env: env, ... other popen args ...}, ...]}, { <role_name_1> : [{args: cmd, env: env, ... other popen args ...}, ...]}, ... ] # example (app has 2 roles: master (1 replica), trainer (2 replicas) [ { "master" : [ {args: "master.par", env: env, ... other popen args ...} ] }, { "trainer" : [ {args: "trainer.par", env: env, ... other popen args ...}, {args: "trainer.par", env: env, ... other popen args ...} ] }, ] """ app_popen_params = [] for role in roles: container = role.container assert ( container ), "all roles in a submitted app must have container association" image_fetcher = self._get_img_fetcher(cfg) img_root = image_fetcher.fetch(container.image) cmd = os.path.join(img_root, role.entrypoint) role_popen_params = {} for replica_id in range(role.num_replicas): args = [cmd] + macros.substitute(role.args, img_root, app_id, str(replica_id)) replica_popen_params = role_popen_params.setdefault( role.name, []) replica_log_dir = os.path.join(app_log_dir, role.name, str(replica_id)) if not dryrun: os.makedirs(replica_log_dir) env_vars = { # this is the top level (agent if using elastic role) error file # a.k.a scheduler reply file "TORCHELASTIC_ERROR_FILE": os.path.join(replica_log_dir, "error.json"), **role.env, } params: Dict[str, Any] = {"args": args, "env": env_vars} if redirect_std: params["stdout"] = os.path.join(replica_log_dir, "stdout.log") params["stderr"] = os.path.join(replica_log_dir, "stderr.log") replica_popen_params.append(params) app_popen_params.append(role_popen_params) return app_popen_params