def data_path_nfs_mountpoint(self): assert isinstance(self.data_path, str) server = self.get_cluster_nfs_server() path = self.get_nfs_path_with_folder("storage", self.data_path) mp = make_mountpoint( params={ "name": "data", "mountPath": "/data", "mountType": "nfs", "server": server, "path": path }) logger.info("job %s has data path nfs mountpoint: %s", self.job_id, mp) return mp
def work_path_nfs_mountpoint(self): assert isinstance(self.work_path, str) and len(self.work_path) > 0 server = self.get_cluster_nfs_server() path = self.get_nfs_path_with_folder("work", self.work_path) mp = make_mountpoint( params={ "name": "work", "mountPath": "/work", "mountType": "nfs", "server": server, "path": path }) logger.info("job %s has work path nfs mountpoint: %s", self.job_id, mp) return mp
def home_path_nfs_mountpoint(self): alias = self.get_alias() server = self.get_cluster_nfs_server() path = self.get_nfs_path_with_folder("work", alias) mp = make_mountpoint( params={ "name": "home", "mountPath": "/home/%s" % alias, "mountType": "nfs", "server": server, "path": path }) logger.info("job %s has home path nfs mountpoint: %s", self.job_id, mp) return mp
def infiniband_mountpoints(self): infiniband_mounts = self.get_infiniband_mounts() if not isinstance(infiniband_mounts, list): return None ib_mountpoints = [] for infiniband_mount in infiniband_mounts: ib_mp = make_mountpoint( params={ "name": infiniband_mount["name"].lower(), "mountPath": infiniband_mount["containerPath"], "hostPath": infiniband_mount["hostPath"], "mountType": "hostPath", }) ib_mountpoints.append(ib_mp) return ib_mountpoints
def system_mountpoints(self): """Returns all system defined mountpoints for this job. They can be NFS mountpoints, hostPath mountpoints, and many to be defined. If vc is undefined, the mountpoint is a cluster shared mountpoint. """ vc_name = self.params["vcName"] mp_params = [ mp for mp in self.get_system_mountpoints() if mp.get("vc") is None or mp.get("vc") == vc_name ] mps = [] for mp_param in mp_params: mp = make_mountpoint(mp_param) if mp is not None: logger.info("job %s has mountpoint: %s", self.job_id, mp) mps.append(mp) else: logger.warning("job %s has mountpoint for param %s None", self.job_id, mp_param) return mps
def generate_params(self, job): """ Return (pods, errors) """ assert (isinstance(job, Job)) params = job.params if any(required_field not in params for required_field in [ "jobtrainingtype", "jobName", "jobPath", "workPath", "dataPath", "cmd", "userId", "resourcegpu", "userName", "vcName", "sku", ]): return None, "Missing required parameters!" # Add /job, /work, /home/<alias>, /data job.job_path = params["jobPath"] job.work_path = params["workPath"] job.data_path = params["dataPath"] # Add /job job.add_mountpoints(job.job_path_nfs_mountpoint()) # Add /home/<alias>, /work, /data. # Some clusters have /data as dedicated storage for 1 VC. # Other VCs should not be able to access /data. vc_without_shared_storage = job.get_vc_without_shared_storage() if params["vcName"] not in vc_without_shared_storage: job.add_mountpoints(job.home_path_nfs_mountpoint()) job.add_mountpoints(job.work_path_nfs_mountpoint()) job.add_mountpoints(job.data_path_nfs_mountpoint()) # Add system provided job mountpoints job.add_mountpoints(job.system_mountpoints()) # Add user provided job mountpoints if "mountpoints" in params: for mountpoint_params in params["mountpoints"]: job.add_mountpoints(make_mountpoint(mountpoint_params)) params["init-container"] = os.environ["INIT_CONTAINER_IMAGE"] params["user_email"] = params["userName"] params["pod_ip_range"] = job.get_pod_ip_range() if "nodeSelector" not in params: params["nodeSelector"] = {} if "sku" in params: params["nodeSelector"]["sku"] = params["sku"] # Set up VC dedicated node usage vc_node_hard_assignment = job.get_vc_node_hard_assignment() if isinstance(vc_node_hard_assignment, dict): vc = params["vcName"] # TODO: Fix the case where CPU worker exists in a GPU pool if vc in vc_node_hard_assignment and \ vc_node_hard_assignment[vc] is True: params["nodeSelector"]["vc"] = vc else: params["nodeSelector"]["vc"] = "default" if "envs" not in params: params["envs"] = [] params["envs"].append({ "name": "DLWS_NUM_GPU_PER_WORKER", "value": str(params["resourcegpu"]) }) params["envs"].append({ "name": "DLTS_NUM_GPU_PER_WORKER", "value": str(params["resourcegpu"]) }) job.add_plugins(job.get_plugins()) params["plugins"] = job.plugins # Must be after job.get_plugins # TODO: Make mountpoints independent of job.get_plugins params["mountpoints"] = [mp.to_dict() for mp in job.mountpoints] # Set up system environment variables if any system_envs = job.get_system_envs() for env_name, env_val in system_envs.items(): params["envs"].append({"name": env_name, "value": env_val}) return params, None
def get_blobfuse_plugins(self, plugins): """Constructs and returns a list of blobfuse plugins.""" enable_blobfuse = self.get_enable_blobfuse() if enable_blobfuse is None or enable_blobfuse is False: return [] def identical(e1, e2): return e1["name"] == e2["name"] or \ e1["mountPath"] == e2["mountPath"] root_tmppath = None local_fast_storage = self.get_local_fast_storage() if local_fast_storage is not None and local_fast_storage != "": root_tmppath = local_fast_storage.rstrip("/") blobfuses = [] for i, p_bf in enumerate(plugins): account_name = p_bf.get("accountName") account_key = p_bf.get("accountKey") container_name = p_bf.get("containerName") mount_path = p_bf.get("mountPath") mount_options = p_bf.get("mountOptions") # Ignore Azure blobfuse with incomplete configurations if invalid_entry(account_name) or \ invalid_entry(account_key) or \ invalid_entry(container_name) or \ invalid_entry(mount_path): continue name = p_bf.get("name") if name is None: name = "%s-blobfuse-%d" % (self.job_id, i) # Reassign everything for clarity bf = { "enabled": True, "name": name, "secreds": "%s-blobfuse-%d-secreds" % (self.job_id, i), "accountName": b64encode(account_name), "accountKey": b64encode(account_key), "containerName": container_name, "mountPath": mount_path, "jobId": self.job_id, } if root_tmppath is not None: # Make tmppath unique for each blobfuse mount bf["rootTmppath"] = root_tmppath bf["tmppath"] = name # Also support a list of strings if isinstance(mount_options, list): mount_options = " ".join(mount_options) if not invalid_entry(mount_options): bf["mountOptions"] = mount_options # TODO: Refactor into mountpoint add blobfuses = dedup_add(bf, blobfuses, identical) # Add to mountpoints bf["mountType"] = "blobfuse" bf_mp = make_mountpoint(bf) if bf_mp is not None: self.add_mountpoints(bf_mp) return blobfuses