def test_pod_template_with_custom_scheduler(self): enable_custom_scheduler = True pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) gpu_num = 2 pod = { "podName": "790a6b30-560f-44a4-a9f0-5d1458dcb0d1-pod-0", "gpuLimit": gpu_num, } data = pod_template.generate_pod(pod) # eanbled custom scheduler would clear the resource limits: spec.containers[].resources.limits self.assertEqual( 0, data["spec"]["containers"][0]["resources"]["limits"] ["nvidia.com/gpu"]) # metadata.annotations["pod.alpha/DeviceInformation"] should be set # annotations = data["metadata"]["annotations"] device_annotation = json.loads( data["metadata"]["annotations"]["pod.alpha/DeviceInformation"]) self.assertEqual( gpu_num, device_annotation["runningcontainer"][pod["podName"]] ["requests"]["alpha.gpu/numgpu"]) # disabled topology self.assertEqual( 0, device_annotation["requests"]["alpha.gpu/gpu-generate-topology"])
def test_generate_pods_missing_required_params(self): enable_custom_scheduler = True pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) job.params = {} job_description, error = pod_template.generate_pods(job) self.assertIsNone(job_description) self.assertTrue(error) self.assertEqual("Missing required parameters!", error)
def test_generate_pod_with_labels(self): enable_custom_scheduler = False pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) pod = { "gpuLimit": 2, "labels": [{ "name": "my_label_name", "value": "my_label_value" }], } data = pod_template.generate_pod(pod) self.assertEqual("my_label_value", data["metadata"]["labels"]["my_label_name"])
def test_pod_template_without_custer_scheduler(self): enable_custom_scheduler = False pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) pod = {"gpuLimit": 2} data = pod_template.generate_pod(pod) # not eanbled custom scheduler, set the resource limits: spec.containers[].resources.limits self.assertEqual( pod["gpuLimit"], data["spec"]["containers"][0]["resources"] ["limits"]["nvidia.com/gpu"]) # metadata.annotations["pod.alpha/DeviceInformation"] should be empty self.assertTrue(("annotations" not in data["metadata"]) or ("pod.alpha/DeviceInformation" not in data["metadata"]["annotations"]))
def test_generate_pod_with_envs(self): enable_custom_scheduler = False pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) pod = { "gpuLimit": 2, "envs": [{ "name": "my_env_name", "value": "my_env_value" }], } data = pod_template.generate_pod(pod) self.assertIn({ "name": "my_env_name", "value": "my_env_value" }, data["spec"]["containers"][0]["env"])
def test_generate_launch_script(self): job_id = "ce7dca49-28df-450a-a03b-51b9c2ecc69c" path_to_save = "/tmp" user_id = "20000" gpu_num = 3 user_script = "sleep infinity" script_file = PodTemplate.generate_launch_script( job_id, path_to_save, user_id, gpu_num, user_script) # return the container command self.assertListEqual(["bash", "/pod/scripts/bootstrap.sh"], script_file)
def SubmitJob(job): # check if existing any pod with label: run=job_id assert ("jobId" in job) job_id = job["jobId"] if not all_pods_not_existing(job_id): logging.warning( "Waiting until previously pods are cleaned up! Job {}".format( job_id)) job_deployer = JobDeployer() errors = job_deployer.delete_job(job_id, force=True) if errors: logging.warning("Force delete job {}: {}".format(job_id, errors)) return ret = {} dataHandler = DataHandler() try: # TODO refine later # before resubmit the job, reset the endpoints # update all endpoint to status 'pending', so it would restart when job is ready endpoints = dataHandler.GetJobEndpoints(job_id) for endpoint_id, endpoint in endpoints.items(): endpoint["status"] = "pending" logging.info( "Reset endpoint status to 'pending': {}".format(endpoint_id)) dataHandler.UpdateEndpoint(endpoint) job["cluster"] = config job_object, errors = JobSchema().load(job) # TODO assert job_object is a Job assert (isinstance(job_object, Job)) job_object.params = json.loads(base64.b64decode(job["jobParams"])) # inject gid, uid and user # TODO it should return only one entry user_info = dataHandler.GetIdentityInfo( job_object.params["userName"])[0] job_object.params["gid"] = user_info["gid"] job_object.params["uid"] = user_info["uid"] job_object.params["user"] = job_object.get_alias() enable_custom_scheduler = job_object.is_custom_scheduler_enabled() if job_object.params["jobtrainingtype"] == "RegularJob": pod_template = PodTemplate(job_object.get_template(), enable_custom_scheduler) elif job_object.params["jobtrainingtype"] == "PSDistJob": pod_template = DistPodTemplate(job_object.get_template()) elif job_object.params["jobtrainingtype"] == "InferenceJob": pod_template = PodTemplate(job_object.get_template(), enable_custom_scheduler) else: dataHandler.SetJobError( job_object.job_id, "ERROR: invalid jobtrainingtype: %s" % job_object.params["jobtrainingtype"]) dataHandler.Close() return False pods, error = pod_template.generate_pods(job_object) if error: dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error) dataHandler.Close() return False job_description = "\n---\n".join([yaml.dump(pod) for pod in pods]) job_description_path = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml" local_jobDescriptionPath = os.path.realpath( os.path.join(config["storage-mount-path"], job_description_path)) if not os.path.exists(os.path.dirname(local_jobDescriptionPath)): os.makedirs(os.path.dirname(local_jobDescriptionPath)) with open(local_jobDescriptionPath, 'w') as f: f.write(job_description) job_deployer = JobDeployer() try: pods = job_deployer.create_pods(pods) ret["output"] = "Created pods: {}".format( [pod.metadata.name for pod in pods]) except Exception as e: ret["output"] = "Error: %s" % e.message logging.error(e, exc_info=True) ret["jobId"] = job_object.job_id dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus", "scheduling") dataHandler.UpdateJobTextField(job_object.job_id, "jobDescriptionPath", job_description_path) dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription", base64.b64encode(job_description)) dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated", datetime.datetime.now().isoformat()) jobMeta = {} jobMeta["jobDescriptionPath"] = job_description_path jobMeta["jobPath"] = job_object.job_path jobMeta["workPath"] = job_object.work_path # the command of the first container jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta", jobMetaStr) except Exception as e: logging.error("Submit job failed: %s" % job, exc_info=True) ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "Cannot submit job!" + str(e)) dataHandler.Close() return ret
def test_generate_pods(self): enable_custom_scheduler = True pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) job.params = { "gid": "20000", "uid": "20000", "user": "******", "mountpoints": [{ "description": "NFS (remote file share)", "enabled": True, "containerPath": "/home/user", "hostPath": "/dlwsdata/work/user", "name": "homefolder" }], "image": "indexserveregistry.azurecr.io/deepscale:1.0", "userId": "20000", "dataPath": "", "jobId": "140782a0-7f6d-4039-9801-fd6294c7c88a", "isParent": 1, "jobType": "training", "jobPath": "user/jobs/190627/140782a0-7f6d-4039-9801-fd6294c7c88a", "containerUserId": "0", "resourcegpu": 1, "env": [], "enabledatapath": True, "runningasroot": True, "interactivePorts": [], "preemptionAllowed": False, "jobtrainingtype": "RegularJob", "do_log": False, "is_interactive": False, "familyToken": "72fc61265bcb4416b68b44c82d120b3b", "enableworkpath": True, "vcName": "vc1", "userName": "******", "workPath": "user", "cmd": "sleep infinity", "jobName": "test-job", "enablejobpath": True, "gpuType": "P40", "ssh": True } pods, error = pod_template.generate_pods(job) self.assertFalse(error) # generate list of pod yamls self.assertTrue(list, type(pods)) self.assertEqual(1, len(pods)) self.assertIsNotNone(pods[0]["spec"]["containers"][0]["command"])
def submit_job_impl(self, job): # check if existing any pod with label: run=job_id assert ("jobId" in job) job_id = job["jobId"] if not self._all_pods_not_existing(job_id): logging.warning( "Waiting until previously pods are cleaned up! Job {}".format( job_id)) job_deployer = JobDeployer() errors = job_deployer.delete_job(job_id, force=True) if errors: logging.warning("Force delete job {}: {}".format( job_id, errors)) return ret = {} dataHandler = DataHandler() try: # TODO refine later # before resubmit the job, reset the endpoints # update all endpoint to status 'pending', so it would restart when job is ready endpoints = dataHandler.GetJobEndpoints(job_id) for endpoint_id, endpoint in endpoints.items(): endpoint["status"] = "pending" logging.info("Reset endpoint status to 'pending': {}".format( endpoint_id)) dataHandler.UpdateEndpoint(endpoint) job["cluster"] = config job_object, errors = JobSchema().load(job) # TODO assert job_object is a Job assert isinstance( job_object, Job), "job_object is not of Job, but " + str(type(job_object)) job_object.params = json.loads(base64.b64decode(job["jobParams"])) # inject gid, uid and user # TODO it should return only one entry user_info = dataHandler.GetIdentityInfo( job_object.params["userName"])[0] job_object.params["gid"] = user_info["gid"] job_object.params["uid"] = user_info["uid"] job_object.params["user"] = job_object.get_alias() if "job_token" not in job_object.params: if "user_sign_token" in config and "userName" in job_object.params: job_object.params["job_token"] = hashlib.md5( job_object.params["userName"] + ":" + config["user_sign_token"]).hexdigest() else: job_object.params["job_token"] = "tryme2017" if "envs" not in job_object.params: job_object.params["envs"] = [] job_object.params["envs"].append({ "name": "DLTS_JOB_TOKEN", "value": job_object.params["job_token"] }) enable_custom_scheduler = job_object.is_custom_scheduler_enabled() secret_template = job_object.get_blobfuse_secret_template() if job_object.params["jobtrainingtype"] == "RegularJob": pod_template = PodTemplate( job_object.get_template(), enable_custom_scheduler=enable_custom_scheduler, secret_template=secret_template) elif job_object.params["jobtrainingtype"] == "PSDistJob": pod_template = DistPodTemplate(job_object.get_template(), secret_template=secret_template) elif job_object.params["jobtrainingtype"] == "InferenceJob": pod_template = PodTemplate( job_object.get_template(), deployment_template=job_object.get_deployment_template(), enable_custom_scheduler=False, secret_template=secret_template) else: dataHandler.SetJobError( job_object.job_id, "ERROR: invalid jobtrainingtype: %s" % job_object.params["jobtrainingtype"]) dataHandler.Close() return False pods, error = pod_template.generate_pods(job_object) if error: dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error) dataHandler.Close() return False job_description = "\n---\n".join([yaml.dump(pod) for pod in pods]) job_description_path = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml" local_jobDescriptionPath = os.path.realpath( os.path.join(config["storage-mount-path"], job_description_path)) if not os.path.exists(os.path.dirname(local_jobDescriptionPath)): os.makedirs(os.path.dirname(local_jobDescriptionPath)) with open(local_jobDescriptionPath, 'w') as f: f.write(job_description) secrets = pod_template.generate_secrets(job_object) job_deployer = JobDeployer() try: secrets = job_deployer.create_secrets(secrets) ret["output"] = "Created secrets: {}. ".format( [secret.metadata.name for secret in secrets]) pods = job_deployer.create_pods(pods) ret["output"] += "Created pods: {}".format( [pod.metadata.name for pod in pods]) except Exception as e: ret["output"] = "Error: %s" % e.message logging.error(e, exc_info=True) ret["jobId"] = job_object.job_id dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus", "scheduling") dataHandler.UpdateJobTextField(job_object.job_id, "jobDescriptionPath", job_description_path) dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription", base64.b64encode(job_description)) dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated", datetime.datetime.now().isoformat()) jobMeta = {} jobMeta["jobDescriptionPath"] = job_description_path jobMeta["jobPath"] = job_object.job_path jobMeta["workPath"] = job_object.work_path # the command of the first container jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta", jobMetaStr) except Exception as e: logging.error("Submit job failed: %s" % job, exc_info=True) ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "Cannot submit job!" + str(e)) detail = get_job_status_detail(job) detail = job_status_detail_with_finished_time( detail, "error", "Server error in job submission") dataHandler.UpdateJobTextField( job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) # Try to clean up the job try: job_deployer = JobDeployer() job_deployer.delete_job(job_id, force=True) logging.info( "Cleaning up job %s succeeded after %d retries of job submission" % (job["jobId"], retries)) except: logging.warning( "Cleaning up job %s failed after %d retries of job submission" % (job["jobId"], retries)) dataHandler.Close() return ret