def UpdateDistJobStatus(job): dataHandler = DataHandler() jobParams = json.loads(base64.b64decode(job["jobParams"])) if "userId" not in jobParams: jobParams["userId"] = "0" jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") result, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(detail)) logging.info("job %s status: %s,%s" % (job["jobId"], result, json.dumps(detail))) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None jobId = jobParams["jobId"] workerPodInfo = k8sUtils.GetPod("distRole=worker,run=" + jobId) psPodInfo = k8sUtils.GetPod("distRole=ps,run=" + jobId) if "items" in workerPodInfo and len(workerPodInfo["items"]) == int( jobParams["numpsworker"]) and "items" in psPodInfo and len( psPodInfo["items"]) == int(jobParams["numps"]): if job["jobStatus"] == "scheduling": launch_ps_dist_job(jobParams) if job["jobStatus"] == "running": result, detail = GetDistJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(detail)) printlog("job %s status: %s" % (job["jobId"], result)) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"] ) if "jobDescriptionPath" in job else None if result.strip() == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Running": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) if job["jobStatus"] != "running": dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") if "interactivePort" in jobParams: serviceAddress = k8sUtils.GetServiceAddress(job["jobId"]) serviceAddress = base64.b64encode( json.dumps(serviceAddress)) dataHandler.UpdateJobTextField(job["jobId"], "endpoints", serviceAddress) elif result.strip() == "Failed": printlog("Job %s fails, cleaning..." % job["jobId"]) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail) if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Unknown": if job["jobId"] not in UnusualJobs: UnusualJobs[job["jobId"]] = datetime.datetime.now() elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: del UnusualJobs[job["jobId"]] retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField( job["jobId"], "errorMsg", "cannot launch the job.") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) else: printlog( "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"], retries)) SubmitJob(job) if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]] pass
def SubmitPSDistJob(job): ret = {} dataHandler = DataHandler() try: jobParams = json.loads(base64.b64decode(job["jobParams"])) jobParams["rest-api"] = config["rest-api"] distJobParams = {} distJobParams["ps"] = [] distJobParams["worker"] = [] assignedRack = None if len(config["racks"]) > 0: assignedRack = random.choice(config["racks"]) if jobParams["jobtrainingtype"] == "PSDistJob": jobDescriptionList = [] nums = { "ps": int(jobParams["numps"]), "worker": int(jobParams["numpsworker"]) } for role in ["ps", "worker"]: for i in range(nums[role]): distJobParam = copy.copy(jobParams) distJobParam["distId"] = "%s%d" % (role, i) distJobParam["distRole"] = role if "jobPath" not in distJobParam or len( distJobParam["jobPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: job-path does not exist") return False distJobParam["distJobPath"] = os.path.join( distJobParam["jobPath"], distJobParam["distId"]) if "workPath" not in distJobParam or len( distJobParam["workPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: work-path does not exist") return False if "dataPath" not in distJobParam or len( distJobParam["dataPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: data-path does not exist") return False jobPath, workPath, dataPath = GetStoragePath( distJobParam["distJobPath"], distJobParam["workPath"], distJobParam["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) if not os.path.exists(localJobPath): if "userId" in distJobParam: mkdirsAsUser(localJobPath, distJobParam["userId"]) else: mkdirsAsUser(localJobPath, 0) distJobParam["LaunchCMD"] = "" if "cmd" not in distJobParam: distJobParam["cmd"] = "" ################One choice is that we only wait for certain time. # launchCMD = """ ##!/bin/bash #mkdir -p /opt #echo "[DLWorkspace System]: Waiting for all containers are ready..." ## wait for at most 10 mins. #for i in {1..200}; do # if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then # sleep 3 # else # break # fi #done #if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then # echo "[DLWorkspace System]: Waiting for containers: timeout! Restarting..." # exit 1 #else # echo "[DLWorkspace System]: All containers are ready, launching training job..." # chmod +x /opt/run_dist_job.sh # /opt/run_dist_job.sh #fi #""" launchCMD = """ #!/bin/bash mkdir -p /opt echo "[DLWorkspace System]: Waiting for all containers are ready..." while [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; do sleep 3 done echo "[DLWorkspace System]: All containers are ready, launching training job..." chmod +x /opt/run_dist_job.sh /opt/run_dist_job.sh """ launchScriptPath = os.path.join( localJobPath, "launch-%s.sh" % distJobParam["jobId"]) with open(launchScriptPath, 'w') as f: f.write(launchCMD) f.close() distJobParam[ "LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % distJobParam[ "jobId"] distJobParam["jobNameLabel"] = ''.join( e for e in distJobParam["jobName"] if e.isalnum()) distJobParam["userNameLabel"] = getAlias( jobParams["userName"]) ENV = Environment(loader=FileSystemLoader("/")) jobTempDir = os.path.join(config["root-path"], "Jobs_Templete") jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template") distJobParam["hostjobPath"] = os.path.join( config["storage-mount-path"], jobPath) distJobParam["hostworkPath"] = os.path.join( config["storage-mount-path"], workPath) distJobParam["hostdataPath"] = os.path.join( config["storage-mount-path"], dataPath) distJobParam["nvidiaDriverPath"] = nvidiaDriverPath if "mountpoints" not in distJobParam: distJobParam["mountpoints"] = [] distJobParam["mountpoints"].append({ "name": "nvidia-driver", "containerPath": "/usr/local/nvidia", "hostPath": nvidiaDriverPath }) distJobParam["mountpoints"].append({ "name": "job", "containerPath": "/job", "hostPath": distJobParam["hostjobPath"] }) distJobParam["mountpoints"].append({ "name": "work", "containerPath": "/work", "hostPath": distJobParam["hostworkPath"] }) distJobParam["mountpoints"].append({ "name": "data", "containerPath": "/data", "hostPath": distJobParam["hostdataPath"] }) distJobParam["pod_ip_range"] = config["pod_ip_range"] if "usefreeflow" in config and config[ "usefreeflow"] == "True": distJobParam["usefreeflow"] = config["usefreeflow"] else: distJobParam["usefreeflow"] = False random.seed(datetime.datetime.now()) distJobParam["containerPort"] = int(random.random() * 1000 + 3000) if assignedRack is not None: if "nodeSelector" not in distJobParam: distJobParam["nodeSelector"] = {} distJobParam["nodeSelector"]["rack"] = assignedRack template = ENV.get_template(os.path.abspath(jobTemp)) job_description = template.render(job=distJobParam) jobDescriptionList.append(job_description) distJobParams[role].append(distJobParam) jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" jobDescription = "\n---\n".join(jobDescriptionList) jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) if not os.path.exists( os.path.dirname(os.path.realpath(jobDescriptionPath))): os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) if os.path.isfile(jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) with open(jobDescriptionPath, 'w') as f: f.write(jobDescription) output = k8sUtils.kubectl_create(jobDescriptionPath) ret["output"] = output ret["jobId"] = jobParams["jobId"] if "userName" not in jobParams: jobParams["userName"] = "" dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "scheduling") dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescriptionPath", jobParams["jobDescriptionPath"]) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription", base64.b64encode(jobDescription)) jobMeta = {} jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["workPath"] = jobParams["workPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["LaunchCMD"] = jobParams["LaunchCMD"] jobMeta["distJobParams"] = distJobParams jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta", jobMetaStr) except Exception as e: print e ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg", "Cannot submit job!" + str(e)) return ret
def SubmitRegularJob(job): ret = {} dataHandler = DataHandler() try: jobParams = json.loads(base64.b64decode(job["jobParams"])) jobParams["pvc_job"] = "jobs-" + jobParams["jobId"] jobParams["pvc_work"] = "work-" + jobParams["jobId"] jobParams["pvc_data"] = "storage-" + jobParams["jobId"] if "jobPath" not in jobParams or len( jobParams["jobPath"].strip()) == 0: dataHandler.SetJobError(jobParams["jobId"], "ERROR: job-path does not exist") return False if "workPath" not in jobParams or len( jobParams["workPath"].strip()) == 0: dataHandler.SetJobError(jobParams["jobId"], "ERROR: work-path does not exist") return False #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0: # dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist") # return False jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) if not os.path.exists(localJobPath): if "userId" in jobParams: mkdirsAsUser(localJobPath, jobParams["userId"]) mkdirsAsUser(os.path.join(localJobPath, "models"), jobParams["userId"]) else: mkdirsAsUser(localJobPath, "0") mkdirsAsUser(os.path.join(localJobPath, "models"), "0") jobParams["LaunchCMD"] = "" if "cmd" not in jobParams: jobParams["cmd"] = "" if isinstance(jobParams["cmd"], basestring) and not jobParams["cmd"] == "": launchScriptPath = os.path.join( localJobPath, "launch-%s.sh" % jobParams["jobId"]) with open(launchScriptPath, 'w') as f: f.write("#!/bin/bash -x\n") f.write(jobParams["cmd"] + "\n") f.close() if "userId" in jobParams: os.system("chown -R %s %s" % (jobParams["userId"], launchScriptPath)) jobParams[ "LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams[ "jobId"] jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"] if e.isalnum()) ENV = Environment(loader=FileSystemLoader("/")) jobTempDir = os.path.join(config["root-path"], "Jobs_Templete") jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template") jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath) jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath) jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath) jobParams["nvidiaDriverPath"] = nvidiaDriverPath jobParams["userNameLabel"] = getAlias(jobParams["userName"]) jobParams["rest-api"] = config["rest-api"] if "mountpoints" not in jobParams: jobParams["mountpoints"] = [] for onemount in jobParams["mountpoints"]: onemount["name"] = onemount["containerPath"].replace("/", "") jobParams["mountpoints"].append({ "name": "nvidia-driver", "containerPath": "/usr/local/nvidia", "hostPath": nvidiaDriverPath }) jobParams["mountpoints"].append({ "name": "job", "containerPath": "/job", "hostPath": jobParams["hostjobPath"] }) jobParams["mountpoints"].append({ "name": "work", "containerPath": "/work", "hostPath": jobParams["hostworkPath"] }) jobParams["mountpoints"].append({ "name": "data", "containerPath": "/data", "hostPath": jobParams["hostdataPath"] }) jobParams["pod_ip_range"] = config["pod_ip_range"] if "usefreeflow" in config: jobParams["usefreeflow"] = config["usefreeflow"] else: jobParams["usefreeflow"] = False print("Render Job: %s" % jobParams) template = ENV.get_template(os.path.abspath(jobTemp)) job_description = template.render(job=jobParams) jobDescriptionList = [] jobDescriptionList.append(job_description) if ("interactivePort" in jobParams and len(jobParams["interactivePort"].strip()) > 0): ports = [ p.strip() for p in re.split(",|;", jobParams["interactivePort"]) if len(p.strip()) > 0 and p.strip().isdigit() ] for portNum in ports: jobParams["serviceId"] = "interactive-" + jobParams[ "jobId"] + "-" + portNum jobParams["port"] = portNum jobParams["port-name"] = "interactive" jobParams["port-type"] = "TCP" serviceTemplate = ENV.get_template( os.path.join(jobTempDir, "KubeSvc.yaml.template")) template = ENV.get_template(serviceTemplate) interactiveMeta = template.render(svc=jobParams) jobDescriptionList.append(interactiveMeta) jobDescription = "\n---\n".join(jobDescriptionList) jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) if not os.path.exists( os.path.dirname(os.path.realpath(jobDescriptionPath))): os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) if os.path.isfile(jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) with open(jobDescriptionPath, 'w') as f: f.write(jobDescription) output = k8sUtils.kubectl_create(jobDescriptionPath) logging.info("Submitted job %s to k8s, returned with status %s" % (job["jobId"], output)) ret["output"] = output ret["jobId"] = jobParams["jobId"] if "userName" not in jobParams: jobParams["userName"] = "" dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "scheduling") dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescriptionPath", jobParams["jobDescriptionPath"]) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription", base64.b64encode(jobDescription)) jobMeta = {} jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["workPath"] = jobParams["workPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["LaunchCMD"] = jobParams["LaunchCMD"] jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta", jobMetaStr) except Exception as e: print e ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg", "Cannot submit job!" + str(e)) return ret
def SubmitJob(job): # check if existing any pod with label: run=job_id assert ("jobId" in job) job_id = job["jobId"] if not all_pods_not_existing(job_id): logging.warning( "Waiting until previously pods are cleaned up! Job {}".format( job_id)) job_deployer = JobDeployer() errors = job_deployer.delete_job(job_id, force=True) if errors: logging.warning("Force delete job {}: {}".format(job_id, errors)) return ret = {} dataHandler = DataHandler() try: # TODO refine later # before resubmit the job, reset the endpoints # update all endpoint to status 'pending', so it would restart when job is ready endpoints = dataHandler.GetJobEndpoints(job_id) for endpoint_id, endpoint in endpoints.items(): endpoint["status"] = "pending" logging.info( "Reset endpoint status to 'pending': {}".format(endpoint_id)) dataHandler.UpdateEndpoint(endpoint) job["cluster"] = config job_object, errors = JobSchema().load(job) # TODO assert job_object is a Job assert (isinstance(job_object, Job)) job_object.params = json.loads(base64.b64decode(job["jobParams"])) # inject gid, uid and user # TODO it should return only one entry user_info = dataHandler.GetIdentityInfo( job_object.params["userName"])[0] job_object.params["gid"] = user_info["gid"] job_object.params["uid"] = user_info["uid"] job_object.params["user"] = job_object.get_alias() enable_custom_scheduler = job_object.is_custom_scheduler_enabled() if job_object.params["jobtrainingtype"] == "RegularJob": pod_template = PodTemplate(job_object.get_template(), enable_custom_scheduler) elif job_object.params["jobtrainingtype"] == "PSDistJob": pod_template = DistPodTemplate(job_object.get_template()) elif job_object.params["jobtrainingtype"] == "InferenceJob": pod_template = PodTemplate(job_object.get_template(), enable_custom_scheduler) else: dataHandler.SetJobError( job_object.job_id, "ERROR: invalid jobtrainingtype: %s" % job_object.params["jobtrainingtype"]) dataHandler.Close() return False pods, error = pod_template.generate_pods(job_object) if error: dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error) dataHandler.Close() return False job_description = "\n---\n".join([yaml.dump(pod) for pod in pods]) job_description_path = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml" local_jobDescriptionPath = os.path.realpath( os.path.join(config["storage-mount-path"], job_description_path)) if not os.path.exists(os.path.dirname(local_jobDescriptionPath)): os.makedirs(os.path.dirname(local_jobDescriptionPath)) with open(local_jobDescriptionPath, 'w') as f: f.write(job_description) job_deployer = JobDeployer() try: pods = job_deployer.create_pods(pods) ret["output"] = "Created pods: {}".format( [pod.metadata.name for pod in pods]) except Exception as e: ret["output"] = "Error: %s" % e.message logging.error(e, exc_info=True) ret["jobId"] = job_object.job_id dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus", "scheduling") dataHandler.UpdateJobTextField(job_object.job_id, "jobDescriptionPath", job_description_path) dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription", base64.b64encode(job_description)) dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated", datetime.datetime.now().isoformat()) jobMeta = {} jobMeta["jobDescriptionPath"] = job_description_path jobMeta["jobPath"] = job_object.job_path jobMeta["workPath"] = job_object.work_path # the command of the first container jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta", jobMetaStr) except Exception as e: logging.error("Submit job failed: %s" % job, exc_info=True) ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "Cannot submit job!" + str(e)) dataHandler.Close() return ret
def SubmitRegularJob(job): ret = {} dataHandler = DataHandler() try: jobParams = json.loads(base64.b64decode(job["jobParams"])) jobParams["pvc_job"] = "jobs-" + jobParams["jobId"] jobParams["pvc_work"] = "work-" + jobParams["jobId"] jobParams["pvc_data"] = "storage-" + jobParams["jobId"] if "jobPath" not in jobParams or len(jobParams["jobPath"].strip()) == 0: dataHandler.SetJobError(jobParams["jobId"],"ERROR: job-path does not exist") return False if "workPath" not in jobParams or len(jobParams["workPath"].strip()) == 0: dataHandler.SetJobError(jobParams["jobId"],"ERROR: work-path does not exist") return False #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0: # dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist") # return False jobPath,workPath,dataPath = GetStoragePath(jobParams["jobPath"],jobParams["workPath"],jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"],jobPath) if not os.path.exists(localJobPath): if "userId" in jobParams: mkdirsAsUser(localJobPath,jobParams["userId"]) mkdirsAsUser(os.path.join(localJobPath,"models"),jobParams["userId"]) else: mkdirsAsUser(localJobPath,"0") mkdirsAsUser(os.path.join(localJobPath,"models"),"0") jobParams["LaunchCMD"] = "" if "cmd" not in jobParams: jobParams["cmd"] = "" if isinstance(jobParams["cmd"], basestring) and not jobParams["cmd"] == "": launchScriptPath = os.path.join(localJobPath,"launch-%s.sh" % jobParams["jobId"]) with open(launchScriptPath, 'w') as f: f.write("#!/bin/bash -x\n") f.write(jobParams["cmd"] + "\n") f.close() if "userId" in jobParams: os.system("chown -R %s %s" % (jobParams["userId"], launchScriptPath)) jobParams["LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams["jobId"] jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"] if e.isalnum()) ENV = Environment(loader=FileSystemLoader("/")) jobTempDir = os.path.join(config["root-path"],"Jobs_Templete") jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template") jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath) jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath) jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath) jobParams["nvidiaDriverPath"] = nvidiaDriverPath jobParams["userNameLabel"] = getAlias(jobParams["userName"]) jobParams["rest-api"] = config["rest-api"] if "mountpoints" not in jobParams: jobParams["mountpoints"] = [] for onemount in jobParams["mountpoints"]: onemount["name"] = onemount["containerPath"].replace("/","") mp = {"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath, "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) mp = {"name":"job","containerPath":"/job","hostPath":jobParams["hostjobPath"], "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) mp = {"name":"work","containerPath":"/work","hostPath":jobParams["hostworkPath"], "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) mp = {"name":"data","containerPath":"/data","hostPath":jobParams["hostdataPath"], "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) userAlias = getAlias(jobParams["userName"]) mp = {"name":"sshkey","containerPath":"/home/%s/.ssh" % userAlias,"hostPath":os.path.join(config["storage-mount-path"], GetWorkPath(userAlias)+"/.ssh"), "readOnly":True, "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) jobParams["pod_ip_range"] = config["pod_ip_range"] if "usefreeflow" in config: jobParams["usefreeflow"] = config["usefreeflow"] else: jobParams["usefreeflow"] = False print ("Render Job: %s" % jobParams) jobDescriptionList = [] pods = [] if "hyperparametername" in jobParams and "hyperparameterstartvalue" in jobParams and "hyperparameterendvalue" in jobParams and "hyperparameterstep" in jobParams: i = int(jobParams["hyperparameterstartvalue"]) end = int(jobParams["hyperparameterendvalue"]) step = int(jobParams["hyperparameterstep"]) c = 0 while (i <= end): pod = {} pod["podName"] = jobParams["jobId"]+"-pod-"+str(c) pod["envs"] = [{"name":jobParams["hyperparametername"],"value":i}] i += step c += 1 pods.append(pod) else: pod = {} pod["podName"] = jobParams["jobId"] pod["envs"] = [] pods.append(pod) if "env" not in jobParams: jobParams["env"] = [] jobParams["commonenv"] = copy.copy(jobParams["env"]) for pod in pods: jobParams["podName"] = pod["podName"] jobParams["env"] = jobParams["commonenv"] + pod["envs"] if "kube_custom_scheduler" in config and config["kube_custom_scheduler"]: container = {} container["requests"] = {"alpha.gpu/numgpu" : int(jobParams["resourcegpu"])} podInfo = {} podInfo["podname"] = jobParams["podName"] if "useGPUTopology" in jobParams and jobParams["useGPUTopology"]: # add topology constraints explicitly - for testing # if (jobParams["resourcegpu"] >= 2): # # both cards in same inner group # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/0/cards"] = 1 # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/1/cards"] = 1 # if (jobParams["resourcegpu"] >= 3): # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/2/cards"] = 1 # if (jobParams["resourcegpu"] >= 4): # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/3/cards"] = 1 # if (jobParams["resourcegpu"] >= 5): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/4/cards"] = 1 # if (jobParams["resourcegpu"] >= 6): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/5/cards"] = 1 # if (jobParams["resourcegpu"] >= 7): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/6/cards"] = 1 # if (jobParams["resourcegpu"] >= 8): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/7/cards"] = 1 podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 1} else: # for cases when desired topology is explictly given or not desired podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 0} podInfo["runningcontainer"] = {jobParams["podName"] : container} if "annotations" not in jobParams: jobParams["annotations"] = {} jobParams["annotations"]["pod.alpha/DeviceInformation"] = "'" + json.dumps(podInfo) + "'" jobParams["resourcegpu"] = 0 # gpu requests specified through annotation template = ENV.get_template(os.path.abspath(jobTemp)) job_description = template.render(job=jobParams) jobDescriptionList.append(job_description) if ("interactivePort" in jobParams and len(jobParams["interactivePort"].strip()) > 0): ports = [p.strip() for p in re.split(",|;",jobParams["interactivePort"]) if len(p.strip()) > 0 and p.strip().isdigit()] for portNum in ports: jobParams["serviceId"] = "interactive-" + jobParams["podName"] + "-" + portNum jobParams["port"] = portNum jobParams["port-name"] = "interactive" jobParams["port-type"] = "TCP" serviceTemplate = ENV.get_template(os.path.join(jobTempDir,"KubeSvc.yaml.template")) stemplate = ENV.get_template(serviceTemplate) interactiveMeta = stemplate.render(svc=jobParams) jobDescriptionList.append(interactiveMeta) jobDescription = "\n---\n".join(jobDescriptionList) jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) if not os.path.exists(os.path.dirname(os.path.realpath(jobDescriptionPath))): os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) if os.path.isfile(jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) with open(jobDescriptionPath, 'w') as f: f.write(jobDescription) output = k8sUtils.kubectl_create(jobDescriptionPath) logging.info("Submitted job %s to k8s, returned with status %s" %(job["jobId"], output)) ret["output"] = output ret["jobId"] = jobParams["jobId"] if "userName" not in jobParams: jobParams["userName"] = "" dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","scheduling") dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescriptionPath",jobParams["jobDescriptionPath"]) dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescription",base64.b64encode(jobDescription)) jobMeta = {} jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["workPath"] = jobParams["workPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["LaunchCMD"] = jobParams["LaunchCMD"] jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(jobParams["jobId"],"jobMeta",jobMetaStr) except Exception as e: print e ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","error") dataHandler.UpdateJobTextField(jobParams["jobId"],"errorMsg","Cannot submit job!" + str(e)) return ret
def submit_job_impl(self, job): # check if existing any pod with label: run=job_id assert ("jobId" in job) job_id = job["jobId"] if not self._all_pods_not_existing(job_id): logger.warning( "Waiting until previously pods are cleaned up! Job {}".format( job_id)) errors = self.delete_job(job_id, force=True) if errors: logger.warning("Force delete job {}: {}".format( job_id, errors)) return ret = {} dataHandler = DataHandler() try: # TODO refine later # before resubmit the job, reset the endpoints # update all endpoint to status 'pending', so it would restart when job is ready endpoints = dataHandler.GetJobEndpoints(job_id) for endpoint_id, endpoint in list(endpoints.items()): endpoint["status"] = "pending" logger.debug("Reset endpoint status to 'pending': {}".format( endpoint_id)) dataHandler.UpdateEndpoint(endpoint) job["cluster"] = config job_object, errors = JobSchema().load(job) # TODO assert job_object is a Job assert isinstance( job_object, Job), "job_object is not of Job, but " + str(type(job_object)) job_object.params = json.loads(b64decode(job["jobParams"])) # inject gid, uid and user # TODO it should return only one entry user_info = dataHandler.GetIdentityInfo( job_object.params["userName"])[0] job_object.params["gid"] = user_info["gid"] job_object.params["uid"] = user_info["uid"] job_object.params["user"] = job_object.get_alias() job_object.params["private_key"] = user_info["private_key"] job_object.params["ssh_public_keys"] = job_object.params.get( "ssh_public_keys", []) job_object.params["ssh_public_keys"].append( user_info["public_key"]) if "job_token" not in job_object.params: if "master_token" in config and config[ "master_token"] is not None and "userName" in job_object.params: plain_token = job_object.params["userName"] + \ ":" + config["master_token"] job_object.params["job_token"] = hashlib.md5( plain_token.encode("utf-8")).hexdigest() else: job_object.params["job_token"] = "tryme2017" if "envs" not in job_object.params: job_object.params["envs"] = [] job_object.params["envs"].append({ "name": "DLTS_JOB_TOKEN", "value": job_object.params["job_token"] }) blobfuse_secret_template = job_object.get_blobfuse_secret_template( ) image_pull_secret_template = job_object.get_image_pull_secret_template( ) secret_templates = { "blobfuse": blobfuse_secret_template, "imagePull": image_pull_secret_template } if job_object.params["jobtrainingtype"] == "RegularJob": pod_template = RegularJobTemplate( job_object.get_template(), secret_templates=secret_templates) elif job_object.params["jobtrainingtype"] == "PSDistJob": pod_template = DistributeJobTemplate( job_object.get_template(), secret_templates=secret_templates) elif job_object.params["jobtrainingtype"] == "InferenceJob": pod_template = InferenceJobTemplate( job_object.get_template(), deployment_template=job_object.get_deployment_template(), secret_templates=secret_templates) else: dataHandler.SetJobError( job_object.job_id, "ERROR: invalid jobtrainingtype: %s" % job_object.params["jobtrainingtype"]) dataHandler.Close() return False pods, error = pod_template.generate_pods(job_object) if error: dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error) dataHandler.Close() return False job_description = "\n---\n".join([yaml.dump(pod) for pod in pods]) secrets = pod_template.generate_secrets(job_object) try: secrets = self.create_secrets(secrets) ret["output"] = "Created secrets: {}. ".format( [secret.metadata.name for secret in secrets]) created_pods = self.create_pods(pods) ret["output"] += "Created pods: {}".format( [pod.metadata.name for pod in created_pods]) except Exception as e: ret["output"] = "Error: %s" % e.message logger.exception(e) ret["jobId"] = job_object.job_id jobMeta = {} jobMeta["jobPath"] = job_object.job_path jobMeta["workPath"] = job_object.work_path # the command of the first container jobMeta["LaunchCMD"] = job_object.params["cmd"] jobMetaStr = b64encode(json.dumps(jobMeta)) dataFields = { "jobStatus": "scheduling", "jobDescription": b64encode(job_description), "lastUpdated": datetime.datetime.now().isoformat(), "jobMeta": jobMetaStr } conditionFields = {"jobId": job_object.job_id} dataHandler.UpdateJobTextFields(conditionFields, dataFields) except Exception as e: logger.error("Submit job failed: %s" % job, exc_info=True) ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: detail = get_job_status_detail(job) detail = job_status_detail_with_finished_time( detail, "error", "Server error in job submission") dataFields = { "jobStatus": "error", "errorMsg": "Cannot submit job!" + str(e), "jobStatusDetail": b64encode(json.dumps(detail)) } conditionFields = {"jobId": job["jobId"]} dataHandler.UpdateJobTextFields(conditionFields, dataFields) # Try to clean up the job try: self.delete_job(job_id, force=True) logger.info( "Cleaning up job %s succeeded after %d retries of job submission" % (job["jobId"], retries)) except: logger.warning( "Cleaning up job %s failed after %d retries of job submission" % (job["jobId"], retries)) dataHandler.Close() return ret
def UpdateJobStatus(job): dataHandler = DataHandler() jobParams = json.loads(base64.b64decode(job["jobParams"])) if job["jobStatus"] == "scheduling" and jobParams[ "jobtrainingtype"] == "PSDistJob": # launch user command only all pods are ready result, detail = k8sUtils.GetJobStatus(job["jobId"]) if result in ["Failed", "Succeeded"]: # TODO shoudn't be here, update status dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", result) pass else: # previously status is 'scheduling', and now all pods are ready # TODO check all pods are ready if k8sUtils.all_pod_ready(job["jobId"]): try: launch_ps_dist_job(jobParams) except Exception as e: print(e) return jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") result, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) logging.info("job %s status: %s,%s" % (job["jobId"], result, json.dumps(detail))) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None if "userId" not in jobParams: jobParams["userId"] = "0" if result.strip() == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Running": if job["jobStatus"] != "running": dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") elif result.strip() == "Failed": printlog("Job %s fails, cleaning..." % job["jobId"]) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail) if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Unknown": if job["jobId"] not in UnusualJobs: UnusualJobs[job["jobId"]] = datetime.datetime.now() elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: del UnusualJobs[job["jobId"]] retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "cannot launch the job.") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) else: printlog( "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"], retries)) SubmitJob(job) elif result.strip() == "PendingHostPort": printlog( "Cannot find host ports for job :%s, re-launch the job with different host ports " % (job["jobId"])) SubmitJob(job) if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]] dataHandler.Close()
def SubmitPSDistJob(job): ret = {} dataHandler = DataHandler() try: jobParams = json.loads(base64.b64decode(job["jobParams"])) jobParams["rest-api"] = config["rest-api"] distJobParams = {} distJobParams["ps"] = [] distJobParams["worker"] = [] assignedRack = None if len(config["racks"]) > 0: assignedRack = random.choice(config["racks"]) userAlias = getAlias(jobParams["userName"]) jobParams["user_email"] = jobParams["userName"] jobParams["homeFolderHostpath"] = os.path.join( config["storage-mount-path"], GetWorkPath(userAlias)) if jobParams["jobtrainingtype"] == "PSDistJob": jobDescriptionList = [] nums = { "ps": int(jobParams["numps"]), "worker": int(jobParams["numpsworker"]) } for role in ["ps", "worker"]: for i in range(nums[role]): distJobParam = copy.deepcopy(jobParams) distJobParam["distId"] = "%s%d" % (role, i) distJobParam["distRole"] = role distJobParam["distRoleIdx"] = i if "jobPath" not in distJobParam or len( distJobParam["jobPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: job-path does not exist") return False if "workPath" not in distJobParam or len( distJobParam["workPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: work-path does not exist") return False #if "dataPath" not in distJobParam or len(distJobParam["dataPath"].strip()) == 0: # dataHandler.SetJobError(distJobParam["jobId"],"ERROR: data-path does not exist") # return False distJobParam["distJobPath"] = os.path.join( distJobParam["jobPath"], distJobParam["distId"]) jobPath, workPath, dataPath = GetStoragePath( distJobParam["distJobPath"], distJobParam["workPath"], distJobParam["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) if not os.path.exists(localJobPath): if "userId" in distJobParam: mkdirsAsUser(localJobPath, distJobParam["userId"]) else: mkdirsAsUser(localJobPath, 0) # TODO ??? if "cmd" not in distJobParam: distJobParam["cmd"] = "" #change ssh folder permission here because the setup permission script in launch_ps_job function may have race condition with init_user.sh script. results in no such user error if role == "ps": launchCMD = """ #!/bin/bash echo "[DLWorkspace System]: Waiting for all containers are ready..." while [ ! -f /opt/run_dist_job ]; do sleep 3 done sudo chmod 600 -R /home/%s/.ssh &>/dev/null; sudo chmod 700 /home/%s/.ssh &>/dev/null; sudo chown -R %s /home/%s/.ssh &>/dev/null; sudo mkdir -p /root/.ssh &>/dev/null ; sudo ln -s /home/%s/.ssh/config /root/.ssh/config &>/dev/null; sudo mkdir -p /opt &>/dev/null; sudo ln -s /job/hostfile /opt/hostfile &>/dev/null; JOB_DIR='/home/%s' WORKER_NUM=%s echo $JOB_DIR $WORKER_NUM all_workers_ready=false while [ "$all_workers_ready" != true ] do # update it to false if any woker is not ready all_workers_ready=true for i in $(seq 0 $(( ${WORKER_NUM} - 1)) ) do worker="worker${i}" file="$JOB_DIR/${worker}/WORKER_READY" #echo $file if [ ! -f $file ]; then echo "${worker} not ready!" all_workers_ready=false sleep 10 fi done done echo "[DLWorkspace System]: All containers are ready, launching training job..." %s """ % (userAlias, userAlias, userAlias, userAlias, userAlias, distJobParam["jobPath"], jobParams["numpsworker"], distJobParam["cmd"]) else: launchCMD = """ while [ ! -f /opt/run_dist_job ]; do sleep 3 done sudo chmod 600 -R /home/%s/.ssh &>/dev/null; sudo chmod 700 /home/%s/.ssh &>/dev/null; sudo chown -R %s /home/%s/.ssh &>/dev/null; sudo mkdir -p /root/.ssh &>/dev/null; sudo ln -s /home/%s/.ssh/config /root/.ssh/config &>/dev/null; sudo mkdir -p /opt && sudo ln -s /job/hostfile /opt/hostfile &>/dev/null; # TODO mark the worker as 'READY', better to change to '/pod/READY' later sudo touch /job/WORKER_READY sleep infinity """ % (userAlias, userAlias, userAlias, userAlias, userAlias) launchScriptPath = os.path.join( localJobPath, "launch-%s-%s%d.sh" % (distJobParam["jobId"], role, i)) # TODO need to set up user for distribute jobs with open(launchScriptPath, 'w') as f: f.write(launchCMD) f.close() launchScriptInContainer = "bash /job/launch-%s-%s%d.sh" % ( distJobParam["jobId"], role, i) distJobParam[ "LaunchCMD"] = '["bash", "-c", "bash /dlws/init_user.sh &> /job/init_user_script.log && runuser -l ${DLWS_USER_NAME} -c \'%s\'"]' % launchScriptInContainer distJobParam["jobNameLabel"] = ''.join( e for e in distJobParam["jobName"] if e.isalnum()) ENV = Environment(loader=FileSystemLoader("/")) jobTempDir = os.path.join(config["root-path"], "Jobs_Templete") jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template") distJobParam["hostjobPath"] = os.path.join( config["storage-mount-path"], jobPath) distJobParam["hostworkPath"] = os.path.join( config["storage-mount-path"], workPath) distJobParam["hostdataPath"] = os.path.join( config["storage-mount-path"], dataPath) distJobParam["nvidiaDriverPath"] = nvidiaDriverPath if "mountpoints" not in distJobParam: distJobParam["mountpoints"] = [] # distJobParam["mountpoints"].append({"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath}) distJobParam["mountpoints"].append({ "name": "job", "containerPath": "/job", "hostPath": distJobParam["hostjobPath"] }) distJobParam["mountpoints"].append({ "name": "work", "containerPath": "/work", "hostPath": distJobParam["hostworkPath"] }) distJobParam["mountpoints"].append({ "name": "data", "containerPath": "/data", "hostPath": distJobParam["hostdataPath"] }) for idx in range(len(distJobParam["mountpoints"])): if "name" not in distJobParam["mountpoints"][idx]: distJobParam["mountpoints"][idx]["name"] = str( uuid.uuid4()).replace("-", "") distJobParam["pod_ip_range"] = config["pod_ip_range"] if "usefreeflow" in config: distJobParam["usefreeflow"] = config["usefreeflow"] else: distJobParam["usefreeflow"] = False distJobParam["numworker"] = int(jobParams["numpsworker"]) distJobParam["numps"] = int(jobParams["numps"]) random.seed(datetime.datetime.now()) if "hostNetwork" in jobParams and jobParams["hostNetwork"]: distJobParam["containerPort"] = random.randint( 40000, 49999) else: distJobParam["containerPort"] = int(random.random() * 1000 + 3000) if assignedRack is not None: if "nodeSelector" not in distJobParam: distJobParam["nodeSelector"] = {} distJobParam["nodeSelector"]["rack"] = assignedRack if "gpuType" in distJobParam: if "nodeSelector" not in distJobParam: distJobParam["nodeSelector"] = {} distJobParam["nodeSelector"]["gpuType"] = distJobParam[ "gpuType"] # inject gid, uid and user # TODO it should return only one entry user_info = dataHandler.GetIdentityInfo( jobParams["userName"])[0] distJobParam["gid"] = user_info["gid"] distJobParam["uid"] = user_info["uid"] distJobParam["user"] = userAlias template = ENV.get_template(os.path.abspath(jobTemp)) job_description = template.render(job=distJobParam) jobDescriptionList.append(job_description) distJobParams[role].append(distJobParam) jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" jobDescription = "\n---\n".join(jobDescriptionList) jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) if not os.path.exists( os.path.dirname(os.path.realpath(jobDescriptionPath))): os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) if os.path.isfile(jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) with open(jobDescriptionPath, 'w') as f: f.write(jobDescription) output = k8sUtils.kubectl_create(jobDescriptionPath) ret["output"] = output ret["jobId"] = jobParams["jobId"] if "userName" not in jobParams: jobParams["userName"] = "" dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "scheduling") dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescriptionPath", jobParams["jobDescriptionPath"]) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription", base64.b64encode(jobDescription)) jobMeta = {} jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["workPath"] = jobParams["workPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["LaunchCMD"] = jobParams["cmd"] jobMeta["distJobParams"] = distJobParams jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta", jobMetaStr) except Exception as e: import traceback traceback.print_exc() print(e) ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg", "Cannot submit job!" + str(e)) dataHandler.Close() return ret
def UpdateJobStatus(job): dataHandler = DataHandler() jobParams = json.loads(base64.b64decode(job["jobParams"])) logging.info("start to update job status...") if job["jobStatus"] == "scheduling" and jobParams[ "jobtrainingtype"] == "PSDistJob": launch_ps_dist_job(jobParams) jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") result, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) msg = "job %s status, result: %s, detail: %s" % (job["jobId"], result, json.dumps(detail)) logging.info(msg) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None if "userId" not in jobParams: jobParams["userId"] = "0" if result.strip() == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) logging.info("kubectl delete " + jobDescriptionPath + " output: " + str(output)) elif result.strip() == "Running": if job["jobStatus"] != "running": dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") if "interactivePort" in jobParams: serviceAddress = k8sUtils.GetServiceAddress(job["jobId"]) serviceAddress = base64.b64encode(json.dumps(serviceAddress)) dataHandler.UpdateJobTextField(job["jobId"], "endpoints", serviceAddress) elif result.strip() == "Failed": printlog("Job %s fails, cleaning..." % job["jobId"]) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail) if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) logging.info("kubectl delete " + jobDescriptionPath + " output: " + str(output)) elif result.strip() == "Unknown": if job["jobId"] not in UnusualJobs: UnusualJobs[job["jobId"]] = datetime.datetime.now() elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: del UnusualJobs[job["jobId"]] retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "cannot launch the job.") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) logging.info("kubectl delete " + jobDescriptionPath + " output: " + str(output)) else: printlog( "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"], retries)) SubmitJob(job) elif result.strip() == "PendingHostPort": printlog( "Cannot find host ports for job :%s, re-launch the job with different host ports " % (job["jobId"])) SubmitJob(job) if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]]
def submit_job_impl(self, job): # check if existing any pod with label: run=job_id assert ("jobId" in job) job_id = job["jobId"] if not self._all_pods_not_existing(job_id): logging.warning( "Waiting until previously pods are cleaned up! Job {}".format( job_id)) job_deployer = JobDeployer() errors = job_deployer.delete_job(job_id, force=True) if errors: logging.warning("Force delete job {}: {}".format( job_id, errors)) return ret = {} dataHandler = DataHandler() try: # TODO refine later # before resubmit the job, reset the endpoints # update all endpoint to status 'pending', so it would restart when job is ready endpoints = dataHandler.GetJobEndpoints(job_id) for endpoint_id, endpoint in endpoints.items(): endpoint["status"] = "pending" logging.info("Reset endpoint status to 'pending': {}".format( endpoint_id)) dataHandler.UpdateEndpoint(endpoint) job["cluster"] = config job_object, errors = JobSchema().load(job) # TODO assert job_object is a Job assert isinstance( job_object, Job), "job_object is not of Job, but " + str(type(job_object)) job_object.params = json.loads(base64.b64decode(job["jobParams"])) # inject gid, uid and user # TODO it should return only one entry user_info = dataHandler.GetIdentityInfo( job_object.params["userName"])[0] job_object.params["gid"] = user_info["gid"] job_object.params["uid"] = user_info["uid"] job_object.params["user"] = job_object.get_alias() if "job_token" not in job_object.params: if "user_sign_token" in config and "userName" in job_object.params: job_object.params["job_token"] = hashlib.md5( job_object.params["userName"] + ":" + config["user_sign_token"]).hexdigest() else: job_object.params["job_token"] = "tryme2017" if "envs" not in job_object.params: job_object.params["envs"] = [] job_object.params["envs"].append({ "name": "DLTS_JOB_TOKEN", "value": job_object.params["job_token"] }) enable_custom_scheduler = job_object.is_custom_scheduler_enabled() secret_template = job_object.get_blobfuse_secret_template() if job_object.params["jobtrainingtype"] == "RegularJob": pod_template = PodTemplate( job_object.get_template(), enable_custom_scheduler=enable_custom_scheduler, secret_template=secret_template) elif job_object.params["jobtrainingtype"] == "PSDistJob": pod_template = DistPodTemplate(job_object.get_template(), secret_template=secret_template) elif job_object.params["jobtrainingtype"] == "InferenceJob": pod_template = PodTemplate( job_object.get_template(), deployment_template=job_object.get_deployment_template(), enable_custom_scheduler=False, secret_template=secret_template) else: dataHandler.SetJobError( job_object.job_id, "ERROR: invalid jobtrainingtype: %s" % job_object.params["jobtrainingtype"]) dataHandler.Close() return False pods, error = pod_template.generate_pods(job_object) if error: dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error) dataHandler.Close() return False job_description = "\n---\n".join([yaml.dump(pod) for pod in pods]) job_description_path = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml" local_jobDescriptionPath = os.path.realpath( os.path.join(config["storage-mount-path"], job_description_path)) if not os.path.exists(os.path.dirname(local_jobDescriptionPath)): os.makedirs(os.path.dirname(local_jobDescriptionPath)) with open(local_jobDescriptionPath, 'w') as f: f.write(job_description) secrets = pod_template.generate_secrets(job_object) job_deployer = JobDeployer() try: secrets = job_deployer.create_secrets(secrets) ret["output"] = "Created secrets: {}. ".format( [secret.metadata.name for secret in secrets]) pods = job_deployer.create_pods(pods) ret["output"] += "Created pods: {}".format( [pod.metadata.name for pod in pods]) except Exception as e: ret["output"] = "Error: %s" % e.message logging.error(e, exc_info=True) ret["jobId"] = job_object.job_id dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus", "scheduling") dataHandler.UpdateJobTextField(job_object.job_id, "jobDescriptionPath", job_description_path) dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription", base64.b64encode(job_description)) dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated", datetime.datetime.now().isoformat()) jobMeta = {} jobMeta["jobDescriptionPath"] = job_description_path jobMeta["jobPath"] = job_object.job_path jobMeta["workPath"] = job_object.work_path # the command of the first container jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta", jobMetaStr) except Exception as e: logging.error("Submit job failed: %s" % job, exc_info=True) ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "Cannot submit job!" + str(e)) detail = get_job_status_detail(job) detail = job_status_detail_with_finished_time( detail, "error", "Server error in job submission") dataHandler.UpdateJobTextField( job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) # Try to clean up the job try: job_deployer = JobDeployer() job_deployer.delete_job(job_id, force=True) logging.info( "Cleaning up job %s succeeded after %d retries of job submission" % (job["jobId"], retries)) except: logging.warning( "Cleaning up job %s failed after %d retries of job submission" % (job["jobId"], retries)) dataHandler.Close() return ret