示例#1
0
def UpdateDistJobStatus(job):
    dataHandler = DataHandler()
    jobParams = json.loads(base64.b64decode(job["jobParams"]))

    if "userId" not in jobParams:
        jobParams["userId"] = "0"

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    result, detail = k8sUtils.GetJobStatus(job["jobId"])
    dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                   base64.b64encode(detail))

    logging.info("job %s status: %s,%s" %
                 (job["jobId"], result, json.dumps(detail)))

    jobDescriptionPath = os.path.join(
        config["storage-mount-path"],
        job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None

    jobId = jobParams["jobId"]
    workerPodInfo = k8sUtils.GetPod("distRole=worker,run=" + jobId)
    psPodInfo = k8sUtils.GetPod("distRole=ps,run=" + jobId)
    if "items" in workerPodInfo and len(workerPodInfo["items"]) == int(
            jobParams["numpsworker"]) and "items" in psPodInfo and len(
                psPodInfo["items"]) == int(jobParams["numps"]):
        if job["jobStatus"] == "scheduling":
            launch_ps_dist_job(jobParams)
        if job["jobStatus"] == "running":
            result, detail = GetDistJobStatus(job["jobId"])
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                           base64.b64encode(detail))

            printlog("job %s status: %s" % (job["jobId"], result))

            jobDescriptionPath = os.path.join(
                config["storage-mount-path"], job["jobDescriptionPath"]
            ) if "jobDescriptionPath" in job else None

            if result.strip() == "Succeeded":
                joblog_manager.extract_job_log(job["jobId"], logPath,
                                               jobParams["userId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "finished")
                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    k8sUtils.kubectl_delete(jobDescriptionPath)

            elif result.strip() == "Running":
                joblog_manager.extract_job_log(job["jobId"], logPath,
                                               jobParams["userId"])
                if job["jobStatus"] != "running":
                    dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                                   "running")
                if "interactivePort" in jobParams:
                    serviceAddress = k8sUtils.GetServiceAddress(job["jobId"])
                    serviceAddress = base64.b64encode(
                        json.dumps(serviceAddress))
                    dataHandler.UpdateJobTextField(job["jobId"], "endpoints",
                                                   serviceAddress)

            elif result.strip() == "Failed":
                printlog("Job %s fails, cleaning..." % job["jobId"])
                joblog_manager.extract_job_log(job["jobId"], logPath,
                                               jobParams["userId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "failed")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               detail)
                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    k8sUtils.kubectl_delete(jobDescriptionPath)

            elif result.strip() == "Unknown":
                if job["jobId"] not in UnusualJobs:
                    UnusualJobs[job["jobId"]] = datetime.datetime.now()
                elif (datetime.datetime.now() -
                      UnusualJobs[job["jobId"]]).seconds > 300:
                    del UnusualJobs[job["jobId"]]
                    retries = dataHandler.AddandGetJobRetries(job["jobId"])
                    if retries >= 5:
                        printlog("Job %s fails for more than 5 times, abort" %
                                 job["jobId"])
                        dataHandler.UpdateJobTextField(job["jobId"],
                                                       "jobStatus", "error")
                        dataHandler.UpdateJobTextField(
                            job["jobId"], "errorMsg", "cannot launch the job.")
                        if jobDescriptionPath is not None and os.path.isfile(
                                jobDescriptionPath):
                            k8sUtils.kubectl_delete(jobDescriptionPath)
                    else:
                        printlog(
                            "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d"
                            % (job["jobId"], retries))
                        SubmitJob(job)

            if result.strip() != "Unknown" and job["jobId"] in UnusualJobs:
                del UnusualJobs[job["jobId"]]

    pass
示例#2
0
def SubmitPSDistJob(job):
    ret = {}
    dataHandler = DataHandler()

    try:
        jobParams = json.loads(base64.b64decode(job["jobParams"]))
        jobParams["rest-api"] = config["rest-api"]
        distJobParams = {}
        distJobParams["ps"] = []
        distJobParams["worker"] = []
        assignedRack = None
        if len(config["racks"]) > 0:
            assignedRack = random.choice(config["racks"])
        if jobParams["jobtrainingtype"] == "PSDistJob":
            jobDescriptionList = []
            nums = {
                "ps": int(jobParams["numps"]),
                "worker": int(jobParams["numpsworker"])
            }
            for role in ["ps", "worker"]:
                for i in range(nums[role]):
                    distJobParam = copy.copy(jobParams)
                    distJobParam["distId"] = "%s%d" % (role, i)
                    distJobParam["distRole"] = role

                    if "jobPath" not in distJobParam or len(
                            distJobParam["jobPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: job-path does not exist")
                        return False

                    distJobParam["distJobPath"] = os.path.join(
                        distJobParam["jobPath"], distJobParam["distId"])

                    if "workPath" not in distJobParam or len(
                            distJobParam["workPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: work-path does not exist")
                        return False

                    if "dataPath" not in distJobParam or len(
                            distJobParam["dataPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: data-path does not exist")
                        return False

                    jobPath, workPath, dataPath = GetStoragePath(
                        distJobParam["distJobPath"], distJobParam["workPath"],
                        distJobParam["dataPath"])

                    localJobPath = os.path.join(config["storage-mount-path"],
                                                jobPath)
                    if not os.path.exists(localJobPath):
                        if "userId" in distJobParam:
                            mkdirsAsUser(localJobPath, distJobParam["userId"])
                        else:
                            mkdirsAsUser(localJobPath, 0)

                    distJobParam["LaunchCMD"] = ""
                    if "cmd" not in distJobParam:
                        distJobParam["cmd"] = ""


################One choice is that we only wait for certain time.
#					launchCMD = """
##!/bin/bash
#mkdir -p /opt
#echo "[DLWorkspace System]: Waiting for all containers are ready..."
## wait for at most 10 mins.
#for i in {1..200}; do
#	if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then
#		sleep 3
#	else
#		break
#	fi
#done
#if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then
#	echo "[DLWorkspace System]: Waiting for containers: timeout! Restarting..."
#	exit 1
#else
#	echo "[DLWorkspace System]: All containers are ready, launching training job..."
#	chmod +x /opt/run_dist_job.sh
#	/opt/run_dist_job.sh
#fi
#"""

                    launchCMD = """
#!/bin/bash
mkdir -p /opt
echo "[DLWorkspace System]: Waiting for all containers are ready..."
while [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; do
	sleep 3
done
echo "[DLWorkspace System]: All containers are ready, launching training job..."
chmod +x /opt/run_dist_job.sh
/opt/run_dist_job.sh
"""

                    launchScriptPath = os.path.join(
                        localJobPath, "launch-%s.sh" % distJobParam["jobId"])
                    with open(launchScriptPath, 'w') as f:
                        f.write(launchCMD)
                    f.close()
                    distJobParam[
                        "LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % distJobParam[
                            "jobId"]

                    distJobParam["jobNameLabel"] = ''.join(
                        e for e in distJobParam["jobName"] if e.isalnum())
                    distJobParam["userNameLabel"] = getAlias(
                        jobParams["userName"])
                    ENV = Environment(loader=FileSystemLoader("/"))

                    jobTempDir = os.path.join(config["root-path"],
                                              "Jobs_Templete")
                    jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template")

                    distJobParam["hostjobPath"] = os.path.join(
                        config["storage-mount-path"], jobPath)
                    distJobParam["hostworkPath"] = os.path.join(
                        config["storage-mount-path"], workPath)
                    distJobParam["hostdataPath"] = os.path.join(
                        config["storage-mount-path"], dataPath)
                    distJobParam["nvidiaDriverPath"] = nvidiaDriverPath

                    if "mountpoints" not in distJobParam:
                        distJobParam["mountpoints"] = []

                    distJobParam["mountpoints"].append({
                        "name":
                        "nvidia-driver",
                        "containerPath":
                        "/usr/local/nvidia",
                        "hostPath":
                        nvidiaDriverPath
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "job",
                        "containerPath":
                        "/job",
                        "hostPath":
                        distJobParam["hostjobPath"]
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "work",
                        "containerPath":
                        "/work",
                        "hostPath":
                        distJobParam["hostworkPath"]
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "data",
                        "containerPath":
                        "/data",
                        "hostPath":
                        distJobParam["hostdataPath"]
                    })
                    distJobParam["pod_ip_range"] = config["pod_ip_range"]
                    if "usefreeflow" in config and config[
                            "usefreeflow"] == "True":
                        distJobParam["usefreeflow"] = config["usefreeflow"]
                    else:
                        distJobParam["usefreeflow"] = False

                    random.seed(datetime.datetime.now())
                    distJobParam["containerPort"] = int(random.random() *
                                                        1000 + 3000)

                    if assignedRack is not None:
                        if "nodeSelector" not in distJobParam:
                            distJobParam["nodeSelector"] = {}
                        distJobParam["nodeSelector"]["rack"] = assignedRack

                    template = ENV.get_template(os.path.abspath(jobTemp))
                    job_description = template.render(job=distJobParam)

                    jobDescriptionList.append(job_description)

                    distJobParams[role].append(distJobParam)

            jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime(
                "%y%m%d"
            ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"
            jobDescription = "\n---\n".join(jobDescriptionList)

        jobDescriptionPath = os.path.join(config["storage-mount-path"],
                                          jobParams["jobDescriptionPath"])
        if not os.path.exists(
                os.path.dirname(os.path.realpath(jobDescriptionPath))):
            os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
        if os.path.isfile(jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)

        with open(jobDescriptionPath, 'w') as f:
            f.write(jobDescription)

        output = k8sUtils.kubectl_create(jobDescriptionPath)

        ret["output"] = output

        ret["jobId"] = jobParams["jobId"]

        if "userName" not in jobParams:
            jobParams["userName"] = ""

        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(jobParams["jobId"],
                                       "jobDescriptionPath",
                                       jobParams["jobDescriptionPath"])
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription",
                                       base64.b64encode(jobDescription))

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["workPath"] = jobParams["workPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["LaunchCMD"] = jobParams["LaunchCMD"]
        jobMeta["distJobParams"] = distJobParams

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta",
                                       jobMetaStr)
    except Exception as e:
        print e
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                           "error")
            dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))

    return ret
示例#3
0
def SubmitRegularJob(job):
    ret = {}
    dataHandler = DataHandler()

    try:
        jobParams = json.loads(base64.b64decode(job["jobParams"]))

        jobParams["pvc_job"] = "jobs-" + jobParams["jobId"]
        jobParams["pvc_work"] = "work-" + jobParams["jobId"]
        jobParams["pvc_data"] = "storage-" + jobParams["jobId"]

        if "jobPath" not in jobParams or len(
                jobParams["jobPath"].strip()) == 0:
            dataHandler.SetJobError(jobParams["jobId"],
                                    "ERROR: job-path does not exist")
            return False

        if "workPath" not in jobParams or len(
                jobParams["workPath"].strip()) == 0:
            dataHandler.SetJobError(jobParams["jobId"],
                                    "ERROR: work-path does not exist")
            return False

        #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0:
        #	dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist")
        #	return False

        jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                     jobParams["workPath"],
                                                     jobParams["dataPath"])

        localJobPath = os.path.join(config["storage-mount-path"], jobPath)

        if not os.path.exists(localJobPath):
            if "userId" in jobParams:
                mkdirsAsUser(localJobPath, jobParams["userId"])
                mkdirsAsUser(os.path.join(localJobPath, "models"),
                             jobParams["userId"])
            else:
                mkdirsAsUser(localJobPath, "0")
                mkdirsAsUser(os.path.join(localJobPath, "models"), "0")

        jobParams["LaunchCMD"] = ""
        if "cmd" not in jobParams:
            jobParams["cmd"] = ""

        if isinstance(jobParams["cmd"],
                      basestring) and not jobParams["cmd"] == "":
            launchScriptPath = os.path.join(
                localJobPath, "launch-%s.sh" % jobParams["jobId"])
            with open(launchScriptPath, 'w') as f:
                f.write("#!/bin/bash -x\n")
                f.write(jobParams["cmd"] + "\n")
            f.close()
            if "userId" in jobParams:
                os.system("chown -R %s %s" %
                          (jobParams["userId"], launchScriptPath))
            jobParams[
                "LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams[
                    "jobId"]

        jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime(
            "%y%m%d"
        ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"

        jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"]
                                            if e.isalnum())

        ENV = Environment(loader=FileSystemLoader("/"))

        jobTempDir = os.path.join(config["root-path"], "Jobs_Templete")
        jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template")

        jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"],
                                                jobPath)
        jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"],
                                                 workPath)
        jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"],
                                                 dataPath)
        jobParams["nvidiaDriverPath"] = nvidiaDriverPath

        jobParams["userNameLabel"] = getAlias(jobParams["userName"])
        jobParams["rest-api"] = config["rest-api"]

        if "mountpoints" not in jobParams:
            jobParams["mountpoints"] = []
        for onemount in jobParams["mountpoints"]:
            onemount["name"] = onemount["containerPath"].replace("/", "")

        jobParams["mountpoints"].append({
            "name": "nvidia-driver",
            "containerPath": "/usr/local/nvidia",
            "hostPath": nvidiaDriverPath
        })
        jobParams["mountpoints"].append({
            "name": "job",
            "containerPath": "/job",
            "hostPath": jobParams["hostjobPath"]
        })
        jobParams["mountpoints"].append({
            "name": "work",
            "containerPath": "/work",
            "hostPath": jobParams["hostworkPath"]
        })
        jobParams["mountpoints"].append({
            "name": "data",
            "containerPath": "/data",
            "hostPath": jobParams["hostdataPath"]
        })
        jobParams["pod_ip_range"] = config["pod_ip_range"]
        if "usefreeflow" in config:
            jobParams["usefreeflow"] = config["usefreeflow"]
        else:
            jobParams["usefreeflow"] = False

        print("Render Job: %s" % jobParams)
        template = ENV.get_template(os.path.abspath(jobTemp))
        job_description = template.render(job=jobParams)

        jobDescriptionList = []

        jobDescriptionList.append(job_description)

        if ("interactivePort" in jobParams
                and len(jobParams["interactivePort"].strip()) > 0):
            ports = [
                p.strip()
                for p in re.split(",|;", jobParams["interactivePort"])
                if len(p.strip()) > 0 and p.strip().isdigit()
            ]
            for portNum in ports:
                jobParams["serviceId"] = "interactive-" + jobParams[
                    "jobId"] + "-" + portNum
                jobParams["port"] = portNum
                jobParams["port-name"] = "interactive"
                jobParams["port-type"] = "TCP"

                serviceTemplate = ENV.get_template(
                    os.path.join(jobTempDir, "KubeSvc.yaml.template"))

                template = ENV.get_template(serviceTemplate)
                interactiveMeta = template.render(svc=jobParams)
                jobDescriptionList.append(interactiveMeta)

        jobDescription = "\n---\n".join(jobDescriptionList)

        jobDescriptionPath = os.path.join(config["storage-mount-path"],
                                          jobParams["jobDescriptionPath"])
        if not os.path.exists(
                os.path.dirname(os.path.realpath(jobDescriptionPath))):
            os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
        if os.path.isfile(jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)

        with open(jobDescriptionPath, 'w') as f:
            f.write(jobDescription)

        output = k8sUtils.kubectl_create(jobDescriptionPath)
        logging.info("Submitted job %s to k8s, returned with status %s" %
                     (job["jobId"], output))

        ret["output"] = output

        ret["jobId"] = jobParams["jobId"]

        if "userName" not in jobParams:
            jobParams["userName"] = ""

        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(jobParams["jobId"],
                                       "jobDescriptionPath",
                                       jobParams["jobDescriptionPath"])
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription",
                                       base64.b64encode(jobDescription))

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["workPath"] = jobParams["workPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["LaunchCMD"] = jobParams["LaunchCMD"]

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta",
                                       jobMetaStr)
    except Exception as e:
        print e
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                           "error")
            dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))

    return ret
示例#4
0
def SubmitJob(job):
    # check if existing any pod with label: run=job_id
    assert ("jobId" in job)
    job_id = job["jobId"]
    if not all_pods_not_existing(job_id):
        logging.warning(
            "Waiting until previously pods are cleaned up! Job {}".format(
                job_id))
        job_deployer = JobDeployer()
        errors = job_deployer.delete_job(job_id, force=True)
        if errors:
            logging.warning("Force delete job {}: {}".format(job_id, errors))
        return

    ret = {}
    dataHandler = DataHandler()

    try:
        # TODO refine later
        # before resubmit the job, reset the endpoints
        # update all endpoint to status 'pending', so it would restart when job is ready
        endpoints = dataHandler.GetJobEndpoints(job_id)
        for endpoint_id, endpoint in endpoints.items():
            endpoint["status"] = "pending"
            logging.info(
                "Reset endpoint status to 'pending': {}".format(endpoint_id))
            dataHandler.UpdateEndpoint(endpoint)

        job["cluster"] = config
        job_object, errors = JobSchema().load(job)
        # TODO assert job_object is a Job
        assert (isinstance(job_object, Job))

        job_object.params = json.loads(base64.b64decode(job["jobParams"]))

        # inject gid, uid and user
        # TODO it should return only one entry
        user_info = dataHandler.GetIdentityInfo(
            job_object.params["userName"])[0]
        job_object.params["gid"] = user_info["gid"]
        job_object.params["uid"] = user_info["uid"]
        job_object.params["user"] = job_object.get_alias()

        enable_custom_scheduler = job_object.is_custom_scheduler_enabled()
        if job_object.params["jobtrainingtype"] == "RegularJob":
            pod_template = PodTemplate(job_object.get_template(),
                                       enable_custom_scheduler)
        elif job_object.params["jobtrainingtype"] == "PSDistJob":
            pod_template = DistPodTemplate(job_object.get_template())
        elif job_object.params["jobtrainingtype"] == "InferenceJob":
            pod_template = PodTemplate(job_object.get_template(),
                                       enable_custom_scheduler)
        else:
            dataHandler.SetJobError(
                job_object.job_id, "ERROR: invalid jobtrainingtype: %s" %
                job_object.params["jobtrainingtype"])
            dataHandler.Close()
            return False

        pods, error = pod_template.generate_pods(job_object)
        if error:
            dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error)
            dataHandler.Close()
            return False

        job_description = "\n---\n".join([yaml.dump(pod) for pod in pods])
        job_description_path = "jobfiles/" + time.strftime(
            "%y%m%d"
        ) + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml"
        local_jobDescriptionPath = os.path.realpath(
            os.path.join(config["storage-mount-path"], job_description_path))
        if not os.path.exists(os.path.dirname(local_jobDescriptionPath)):
            os.makedirs(os.path.dirname(local_jobDescriptionPath))
        with open(local_jobDescriptionPath, 'w') as f:
            f.write(job_description)

        job_deployer = JobDeployer()
        try:
            pods = job_deployer.create_pods(pods)
            ret["output"] = "Created pods: {}".format(
                [pod.metadata.name for pod in pods])
        except Exception as e:
            ret["output"] = "Error: %s" % e.message
            logging.error(e, exc_info=True)

        ret["jobId"] = job_object.job_id

        dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(job_object.job_id, "jobDescriptionPath",
                                       job_description_path)
        dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription",
                                       base64.b64encode(job_description))
        dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated",
                                       datetime.datetime.now().isoformat())

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = job_description_path
        jobMeta["jobPath"] = job_object.job_path
        jobMeta["workPath"] = job_object.work_path
        # the command of the first container
        jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta",
                                       jobMetaStr)
    except Exception as e:
        logging.error("Submit job failed: %s" % job, exc_info=True)
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(job["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error")
            dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))
    dataHandler.Close()
    return ret
示例#5
0
def SubmitRegularJob(job):
    ret = {}
    dataHandler = DataHandler()

    try:
        jobParams = json.loads(base64.b64decode(job["jobParams"]))

        jobParams["pvc_job"] = "jobs-" + jobParams["jobId"]
        jobParams["pvc_work"] = "work-" + jobParams["jobId"]
        jobParams["pvc_data"] = "storage-" + jobParams["jobId"]


        if "jobPath" not in jobParams or len(jobParams["jobPath"].strip()) == 0: 
            dataHandler.SetJobError(jobParams["jobId"],"ERROR: job-path does not exist")
            return False

        if "workPath" not in jobParams or len(jobParams["workPath"].strip()) == 0: 
            dataHandler.SetJobError(jobParams["jobId"],"ERROR: work-path does not exist")
            return False

        #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0: 
        #    dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist")
        #    return False


        jobPath,workPath,dataPath = GetStoragePath(jobParams["jobPath"],jobParams["workPath"],jobParams["dataPath"])


        localJobPath = os.path.join(config["storage-mount-path"],jobPath)

        if not os.path.exists(localJobPath):
            if "userId" in jobParams:
                mkdirsAsUser(localJobPath,jobParams["userId"])
                mkdirsAsUser(os.path.join(localJobPath,"models"),jobParams["userId"])
            else:
                mkdirsAsUser(localJobPath,"0")
                mkdirsAsUser(os.path.join(localJobPath,"models"),"0")

        jobParams["LaunchCMD"] = ""
        if "cmd" not in jobParams:
            jobParams["cmd"] = ""
            
        if isinstance(jobParams["cmd"], basestring) and not jobParams["cmd"] == "":
            launchScriptPath = os.path.join(localJobPath,"launch-%s.sh" % jobParams["jobId"])
            with open(launchScriptPath, 'w') as f:
                f.write("#!/bin/bash -x\n")
                f.write(jobParams["cmd"] + "\n")
            f.close()    
            if "userId" in jobParams:
                os.system("chown -R %s %s" % (jobParams["userId"], launchScriptPath))
            jobParams["LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams["jobId"]


        jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"

        jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"] if e.isalnum())

        ENV = Environment(loader=FileSystemLoader("/"))

        jobTempDir = os.path.join(config["root-path"],"Jobs_Templete")
        jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template")

        jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath)
        jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath)
        jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath)
        jobParams["nvidiaDriverPath"] = nvidiaDriverPath


        jobParams["userNameLabel"] = getAlias(jobParams["userName"])
        jobParams["rest-api"] = config["rest-api"]

        if "mountpoints" not in jobParams:
            jobParams["mountpoints"] = []
        for onemount in jobParams["mountpoints"]:
            onemount["name"] = onemount["containerPath"].replace("/","")

        mp = {"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath, "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)

        mp = {"name":"job","containerPath":"/job","hostPath":jobParams["hostjobPath"], "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)

        mp = {"name":"work","containerPath":"/work","hostPath":jobParams["hostworkPath"], "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)

        mp = {"name":"data","containerPath":"/data","hostPath":jobParams["hostdataPath"], "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)                        

        userAlias = getAlias(jobParams["userName"])

        mp = {"name":"sshkey","containerPath":"/home/%s/.ssh" % userAlias,"hostPath":os.path.join(config["storage-mount-path"], GetWorkPath(userAlias)+"/.ssh"), "readOnly":True, "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)            


        jobParams["pod_ip_range"] = config["pod_ip_range"]
        if "usefreeflow" in config:
            jobParams["usefreeflow"] = config["usefreeflow"]
        else:
            jobParams["usefreeflow"] = False

        print ("Render Job: %s" % jobParams)
        jobDescriptionList = []

        pods = []
        if "hyperparametername" in jobParams and "hyperparameterstartvalue" in jobParams and "hyperparameterendvalue" in jobParams and "hyperparameterstep" in jobParams:
            i = int(jobParams["hyperparameterstartvalue"])
            end = int(jobParams["hyperparameterendvalue"])
            step = int(jobParams["hyperparameterstep"])
            c = 0
            while (i <= end):
                pod = {}
                pod["podName"] = jobParams["jobId"]+"-pod-"+str(c)
                pod["envs"] = [{"name":jobParams["hyperparametername"],"value":i}]
                i += step
                c += 1 
                pods.append(pod)
        else:
                pod = {}
                pod["podName"] = jobParams["jobId"]
                pod["envs"] = []
                pods.append(pod)

        if "env" not in jobParams:
            jobParams["env"] = []
        jobParams["commonenv"] = copy.copy(jobParams["env"])


        for pod in pods:
            jobParams["podName"] = pod["podName"]
            jobParams["env"] = jobParams["commonenv"] + pod["envs"]

            if "kube_custom_scheduler" in config and config["kube_custom_scheduler"]:
                container = {}
                container["requests"] = {"alpha.gpu/numgpu" : int(jobParams["resourcegpu"])}
                podInfo = {}
                podInfo["podname"] = jobParams["podName"]
                if "useGPUTopology" in jobParams and jobParams["useGPUTopology"]:
                    # add topology constraints explicitly - for testing
                    # if (jobParams["resourcegpu"] >= 2):
                    #     # both cards in same inner group
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/0/cards"] = 1
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/1/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 3):
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/2/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 4):
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/3/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 5):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/4/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 6):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/5/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 7):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/6/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 8):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/7/cards"] = 1
                    podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 1}
                else:
                    # for cases when desired topology is explictly given or not desired
                    podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 0}
                podInfo["runningcontainer"] = {jobParams["podName"] : container}

                if "annotations" not in jobParams:
                    jobParams["annotations"] = {}
                jobParams["annotations"]["pod.alpha/DeviceInformation"] = "'" + json.dumps(podInfo) + "'"
                jobParams["resourcegpu"] = 0 # gpu requests specified through annotation

            template = ENV.get_template(os.path.abspath(jobTemp))
            job_description = template.render(job=jobParams)
            jobDescriptionList.append(job_description)

            if ("interactivePort" in jobParams and len(jobParams["interactivePort"].strip()) > 0):
                ports = [p.strip() for p in re.split(",|;",jobParams["interactivePort"]) if len(p.strip()) > 0 and p.strip().isdigit()]
                for portNum in ports:
                    jobParams["serviceId"] = "interactive-" + jobParams["podName"] + "-" + portNum
                    jobParams["port"] = portNum
                    jobParams["port-name"] = "interactive"
                    jobParams["port-type"] = "TCP"

                    serviceTemplate = ENV.get_template(os.path.join(jobTempDir,"KubeSvc.yaml.template"))

                    stemplate = ENV.get_template(serviceTemplate)
                    interactiveMeta = stemplate.render(svc=jobParams)
                    jobDescriptionList.append(interactiveMeta)


        jobDescription = "\n---\n".join(jobDescriptionList)

        jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"])
        if not os.path.exists(os.path.dirname(os.path.realpath(jobDescriptionPath))):
            os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
        if os.path.isfile(jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath) 

        with open(jobDescriptionPath, 'w') as f:
            f.write(jobDescription)

        output = k8sUtils.kubectl_create(jobDescriptionPath)    
        logging.info("Submitted job %s to k8s, returned with status %s" %(job["jobId"], output))

        ret["output"] = output
        
        ret["jobId"] = jobParams["jobId"]


        if "userName" not in jobParams:
            jobParams["userName"] = ""

        dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","scheduling")
        dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescriptionPath",jobParams["jobDescriptionPath"])
        dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescription",base64.b64encode(jobDescription))


        jobMeta = {}
        jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["workPath"] = jobParams["workPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["LaunchCMD"] = jobParams["LaunchCMD"]

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(jobParams["jobId"],"jobMeta",jobMetaStr)
    except Exception as e:
        print e
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","error")
            dataHandler.UpdateJobTextField(jobParams["jobId"],"errorMsg","Cannot submit job!" + str(e))

    return ret
示例#6
0
    def submit_job_impl(self, job):
        # check if existing any pod with label: run=job_id
        assert ("jobId" in job)
        job_id = job["jobId"]
        if not self._all_pods_not_existing(job_id):
            logger.warning(
                "Waiting until previously pods are cleaned up! Job {}".format(
                    job_id))
            errors = self.delete_job(job_id, force=True)
            if errors:
                logger.warning("Force delete job {}: {}".format(
                    job_id, errors))
            return

        ret = {}
        dataHandler = DataHandler()

        try:
            # TODO refine later
            # before resubmit the job, reset the endpoints
            # update all endpoint to status 'pending', so it would restart when job is ready
            endpoints = dataHandler.GetJobEndpoints(job_id)
            for endpoint_id, endpoint in list(endpoints.items()):
                endpoint["status"] = "pending"
                logger.debug("Reset endpoint status to 'pending': {}".format(
                    endpoint_id))
                dataHandler.UpdateEndpoint(endpoint)

            job["cluster"] = config
            job_object, errors = JobSchema().load(job)
            # TODO assert job_object is a Job
            assert isinstance(
                job_object,
                Job), "job_object is not of Job, but " + str(type(job_object))

            job_object.params = json.loads(b64decode(job["jobParams"]))

            # inject gid, uid and user
            # TODO it should return only one entry
            user_info = dataHandler.GetIdentityInfo(
                job_object.params["userName"])[0]
            job_object.params["gid"] = user_info["gid"]
            job_object.params["uid"] = user_info["uid"]
            job_object.params["user"] = job_object.get_alias()
            job_object.params["private_key"] = user_info["private_key"]
            job_object.params["ssh_public_keys"] = job_object.params.get(
                "ssh_public_keys", [])
            job_object.params["ssh_public_keys"].append(
                user_info["public_key"])

            if "job_token" not in job_object.params:
                if "master_token" in config and config[
                        "master_token"] is not None and "userName" in job_object.params:
                    plain_token = job_object.params["userName"] + \
                        ":" + config["master_token"]
                    job_object.params["job_token"] = hashlib.md5(
                        plain_token.encode("utf-8")).hexdigest()
                else:
                    job_object.params["job_token"] = "tryme2017"

            if "envs" not in job_object.params:
                job_object.params["envs"] = []
            job_object.params["envs"].append({
                "name":
                "DLTS_JOB_TOKEN",
                "value":
                job_object.params["job_token"]
            })

            blobfuse_secret_template = job_object.get_blobfuse_secret_template(
            )
            image_pull_secret_template = job_object.get_image_pull_secret_template(
            )
            secret_templates = {
                "blobfuse": blobfuse_secret_template,
                "imagePull": image_pull_secret_template
            }
            if job_object.params["jobtrainingtype"] == "RegularJob":
                pod_template = RegularJobTemplate(
                    job_object.get_template(),
                    secret_templates=secret_templates)
            elif job_object.params["jobtrainingtype"] == "PSDistJob":
                pod_template = DistributeJobTemplate(
                    job_object.get_template(),
                    secret_templates=secret_templates)
            elif job_object.params["jobtrainingtype"] == "InferenceJob":
                pod_template = InferenceJobTemplate(
                    job_object.get_template(),
                    deployment_template=job_object.get_deployment_template(),
                    secret_templates=secret_templates)
            else:
                dataHandler.SetJobError(
                    job_object.job_id, "ERROR: invalid jobtrainingtype: %s" %
                    job_object.params["jobtrainingtype"])
                dataHandler.Close()
                return False

            pods, error = pod_template.generate_pods(job_object)
            if error:
                dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error)
                dataHandler.Close()
                return False

            job_description = "\n---\n".join([yaml.dump(pod) for pod in pods])

            secrets = pod_template.generate_secrets(job_object)

            try:
                secrets = self.create_secrets(secrets)
                ret["output"] = "Created secrets: {}. ".format(
                    [secret.metadata.name for secret in secrets])
                created_pods = self.create_pods(pods)
                ret["output"] += "Created pods: {}".format(
                    [pod.metadata.name for pod in created_pods])
            except Exception as e:
                ret["output"] = "Error: %s" % e.message
                logger.exception(e)

            ret["jobId"] = job_object.job_id

            jobMeta = {}
            jobMeta["jobPath"] = job_object.job_path
            jobMeta["workPath"] = job_object.work_path
            # the command of the first container
            jobMeta["LaunchCMD"] = job_object.params["cmd"]

            jobMetaStr = b64encode(json.dumps(jobMeta))

            dataFields = {
                "jobStatus": "scheduling",
                "jobDescription": b64encode(job_description),
                "lastUpdated": datetime.datetime.now().isoformat(),
                "jobMeta": jobMetaStr
            }
            conditionFields = {"jobId": job_object.job_id}
            dataHandler.UpdateJobTextFields(conditionFields, dataFields)
        except Exception as e:
            logger.error("Submit job failed: %s" % job, exc_info=True)
            ret["error"] = str(e)
            retries = dataHandler.AddandGetJobRetries(job["jobId"])
            if retries >= 5:
                detail = get_job_status_detail(job)
                detail = job_status_detail_with_finished_time(
                    detail, "error", "Server error in job submission")

                dataFields = {
                    "jobStatus": "error",
                    "errorMsg": "Cannot submit job!" + str(e),
                    "jobStatusDetail": b64encode(json.dumps(detail))
                }
                conditionFields = {"jobId": job["jobId"]}
                dataHandler.UpdateJobTextFields(conditionFields, dataFields)
                # Try to clean up the job
                try:
                    self.delete_job(job_id, force=True)
                    logger.info(
                        "Cleaning up job %s succeeded after %d retries of job submission"
                        % (job["jobId"], retries))
                except:
                    logger.warning(
                        "Cleaning up job %s failed after %d retries of job submission"
                        % (job["jobId"], retries))

        dataHandler.Close()
        return ret
示例#7
0
def UpdateJobStatus(job):
    dataHandler = DataHandler()
    jobParams = json.loads(base64.b64decode(job["jobParams"]))

    if job["jobStatus"] == "scheduling" and jobParams[
            "jobtrainingtype"] == "PSDistJob":
        # launch user command only all pods are ready
        result, detail = k8sUtils.GetJobStatus(job["jobId"])
        if result in ["Failed", "Succeeded"]:
            # TODO shoudn't be here, update status
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", result)
            pass
        else:
            # previously status is 'scheduling', and now all pods are ready
            # TODO check all pods are ready
            if k8sUtils.all_pod_ready(job["jobId"]):
                try:
                    launch_ps_dist_job(jobParams)
                except Exception as e:
                    print(e)
            return

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    result, detail = k8sUtils.GetJobStatus(job["jobId"])
    dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                   base64.b64encode(json.dumps(detail)))

    logging.info("job %s status: %s,%s" %
                 (job["jobId"], result, json.dumps(detail)))

    jobDescriptionPath = os.path.join(
        config["storage-mount-path"],
        job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None
    if "userId" not in jobParams:
        jobParams["userId"] = "0"
    if result.strip() == "Succeeded":
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished")
        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            k8sUtils.kubectl_delete(jobDescriptionPath)

    elif result.strip() == "Running":
        if job["jobStatus"] != "running":
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                           "running")

    elif result.strip() == "Failed":
        printlog("Job %s fails, cleaning..." % job["jobId"])
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed")
        dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail)
        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            k8sUtils.kubectl_delete(jobDescriptionPath)

    elif result.strip() == "Unknown":
        if job["jobId"] not in UnusualJobs:
            UnusualJobs[job["jobId"]] = datetime.datetime.now()
        elif (datetime.datetime.now() -
              UnusualJobs[job["jobId"]]).seconds > 300:
            del UnusualJobs[job["jobId"]]
            retries = dataHandler.AddandGetJobRetries(job["jobId"])
            if retries >= 5:
                printlog("Job %s fails for more than 5 times, abort" %
                         job["jobId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "error")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               "cannot launch the job.")
                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    k8sUtils.kubectl_delete(jobDescriptionPath)
            else:
                printlog(
                    "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d"
                    % (job["jobId"], retries))
                SubmitJob(job)
    elif result.strip() == "PendingHostPort":
        printlog(
            "Cannot find host ports for job :%s, re-launch the job with different host ports "
            % (job["jobId"]))

        SubmitJob(job)

    if result.strip() != "Unknown" and job["jobId"] in UnusualJobs:
        del UnusualJobs[job["jobId"]]

    dataHandler.Close()
示例#8
0
def SubmitPSDistJob(job):
    ret = {}
    dataHandler = DataHandler()

    try:
        jobParams = json.loads(base64.b64decode(job["jobParams"]))
        jobParams["rest-api"] = config["rest-api"]
        distJobParams = {}
        distJobParams["ps"] = []
        distJobParams["worker"] = []
        assignedRack = None
        if len(config["racks"]) > 0:
            assignedRack = random.choice(config["racks"])

        userAlias = getAlias(jobParams["userName"])
        jobParams["user_email"] = jobParams["userName"]

        jobParams["homeFolderHostpath"] = os.path.join(
            config["storage-mount-path"], GetWorkPath(userAlias))

        if jobParams["jobtrainingtype"] == "PSDistJob":
            jobDescriptionList = []
            nums = {
                "ps": int(jobParams["numps"]),
                "worker": int(jobParams["numpsworker"])
            }
            for role in ["ps", "worker"]:
                for i in range(nums[role]):
                    distJobParam = copy.deepcopy(jobParams)
                    distJobParam["distId"] = "%s%d" % (role, i)
                    distJobParam["distRole"] = role
                    distJobParam["distRoleIdx"] = i

                    if "jobPath" not in distJobParam or len(
                            distJobParam["jobPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: job-path does not exist")
                        return False
                    if "workPath" not in distJobParam or len(
                            distJobParam["workPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: work-path does not exist")
                        return False
                    #if "dataPath" not in distJobParam or len(distJobParam["dataPath"].strip()) == 0:
                    #    dataHandler.SetJobError(distJobParam["jobId"],"ERROR: data-path does not exist")
                    #    return False
                    distJobParam["distJobPath"] = os.path.join(
                        distJobParam["jobPath"], distJobParam["distId"])
                    jobPath, workPath, dataPath = GetStoragePath(
                        distJobParam["distJobPath"], distJobParam["workPath"],
                        distJobParam["dataPath"])

                    localJobPath = os.path.join(config["storage-mount-path"],
                                                jobPath)
                    if not os.path.exists(localJobPath):
                        if "userId" in distJobParam:
                            mkdirsAsUser(localJobPath, distJobParam["userId"])
                        else:
                            mkdirsAsUser(localJobPath, 0)

                    # TODO ???
                    if "cmd" not in distJobParam:
                        distJobParam["cmd"] = ""


#change ssh folder permission here because the setup permission script in launch_ps_job function may have race condition with init_user.sh script. results in no such user error
                    if role == "ps":
                        launchCMD = """
#!/bin/bash
echo "[DLWorkspace System]: Waiting for all containers are ready..."
while [ ! -f /opt/run_dist_job ]; do
    sleep 3
done

sudo chmod 600 -R /home/%s/.ssh &>/dev/null;
sudo chmod 700 /home/%s/.ssh &>/dev/null;
sudo chown -R %s /home/%s/.ssh &>/dev/null;

sudo mkdir -p /root/.ssh  &>/dev/null ;
sudo ln -s /home/%s/.ssh/config /root/.ssh/config  &>/dev/null;
sudo mkdir -p /opt  &>/dev/null;
sudo ln -s /job/hostfile /opt/hostfile &>/dev/null;

JOB_DIR='/home/%s'
WORKER_NUM=%s
echo $JOB_DIR $WORKER_NUM

all_workers_ready=false
while [ "$all_workers_ready" != true ]
do
  # update it to false if any woker is not ready
  all_workers_ready=true

  for i in $(seq 0 $(( ${WORKER_NUM} - 1)) )
  do
    worker="worker${i}"
    file="$JOB_DIR/${worker}/WORKER_READY"
    #echo $file

    if [ ! -f $file ]; then
      echo "${worker} not ready!"
      all_workers_ready=false
      sleep 10
    fi
  done
done

echo "[DLWorkspace System]: All containers are ready, launching training job..."
%s
""" % (userAlias, userAlias, userAlias, userAlias, userAlias,
                        distJobParam["jobPath"], jobParams["numpsworker"], distJobParam["cmd"])
                    else:
                        launchCMD = """
while [ ! -f /opt/run_dist_job ]; do
    sleep 3
done
sudo chmod 600 -R /home/%s/.ssh &>/dev/null;
sudo chmod 700 /home/%s/.ssh &>/dev/null;
sudo chown -R %s /home/%s/.ssh  &>/dev/null;
sudo mkdir -p /root/.ssh  &>/dev/null;
sudo ln -s /home/%s/.ssh/config /root/.ssh/config &>/dev/null;
sudo mkdir -p /opt && sudo ln -s /job/hostfile /opt/hostfile  &>/dev/null;

# TODO mark the worker as 'READY', better to change to '/pod/READY' later
sudo touch /job/WORKER_READY

sleep infinity
""" % (userAlias, userAlias, userAlias, userAlias, userAlias)

                    launchScriptPath = os.path.join(
                        localJobPath,
                        "launch-%s-%s%d.sh" % (distJobParam["jobId"], role, i))
                    # TODO need to set up user for distribute jobs
                    with open(launchScriptPath, 'w') as f:
                        f.write(launchCMD)
                    f.close()

                    launchScriptInContainer = "bash /job/launch-%s-%s%d.sh" % (
                        distJobParam["jobId"], role, i)

                    distJobParam[
                        "LaunchCMD"] = '["bash", "-c", "bash /dlws/init_user.sh &> /job/init_user_script.log && runuser -l ${DLWS_USER_NAME} -c \'%s\'"]' % launchScriptInContainer

                    distJobParam["jobNameLabel"] = ''.join(
                        e for e in distJobParam["jobName"] if e.isalnum())
                    ENV = Environment(loader=FileSystemLoader("/"))

                    jobTempDir = os.path.join(config["root-path"],
                                              "Jobs_Templete")
                    jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template")

                    distJobParam["hostjobPath"] = os.path.join(
                        config["storage-mount-path"], jobPath)
                    distJobParam["hostworkPath"] = os.path.join(
                        config["storage-mount-path"], workPath)
                    distJobParam["hostdataPath"] = os.path.join(
                        config["storage-mount-path"], dataPath)
                    distJobParam["nvidiaDriverPath"] = nvidiaDriverPath

                    if "mountpoints" not in distJobParam:
                        distJobParam["mountpoints"] = []

                    # distJobParam["mountpoints"].append({"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath})
                    distJobParam["mountpoints"].append({
                        "name":
                        "job",
                        "containerPath":
                        "/job",
                        "hostPath":
                        distJobParam["hostjobPath"]
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "work",
                        "containerPath":
                        "/work",
                        "hostPath":
                        distJobParam["hostworkPath"]
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "data",
                        "containerPath":
                        "/data",
                        "hostPath":
                        distJobParam["hostdataPath"]
                    })

                    for idx in range(len(distJobParam["mountpoints"])):
                        if "name" not in distJobParam["mountpoints"][idx]:
                            distJobParam["mountpoints"][idx]["name"] = str(
                                uuid.uuid4()).replace("-", "")

                    distJobParam["pod_ip_range"] = config["pod_ip_range"]
                    if "usefreeflow" in config:
                        distJobParam["usefreeflow"] = config["usefreeflow"]
                    else:
                        distJobParam["usefreeflow"] = False

                    distJobParam["numworker"] = int(jobParams["numpsworker"])
                    distJobParam["numps"] = int(jobParams["numps"])

                    random.seed(datetime.datetime.now())
                    if "hostNetwork" in jobParams and jobParams["hostNetwork"]:
                        distJobParam["containerPort"] = random.randint(
                            40000, 49999)
                    else:
                        distJobParam["containerPort"] = int(random.random() *
                                                            1000 + 3000)

                    if assignedRack is not None:
                        if "nodeSelector" not in distJobParam:
                            distJobParam["nodeSelector"] = {}
                        distJobParam["nodeSelector"]["rack"] = assignedRack

                    if "gpuType" in distJobParam:
                        if "nodeSelector" not in distJobParam:
                            distJobParam["nodeSelector"] = {}
                        distJobParam["nodeSelector"]["gpuType"] = distJobParam[
                            "gpuType"]

                    # inject gid, uid and user
                    # TODO it should return only one entry
                    user_info = dataHandler.GetIdentityInfo(
                        jobParams["userName"])[0]
                    distJobParam["gid"] = user_info["gid"]
                    distJobParam["uid"] = user_info["uid"]
                    distJobParam["user"] = userAlias

                    template = ENV.get_template(os.path.abspath(jobTemp))
                    job_description = template.render(job=distJobParam)

                    jobDescriptionList.append(job_description)

                    distJobParams[role].append(distJobParam)

            jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime(
                "%y%m%d"
            ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"
            jobDescription = "\n---\n".join(jobDescriptionList)

        jobDescriptionPath = os.path.join(config["storage-mount-path"],
                                          jobParams["jobDescriptionPath"])
        if not os.path.exists(
                os.path.dirname(os.path.realpath(jobDescriptionPath))):
            os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
        if os.path.isfile(jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)

        with open(jobDescriptionPath, 'w') as f:
            f.write(jobDescription)

        output = k8sUtils.kubectl_create(jobDescriptionPath)

        ret["output"] = output

        ret["jobId"] = jobParams["jobId"]

        if "userName" not in jobParams:
            jobParams["userName"] = ""

        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(jobParams["jobId"],
                                       "jobDescriptionPath",
                                       jobParams["jobDescriptionPath"])
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription",
                                       base64.b64encode(jobDescription))

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["workPath"] = jobParams["workPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["LaunchCMD"] = jobParams["cmd"]
        jobMeta["distJobParams"] = distJobParams

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta",
                                       jobMetaStr)
    except Exception as e:
        import traceback
        traceback.print_exc()
        print(e)
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                           "error")
            dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))
    dataHandler.Close()
    return ret
def UpdateJobStatus(job):

    dataHandler = DataHandler()
    jobParams = json.loads(base64.b64decode(job["jobParams"]))
    logging.info("start to update job status...")

    if job["jobStatus"] == "scheduling" and jobParams[
            "jobtrainingtype"] == "PSDistJob":
        launch_ps_dist_job(jobParams)

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    result, detail = k8sUtils.GetJobStatus(job["jobId"])
    dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                   base64.b64encode(json.dumps(detail)))

    msg = "job %s status, result: %s, detail: %s" % (job["jobId"], result,
                                                     json.dumps(detail))
    logging.info(msg)

    jobDescriptionPath = os.path.join(
        config["storage-mount-path"],
        job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None

    if "userId" not in jobParams:
        jobParams["userId"] = "0"

    if result.strip() == "Succeeded":
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished")

        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)
            logging.info("kubectl delete " + jobDescriptionPath + " output: " +
                         str(output))

    elif result.strip() == "Running":
        if job["jobStatus"] != "running":
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                           "running")

        if "interactivePort" in jobParams:
            serviceAddress = k8sUtils.GetServiceAddress(job["jobId"])
            serviceAddress = base64.b64encode(json.dumps(serviceAddress))
            dataHandler.UpdateJobTextField(job["jobId"], "endpoints",
                                           serviceAddress)

    elif result.strip() == "Failed":
        printlog("Job %s fails, cleaning..." % job["jobId"])
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed")
        dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail)

        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)
            logging.info("kubectl delete " + jobDescriptionPath + " output: " +
                         str(output))

    elif result.strip() == "Unknown":
        if job["jobId"] not in UnusualJobs:
            UnusualJobs[job["jobId"]] = datetime.datetime.now()

        elif (datetime.datetime.now() -
              UnusualJobs[job["jobId"]]).seconds > 300:
            del UnusualJobs[job["jobId"]]

            retries = dataHandler.AddandGetJobRetries(job["jobId"])
            if retries >= 5:
                printlog("Job %s fails for more than 5 times, abort" %
                         job["jobId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "error")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               "cannot launch the job.")

                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    output = k8sUtils.kubectl_delete(jobDescriptionPath)
                    logging.info("kubectl delete " + jobDescriptionPath +
                                 " output: " + str(output))

            else:
                printlog(
                    "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d"
                    % (job["jobId"], retries))
                SubmitJob(job)

    elif result.strip() == "PendingHostPort":
        printlog(
            "Cannot find host ports for job :%s, re-launch the job with different host ports "
            % (job["jobId"]))

        SubmitJob(job)

    if result.strip() != "Unknown" and job["jobId"] in UnusualJobs:
        del UnusualJobs[job["jobId"]]
示例#10
0
    def submit_job_impl(self, job):
        # check if existing any pod with label: run=job_id
        assert ("jobId" in job)
        job_id = job["jobId"]
        if not self._all_pods_not_existing(job_id):
            logging.warning(
                "Waiting until previously pods are cleaned up! Job {}".format(
                    job_id))
            job_deployer = JobDeployer()
            errors = job_deployer.delete_job(job_id, force=True)
            if errors:
                logging.warning("Force delete job {}: {}".format(
                    job_id, errors))
            return

        ret = {}
        dataHandler = DataHandler()

        try:
            # TODO refine later
            # before resubmit the job, reset the endpoints
            # update all endpoint to status 'pending', so it would restart when job is ready
            endpoints = dataHandler.GetJobEndpoints(job_id)
            for endpoint_id, endpoint in endpoints.items():
                endpoint["status"] = "pending"
                logging.info("Reset endpoint status to 'pending': {}".format(
                    endpoint_id))
                dataHandler.UpdateEndpoint(endpoint)

            job["cluster"] = config
            job_object, errors = JobSchema().load(job)
            # TODO assert job_object is a Job
            assert isinstance(
                job_object,
                Job), "job_object is not of Job, but " + str(type(job_object))

            job_object.params = json.loads(base64.b64decode(job["jobParams"]))

            # inject gid, uid and user
            # TODO it should return only one entry
            user_info = dataHandler.GetIdentityInfo(
                job_object.params["userName"])[0]
            job_object.params["gid"] = user_info["gid"]
            job_object.params["uid"] = user_info["uid"]
            job_object.params["user"] = job_object.get_alias()

            if "job_token" not in job_object.params:
                if "user_sign_token" in config and "userName" in job_object.params:
                    job_object.params["job_token"] = hashlib.md5(
                        job_object.params["userName"] + ":" +
                        config["user_sign_token"]).hexdigest()
                else:
                    job_object.params["job_token"] = "tryme2017"

            if "envs" not in job_object.params:
                job_object.params["envs"] = []
            job_object.params["envs"].append({
                "name":
                "DLTS_JOB_TOKEN",
                "value":
                job_object.params["job_token"]
            })

            enable_custom_scheduler = job_object.is_custom_scheduler_enabled()
            secret_template = job_object.get_blobfuse_secret_template()
            if job_object.params["jobtrainingtype"] == "RegularJob":
                pod_template = PodTemplate(
                    job_object.get_template(),
                    enable_custom_scheduler=enable_custom_scheduler,
                    secret_template=secret_template)
            elif job_object.params["jobtrainingtype"] == "PSDistJob":
                pod_template = DistPodTemplate(job_object.get_template(),
                                               secret_template=secret_template)
            elif job_object.params["jobtrainingtype"] == "InferenceJob":
                pod_template = PodTemplate(
                    job_object.get_template(),
                    deployment_template=job_object.get_deployment_template(),
                    enable_custom_scheduler=False,
                    secret_template=secret_template)
            else:
                dataHandler.SetJobError(
                    job_object.job_id, "ERROR: invalid jobtrainingtype: %s" %
                    job_object.params["jobtrainingtype"])
                dataHandler.Close()
                return False

            pods, error = pod_template.generate_pods(job_object)
            if error:
                dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error)
                dataHandler.Close()
                return False

            job_description = "\n---\n".join([yaml.dump(pod) for pod in pods])
            job_description_path = "jobfiles/" + time.strftime(
                "%y%m%d"
            ) + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml"
            local_jobDescriptionPath = os.path.realpath(
                os.path.join(config["storage-mount-path"],
                             job_description_path))
            if not os.path.exists(os.path.dirname(local_jobDescriptionPath)):
                os.makedirs(os.path.dirname(local_jobDescriptionPath))
            with open(local_jobDescriptionPath, 'w') as f:
                f.write(job_description)

            secrets = pod_template.generate_secrets(job_object)

            job_deployer = JobDeployer()
            try:
                secrets = job_deployer.create_secrets(secrets)
                ret["output"] = "Created secrets: {}. ".format(
                    [secret.metadata.name for secret in secrets])
                pods = job_deployer.create_pods(pods)
                ret["output"] += "Created pods: {}".format(
                    [pod.metadata.name for pod in pods])
            except Exception as e:
                ret["output"] = "Error: %s" % e.message
                logging.error(e, exc_info=True)

            ret["jobId"] = job_object.job_id

            dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus",
                                           "scheduling")
            dataHandler.UpdateJobTextField(job_object.job_id,
                                           "jobDescriptionPath",
                                           job_description_path)
            dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription",
                                           base64.b64encode(job_description))
            dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated",
                                           datetime.datetime.now().isoformat())

            jobMeta = {}
            jobMeta["jobDescriptionPath"] = job_description_path
            jobMeta["jobPath"] = job_object.job_path
            jobMeta["workPath"] = job_object.work_path
            # the command of the first container
            jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command

            jobMetaStr = base64.b64encode(json.dumps(jobMeta))
            dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta",
                                           jobMetaStr)
        except Exception as e:
            logging.error("Submit job failed: %s" % job, exc_info=True)
            ret["error"] = str(e)
            retries = dataHandler.AddandGetJobRetries(job["jobId"])
            if retries >= 5:
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "error")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               "Cannot submit job!" + str(e))

                detail = get_job_status_detail(job)
                detail = job_status_detail_with_finished_time(
                    detail, "error", "Server error in job submission")
                dataHandler.UpdateJobTextField(
                    job["jobId"], "jobStatusDetail",
                    base64.b64encode(json.dumps(detail)))

                # Try to clean up the job
                try:
                    job_deployer = JobDeployer()
                    job_deployer.delete_job(job_id, force=True)
                    logging.info(
                        "Cleaning up job %s succeeded after %d retries of job submission"
                        % (job["jobId"], retries))
                except:
                    logging.warning(
                        "Cleaning up job %s failed after %d retries of job submission"
                        % (job["jobId"], retries))

        dataHandler.Close()
        return ret