Exemplo n.º 1
0
def cleanup_endpoints():
    try:
        data_handler = DataHandler()
        dead_endpoints = data_handler.GetDeadEndpoints()
        for endpoint_id, dead_endpoint in dead_endpoints.items():
            print("\n\n\n\n\n\n----------------Begin to cleanup endpoint %s" %
                  endpoint_id)
            endpoint_description_path = os.path.join(
                config["storage-mount-path"],
                dead_endpoint["endpointDescriptionPath"])
            still_running = get_k8s_endpoint(endpoint_description_path)
            # empty mean not existing
            if still_running == "":
                print("Endpoint already gone %s" % endpoint_id)
                status = "stopped"
            else:
                output = k8sUtils.kubectl_delete(endpoint_description_path)
                # 0 for success
                if output == 0:
                    status = "stopped"
                    print("Succeed cleanup endpoint %s" % endpoint_id)
                else:
                    print("Clean dead endpoint %s failed, endpoints: %s" %
                          (endpoint_id, dead_endpoint))

            dead_endpoint["status"] = status
            dead_endpoint["lastUpdated"] = datetime.datetime.now().isoformat()
            data_handler.UpdateEndpoint(dead_endpoint)
    except Exception as e:
        traceback.print_exc()
    finally:
        pass
Exemplo n.º 2
0
def KillJob(job):
    dataHandler = DataHandler()
    result, detail = k8sUtils.GetJobStatus(job["jobId"])
    dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                   base64.b64encode(json.dumps(detail)))
    logging.info("Killing job %s, with status %s, %s" %
                 (job["jobId"], result, detail))
    if "jobDescriptionPath" in job and job["jobDescriptionPath"] is not None:
        jobDescriptionPath = os.path.join(config["storage-mount-path"],
                                          job["jobDescriptionPath"])
        if os.path.isfile(jobDescriptionPath):
            if k8sUtils.kubectl_delete(jobDescriptionPath) == 0:
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "killed")
                return True
            else:
                dataHandler.UpdateJobTextField(
                    job["jobId"], "errorMsg",
                    "Cannot delete job from Kubernetes Cluster!")
    else:
        dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                       "Cannot find job description file!")

    dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error")
    return False
Exemplo n.º 3
0
def cleanup_endpoints():
    try:
        data_handler = DataHandler()
        try:
            dead_endpoints = data_handler.GetDeadEndpoints()
            for endpoint_id, dead_endpoint in dead_endpoints.items():
                try:
                    logger.info(
                        "\n\n\n\n\n\n----------------Begin to cleanup endpoint %s",
                        endpoint_id)
                    endpoint_description_path = os.path.join(
                        config["storage-mount-path"],
                        dead_endpoint["endpointDescriptionPath"])
                    still_running = get_k8s_endpoint(endpoint_description_path)
                    # empty mean not existing
                    if still_running == "":
                        logger.info("Endpoint already gone %s", endpoint_id)
                        status = "stopped"
                    else:
                        output = k8sUtils.kubectl_delete(
                            endpoint_description_path)
                        # 0 for success
                        if output == 0:
                            status = "stopped"
                            logger.info("Succeed cleanup endpoint %s",
                                        endpoint_id)
                        else:
                            # TODO will need to clean it up eventually
                            status = "unknown"
                            logger.info(
                                "Clean dead endpoint %s failed, endpoints: %s",
                                endpoint_id, dead_endpoint)

                    # we are not changing status from "pending", "pending" endpoints are planed to setup later
                    if dead_endpoint["status"] != "pending":
                        dead_endpoint["status"] = status
                    dead_endpoint["lastUpdated"] = datetime.datetime.now(
                    ).isoformat()
                    data_handler.UpdateEndpoint(dead_endpoint)
                except Exception as e:
                    logger.warning(
                        "Clanup endpoint failed {}".format(dead_endpoint),
                        exc_info=True)
        except Exception as e:
            logger.exception("cleanup endpoint failed")
        finally:
            data_handler.Close()
    except Exception as e:
        logger.exception("close data handler failed")
Exemplo n.º 4
0
def UpdateDistJobStatus(job):
    dataHandler = DataHandler()
    jobParams = json.loads(base64.b64decode(job["jobParams"]))

    if "userId" not in jobParams:
        jobParams["userId"] = "0"

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    result, detail = k8sUtils.GetJobStatus(job["jobId"])
    dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                   base64.b64encode(detail))

    logging.info("job %s status: %s,%s" %
                 (job["jobId"], result, json.dumps(detail)))

    jobDescriptionPath = os.path.join(
        config["storage-mount-path"],
        job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None

    jobId = jobParams["jobId"]
    workerPodInfo = k8sUtils.GetPod("distRole=worker,run=" + jobId)
    psPodInfo = k8sUtils.GetPod("distRole=ps,run=" + jobId)
    if "items" in workerPodInfo and len(workerPodInfo["items"]) == int(
            jobParams["numpsworker"]) and "items" in psPodInfo and len(
                psPodInfo["items"]) == int(jobParams["numps"]):
        if job["jobStatus"] == "scheduling":
            launch_ps_dist_job(jobParams)
        if job["jobStatus"] == "running":
            result, detail = GetDistJobStatus(job["jobId"])
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                           base64.b64encode(detail))

            printlog("job %s status: %s" % (job["jobId"], result))

            jobDescriptionPath = os.path.join(
                config["storage-mount-path"], job["jobDescriptionPath"]
            ) if "jobDescriptionPath" in job else None

            if result.strip() == "Succeeded":
                joblog_manager.extract_job_log(job["jobId"], logPath,
                                               jobParams["userId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "finished")
                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    k8sUtils.kubectl_delete(jobDescriptionPath)

            elif result.strip() == "Running":
                joblog_manager.extract_job_log(job["jobId"], logPath,
                                               jobParams["userId"])
                if job["jobStatus"] != "running":
                    dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                                   "running")
                if "interactivePort" in jobParams:
                    serviceAddress = k8sUtils.GetServiceAddress(job["jobId"])
                    serviceAddress = base64.b64encode(
                        json.dumps(serviceAddress))
                    dataHandler.UpdateJobTextField(job["jobId"], "endpoints",
                                                   serviceAddress)

            elif result.strip() == "Failed":
                printlog("Job %s fails, cleaning..." % job["jobId"])
                joblog_manager.extract_job_log(job["jobId"], logPath,
                                               jobParams["userId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "failed")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               detail)
                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    k8sUtils.kubectl_delete(jobDescriptionPath)

            elif result.strip() == "Unknown":
                if job["jobId"] not in UnusualJobs:
                    UnusualJobs[job["jobId"]] = datetime.datetime.now()
                elif (datetime.datetime.now() -
                      UnusualJobs[job["jobId"]]).seconds > 300:
                    del UnusualJobs[job["jobId"]]
                    retries = dataHandler.AddandGetJobRetries(job["jobId"])
                    if retries >= 5:
                        printlog("Job %s fails for more than 5 times, abort" %
                                 job["jobId"])
                        dataHandler.UpdateJobTextField(job["jobId"],
                                                       "jobStatus", "error")
                        dataHandler.UpdateJobTextField(
                            job["jobId"], "errorMsg", "cannot launch the job.")
                        if jobDescriptionPath is not None and os.path.isfile(
                                jobDescriptionPath):
                            k8sUtils.kubectl_delete(jobDescriptionPath)
                    else:
                        printlog(
                            "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d"
                            % (job["jobId"], retries))
                        SubmitJob(job)

            if result.strip() != "Unknown" and job["jobId"] in UnusualJobs:
                del UnusualJobs[job["jobId"]]

    pass
Exemplo n.º 5
0
def SubmitRegularJob(job):
    ret = {}
    dataHandler = DataHandler()

    try:
        jobParams = json.loads(base64.b64decode(job["jobParams"]))

        jobParams["pvc_job"] = "jobs-" + jobParams["jobId"]
        jobParams["pvc_work"] = "work-" + jobParams["jobId"]
        jobParams["pvc_data"] = "storage-" + jobParams["jobId"]

        if "jobPath" not in jobParams or len(
                jobParams["jobPath"].strip()) == 0:
            dataHandler.SetJobError(jobParams["jobId"],
                                    "ERROR: job-path does not exist")
            return False

        if "workPath" not in jobParams or len(
                jobParams["workPath"].strip()) == 0:
            dataHandler.SetJobError(jobParams["jobId"],
                                    "ERROR: work-path does not exist")
            return False

        #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0:
        #	dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist")
        #	return False

        jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                     jobParams["workPath"],
                                                     jobParams["dataPath"])

        localJobPath = os.path.join(config["storage-mount-path"], jobPath)

        if not os.path.exists(localJobPath):
            if "userId" in jobParams:
                mkdirsAsUser(localJobPath, jobParams["userId"])
                mkdirsAsUser(os.path.join(localJobPath, "models"),
                             jobParams["userId"])
            else:
                mkdirsAsUser(localJobPath, "0")
                mkdirsAsUser(os.path.join(localJobPath, "models"), "0")

        jobParams["LaunchCMD"] = ""
        if "cmd" not in jobParams:
            jobParams["cmd"] = ""

        if isinstance(jobParams["cmd"],
                      basestring) and not jobParams["cmd"] == "":
            launchScriptPath = os.path.join(
                localJobPath, "launch-%s.sh" % jobParams["jobId"])
            with open(launchScriptPath, 'w') as f:
                f.write("#!/bin/bash -x\n")
                f.write(jobParams["cmd"] + "\n")
            f.close()
            if "userId" in jobParams:
                os.system("chown -R %s %s" %
                          (jobParams["userId"], launchScriptPath))
            jobParams[
                "LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams[
                    "jobId"]

        jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime(
            "%y%m%d"
        ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"

        jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"]
                                            if e.isalnum())

        ENV = Environment(loader=FileSystemLoader("/"))

        jobTempDir = os.path.join(config["root-path"], "Jobs_Templete")
        jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template")

        jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"],
                                                jobPath)
        jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"],
                                                 workPath)
        jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"],
                                                 dataPath)
        jobParams["nvidiaDriverPath"] = nvidiaDriverPath

        jobParams["userNameLabel"] = getAlias(jobParams["userName"])
        jobParams["rest-api"] = config["rest-api"]

        if "mountpoints" not in jobParams:
            jobParams["mountpoints"] = []
        for onemount in jobParams["mountpoints"]:
            onemount["name"] = onemount["containerPath"].replace("/", "")

        jobParams["mountpoints"].append({
            "name": "nvidia-driver",
            "containerPath": "/usr/local/nvidia",
            "hostPath": nvidiaDriverPath
        })
        jobParams["mountpoints"].append({
            "name": "job",
            "containerPath": "/job",
            "hostPath": jobParams["hostjobPath"]
        })
        jobParams["mountpoints"].append({
            "name": "work",
            "containerPath": "/work",
            "hostPath": jobParams["hostworkPath"]
        })
        jobParams["mountpoints"].append({
            "name": "data",
            "containerPath": "/data",
            "hostPath": jobParams["hostdataPath"]
        })
        jobParams["pod_ip_range"] = config["pod_ip_range"]
        if "usefreeflow" in config:
            jobParams["usefreeflow"] = config["usefreeflow"]
        else:
            jobParams["usefreeflow"] = False

        print("Render Job: %s" % jobParams)
        template = ENV.get_template(os.path.abspath(jobTemp))
        job_description = template.render(job=jobParams)

        jobDescriptionList = []

        jobDescriptionList.append(job_description)

        if ("interactivePort" in jobParams
                and len(jobParams["interactivePort"].strip()) > 0):
            ports = [
                p.strip()
                for p in re.split(",|;", jobParams["interactivePort"])
                if len(p.strip()) > 0 and p.strip().isdigit()
            ]
            for portNum in ports:
                jobParams["serviceId"] = "interactive-" + jobParams[
                    "jobId"] + "-" + portNum
                jobParams["port"] = portNum
                jobParams["port-name"] = "interactive"
                jobParams["port-type"] = "TCP"

                serviceTemplate = ENV.get_template(
                    os.path.join(jobTempDir, "KubeSvc.yaml.template"))

                template = ENV.get_template(serviceTemplate)
                interactiveMeta = template.render(svc=jobParams)
                jobDescriptionList.append(interactiveMeta)

        jobDescription = "\n---\n".join(jobDescriptionList)

        jobDescriptionPath = os.path.join(config["storage-mount-path"],
                                          jobParams["jobDescriptionPath"])
        if not os.path.exists(
                os.path.dirname(os.path.realpath(jobDescriptionPath))):
            os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
        if os.path.isfile(jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)

        with open(jobDescriptionPath, 'w') as f:
            f.write(jobDescription)

        output = k8sUtils.kubectl_create(jobDescriptionPath)
        logging.info("Submitted job %s to k8s, returned with status %s" %
                     (job["jobId"], output))

        ret["output"] = output

        ret["jobId"] = jobParams["jobId"]

        if "userName" not in jobParams:
            jobParams["userName"] = ""

        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(jobParams["jobId"],
                                       "jobDescriptionPath",
                                       jobParams["jobDescriptionPath"])
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription",
                                       base64.b64encode(jobDescription))

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["workPath"] = jobParams["workPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["LaunchCMD"] = jobParams["LaunchCMD"]

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta",
                                       jobMetaStr)
    except Exception as e:
        print e
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                           "error")
            dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))

    return ret
Exemplo n.º 6
0
def SubmitPSDistJob(job):
    ret = {}
    dataHandler = DataHandler()

    try:
        jobParams = json.loads(base64.b64decode(job["jobParams"]))
        jobParams["rest-api"] = config["rest-api"]
        distJobParams = {}
        distJobParams["ps"] = []
        distJobParams["worker"] = []
        assignedRack = None
        if len(config["racks"]) > 0:
            assignedRack = random.choice(config["racks"])
        if jobParams["jobtrainingtype"] == "PSDistJob":
            jobDescriptionList = []
            nums = {
                "ps": int(jobParams["numps"]),
                "worker": int(jobParams["numpsworker"])
            }
            for role in ["ps", "worker"]:
                for i in range(nums[role]):
                    distJobParam = copy.copy(jobParams)
                    distJobParam["distId"] = "%s%d" % (role, i)
                    distJobParam["distRole"] = role

                    if "jobPath" not in distJobParam or len(
                            distJobParam["jobPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: job-path does not exist")
                        return False

                    distJobParam["distJobPath"] = os.path.join(
                        distJobParam["jobPath"], distJobParam["distId"])

                    if "workPath" not in distJobParam or len(
                            distJobParam["workPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: work-path does not exist")
                        return False

                    if "dataPath" not in distJobParam or len(
                            distJobParam["dataPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: data-path does not exist")
                        return False

                    jobPath, workPath, dataPath = GetStoragePath(
                        distJobParam["distJobPath"], distJobParam["workPath"],
                        distJobParam["dataPath"])

                    localJobPath = os.path.join(config["storage-mount-path"],
                                                jobPath)
                    if not os.path.exists(localJobPath):
                        if "userId" in distJobParam:
                            mkdirsAsUser(localJobPath, distJobParam["userId"])
                        else:
                            mkdirsAsUser(localJobPath, 0)

                    distJobParam["LaunchCMD"] = ""
                    if "cmd" not in distJobParam:
                        distJobParam["cmd"] = ""


################One choice is that we only wait for certain time.
#					launchCMD = """
##!/bin/bash
#mkdir -p /opt
#echo "[DLWorkspace System]: Waiting for all containers are ready..."
## wait for at most 10 mins.
#for i in {1..200}; do
#	if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then
#		sleep 3
#	else
#		break
#	fi
#done
#if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then
#	echo "[DLWorkspace System]: Waiting for containers: timeout! Restarting..."
#	exit 1
#else
#	echo "[DLWorkspace System]: All containers are ready, launching training job..."
#	chmod +x /opt/run_dist_job.sh
#	/opt/run_dist_job.sh
#fi
#"""

                    launchCMD = """
#!/bin/bash
mkdir -p /opt
echo "[DLWorkspace System]: Waiting for all containers are ready..."
while [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; do
	sleep 3
done
echo "[DLWorkspace System]: All containers are ready, launching training job..."
chmod +x /opt/run_dist_job.sh
/opt/run_dist_job.sh
"""

                    launchScriptPath = os.path.join(
                        localJobPath, "launch-%s.sh" % distJobParam["jobId"])
                    with open(launchScriptPath, 'w') as f:
                        f.write(launchCMD)
                    f.close()
                    distJobParam[
                        "LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % distJobParam[
                            "jobId"]

                    distJobParam["jobNameLabel"] = ''.join(
                        e for e in distJobParam["jobName"] if e.isalnum())
                    distJobParam["userNameLabel"] = getAlias(
                        jobParams["userName"])
                    ENV = Environment(loader=FileSystemLoader("/"))

                    jobTempDir = os.path.join(config["root-path"],
                                              "Jobs_Templete")
                    jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template")

                    distJobParam["hostjobPath"] = os.path.join(
                        config["storage-mount-path"], jobPath)
                    distJobParam["hostworkPath"] = os.path.join(
                        config["storage-mount-path"], workPath)
                    distJobParam["hostdataPath"] = os.path.join(
                        config["storage-mount-path"], dataPath)
                    distJobParam["nvidiaDriverPath"] = nvidiaDriverPath

                    if "mountpoints" not in distJobParam:
                        distJobParam["mountpoints"] = []

                    distJobParam["mountpoints"].append({
                        "name":
                        "nvidia-driver",
                        "containerPath":
                        "/usr/local/nvidia",
                        "hostPath":
                        nvidiaDriverPath
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "job",
                        "containerPath":
                        "/job",
                        "hostPath":
                        distJobParam["hostjobPath"]
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "work",
                        "containerPath":
                        "/work",
                        "hostPath":
                        distJobParam["hostworkPath"]
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "data",
                        "containerPath":
                        "/data",
                        "hostPath":
                        distJobParam["hostdataPath"]
                    })
                    distJobParam["pod_ip_range"] = config["pod_ip_range"]
                    if "usefreeflow" in config and config[
                            "usefreeflow"] == "True":
                        distJobParam["usefreeflow"] = config["usefreeflow"]
                    else:
                        distJobParam["usefreeflow"] = False

                    random.seed(datetime.datetime.now())
                    distJobParam["containerPort"] = int(random.random() *
                                                        1000 + 3000)

                    if assignedRack is not None:
                        if "nodeSelector" not in distJobParam:
                            distJobParam["nodeSelector"] = {}
                        distJobParam["nodeSelector"]["rack"] = assignedRack

                    template = ENV.get_template(os.path.abspath(jobTemp))
                    job_description = template.render(job=distJobParam)

                    jobDescriptionList.append(job_description)

                    distJobParams[role].append(distJobParam)

            jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime(
                "%y%m%d"
            ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"
            jobDescription = "\n---\n".join(jobDescriptionList)

        jobDescriptionPath = os.path.join(config["storage-mount-path"],
                                          jobParams["jobDescriptionPath"])
        if not os.path.exists(
                os.path.dirname(os.path.realpath(jobDescriptionPath))):
            os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
        if os.path.isfile(jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)

        with open(jobDescriptionPath, 'w') as f:
            f.write(jobDescription)

        output = k8sUtils.kubectl_create(jobDescriptionPath)

        ret["output"] = output

        ret["jobId"] = jobParams["jobId"]

        if "userName" not in jobParams:
            jobParams["userName"] = ""

        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(jobParams["jobId"],
                                       "jobDescriptionPath",
                                       jobParams["jobDescriptionPath"])
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription",
                                       base64.b64encode(jobDescription))

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["workPath"] = jobParams["workPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["LaunchCMD"] = jobParams["LaunchCMD"]
        jobMeta["distJobParams"] = distJobParams

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta",
                                       jobMetaStr)
    except Exception as e:
        print e
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                           "error")
            dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))

    return ret
Exemplo n.º 7
0
def SubmitRegularJob(job):
    ret = {}
    dataHandler = DataHandler()

    try:
        jobParams = json.loads(base64.b64decode(job["jobParams"]))

        jobParams["pvc_job"] = "jobs-" + jobParams["jobId"]
        jobParams["pvc_work"] = "work-" + jobParams["jobId"]
        jobParams["pvc_data"] = "storage-" + jobParams["jobId"]


        if "jobPath" not in jobParams or len(jobParams["jobPath"].strip()) == 0: 
            dataHandler.SetJobError(jobParams["jobId"],"ERROR: job-path does not exist")
            return False

        if "workPath" not in jobParams or len(jobParams["workPath"].strip()) == 0: 
            dataHandler.SetJobError(jobParams["jobId"],"ERROR: work-path does not exist")
            return False

        #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0: 
        #    dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist")
        #    return False


        jobPath,workPath,dataPath = GetStoragePath(jobParams["jobPath"],jobParams["workPath"],jobParams["dataPath"])


        localJobPath = os.path.join(config["storage-mount-path"],jobPath)

        if not os.path.exists(localJobPath):
            if "userId" in jobParams:
                mkdirsAsUser(localJobPath,jobParams["userId"])
                mkdirsAsUser(os.path.join(localJobPath,"models"),jobParams["userId"])
            else:
                mkdirsAsUser(localJobPath,"0")
                mkdirsAsUser(os.path.join(localJobPath,"models"),"0")

        jobParams["LaunchCMD"] = ""
        if "cmd" not in jobParams:
            jobParams["cmd"] = ""
            
        if isinstance(jobParams["cmd"], basestring) and not jobParams["cmd"] == "":
            launchScriptPath = os.path.join(localJobPath,"launch-%s.sh" % jobParams["jobId"])
            with open(launchScriptPath, 'w') as f:
                f.write("#!/bin/bash -x\n")
                f.write(jobParams["cmd"] + "\n")
            f.close()    
            if "userId" in jobParams:
                os.system("chown -R %s %s" % (jobParams["userId"], launchScriptPath))
            jobParams["LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams["jobId"]


        jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"

        jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"] if e.isalnum())

        ENV = Environment(loader=FileSystemLoader("/"))

        jobTempDir = os.path.join(config["root-path"],"Jobs_Templete")
        jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template")

        jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath)
        jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath)
        jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath)
        jobParams["nvidiaDriverPath"] = nvidiaDriverPath


        jobParams["userNameLabel"] = getAlias(jobParams["userName"])
        jobParams["rest-api"] = config["rest-api"]

        if "mountpoints" not in jobParams:
            jobParams["mountpoints"] = []
        for onemount in jobParams["mountpoints"]:
            onemount["name"] = onemount["containerPath"].replace("/","")

        mp = {"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath, "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)

        mp = {"name":"job","containerPath":"/job","hostPath":jobParams["hostjobPath"], "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)

        mp = {"name":"work","containerPath":"/work","hostPath":jobParams["hostworkPath"], "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)

        mp = {"name":"data","containerPath":"/data","hostPath":jobParams["hostdataPath"], "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)                        

        userAlias = getAlias(jobParams["userName"])

        mp = {"name":"sshkey","containerPath":"/home/%s/.ssh" % userAlias,"hostPath":os.path.join(config["storage-mount-path"], GetWorkPath(userAlias)+"/.ssh"), "readOnly":True, "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)            


        jobParams["pod_ip_range"] = config["pod_ip_range"]
        if "usefreeflow" in config:
            jobParams["usefreeflow"] = config["usefreeflow"]
        else:
            jobParams["usefreeflow"] = False

        print ("Render Job: %s" % jobParams)
        jobDescriptionList = []

        pods = []
        if "hyperparametername" in jobParams and "hyperparameterstartvalue" in jobParams and "hyperparameterendvalue" in jobParams and "hyperparameterstep" in jobParams:
            i = int(jobParams["hyperparameterstartvalue"])
            end = int(jobParams["hyperparameterendvalue"])
            step = int(jobParams["hyperparameterstep"])
            c = 0
            while (i <= end):
                pod = {}
                pod["podName"] = jobParams["jobId"]+"-pod-"+str(c)
                pod["envs"] = [{"name":jobParams["hyperparametername"],"value":i}]
                i += step
                c += 1 
                pods.append(pod)
        else:
                pod = {}
                pod["podName"] = jobParams["jobId"]
                pod["envs"] = []
                pods.append(pod)

        if "env" not in jobParams:
            jobParams["env"] = []
        jobParams["commonenv"] = copy.copy(jobParams["env"])


        for pod in pods:
            jobParams["podName"] = pod["podName"]
            jobParams["env"] = jobParams["commonenv"] + pod["envs"]

            if "kube_custom_scheduler" in config and config["kube_custom_scheduler"]:
                container = {}
                container["requests"] = {"alpha.gpu/numgpu" : int(jobParams["resourcegpu"])}
                podInfo = {}
                podInfo["podname"] = jobParams["podName"]
                if "useGPUTopology" in jobParams and jobParams["useGPUTopology"]:
                    # add topology constraints explicitly - for testing
                    # if (jobParams["resourcegpu"] >= 2):
                    #     # both cards in same inner group
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/0/cards"] = 1
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/1/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 3):
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/2/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 4):
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/3/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 5):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/4/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 6):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/5/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 7):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/6/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 8):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/7/cards"] = 1
                    podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 1}
                else:
                    # for cases when desired topology is explictly given or not desired
                    podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 0}
                podInfo["runningcontainer"] = {jobParams["podName"] : container}

                if "annotations" not in jobParams:
                    jobParams["annotations"] = {}
                jobParams["annotations"]["pod.alpha/DeviceInformation"] = "'" + json.dumps(podInfo) + "'"
                jobParams["resourcegpu"] = 0 # gpu requests specified through annotation

            template = ENV.get_template(os.path.abspath(jobTemp))
            job_description = template.render(job=jobParams)
            jobDescriptionList.append(job_description)

            if ("interactivePort" in jobParams and len(jobParams["interactivePort"].strip()) > 0):
                ports = [p.strip() for p in re.split(",|;",jobParams["interactivePort"]) if len(p.strip()) > 0 and p.strip().isdigit()]
                for portNum in ports:
                    jobParams["serviceId"] = "interactive-" + jobParams["podName"] + "-" + portNum
                    jobParams["port"] = portNum
                    jobParams["port-name"] = "interactive"
                    jobParams["port-type"] = "TCP"

                    serviceTemplate = ENV.get_template(os.path.join(jobTempDir,"KubeSvc.yaml.template"))

                    stemplate = ENV.get_template(serviceTemplate)
                    interactiveMeta = stemplate.render(svc=jobParams)
                    jobDescriptionList.append(interactiveMeta)


        jobDescription = "\n---\n".join(jobDescriptionList)

        jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"])
        if not os.path.exists(os.path.dirname(os.path.realpath(jobDescriptionPath))):
            os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
        if os.path.isfile(jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath) 

        with open(jobDescriptionPath, 'w') as f:
            f.write(jobDescription)

        output = k8sUtils.kubectl_create(jobDescriptionPath)    
        logging.info("Submitted job %s to k8s, returned with status %s" %(job["jobId"], output))

        ret["output"] = output
        
        ret["jobId"] = jobParams["jobId"]


        if "userName" not in jobParams:
            jobParams["userName"] = ""

        dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","scheduling")
        dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescriptionPath",jobParams["jobDescriptionPath"])
        dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescription",base64.b64encode(jobDescription))


        jobMeta = {}
        jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["workPath"] = jobParams["workPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["LaunchCMD"] = jobParams["LaunchCMD"]

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(jobParams["jobId"],"jobMeta",jobMetaStr)
    except Exception as e:
        print e
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","error")
            dataHandler.UpdateJobTextField(jobParams["jobId"],"errorMsg","Cannot submit job!" + str(e))

    return ret
Exemplo n.º 8
0
def UpdateJobStatus(job, notifier=None, dataHandlerOri=None):
    assert (job["jobStatus"] == "scheduling" or job["jobStatus"] == "running")
    if dataHandlerOri is None:
        dataHandler = DataHandler()
    else:
        dataHandler = dataHandlerOri
    jobParams = json.loads(base64.b64decode(job["jobParams"]))

    result = check_job_status(job["jobId"])
    logging.info("++++++++ Job status: {} {}".format(job["jobId"], result))

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    jobDescriptionPath = os.path.join(
        config["storage-mount-path"],
        job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None
    if "userId" not in jobParams:
        jobParams["userId"] = "0"

    if result == "Succeeded":
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished")
        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            k8sUtils.kubectl_delete(jobDescriptionPath)

        if notifier is not None:
            notifier.notify(
                notify.new_job_state_change_message(job["userName"],
                                                    job["jobId"],
                                                    result.strip()))
    elif result == "Running":
        if job["jobStatus"] != "running":
            started_at = datetime.datetime.now().isoformat()
            detail = [{
                "startedAt": started_at,
                "message": "started at: {}".format(started_at)
            }]
            dataHandler.UpdateJobTextField(
                job["jobId"], "jobStatusDetail",
                base64.b64encode(json.dumps(detail)))
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                           "running")

    elif result == "Failed":
        logging.warning("Job %s fails, cleaning...", job["jobId"])

        if notifier is not None:
            notifier.notify(
                notify.new_job_state_change_message(job["userName"],
                                                    job["jobId"],
                                                    result.strip()))

        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed")
        dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "pod failed")

        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            k8sUtils.kubectl_delete(jobDescriptionPath)

    elif result == "Unknown" or result == "NotFound":
        if job["jobId"] not in UnusualJobs:
            logging.warning("!!! Job status ---{}---, job: {}".format(
                result, job["jobId"]))
            UnusualJobs[job["jobId"]] = datetime.datetime.now()
        # TODO
        # 1) May need to reduce the timeout.
        #     It takes minutes before pod turns into "Unknown", we may don't need to wait so long.
        # 2) If node resume before we resubmit the job, the job will end in status 'NotFound'.
        elif (datetime.datetime.now() -
              UnusualJobs[job["jobId"]]).seconds > 30:
            del UnusualJobs[job["jobId"]]

            # TODO refine later
            # before resubmit the job, reset the endpoints
            # update all endpoint to status 'pending', so it would restart when job is ready
            endpoints = dataHandler.GetJobEndpoints(job["jobId"])
            for endpoint_id, endpoint in endpoints.items():
                endpoint["status"] = "pending"
                logging.info("Reset endpoint status to 'pending': {}".format(
                    endpoint_id))
                dataHandler.UpdateEndpoint(endpoint)

            logging.warning(
                "Job {} fails in Kubernetes as {}, delete and re-submit.".
                format(job["jobId"], result))
            KillJob(job["jobId"], "queued")

    if result != "Unknown" and result != "NotFound" and job[
            "jobId"] in UnusualJobs:
        del UnusualJobs[job["jobId"]]
    if dataHandlerOri is None:
        dataHandler.Close()
Exemplo n.º 9
0
def UpdateJobStatus(job):
    dataHandler = DataHandler()
    jobParams = json.loads(base64.b64decode(job["jobParams"]))

    if job["jobStatus"] == "scheduling" and jobParams[
            "jobtrainingtype"] == "PSDistJob":
        # launch user command only all pods are ready
        result, detail = k8sUtils.GetJobStatus(job["jobId"])
        if result in ["Failed", "Succeeded"]:
            # TODO shoudn't be here, update status
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", result)
            pass
        else:
            # previously status is 'scheduling', and now all pods are ready
            # TODO check all pods are ready
            if k8sUtils.all_pod_ready(job["jobId"]):
                try:
                    launch_ps_dist_job(jobParams)
                except Exception as e:
                    print(e)
            return

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    result, detail = k8sUtils.GetJobStatus(job["jobId"])
    dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                   base64.b64encode(json.dumps(detail)))

    logging.info("job %s status: %s,%s" %
                 (job["jobId"], result, json.dumps(detail)))

    jobDescriptionPath = os.path.join(
        config["storage-mount-path"],
        job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None
    if "userId" not in jobParams:
        jobParams["userId"] = "0"
    if result.strip() == "Succeeded":
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished")
        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            k8sUtils.kubectl_delete(jobDescriptionPath)

    elif result.strip() == "Running":
        if job["jobStatus"] != "running":
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                           "running")

    elif result.strip() == "Failed":
        printlog("Job %s fails, cleaning..." % job["jobId"])
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed")
        dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail)
        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            k8sUtils.kubectl_delete(jobDescriptionPath)

    elif result.strip() == "Unknown":
        if job["jobId"] not in UnusualJobs:
            UnusualJobs[job["jobId"]] = datetime.datetime.now()
        elif (datetime.datetime.now() -
              UnusualJobs[job["jobId"]]).seconds > 300:
            del UnusualJobs[job["jobId"]]
            retries = dataHandler.AddandGetJobRetries(job["jobId"])
            if retries >= 5:
                printlog("Job %s fails for more than 5 times, abort" %
                         job["jobId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "error")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               "cannot launch the job.")
                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    k8sUtils.kubectl_delete(jobDescriptionPath)
            else:
                printlog(
                    "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d"
                    % (job["jobId"], retries))
                SubmitJob(job)
    elif result.strip() == "PendingHostPort":
        printlog(
            "Cannot find host ports for job :%s, re-launch the job with different host ports "
            % (job["jobId"]))

        SubmitJob(job)

    if result.strip() != "Unknown" and job["jobId"] in UnusualJobs:
        del UnusualJobs[job["jobId"]]

    dataHandler.Close()
Exemplo n.º 10
0
def SubmitPSDistJob(job):
    ret = {}
    dataHandler = DataHandler()

    try:
        jobParams = json.loads(base64.b64decode(job["jobParams"]))
        jobParams["rest-api"] = config["rest-api"]
        distJobParams = {}
        distJobParams["ps"] = []
        distJobParams["worker"] = []
        assignedRack = None
        if len(config["racks"]) > 0:
            assignedRack = random.choice(config["racks"])

        userAlias = getAlias(jobParams["userName"])
        jobParams["user_email"] = jobParams["userName"]

        jobParams["homeFolderHostpath"] = os.path.join(
            config["storage-mount-path"], GetWorkPath(userAlias))

        if jobParams["jobtrainingtype"] == "PSDistJob":
            jobDescriptionList = []
            nums = {
                "ps": int(jobParams["numps"]),
                "worker": int(jobParams["numpsworker"])
            }
            for role in ["ps", "worker"]:
                for i in range(nums[role]):
                    distJobParam = copy.deepcopy(jobParams)
                    distJobParam["distId"] = "%s%d" % (role, i)
                    distJobParam["distRole"] = role
                    distJobParam["distRoleIdx"] = i

                    if "jobPath" not in distJobParam or len(
                            distJobParam["jobPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: job-path does not exist")
                        return False
                    if "workPath" not in distJobParam or len(
                            distJobParam["workPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: work-path does not exist")
                        return False
                    #if "dataPath" not in distJobParam or len(distJobParam["dataPath"].strip()) == 0:
                    #    dataHandler.SetJobError(distJobParam["jobId"],"ERROR: data-path does not exist")
                    #    return False
                    distJobParam["distJobPath"] = os.path.join(
                        distJobParam["jobPath"], distJobParam["distId"])
                    jobPath, workPath, dataPath = GetStoragePath(
                        distJobParam["distJobPath"], distJobParam["workPath"],
                        distJobParam["dataPath"])

                    localJobPath = os.path.join(config["storage-mount-path"],
                                                jobPath)
                    if not os.path.exists(localJobPath):
                        if "userId" in distJobParam:
                            mkdirsAsUser(localJobPath, distJobParam["userId"])
                        else:
                            mkdirsAsUser(localJobPath, 0)

                    # TODO ???
                    if "cmd" not in distJobParam:
                        distJobParam["cmd"] = ""


#change ssh folder permission here because the setup permission script in launch_ps_job function may have race condition with init_user.sh script. results in no such user error
                    if role == "ps":
                        launchCMD = """
#!/bin/bash
echo "[DLWorkspace System]: Waiting for all containers are ready..."
while [ ! -f /opt/run_dist_job ]; do
    sleep 3
done

sudo chmod 600 -R /home/%s/.ssh &>/dev/null;
sudo chmod 700 /home/%s/.ssh &>/dev/null;
sudo chown -R %s /home/%s/.ssh &>/dev/null;

sudo mkdir -p /root/.ssh  &>/dev/null ;
sudo ln -s /home/%s/.ssh/config /root/.ssh/config  &>/dev/null;
sudo mkdir -p /opt  &>/dev/null;
sudo ln -s /job/hostfile /opt/hostfile &>/dev/null;

JOB_DIR='/home/%s'
WORKER_NUM=%s
echo $JOB_DIR $WORKER_NUM

all_workers_ready=false
while [ "$all_workers_ready" != true ]
do
  # update it to false if any woker is not ready
  all_workers_ready=true

  for i in $(seq 0 $(( ${WORKER_NUM} - 1)) )
  do
    worker="worker${i}"
    file="$JOB_DIR/${worker}/WORKER_READY"
    #echo $file

    if [ ! -f $file ]; then
      echo "${worker} not ready!"
      all_workers_ready=false
      sleep 10
    fi
  done
done

echo "[DLWorkspace System]: All containers are ready, launching training job..."
%s
""" % (userAlias, userAlias, userAlias, userAlias, userAlias,
                        distJobParam["jobPath"], jobParams["numpsworker"], distJobParam["cmd"])
                    else:
                        launchCMD = """
while [ ! -f /opt/run_dist_job ]; do
    sleep 3
done
sudo chmod 600 -R /home/%s/.ssh &>/dev/null;
sudo chmod 700 /home/%s/.ssh &>/dev/null;
sudo chown -R %s /home/%s/.ssh  &>/dev/null;
sudo mkdir -p /root/.ssh  &>/dev/null;
sudo ln -s /home/%s/.ssh/config /root/.ssh/config &>/dev/null;
sudo mkdir -p /opt && sudo ln -s /job/hostfile /opt/hostfile  &>/dev/null;

# TODO mark the worker as 'READY', better to change to '/pod/READY' later
sudo touch /job/WORKER_READY

sleep infinity
""" % (userAlias, userAlias, userAlias, userAlias, userAlias)

                    launchScriptPath = os.path.join(
                        localJobPath,
                        "launch-%s-%s%d.sh" % (distJobParam["jobId"], role, i))
                    # TODO need to set up user for distribute jobs
                    with open(launchScriptPath, 'w') as f:
                        f.write(launchCMD)
                    f.close()

                    launchScriptInContainer = "bash /job/launch-%s-%s%d.sh" % (
                        distJobParam["jobId"], role, i)

                    distJobParam[
                        "LaunchCMD"] = '["bash", "-c", "bash /dlws/init_user.sh &> /job/init_user_script.log && runuser -l ${DLWS_USER_NAME} -c \'%s\'"]' % launchScriptInContainer

                    distJobParam["jobNameLabel"] = ''.join(
                        e for e in distJobParam["jobName"] if e.isalnum())
                    ENV = Environment(loader=FileSystemLoader("/"))

                    jobTempDir = os.path.join(config["root-path"],
                                              "Jobs_Templete")
                    jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template")

                    distJobParam["hostjobPath"] = os.path.join(
                        config["storage-mount-path"], jobPath)
                    distJobParam["hostworkPath"] = os.path.join(
                        config["storage-mount-path"], workPath)
                    distJobParam["hostdataPath"] = os.path.join(
                        config["storage-mount-path"], dataPath)
                    distJobParam["nvidiaDriverPath"] = nvidiaDriverPath

                    if "mountpoints" not in distJobParam:
                        distJobParam["mountpoints"] = []

                    # distJobParam["mountpoints"].append({"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath})
                    distJobParam["mountpoints"].append({
                        "name":
                        "job",
                        "containerPath":
                        "/job",
                        "hostPath":
                        distJobParam["hostjobPath"]
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "work",
                        "containerPath":
                        "/work",
                        "hostPath":
                        distJobParam["hostworkPath"]
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "data",
                        "containerPath":
                        "/data",
                        "hostPath":
                        distJobParam["hostdataPath"]
                    })

                    for idx in range(len(distJobParam["mountpoints"])):
                        if "name" not in distJobParam["mountpoints"][idx]:
                            distJobParam["mountpoints"][idx]["name"] = str(
                                uuid.uuid4()).replace("-", "")

                    distJobParam["pod_ip_range"] = config["pod_ip_range"]
                    if "usefreeflow" in config:
                        distJobParam["usefreeflow"] = config["usefreeflow"]
                    else:
                        distJobParam["usefreeflow"] = False

                    distJobParam["numworker"] = int(jobParams["numpsworker"])
                    distJobParam["numps"] = int(jobParams["numps"])

                    random.seed(datetime.datetime.now())
                    if "hostNetwork" in jobParams and jobParams["hostNetwork"]:
                        distJobParam["containerPort"] = random.randint(
                            40000, 49999)
                    else:
                        distJobParam["containerPort"] = int(random.random() *
                                                            1000 + 3000)

                    if assignedRack is not None:
                        if "nodeSelector" not in distJobParam:
                            distJobParam["nodeSelector"] = {}
                        distJobParam["nodeSelector"]["rack"] = assignedRack

                    if "gpuType" in distJobParam:
                        if "nodeSelector" not in distJobParam:
                            distJobParam["nodeSelector"] = {}
                        distJobParam["nodeSelector"]["gpuType"] = distJobParam[
                            "gpuType"]

                    # inject gid, uid and user
                    # TODO it should return only one entry
                    user_info = dataHandler.GetIdentityInfo(
                        jobParams["userName"])[0]
                    distJobParam["gid"] = user_info["gid"]
                    distJobParam["uid"] = user_info["uid"]
                    distJobParam["user"] = userAlias

                    template = ENV.get_template(os.path.abspath(jobTemp))
                    job_description = template.render(job=distJobParam)

                    jobDescriptionList.append(job_description)

                    distJobParams[role].append(distJobParam)

            jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime(
                "%y%m%d"
            ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"
            jobDescription = "\n---\n".join(jobDescriptionList)

        jobDescriptionPath = os.path.join(config["storage-mount-path"],
                                          jobParams["jobDescriptionPath"])
        if not os.path.exists(
                os.path.dirname(os.path.realpath(jobDescriptionPath))):
            os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
        if os.path.isfile(jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)

        with open(jobDescriptionPath, 'w') as f:
            f.write(jobDescription)

        output = k8sUtils.kubectl_create(jobDescriptionPath)

        ret["output"] = output

        ret["jobId"] = jobParams["jobId"]

        if "userName" not in jobParams:
            jobParams["userName"] = ""

        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(jobParams["jobId"],
                                       "jobDescriptionPath",
                                       jobParams["jobDescriptionPath"])
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription",
                                       base64.b64encode(jobDescription))

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["workPath"] = jobParams["workPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["LaunchCMD"] = jobParams["cmd"]
        jobMeta["distJobParams"] = distJobParams

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta",
                                       jobMetaStr)
    except Exception as e:
        import traceback
        traceback.print_exc()
        print(e)
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                           "error")
            dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))
    dataHandler.Close()
    return ret
Exemplo n.º 11
0
def UpdateJobStatus(job):

    dataHandler = DataHandler()
    jobParams = json.loads(base64.b64decode(job["jobParams"]))
    logging.info("start to update job status...")

    if job["jobStatus"] == "scheduling" and jobParams[
            "jobtrainingtype"] == "PSDistJob":
        launch_ps_dist_job(jobParams)

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    result, detail = k8sUtils.GetJobStatus(job["jobId"])
    dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                   base64.b64encode(json.dumps(detail)))

    msg = "job %s status, result: %s, detail: %s" % (job["jobId"], result,
                                                     json.dumps(detail))
    logging.info(msg)

    jobDescriptionPath = os.path.join(
        config["storage-mount-path"],
        job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None

    if "userId" not in jobParams:
        jobParams["userId"] = "0"

    if result.strip() == "Succeeded":
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished")

        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)
            logging.info("kubectl delete " + jobDescriptionPath + " output: " +
                         str(output))

    elif result.strip() == "Running":
        if job["jobStatus"] != "running":
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                           "running")

        if "interactivePort" in jobParams:
            serviceAddress = k8sUtils.GetServiceAddress(job["jobId"])
            serviceAddress = base64.b64encode(json.dumps(serviceAddress))
            dataHandler.UpdateJobTextField(job["jobId"], "endpoints",
                                           serviceAddress)

    elif result.strip() == "Failed":
        printlog("Job %s fails, cleaning..." % job["jobId"])
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed")
        dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail)

        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)
            logging.info("kubectl delete " + jobDescriptionPath + " output: " +
                         str(output))

    elif result.strip() == "Unknown":
        if job["jobId"] not in UnusualJobs:
            UnusualJobs[job["jobId"]] = datetime.datetime.now()

        elif (datetime.datetime.now() -
              UnusualJobs[job["jobId"]]).seconds > 300:
            del UnusualJobs[job["jobId"]]

            retries = dataHandler.AddandGetJobRetries(job["jobId"])
            if retries >= 5:
                printlog("Job %s fails for more than 5 times, abort" %
                         job["jobId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "error")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               "cannot launch the job.")

                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    output = k8sUtils.kubectl_delete(jobDescriptionPath)
                    logging.info("kubectl delete " + jobDescriptionPath +
                                 " output: " + str(output))

            else:
                printlog(
                    "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d"
                    % (job["jobId"], retries))
                SubmitJob(job)

    elif result.strip() == "PendingHostPort":
        printlog(
            "Cannot find host ports for job :%s, re-launch the job with different host ports "
            % (job["jobId"]))

        SubmitJob(job)

    if result.strip() != "Unknown" and job["jobId"] in UnusualJobs:
        del UnusualJobs[job["jobId"]]