Exemplo n.º 1
0
def _extract_job_log(jobId, logPath, userId):
    dataHandler = None
    try:
        dataHandler = DataHandler()

        old_cursor = dataHandler.GetJobTextField(jobId, "jobLogCursor")
        if old_cursor is not None and len(old_cursor) == 0:
            old_cursor = None
        (pod_logs, new_cursor) = GetJobLog(jobId, cursor=old_cursor)

        jobLogDir = os.path.dirname(logPath)
        if not os.path.exists(jobLogDir):
            mkdirsAsUser(jobLogDir, userId)

        for (pod_name, log_text) in pod_logs.items():
            try:
                podLogPath = os.path.join(jobLogDir,
                                          "log-pod-" + pod_name + ".txt")
                with open(podLogPath, 'a', encoding="utf-8") as f:
                    f.write(log_text)
                os.system("chown -R %s %s" % (userId, podLogPath))
            except Exception:
                logger.exception("write pod log of {} failed".format(jobId))

        logging.info("cursor of job %s: %s" % (jobId, new_cursor))
        if new_cursor is not None:
            dataHandler.UpdateJobTextFields({"jobId": jobId},
                                            {"jobLogCursor": new_cursor})

    except Exception as e:
        logging.error(e)
    finally:
        if dataHandler is not None:
            dataHandler.Close()
Exemplo n.º 2
0
def extract_job_log(jobId, logPath, userId):
    try:
        dataHandler = DataHandler()

        # logs = k8sUtils.GetLog(jobId)
        # logs = k8sUtils.getJobConsoleDetail(jobId)
        jupyterLog = k8sUtils.getJupyterInfo(jobId)

        # TODO: Replace joblog manager with elastic search
        logs = k8sUtils.GetLog(jobId, tail=None)

        # Do not overwrite existing logs with empty log
        # DLTS bootstrap will generate logs for all containers.
        # If one container has empty log, skip writing.
        if not logs:
            return

        for log in logs:
            if "containerLog" in log and log["containerLog"] == "":
                return

        jobLogDir = os.path.dirname(logPath)
        if not os.path.exists(jobLogDir):
            mkdirsAsUser(jobLogDir, userId)
        logStr = ""
        trimlogstr = ""

        for log in logs:
            if "podName" in log and "containerID" in log and "containerLog" in log:
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "        logs from pod: %s\n" % log["podName"]
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += log["containerLog"]
                logStr += jupyterLog
                logStr += "\n\n\n"
                logStr += "=========================================================\n"
                logStr += "        end of logs from pod: %s\n" % log["podName"]
                logStr += "=========================================================\n"
                logStr += "\n\n\n"

        logLines = logStr.split('\n')
        length = len(logLines)
        if len(logStr.strip()) > 0:
            if (length <= 2000):
                if os.path.exists(os.path.join(jobLogDir, "max_page")):
                    os.system("rm -rf %s" % (jobLogDir))
                save_log(jobLogDir, str(jobId), userId, logStr)
            else:
                with open(os.path.join(jobLogDir, "max_page"), 'w') as f:
                    f.write(str(length // 2000 + 1))
                for i in range(1, length // 2000 + 2):
                    trimlogstr = "\n".join(logLines[(i - 1) * 2000:i * 2000])
                    save_log(jobLogDir, str(jobId), userId, trimlogstr, i)

    except Exception as e:
        logger.error(e)
Exemplo n.º 3
0
    def generate_launch_script(job_id, path_to_save, user_id, gpu_num,
                               user_script):
        if not os.path.exists(path_to_save):
            mkdirsAsUser(path_to_save, user_id)

        file_name = "job_command.sh"
        launch_script_file = os.path.join(path_to_save, file_name)
        with open(launch_script_file, 'w') as f:
            f.write(user_script)
        os.system("sudo chown %s %s" % (user_id, launch_script_file))
        luanch_cmd = ["bash", "/pod/scripts/bootstrap.sh"]
        return luanch_cmd
Exemplo n.º 4
0
    def generate_launch_script(dist_role, dist_role_idx, user_id, job_path, cmd):
        # change ssh folder permission here because the setup permission
        #  script in launch_ps_job function may have race condition with init_user.sh script.
        # results in no such user error

        local_pod_path = os.path.join(config["storage-mount-path"], "work/", job_path, "{}-{}".format(dist_role, dist_role_idx))
        if not os.path.exists(local_pod_path):
            mkdirsAsUser(local_pod_path, user_id)
        file_name = "job_command.sh"
        launch_script_file = os.path.join(local_pod_path, file_name)
        with open(launch_script_file, 'w') as f:
            f.write(cmd)
        f.close()

        launchCMD = ["bash", "/pod/scripts/bootstrap.sh"]
        return launchCMD
Exemplo n.º 5
0
def SubmitRegularJob(job):
    ret = {}
    dataHandler = DataHandler()

    try:
        jobParams = json.loads(base64.b64decode(job["jobParams"]))

        jobParams["pvc_job"] = "jobs-" + jobParams["jobId"]
        jobParams["pvc_work"] = "work-" + jobParams["jobId"]
        jobParams["pvc_data"] = "storage-" + jobParams["jobId"]

        if "jobPath" not in jobParams or len(
                jobParams["jobPath"].strip()) == 0:
            dataHandler.SetJobError(jobParams["jobId"],
                                    "ERROR: job-path does not exist")
            return False

        if "workPath" not in jobParams or len(
                jobParams["workPath"].strip()) == 0:
            dataHandler.SetJobError(jobParams["jobId"],
                                    "ERROR: work-path does not exist")
            return False

        #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0:
        #	dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist")
        #	return False

        jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                     jobParams["workPath"],
                                                     jobParams["dataPath"])

        localJobPath = os.path.join(config["storage-mount-path"], jobPath)

        if not os.path.exists(localJobPath):
            if "userId" in jobParams:
                mkdirsAsUser(localJobPath, jobParams["userId"])
                mkdirsAsUser(os.path.join(localJobPath, "models"),
                             jobParams["userId"])
            else:
                mkdirsAsUser(localJobPath, "0")
                mkdirsAsUser(os.path.join(localJobPath, "models"), "0")

        jobParams["LaunchCMD"] = ""
        if "cmd" not in jobParams:
            jobParams["cmd"] = ""

        if isinstance(jobParams["cmd"],
                      basestring) and not jobParams["cmd"] == "":
            launchScriptPath = os.path.join(
                localJobPath, "launch-%s.sh" % jobParams["jobId"])
            with open(launchScriptPath, 'w') as f:
                f.write("#!/bin/bash -x\n")
                f.write(jobParams["cmd"] + "\n")
            f.close()
            if "userId" in jobParams:
                os.system("chown -R %s %s" %
                          (jobParams["userId"], launchScriptPath))
            jobParams[
                "LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams[
                    "jobId"]

        jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime(
            "%y%m%d"
        ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"

        jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"]
                                            if e.isalnum())

        ENV = Environment(loader=FileSystemLoader("/"))

        jobTempDir = os.path.join(config["root-path"], "Jobs_Templete")
        jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template")

        jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"],
                                                jobPath)
        jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"],
                                                 workPath)
        jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"],
                                                 dataPath)
        jobParams["nvidiaDriverPath"] = nvidiaDriverPath

        jobParams["userNameLabel"] = getAlias(jobParams["userName"])
        jobParams["rest-api"] = config["rest-api"]

        if "mountpoints" not in jobParams:
            jobParams["mountpoints"] = []
        for onemount in jobParams["mountpoints"]:
            onemount["name"] = onemount["containerPath"].replace("/", "")

        jobParams["mountpoints"].append({
            "name": "nvidia-driver",
            "containerPath": "/usr/local/nvidia",
            "hostPath": nvidiaDriverPath
        })
        jobParams["mountpoints"].append({
            "name": "job",
            "containerPath": "/job",
            "hostPath": jobParams["hostjobPath"]
        })
        jobParams["mountpoints"].append({
            "name": "work",
            "containerPath": "/work",
            "hostPath": jobParams["hostworkPath"]
        })
        jobParams["mountpoints"].append({
            "name": "data",
            "containerPath": "/data",
            "hostPath": jobParams["hostdataPath"]
        })
        jobParams["pod_ip_range"] = config["pod_ip_range"]
        if "usefreeflow" in config:
            jobParams["usefreeflow"] = config["usefreeflow"]
        else:
            jobParams["usefreeflow"] = False

        print("Render Job: %s" % jobParams)
        template = ENV.get_template(os.path.abspath(jobTemp))
        job_description = template.render(job=jobParams)

        jobDescriptionList = []

        jobDescriptionList.append(job_description)

        if ("interactivePort" in jobParams
                and len(jobParams["interactivePort"].strip()) > 0):
            ports = [
                p.strip()
                for p in re.split(",|;", jobParams["interactivePort"])
                if len(p.strip()) > 0 and p.strip().isdigit()
            ]
            for portNum in ports:
                jobParams["serviceId"] = "interactive-" + jobParams[
                    "jobId"] + "-" + portNum
                jobParams["port"] = portNum
                jobParams["port-name"] = "interactive"
                jobParams["port-type"] = "TCP"

                serviceTemplate = ENV.get_template(
                    os.path.join(jobTempDir, "KubeSvc.yaml.template"))

                template = ENV.get_template(serviceTemplate)
                interactiveMeta = template.render(svc=jobParams)
                jobDescriptionList.append(interactiveMeta)

        jobDescription = "\n---\n".join(jobDescriptionList)

        jobDescriptionPath = os.path.join(config["storage-mount-path"],
                                          jobParams["jobDescriptionPath"])
        if not os.path.exists(
                os.path.dirname(os.path.realpath(jobDescriptionPath))):
            os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
        if os.path.isfile(jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)

        with open(jobDescriptionPath, 'w') as f:
            f.write(jobDescription)

        output = k8sUtils.kubectl_create(jobDescriptionPath)
        logging.info("Submitted job %s to k8s, returned with status %s" %
                     (job["jobId"], output))

        ret["output"] = output

        ret["jobId"] = jobParams["jobId"]

        if "userName" not in jobParams:
            jobParams["userName"] = ""

        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(jobParams["jobId"],
                                       "jobDescriptionPath",
                                       jobParams["jobDescriptionPath"])
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription",
                                       base64.b64encode(jobDescription))

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["workPath"] = jobParams["workPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["LaunchCMD"] = jobParams["LaunchCMD"]

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta",
                                       jobMetaStr)
    except Exception as e:
        print e
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                           "error")
            dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))

    return ret
Exemplo n.º 6
0
def SubmitPSDistJob(job):
    ret = {}
    dataHandler = DataHandler()

    try:
        jobParams = json.loads(base64.b64decode(job["jobParams"]))
        jobParams["rest-api"] = config["rest-api"]
        distJobParams = {}
        distJobParams["ps"] = []
        distJobParams["worker"] = []
        assignedRack = None
        if len(config["racks"]) > 0:
            assignedRack = random.choice(config["racks"])
        if jobParams["jobtrainingtype"] == "PSDistJob":
            jobDescriptionList = []
            nums = {
                "ps": int(jobParams["numps"]),
                "worker": int(jobParams["numpsworker"])
            }
            for role in ["ps", "worker"]:
                for i in range(nums[role]):
                    distJobParam = copy.copy(jobParams)
                    distJobParam["distId"] = "%s%d" % (role, i)
                    distJobParam["distRole"] = role

                    if "jobPath" not in distJobParam or len(
                            distJobParam["jobPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: job-path does not exist")
                        return False

                    distJobParam["distJobPath"] = os.path.join(
                        distJobParam["jobPath"], distJobParam["distId"])

                    if "workPath" not in distJobParam or len(
                            distJobParam["workPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: work-path does not exist")
                        return False

                    if "dataPath" not in distJobParam or len(
                            distJobParam["dataPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: data-path does not exist")
                        return False

                    jobPath, workPath, dataPath = GetStoragePath(
                        distJobParam["distJobPath"], distJobParam["workPath"],
                        distJobParam["dataPath"])

                    localJobPath = os.path.join(config["storage-mount-path"],
                                                jobPath)
                    if not os.path.exists(localJobPath):
                        if "userId" in distJobParam:
                            mkdirsAsUser(localJobPath, distJobParam["userId"])
                        else:
                            mkdirsAsUser(localJobPath, 0)

                    distJobParam["LaunchCMD"] = ""
                    if "cmd" not in distJobParam:
                        distJobParam["cmd"] = ""


################One choice is that we only wait for certain time.
#					launchCMD = """
##!/bin/bash
#mkdir -p /opt
#echo "[DLWorkspace System]: Waiting for all containers are ready..."
## wait for at most 10 mins.
#for i in {1..200}; do
#	if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then
#		sleep 3
#	else
#		break
#	fi
#done
#if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then
#	echo "[DLWorkspace System]: Waiting for containers: timeout! Restarting..."
#	exit 1
#else
#	echo "[DLWorkspace System]: All containers are ready, launching training job..."
#	chmod +x /opt/run_dist_job.sh
#	/opt/run_dist_job.sh
#fi
#"""

                    launchCMD = """
#!/bin/bash
mkdir -p /opt
echo "[DLWorkspace System]: Waiting for all containers are ready..."
while [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; do
	sleep 3
done
echo "[DLWorkspace System]: All containers are ready, launching training job..."
chmod +x /opt/run_dist_job.sh
/opt/run_dist_job.sh
"""

                    launchScriptPath = os.path.join(
                        localJobPath, "launch-%s.sh" % distJobParam["jobId"])
                    with open(launchScriptPath, 'w') as f:
                        f.write(launchCMD)
                    f.close()
                    distJobParam[
                        "LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % distJobParam[
                            "jobId"]

                    distJobParam["jobNameLabel"] = ''.join(
                        e for e in distJobParam["jobName"] if e.isalnum())
                    distJobParam["userNameLabel"] = getAlias(
                        jobParams["userName"])
                    ENV = Environment(loader=FileSystemLoader("/"))

                    jobTempDir = os.path.join(config["root-path"],
                                              "Jobs_Templete")
                    jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template")

                    distJobParam["hostjobPath"] = os.path.join(
                        config["storage-mount-path"], jobPath)
                    distJobParam["hostworkPath"] = os.path.join(
                        config["storage-mount-path"], workPath)
                    distJobParam["hostdataPath"] = os.path.join(
                        config["storage-mount-path"], dataPath)
                    distJobParam["nvidiaDriverPath"] = nvidiaDriverPath

                    if "mountpoints" not in distJobParam:
                        distJobParam["mountpoints"] = []

                    distJobParam["mountpoints"].append({
                        "name":
                        "nvidia-driver",
                        "containerPath":
                        "/usr/local/nvidia",
                        "hostPath":
                        nvidiaDriverPath
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "job",
                        "containerPath":
                        "/job",
                        "hostPath":
                        distJobParam["hostjobPath"]
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "work",
                        "containerPath":
                        "/work",
                        "hostPath":
                        distJobParam["hostworkPath"]
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "data",
                        "containerPath":
                        "/data",
                        "hostPath":
                        distJobParam["hostdataPath"]
                    })
                    distJobParam["pod_ip_range"] = config["pod_ip_range"]
                    if "usefreeflow" in config and config[
                            "usefreeflow"] == "True":
                        distJobParam["usefreeflow"] = config["usefreeflow"]
                    else:
                        distJobParam["usefreeflow"] = False

                    random.seed(datetime.datetime.now())
                    distJobParam["containerPort"] = int(random.random() *
                                                        1000 + 3000)

                    if assignedRack is not None:
                        if "nodeSelector" not in distJobParam:
                            distJobParam["nodeSelector"] = {}
                        distJobParam["nodeSelector"]["rack"] = assignedRack

                    template = ENV.get_template(os.path.abspath(jobTemp))
                    job_description = template.render(job=distJobParam)

                    jobDescriptionList.append(job_description)

                    distJobParams[role].append(distJobParam)

            jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime(
                "%y%m%d"
            ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"
            jobDescription = "\n---\n".join(jobDescriptionList)

        jobDescriptionPath = os.path.join(config["storage-mount-path"],
                                          jobParams["jobDescriptionPath"])
        if not os.path.exists(
                os.path.dirname(os.path.realpath(jobDescriptionPath))):
            os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
        if os.path.isfile(jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)

        with open(jobDescriptionPath, 'w') as f:
            f.write(jobDescription)

        output = k8sUtils.kubectl_create(jobDescriptionPath)

        ret["output"] = output

        ret["jobId"] = jobParams["jobId"]

        if "userName" not in jobParams:
            jobParams["userName"] = ""

        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(jobParams["jobId"],
                                       "jobDescriptionPath",
                                       jobParams["jobDescriptionPath"])
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription",
                                       base64.b64encode(jobDescription))

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["workPath"] = jobParams["workPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["LaunchCMD"] = jobParams["LaunchCMD"]
        jobMeta["distJobParams"] = distJobParams

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta",
                                       jobMetaStr)
    except Exception as e:
        print e
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                           "error")
            dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))

    return ret
Exemplo n.º 7
0
def extract_job_log(jobId, logPath, userId):
    try:
        dataHandler = DataHandler()

        # TODO: Replace joblog manager with elastic search
        logs = k8sUtils.GetLog(jobId, tail=None)

        # Do not overwrite existing logs with empty log
        # DLTS bootstrap will generate logs for all containers.
        # If one container has empty log, skip writing.
        for log in logs:
            if "containerLog" in log and log["containerLog"] == "":
                return

        jobLogDir = os.path.dirname(logPath)
        if not os.path.exists(jobLogDir):
            mkdirsAsUser(jobLogDir, userId)
        logStr = ""
        trimlogstr = ""

        for log in logs:
            if "podName" in log and "containerID" in log and "containerLog" in log:
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "        logs from pod: %s\n" % log["podName"]
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += log["containerLog"]
                logStr += "\n\n\n"
                logStr += "=========================================================\n"
                logStr += "        end of logs from pod: %s\n" % log["podName"]
                logStr += "=========================================================\n"
                logStr += "\n\n\n"

                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                trimlogstr += "        logs from pod: %s\n" % log["podName"]
                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                logLines = log["containerLog"].split('\n')
                if (len(logLines) < 3000):
                    trimlogstr += log["containerLog"]
                    trimlogstr += "\n\n\n"
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "        end of logs from pod: %s\n" % log[
                        "podName"]
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "\n\n\n"
                else:
                    trimlogstr += "\n".join(logLines[-2000:])
                    trimlogstr += "\n\n\n"
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "        end of logs from pod: %s\n" % log[
                        "podName"]
                    trimlogstr += "        Note: the log is too long to display in the webpage.\n"
                    trimlogstr += "        Only the last 2000 lines are shown here.\n"
                    trimlogstr += "        Please check the log file (in Job Folder) for the full logs.\n"
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "\n\n\n"

                try:
                    containerLogPath = os.path.join(
                        jobLogDir,
                        "log-container-" + log["containerID"] + ".txt")
                    with open(containerLogPath, 'w') as f:
                        f.write(log["containerLog"])
                    f.close()
                    os.system("chown -R %s %s" % (userId, containerLogPath))
                except Exception as e:
                    logger.exception("write container log failed")

        if len(trimlogstr.strip()) > 0:
            dataHandler.UpdateJobTextField(jobId, "jobLog",
                                           base64.b64encode(trimlogstr))
            with open(logPath, 'w') as f:
                f.write(logStr)
            f.close()
            os.system("chown -R %s %s" % (userId, logPath))

    except Exception as e:
        logger.error(e)
Exemplo n.º 8
0
def SubmitRegularJob(job):
    ret = {}
    dataHandler = DataHandler()

    try:
        jobParams = json.loads(base64.b64decode(job["jobParams"]))

        jobParams["pvc_job"] = "jobs-" + jobParams["jobId"]
        jobParams["pvc_work"] = "work-" + jobParams["jobId"]
        jobParams["pvc_data"] = "storage-" + jobParams["jobId"]


        if "jobPath" not in jobParams or len(jobParams["jobPath"].strip()) == 0: 
            dataHandler.SetJobError(jobParams["jobId"],"ERROR: job-path does not exist")
            return False

        if "workPath" not in jobParams or len(jobParams["workPath"].strip()) == 0: 
            dataHandler.SetJobError(jobParams["jobId"],"ERROR: work-path does not exist")
            return False

        #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0: 
        #    dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist")
        #    return False


        jobPath,workPath,dataPath = GetStoragePath(jobParams["jobPath"],jobParams["workPath"],jobParams["dataPath"])


        localJobPath = os.path.join(config["storage-mount-path"],jobPath)

        if not os.path.exists(localJobPath):
            if "userId" in jobParams:
                mkdirsAsUser(localJobPath,jobParams["userId"])
                mkdirsAsUser(os.path.join(localJobPath,"models"),jobParams["userId"])
            else:
                mkdirsAsUser(localJobPath,"0")
                mkdirsAsUser(os.path.join(localJobPath,"models"),"0")

        jobParams["LaunchCMD"] = ""
        if "cmd" not in jobParams:
            jobParams["cmd"] = ""
            
        if isinstance(jobParams["cmd"], basestring) and not jobParams["cmd"] == "":
            launchScriptPath = os.path.join(localJobPath,"launch-%s.sh" % jobParams["jobId"])
            with open(launchScriptPath, 'w') as f:
                f.write("#!/bin/bash -x\n")
                f.write(jobParams["cmd"] + "\n")
            f.close()    
            if "userId" in jobParams:
                os.system("chown -R %s %s" % (jobParams["userId"], launchScriptPath))
            jobParams["LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams["jobId"]


        jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"

        jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"] if e.isalnum())

        ENV = Environment(loader=FileSystemLoader("/"))

        jobTempDir = os.path.join(config["root-path"],"Jobs_Templete")
        jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template")

        jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath)
        jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath)
        jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath)
        jobParams["nvidiaDriverPath"] = nvidiaDriverPath


        jobParams["userNameLabel"] = getAlias(jobParams["userName"])
        jobParams["rest-api"] = config["rest-api"]

        if "mountpoints" not in jobParams:
            jobParams["mountpoints"] = []
        for onemount in jobParams["mountpoints"]:
            onemount["name"] = onemount["containerPath"].replace("/","")

        mp = {"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath, "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)

        mp = {"name":"job","containerPath":"/job","hostPath":jobParams["hostjobPath"], "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)

        mp = {"name":"work","containerPath":"/work","hostPath":jobParams["hostworkPath"], "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)

        mp = {"name":"data","containerPath":"/data","hostPath":jobParams["hostdataPath"], "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)                        

        userAlias = getAlias(jobParams["userName"])

        mp = {"name":"sshkey","containerPath":"/home/%s/.ssh" % userAlias,"hostPath":os.path.join(config["storage-mount-path"], GetWorkPath(userAlias)+"/.ssh"), "readOnly":True, "enabled":True}
        if CheckMountPoints(jobParams["mountpoints"],mp):
            jobParams["mountpoints"].append(mp)            


        jobParams["pod_ip_range"] = config["pod_ip_range"]
        if "usefreeflow" in config:
            jobParams["usefreeflow"] = config["usefreeflow"]
        else:
            jobParams["usefreeflow"] = False

        print ("Render Job: %s" % jobParams)
        jobDescriptionList = []

        pods = []
        if "hyperparametername" in jobParams and "hyperparameterstartvalue" in jobParams and "hyperparameterendvalue" in jobParams and "hyperparameterstep" in jobParams:
            i = int(jobParams["hyperparameterstartvalue"])
            end = int(jobParams["hyperparameterendvalue"])
            step = int(jobParams["hyperparameterstep"])
            c = 0
            while (i <= end):
                pod = {}
                pod["podName"] = jobParams["jobId"]+"-pod-"+str(c)
                pod["envs"] = [{"name":jobParams["hyperparametername"],"value":i}]
                i += step
                c += 1 
                pods.append(pod)
        else:
                pod = {}
                pod["podName"] = jobParams["jobId"]
                pod["envs"] = []
                pods.append(pod)

        if "env" not in jobParams:
            jobParams["env"] = []
        jobParams["commonenv"] = copy.copy(jobParams["env"])


        for pod in pods:
            jobParams["podName"] = pod["podName"]
            jobParams["env"] = jobParams["commonenv"] + pod["envs"]

            if "kube_custom_scheduler" in config and config["kube_custom_scheduler"]:
                container = {}
                container["requests"] = {"alpha.gpu/numgpu" : int(jobParams["resourcegpu"])}
                podInfo = {}
                podInfo["podname"] = jobParams["podName"]
                if "useGPUTopology" in jobParams and jobParams["useGPUTopology"]:
                    # add topology constraints explicitly - for testing
                    # if (jobParams["resourcegpu"] >= 2):
                    #     # both cards in same inner group
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/0/cards"] = 1
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/1/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 3):
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/2/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 4):
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/3/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 5):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/4/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 6):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/5/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 7):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/6/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 8):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/7/cards"] = 1
                    podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 1}
                else:
                    # for cases when desired topology is explictly given or not desired
                    podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 0}
                podInfo["runningcontainer"] = {jobParams["podName"] : container}

                if "annotations" not in jobParams:
                    jobParams["annotations"] = {}
                jobParams["annotations"]["pod.alpha/DeviceInformation"] = "'" + json.dumps(podInfo) + "'"
                jobParams["resourcegpu"] = 0 # gpu requests specified through annotation

            template = ENV.get_template(os.path.abspath(jobTemp))
            job_description = template.render(job=jobParams)
            jobDescriptionList.append(job_description)

            if ("interactivePort" in jobParams and len(jobParams["interactivePort"].strip()) > 0):
                ports = [p.strip() for p in re.split(",|;",jobParams["interactivePort"]) if len(p.strip()) > 0 and p.strip().isdigit()]
                for portNum in ports:
                    jobParams["serviceId"] = "interactive-" + jobParams["podName"] + "-" + portNum
                    jobParams["port"] = portNum
                    jobParams["port-name"] = "interactive"
                    jobParams["port-type"] = "TCP"

                    serviceTemplate = ENV.get_template(os.path.join(jobTempDir,"KubeSvc.yaml.template"))

                    stemplate = ENV.get_template(serviceTemplate)
                    interactiveMeta = stemplate.render(svc=jobParams)
                    jobDescriptionList.append(interactiveMeta)


        jobDescription = "\n---\n".join(jobDescriptionList)

        jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"])
        if not os.path.exists(os.path.dirname(os.path.realpath(jobDescriptionPath))):
            os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
        if os.path.isfile(jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath) 

        with open(jobDescriptionPath, 'w') as f:
            f.write(jobDescription)

        output = k8sUtils.kubectl_create(jobDescriptionPath)    
        logging.info("Submitted job %s to k8s, returned with status %s" %(job["jobId"], output))

        ret["output"] = output
        
        ret["jobId"] = jobParams["jobId"]


        if "userName" not in jobParams:
            jobParams["userName"] = ""

        dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","scheduling")
        dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescriptionPath",jobParams["jobDescriptionPath"])
        dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescription",base64.b64encode(jobDescription))


        jobMeta = {}
        jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["workPath"] = jobParams["workPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["LaunchCMD"] = jobParams["LaunchCMD"]

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(jobParams["jobId"],"jobMeta",jobMetaStr)
    except Exception as e:
        print e
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","error")
            dataHandler.UpdateJobTextField(jobParams["jobId"],"errorMsg","Cannot submit job!" + str(e))

    return ret
Exemplo n.º 9
0
def extract_job_log(jobId, logPath, userId):
    try:
        dataHandler = DataHandler()

        logs = k8sUtils.GetLog(jobId)

        jobLogDir = os.path.dirname(logPath)
        if not os.path.exists(jobLogDir):
            mkdirsAsUser(jobLogDir, userId)
        logStr = ""
        trimlogstr = ""

        for log in logs:
            if "podName" in log and "containerID" in log and "containerLog" in log:
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "        logs from pod: %s\n" % log["podName"]
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += "=========================================================\n"
                logStr += log["containerLog"]
                logStr += "\n\n\n"
                logStr += "=========================================================\n"
                logStr += "        end of logs from pod: %s\n" % log["podName"]
                logStr += "=========================================================\n"
                logStr += "\n\n\n"

                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                trimlogstr += "        logs from pod: %s\n" % log["podName"]
                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                trimlogstr += "=========================================================\n"
                logLines = log["containerLog"].split('\n')
                if (len(logLines) < 3000):
                    trimlogstr += log["containerLog"]
                    trimlogstr += "\n\n\n"
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "        end of logs from pod: %s\n" % log[
                        "podName"]
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "\n\n\n"
                else:
                    trimlogstr += "\n".join(logLines[-2000:])
                    trimlogstr += "\n\n\n"
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "        end of logs from pod: %s\n" % log[
                        "podName"]
                    trimlogstr += "        Note: the log is too long to display in the webpage.\n"
                    trimlogstr += "        Only the last 2000 lines are shown here.\n"
                    trimlogstr += "        Please check the log file (in Job Folder) for the full logs.\n"
                    trimlogstr += "=========================================================\n"
                    trimlogstr += "\n\n\n"

                try:
                    containerLogPath = os.path.join(
                        jobLogDir,
                        "log-container-" + log["containerID"] + ".txt")
                    with open(containerLogPath, 'w') as f:
                        f.write(log["containerLog"])
                    f.close()
                    os.system("chown -R %s %s" % (userId, containerLogPath))
                except Exception as e:
                    print e

        if len(trimlogstr.strip()) > 0:
            dataHandler.UpdateJobTextField(jobId, "jobLog",
                                           base64.b64encode(trimlogstr))
            with open(logPath, 'w') as f:
                f.write(logStr)
            f.close()
            os.system("chown -R %s %s" % (userId, logPath))

    except Exception as e:
        logging.error(e)
Exemplo n.º 10
0
def SubmitPSDistJob(job):
    ret = {}
    dataHandler = DataHandler()

    try:
        jobParams = json.loads(base64.b64decode(job["jobParams"]))
        jobParams["rest-api"] = config["rest-api"]
        distJobParams = {}
        distJobParams["ps"] = []
        distJobParams["worker"] = []
        assignedRack = None
        if len(config["racks"]) > 0:
            assignedRack = random.choice(config["racks"])

        userAlias = getAlias(jobParams["userName"])
        jobParams["user_email"] = jobParams["userName"]

        jobParams["homeFolderHostpath"] = os.path.join(
            config["storage-mount-path"], GetWorkPath(userAlias))

        if jobParams["jobtrainingtype"] == "PSDistJob":
            jobDescriptionList = []
            nums = {
                "ps": int(jobParams["numps"]),
                "worker": int(jobParams["numpsworker"])
            }
            for role in ["ps", "worker"]:
                for i in range(nums[role]):
                    distJobParam = copy.deepcopy(jobParams)
                    distJobParam["distId"] = "%s%d" % (role, i)
                    distJobParam["distRole"] = role
                    distJobParam["distRoleIdx"] = i

                    if "jobPath" not in distJobParam or len(
                            distJobParam["jobPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: job-path does not exist")
                        return False
                    if "workPath" not in distJobParam or len(
                            distJobParam["workPath"].strip()) == 0:
                        dataHandler.SetJobError(
                            distJobParam["jobId"],
                            "ERROR: work-path does not exist")
                        return False
                    #if "dataPath" not in distJobParam or len(distJobParam["dataPath"].strip()) == 0:
                    #    dataHandler.SetJobError(distJobParam["jobId"],"ERROR: data-path does not exist")
                    #    return False
                    distJobParam["distJobPath"] = os.path.join(
                        distJobParam["jobPath"], distJobParam["distId"])
                    jobPath, workPath, dataPath = GetStoragePath(
                        distJobParam["distJobPath"], distJobParam["workPath"],
                        distJobParam["dataPath"])

                    localJobPath = os.path.join(config["storage-mount-path"],
                                                jobPath)
                    if not os.path.exists(localJobPath):
                        if "userId" in distJobParam:
                            mkdirsAsUser(localJobPath, distJobParam["userId"])
                        else:
                            mkdirsAsUser(localJobPath, 0)

                    # TODO ???
                    if "cmd" not in distJobParam:
                        distJobParam["cmd"] = ""


#change ssh folder permission here because the setup permission script in launch_ps_job function may have race condition with init_user.sh script. results in no such user error
                    if role == "ps":
                        launchCMD = """
#!/bin/bash
echo "[DLWorkspace System]: Waiting for all containers are ready..."
while [ ! -f /opt/run_dist_job ]; do
    sleep 3
done

sudo chmod 600 -R /home/%s/.ssh &>/dev/null;
sudo chmod 700 /home/%s/.ssh &>/dev/null;
sudo chown -R %s /home/%s/.ssh &>/dev/null;

sudo mkdir -p /root/.ssh  &>/dev/null ;
sudo ln -s /home/%s/.ssh/config /root/.ssh/config  &>/dev/null;
sudo mkdir -p /opt  &>/dev/null;
sudo ln -s /job/hostfile /opt/hostfile &>/dev/null;

JOB_DIR='/home/%s'
WORKER_NUM=%s
echo $JOB_DIR $WORKER_NUM

all_workers_ready=false
while [ "$all_workers_ready" != true ]
do
  # update it to false if any woker is not ready
  all_workers_ready=true

  for i in $(seq 0 $(( ${WORKER_NUM} - 1)) )
  do
    worker="worker${i}"
    file="$JOB_DIR/${worker}/WORKER_READY"
    #echo $file

    if [ ! -f $file ]; then
      echo "${worker} not ready!"
      all_workers_ready=false
      sleep 10
    fi
  done
done

echo "[DLWorkspace System]: All containers are ready, launching training job..."
%s
""" % (userAlias, userAlias, userAlias, userAlias, userAlias,
                        distJobParam["jobPath"], jobParams["numpsworker"], distJobParam["cmd"])
                    else:
                        launchCMD = """
while [ ! -f /opt/run_dist_job ]; do
    sleep 3
done
sudo chmod 600 -R /home/%s/.ssh &>/dev/null;
sudo chmod 700 /home/%s/.ssh &>/dev/null;
sudo chown -R %s /home/%s/.ssh  &>/dev/null;
sudo mkdir -p /root/.ssh  &>/dev/null;
sudo ln -s /home/%s/.ssh/config /root/.ssh/config &>/dev/null;
sudo mkdir -p /opt && sudo ln -s /job/hostfile /opt/hostfile  &>/dev/null;

# TODO mark the worker as 'READY', better to change to '/pod/READY' later
sudo touch /job/WORKER_READY

sleep infinity
""" % (userAlias, userAlias, userAlias, userAlias, userAlias)

                    launchScriptPath = os.path.join(
                        localJobPath,
                        "launch-%s-%s%d.sh" % (distJobParam["jobId"], role, i))
                    # TODO need to set up user for distribute jobs
                    with open(launchScriptPath, 'w') as f:
                        f.write(launchCMD)
                    f.close()

                    launchScriptInContainer = "bash /job/launch-%s-%s%d.sh" % (
                        distJobParam["jobId"], role, i)

                    distJobParam[
                        "LaunchCMD"] = '["bash", "-c", "bash /dlws/init_user.sh &> /job/init_user_script.log && runuser -l ${DLWS_USER_NAME} -c \'%s\'"]' % launchScriptInContainer

                    distJobParam["jobNameLabel"] = ''.join(
                        e for e in distJobParam["jobName"] if e.isalnum())
                    ENV = Environment(loader=FileSystemLoader("/"))

                    jobTempDir = os.path.join(config["root-path"],
                                              "Jobs_Templete")
                    jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template")

                    distJobParam["hostjobPath"] = os.path.join(
                        config["storage-mount-path"], jobPath)
                    distJobParam["hostworkPath"] = os.path.join(
                        config["storage-mount-path"], workPath)
                    distJobParam["hostdataPath"] = os.path.join(
                        config["storage-mount-path"], dataPath)
                    distJobParam["nvidiaDriverPath"] = nvidiaDriverPath

                    if "mountpoints" not in distJobParam:
                        distJobParam["mountpoints"] = []

                    # distJobParam["mountpoints"].append({"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath})
                    distJobParam["mountpoints"].append({
                        "name":
                        "job",
                        "containerPath":
                        "/job",
                        "hostPath":
                        distJobParam["hostjobPath"]
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "work",
                        "containerPath":
                        "/work",
                        "hostPath":
                        distJobParam["hostworkPath"]
                    })
                    distJobParam["mountpoints"].append({
                        "name":
                        "data",
                        "containerPath":
                        "/data",
                        "hostPath":
                        distJobParam["hostdataPath"]
                    })

                    for idx in range(len(distJobParam["mountpoints"])):
                        if "name" not in distJobParam["mountpoints"][idx]:
                            distJobParam["mountpoints"][idx]["name"] = str(
                                uuid.uuid4()).replace("-", "")

                    distJobParam["pod_ip_range"] = config["pod_ip_range"]
                    if "usefreeflow" in config:
                        distJobParam["usefreeflow"] = config["usefreeflow"]
                    else:
                        distJobParam["usefreeflow"] = False

                    distJobParam["numworker"] = int(jobParams["numpsworker"])
                    distJobParam["numps"] = int(jobParams["numps"])

                    random.seed(datetime.datetime.now())
                    if "hostNetwork" in jobParams and jobParams["hostNetwork"]:
                        distJobParam["containerPort"] = random.randint(
                            40000, 49999)
                    else:
                        distJobParam["containerPort"] = int(random.random() *
                                                            1000 + 3000)

                    if assignedRack is not None:
                        if "nodeSelector" not in distJobParam:
                            distJobParam["nodeSelector"] = {}
                        distJobParam["nodeSelector"]["rack"] = assignedRack

                    if "gpuType" in distJobParam:
                        if "nodeSelector" not in distJobParam:
                            distJobParam["nodeSelector"] = {}
                        distJobParam["nodeSelector"]["gpuType"] = distJobParam[
                            "gpuType"]

                    # inject gid, uid and user
                    # TODO it should return only one entry
                    user_info = dataHandler.GetIdentityInfo(
                        jobParams["userName"])[0]
                    distJobParam["gid"] = user_info["gid"]
                    distJobParam["uid"] = user_info["uid"]
                    distJobParam["user"] = userAlias

                    template = ENV.get_template(os.path.abspath(jobTemp))
                    job_description = template.render(job=distJobParam)

                    jobDescriptionList.append(job_description)

                    distJobParams[role].append(distJobParam)

            jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime(
                "%y%m%d"
            ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"
            jobDescription = "\n---\n".join(jobDescriptionList)

        jobDescriptionPath = os.path.join(config["storage-mount-path"],
                                          jobParams["jobDescriptionPath"])
        if not os.path.exists(
                os.path.dirname(os.path.realpath(jobDescriptionPath))):
            os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))
        if os.path.isfile(jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)

        with open(jobDescriptionPath, 'w') as f:
            f.write(jobDescription)

        output = k8sUtils.kubectl_create(jobDescriptionPath)

        ret["output"] = output

        ret["jobId"] = jobParams["jobId"]

        if "userName" not in jobParams:
            jobParams["userName"] = ""

        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(jobParams["jobId"],
                                       "jobDescriptionPath",
                                       jobParams["jobDescriptionPath"])
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription",
                                       base64.b64encode(jobDescription))

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["workPath"] = jobParams["workPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["LaunchCMD"] = jobParams["cmd"]
        jobMeta["distJobParams"] = distJobParams

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta",
                                       jobMetaStr)
    except Exception as e:
        import traceback
        traceback.print_exc()
        print(e)
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                           "error")
            dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))
    dataHandler.Close()
    return ret