示例#1
0
def GetClusterStatus():
	job = None
	dataHandler = DataHandler()
	cluster_status,last_update_time =  dataHandler.GetClusterStatus()
	dataHandler.Close()
	return cluster_status,last_update_time
示例#2
0
def TakeJobActions(jobs):
    dataHandler = DataHandler()
    vcList = dataHandler.ListVCs()
    clusterStatus, dummy = dataHandler.GetClusterStatus()
    dataHandler.Close()

    globalTotalRes = ResourceInfo(clusterStatus["gpu_capacity"])
    globalReservedRes = ResourceInfo(clusterStatus["gpu_unschedulable"])

    localResInfo = ResourceInfo()
    globalResInfo = ResourceInfo.Difference(globalTotalRes, globalReservedRes)

    for vc in vcList:
        vcTotalRes = ResourceInfo(json.loads(vc["quota"]), vc["vcName"])
        clusterTotalRes = ResourceInfo(clusterStatus["gpu_capacity"],
                                       vc["vcName"])
        clusterReservedRes = ResourceInfo(clusterStatus["gpu_unschedulable"],
                                          vc["vcName"])
        vcReservedRes = clusterReservedRes.GetFraction(vcTotalRes,
                                                       clusterTotalRes)
        localResInfo.Add(ResourceInfo.Difference(vcTotalRes, vcReservedRes))

    jobsInfo = []
    for job in jobs:
        if job["jobStatus"] == "queued" or job[
                "jobStatus"] == "scheduling" or job["jobStatus"] == "running":
            singleJobInfo = {}
            singleJobInfo["job"] = job
            singleJobInfo["jobParams"] = json.loads(
                base64.b64decode(job["jobParams"]))
            jobGpuType = "any"
            if "gpuType" in singleJobInfo["jobParams"]:
                jobGpuType = singleJobInfo["jobParams"]["gpuType"]
            singleJobInfo["localResInfo"] = ResourceInfo(
                {jobGpuType: GetJobTotalGpu(singleJobInfo["jobParams"])},
                job["vcName"])
            singleJobInfo["globalResInfo"] = ResourceInfo(
                {jobGpuType: GetJobTotalGpu(singleJobInfo["jobParams"])})
            singleJobInfo["sortKey"] = str(job["jobTime"])
            if singleJobInfo["jobParams"]["preemptionAllowed"]:
                singleJobInfo["sortKey"] = "1_" + singleJobInfo["sortKey"]
            else:
                singleJobInfo["sortKey"] = "0_" + singleJobInfo["sortKey"]
            singleJobInfo["allowed"] = False
            jobsInfo.append(singleJobInfo)

    jobsInfo.sort(key=JobInfoSorter)

    logging.info("TakeJobActions : local resources : %s" %
                 (localResInfo.CategoryToCountMap))
    logging.info("TakeJobActions : global resources : %s" %
                 (globalResInfo.CategoryToCountMap))

    for sji in jobsInfo:
        logging.info("TakeJobActions : job : %s : %s : %s" %
                     (sji["jobParams"]["jobName"],
                      sji["localResInfo"].CategoryToCountMap, sji["sortKey"]))
        if sji["jobParams"]["preemptionAllowed"]:
            localResInfo.UnblockResourceCategory(sji["localResInfo"])

        if (localResInfo.CanSatisfy(sji["localResInfo"])):
            localResInfo.Subtract(sji["localResInfo"])
            globalResInfo.Subtract(sji["globalResInfo"])
            sji["allowed"] = True
            logging.info("TakeJobActions : local assignment : %s : %s" %
                         (sji["jobParams"]["jobName"],
                          sji["localResInfo"].CategoryToCountMap))
        elif not sji["jobParams"]["preemptionAllowed"]:
            localResInfo.BlockResourceCategory(
                sji["localResInfo"])  #FIFO scheduling

    #logging.info("TakeJobActions : local resources : %s" % (localResInfo.CategoryToCountMap))
    #logging.info("TakeJobActions : global resources : %s" % (globalResInfo.CategoryToCountMap))

    for sji in jobsInfo:
        if (sji["jobParams"]["preemptionAllowed"] and sji["allowed"] == False):
            if globalResInfo.CanSatisfy(sji["globalResInfo"]):
                logging.info("TakeJobActions : job : %s : %s" %
                             (sji["jobParams"]["jobName"],
                              sji["globalResInfo"].CategoryToCountMap))
                # Strict FIFO policy not required for global (bonus) tokens since these jobs are anyway pre-emptible.
                globalResInfo.Subtract(sji["globalResInfo"])
                sji["allowed"] = True
                logging.info("TakeJobActions : global assignment : %s : %s" %
                             (sji["jobParams"]["jobName"],
                              sji["globalResInfo"].CategoryToCountMap))

    logging.info("TakeJobActions : global resources : %s" %
                 (globalResInfo.CategoryToCountMap))

    for sji in jobsInfo:
        if sji["job"]["jobStatus"] == "queued" and sji["allowed"] == True:
            SubmitJob(sji["job"])
            logging.info("TakeJobActions : submitting job : %s : %s : %s" %
                         (sji["jobParams"]["jobName"],
                          sji["jobParams"]["jobId"], sji["sortKey"]))
        elif sji["jobParams"]["preemptionAllowed"] and (
                sji["job"]["jobStatus"] == "scheduling"
                or sji["job"]["jobStatus"]
                == "running") and sji["allowed"] == False:
            KillJob(sji["job"], "queued")
            logging.info("TakeJobActions : pre-empting job : %s : %s : %s" %
                         (sji["jobParams"]["jobName"],
                          sji["jobParams"]["jobId"], sji["sortKey"]))

    logging.info("TakeJobActions : job desired actions taken")
示例#3
0
def TakeJobActions(jobs):
    dataHandler = DataHandler()
    vcList = dataHandler.ListVCs()
    clusterStatus, _ = dataHandler.GetClusterStatus()
    dataHandler.Close()

    cluster_gpu_capacity = clusterStatus["gpu_capacity"]
    cluster_gpu_reserved = clusterStatus["gpu_reserved"]
    globalTotalRes = ResourceInfo(cluster_gpu_capacity)
    globalReservedRes = ResourceInfo(cluster_gpu_reserved)

    vc_resources = {}
    localResInfo = ResourceInfo()
    globalResInfo = ResourceInfo.Difference(globalTotalRes, globalReservedRes)

    priority_dict = get_priority_dict()
    logging.info("Job priority dict: {}".format(priority_dict))

    for vc in vcList:
        vcTotalRes = ResourceInfo(json.loads(vc["quota"]))
        clusterTotalRes = ResourceInfo(clusterStatus["gpu_capacity"])
        clusterReservedRes = ResourceInfo(clusterStatus["gpu_reserved"])
        vcReservedRes = clusterReservedRes.GetFraction(vcTotalRes,
                                                       clusterTotalRes)
        vc_resources[vc["vcName"]] = ResourceInfo.Difference(
            vcTotalRes, vcReservedRes)

    jobsInfo = []
    for job in jobs:
        if job["jobStatus"] in ["queued", "scheduling", "running"]:
            singleJobInfo = {}
            singleJobInfo["job"] = job
            job_params = json.loads(base64.b64decode(job["jobParams"]))
            singleJobInfo["preemptionAllowed"] = job_params[
                "preemptionAllowed"]
            singleJobInfo["jobId"] = job_params["jobId"]
            jobGpuType = "any"
            if "gpuType" in job_params:
                jobGpuType = job_params["gpuType"]
            singleJobInfo["globalResInfo"] = ResourceInfo(
                {jobGpuType: GetJobTotalGpu(job_params)})

            # Job lists will be sorted based on and in the order of below
            # 1. non-preemptible precedes preemptible
            # 2. running precedes scheduling, precedes queued
            # 3. larger priority value precedes lower priority value
            # 4. early job time precedes later job time

            # Non-Preemptible jobs first
            preemptible = 1 if singleJobInfo["preemptionAllowed"] else 0

            # Job status
            job_status = 0
            if job["jobStatus"] == "scheduling":
                job_status = 1
            elif job["jobStatus"] == "queued":
                job_status = 2

            # Priority value
            reverse_priority = get_job_priority(priority_dict,
                                                singleJobInfo["jobId"])
            priority = 999999 - reverse_priority

            # Job time
            job_time = str(job["jobTime"])

            singleJobInfo["sortKey"] = "{}_{}_{:06d}_{}".format(
                preemptible, job_status, priority, job_time)

            singleJobInfo["allowed"] = False
            jobsInfo.append(singleJobInfo)

    jobsInfo.sort(key=lambda x: x["sortKey"])

    logging.info("TakeJobActions : local resources : %s" % (vc_resources))
    logging.info("TakeJobActions : global resources : %s" %
                 (globalResInfo.CategoryToCountMap))

    for sji in jobsInfo:
        logging.info("TakeJobActions : job : %s : %s : %s" %
                     (sji["jobId"], sji["globalResInfo"].CategoryToCountMap,
                      sji["sortKey"]))
        vc_name = sji["job"]["vcName"]
        vc_resource = vc_resources[vc_name]

        if (vc_resource.CanSatisfy(sji["globalResInfo"])):
            vc_resource.Subtract(sji["globalResInfo"])
            globalResInfo.Subtract(sji["globalResInfo"])
            sji["allowed"] = True
            logging.info(
                "TakeJobActions : local assignment : %s : %s" %
                (sji["jobId"], sji["globalResInfo"].CategoryToCountMap))

    for sji in jobsInfo:
        if sji["preemptionAllowed"] and (sji["allowed"] is False):
            if globalResInfo.CanSatisfy(sji["globalResInfo"]):
                logging.info(
                    "TakeJobActions : job : %s : %s" %
                    (sji["jobId"], sji["globalResInfo"].CategoryToCountMap))
                # Strict FIFO policy not required for global (bonus) tokens since these jobs are anyway pre-emptible.
                globalResInfo.Subtract(sji["globalResInfo"])
                sji["allowed"] = True
                logging.info(
                    "TakeJobActions : global assignment : %s : %s" %
                    (sji["jobId"], sji["globalResInfo"].CategoryToCountMap))

    logging.info("TakeJobActions : global resources : %s" %
                 (globalResInfo.CategoryToCountMap))

    for sji in jobsInfo:
        try:
            if sji["job"]["jobStatus"] == "queued" and (sji["allowed"] is
                                                        True):
                SubmitJob(sji["job"])
                logging.info("TakeJobActions : submitting job : %s : %s" %
                             (sji["jobId"], sji["sortKey"]))
            elif sji["preemptionAllowed"] and (
                    sji["job"]["jobStatus"] == "scheduling"
                    or sji["job"]["jobStatus"]
                    == "running") and (sji["allowed"] is False):
                KillJob(sji["job"]["jobId"], "queued")
                logging.info("TakeJobActions : pre-empting job : %s : %s" %
                             (sji["jobId"], sji["sortKey"]))
        except Exception as e:
            logging.error("Process job failed {}".format(sji["job"]),
                          exc_info=True)

    logging.info("TakeJobActions : job desired actions taken")
示例#4
0
def GetVC(userName, vcName):
    ret = None

    data_handler = DataHandler()

    cluster_status, _ = data_handler.GetClusterStatus()
    cluster_total = cluster_status["gpu_capacity"]
    cluster_available = cluster_status["gpu_avaliable"]
    cluster_reserved = cluster_status["gpu_reserved"]

    user_status = collections.defaultdict(lambda: ResourceInfo())
    user_status_preemptable = collections.defaultdict(lambda: ResourceInfo())

    vc_list = getClusterVCs()
    vc_info = {}
    vc_usage = collections.defaultdict(
        lambda: collections.defaultdict(lambda: 0))
    vc_preemptable_usage = collections.defaultdict(
        lambda: collections.defaultdict(lambda: 0))

    for vc in vc_list:
        vc_info[vc["vcName"]] = json.loads(vc["quota"])

    active_job_list = data_handler.GetActiveJobList()
    for job in active_job_list:
        jobParam = json.loads(base64.b64decode(job["jobParams"]))
        if "gpuType" in jobParam:
            if not jobParam["preemptionAllowed"]:
                vc_usage[job["vcName"]][jobParam["gpuType"]] += GetJobTotalGpu(
                    jobParam)
            else:
                vc_preemptable_usage[job["vcName"]][
                    jobParam["gpuType"]] += GetJobTotalGpu(jobParam)

    result = quota.calculate_vc_gpu_counts(cluster_total, cluster_available,
                                           cluster_reserved, vc_info, vc_usage)

    vc_total, vc_used, vc_available, vc_unschedulable = result

    for vc in vc_list:
        if vc["vcName"] == vcName and AuthorizationManager.HasAccess(
                userName, ResourceType.VC, vcName, Permission.User):

            num_active_jobs = 0
            for job in active_job_list:
                if job["vcName"] == vcName and job["jobStatus"] == "running":
                    num_active_jobs += 1
                    username = job["userName"]
                    jobParam = json.loads(base64.b64decode(job["jobParams"]))
                    if "gpuType" in jobParam:
                        if not jobParam["preemptionAllowed"]:
                            if username not in user_status:
                                user_status[username] = ResourceInfo()
                            user_status[username].Add(
                                ResourceInfo({
                                    jobParam["gpuType"]:
                                    GetJobTotalGpu(jobParam)
                                }))
                        else:
                            if username not in user_status_preemptable:
                                user_status_preemptable[
                                    username] = ResourceInfo()
                            user_status_preemptable[username].Add(
                                ResourceInfo({
                                    jobParam["gpuType"]:
                                    GetJobTotalGpu(jobParam)
                                }))

            vc["gpu_capacity"] = vc_total[vcName]
            vc["gpu_used"] = vc_used[vcName]
            vc["gpu_preemptable_used"] = vc_preemptable_usage[vcName]
            vc["gpu_unschedulable"] = vc_unschedulable[vcName]
            vc["gpu_avaliable"] = vc_available[vcName]
            vc["AvaliableJobNum"] = num_active_jobs
            vc["node_status"] = cluster_status["node_status"]
            vc["user_status"] = []
            for user_name, user_gpu in user_status.iteritems():
                # TODO: job_manager.getAlias should be put in a util file
                user_name = user_name.split("@")[0].strip()
                vc["user_status"].append({
                    "userName": user_name,
                    "userGPU": user_gpu.ToSerializable()
                })

            vc["user_status_preemptable"] = []
            for user_name, user_gpu in user_status_preemptable.iteritems():
                user_name = user_name.split("@")[0].strip()
                vc["user_status_preemptable"].append({
                    "userName":
                    user_name,
                    "userGPU":
                    user_gpu.ToSerializable()
                })

            try:
                gpu_idle_url = config["gpu_reporter"] + '/gpu_idle'
                gpu_idle_params = {"vc": vcName}
                gpu_idle_response = requests.get(gpu_idle_url,
                                                 params=gpu_idle_params)
                gpu_idle_json = gpu_idle_response.json()
                vc["gpu_idle"] = gpu_idle_json
            except Exception:
                logger.exception("Failed to fetch gpu_idle from gpu-exporter")

            ret = vc
            break
    return ret