def GetClusterStatus(): job = None dataHandler = DataHandler() cluster_status,last_update_time = dataHandler.GetClusterStatus() dataHandler.Close() return cluster_status,last_update_time
def TakeJobActions(jobs): dataHandler = DataHandler() vcList = dataHandler.ListVCs() clusterStatus, dummy = dataHandler.GetClusterStatus() dataHandler.Close() globalTotalRes = ResourceInfo(clusterStatus["gpu_capacity"]) globalReservedRes = ResourceInfo(clusterStatus["gpu_unschedulable"]) localResInfo = ResourceInfo() globalResInfo = ResourceInfo.Difference(globalTotalRes, globalReservedRes) for vc in vcList: vcTotalRes = ResourceInfo(json.loads(vc["quota"]), vc["vcName"]) clusterTotalRes = ResourceInfo(clusterStatus["gpu_capacity"], vc["vcName"]) clusterReservedRes = ResourceInfo(clusterStatus["gpu_unschedulable"], vc["vcName"]) vcReservedRes = clusterReservedRes.GetFraction(vcTotalRes, clusterTotalRes) localResInfo.Add(ResourceInfo.Difference(vcTotalRes, vcReservedRes)) jobsInfo = [] for job in jobs: if job["jobStatus"] == "queued" or job[ "jobStatus"] == "scheduling" or job["jobStatus"] == "running": singleJobInfo = {} singleJobInfo["job"] = job singleJobInfo["jobParams"] = json.loads( base64.b64decode(job["jobParams"])) jobGpuType = "any" if "gpuType" in singleJobInfo["jobParams"]: jobGpuType = singleJobInfo["jobParams"]["gpuType"] singleJobInfo["localResInfo"] = ResourceInfo( {jobGpuType: GetJobTotalGpu(singleJobInfo["jobParams"])}, job["vcName"]) singleJobInfo["globalResInfo"] = ResourceInfo( {jobGpuType: GetJobTotalGpu(singleJobInfo["jobParams"])}) singleJobInfo["sortKey"] = str(job["jobTime"]) if singleJobInfo["jobParams"]["preemptionAllowed"]: singleJobInfo["sortKey"] = "1_" + singleJobInfo["sortKey"] else: singleJobInfo["sortKey"] = "0_" + singleJobInfo["sortKey"] singleJobInfo["allowed"] = False jobsInfo.append(singleJobInfo) jobsInfo.sort(key=JobInfoSorter) logging.info("TakeJobActions : local resources : %s" % (localResInfo.CategoryToCountMap)) logging.info("TakeJobActions : global resources : %s" % (globalResInfo.CategoryToCountMap)) for sji in jobsInfo: logging.info("TakeJobActions : job : %s : %s : %s" % (sji["jobParams"]["jobName"], sji["localResInfo"].CategoryToCountMap, sji["sortKey"])) if sji["jobParams"]["preemptionAllowed"]: localResInfo.UnblockResourceCategory(sji["localResInfo"]) if (localResInfo.CanSatisfy(sji["localResInfo"])): localResInfo.Subtract(sji["localResInfo"]) globalResInfo.Subtract(sji["globalResInfo"]) sji["allowed"] = True logging.info("TakeJobActions : local assignment : %s : %s" % (sji["jobParams"]["jobName"], sji["localResInfo"].CategoryToCountMap)) elif not sji["jobParams"]["preemptionAllowed"]: localResInfo.BlockResourceCategory( sji["localResInfo"]) #FIFO scheduling #logging.info("TakeJobActions : local resources : %s" % (localResInfo.CategoryToCountMap)) #logging.info("TakeJobActions : global resources : %s" % (globalResInfo.CategoryToCountMap)) for sji in jobsInfo: if (sji["jobParams"]["preemptionAllowed"] and sji["allowed"] == False): if globalResInfo.CanSatisfy(sji["globalResInfo"]): logging.info("TakeJobActions : job : %s : %s" % (sji["jobParams"]["jobName"], sji["globalResInfo"].CategoryToCountMap)) # Strict FIFO policy not required for global (bonus) tokens since these jobs are anyway pre-emptible. globalResInfo.Subtract(sji["globalResInfo"]) sji["allowed"] = True logging.info("TakeJobActions : global assignment : %s : %s" % (sji["jobParams"]["jobName"], sji["globalResInfo"].CategoryToCountMap)) logging.info("TakeJobActions : global resources : %s" % (globalResInfo.CategoryToCountMap)) for sji in jobsInfo: if sji["job"]["jobStatus"] == "queued" and sji["allowed"] == True: SubmitJob(sji["job"]) logging.info("TakeJobActions : submitting job : %s : %s : %s" % (sji["jobParams"]["jobName"], sji["jobParams"]["jobId"], sji["sortKey"])) elif sji["jobParams"]["preemptionAllowed"] and ( sji["job"]["jobStatus"] == "scheduling" or sji["job"]["jobStatus"] == "running") and sji["allowed"] == False: KillJob(sji["job"], "queued") logging.info("TakeJobActions : pre-empting job : %s : %s : %s" % (sji["jobParams"]["jobName"], sji["jobParams"]["jobId"], sji["sortKey"])) logging.info("TakeJobActions : job desired actions taken")
def TakeJobActions(jobs): dataHandler = DataHandler() vcList = dataHandler.ListVCs() clusterStatus, _ = dataHandler.GetClusterStatus() dataHandler.Close() cluster_gpu_capacity = clusterStatus["gpu_capacity"] cluster_gpu_reserved = clusterStatus["gpu_reserved"] globalTotalRes = ResourceInfo(cluster_gpu_capacity) globalReservedRes = ResourceInfo(cluster_gpu_reserved) vc_resources = {} localResInfo = ResourceInfo() globalResInfo = ResourceInfo.Difference(globalTotalRes, globalReservedRes) priority_dict = get_priority_dict() logging.info("Job priority dict: {}".format(priority_dict)) for vc in vcList: vcTotalRes = ResourceInfo(json.loads(vc["quota"])) clusterTotalRes = ResourceInfo(clusterStatus["gpu_capacity"]) clusterReservedRes = ResourceInfo(clusterStatus["gpu_reserved"]) vcReservedRes = clusterReservedRes.GetFraction(vcTotalRes, clusterTotalRes) vc_resources[vc["vcName"]] = ResourceInfo.Difference( vcTotalRes, vcReservedRes) jobsInfo = [] for job in jobs: if job["jobStatus"] in ["queued", "scheduling", "running"]: singleJobInfo = {} singleJobInfo["job"] = job job_params = json.loads(base64.b64decode(job["jobParams"])) singleJobInfo["preemptionAllowed"] = job_params[ "preemptionAllowed"] singleJobInfo["jobId"] = job_params["jobId"] jobGpuType = "any" if "gpuType" in job_params: jobGpuType = job_params["gpuType"] singleJobInfo["globalResInfo"] = ResourceInfo( {jobGpuType: GetJobTotalGpu(job_params)}) # Job lists will be sorted based on and in the order of below # 1. non-preemptible precedes preemptible # 2. running precedes scheduling, precedes queued # 3. larger priority value precedes lower priority value # 4. early job time precedes later job time # Non-Preemptible jobs first preemptible = 1 if singleJobInfo["preemptionAllowed"] else 0 # Job status job_status = 0 if job["jobStatus"] == "scheduling": job_status = 1 elif job["jobStatus"] == "queued": job_status = 2 # Priority value reverse_priority = get_job_priority(priority_dict, singleJobInfo["jobId"]) priority = 999999 - reverse_priority # Job time job_time = str(job["jobTime"]) singleJobInfo["sortKey"] = "{}_{}_{:06d}_{}".format( preemptible, job_status, priority, job_time) singleJobInfo["allowed"] = False jobsInfo.append(singleJobInfo) jobsInfo.sort(key=lambda x: x["sortKey"]) logging.info("TakeJobActions : local resources : %s" % (vc_resources)) logging.info("TakeJobActions : global resources : %s" % (globalResInfo.CategoryToCountMap)) for sji in jobsInfo: logging.info("TakeJobActions : job : %s : %s : %s" % (sji["jobId"], sji["globalResInfo"].CategoryToCountMap, sji["sortKey"])) vc_name = sji["job"]["vcName"] vc_resource = vc_resources[vc_name] if (vc_resource.CanSatisfy(sji["globalResInfo"])): vc_resource.Subtract(sji["globalResInfo"]) globalResInfo.Subtract(sji["globalResInfo"]) sji["allowed"] = True logging.info( "TakeJobActions : local assignment : %s : %s" % (sji["jobId"], sji["globalResInfo"].CategoryToCountMap)) for sji in jobsInfo: if sji["preemptionAllowed"] and (sji["allowed"] is False): if globalResInfo.CanSatisfy(sji["globalResInfo"]): logging.info( "TakeJobActions : job : %s : %s" % (sji["jobId"], sji["globalResInfo"].CategoryToCountMap)) # Strict FIFO policy not required for global (bonus) tokens since these jobs are anyway pre-emptible. globalResInfo.Subtract(sji["globalResInfo"]) sji["allowed"] = True logging.info( "TakeJobActions : global assignment : %s : %s" % (sji["jobId"], sji["globalResInfo"].CategoryToCountMap)) logging.info("TakeJobActions : global resources : %s" % (globalResInfo.CategoryToCountMap)) for sji in jobsInfo: try: if sji["job"]["jobStatus"] == "queued" and (sji["allowed"] is True): SubmitJob(sji["job"]) logging.info("TakeJobActions : submitting job : %s : %s" % (sji["jobId"], sji["sortKey"])) elif sji["preemptionAllowed"] and ( sji["job"]["jobStatus"] == "scheduling" or sji["job"]["jobStatus"] == "running") and (sji["allowed"] is False): KillJob(sji["job"]["jobId"], "queued") logging.info("TakeJobActions : pre-empting job : %s : %s" % (sji["jobId"], sji["sortKey"])) except Exception as e: logging.error("Process job failed {}".format(sji["job"]), exc_info=True) logging.info("TakeJobActions : job desired actions taken")
def GetVC(userName, vcName): ret = None data_handler = DataHandler() cluster_status, _ = data_handler.GetClusterStatus() cluster_total = cluster_status["gpu_capacity"] cluster_available = cluster_status["gpu_avaliable"] cluster_reserved = cluster_status["gpu_reserved"] user_status = collections.defaultdict(lambda: ResourceInfo()) user_status_preemptable = collections.defaultdict(lambda: ResourceInfo()) vc_list = getClusterVCs() vc_info = {} vc_usage = collections.defaultdict( lambda: collections.defaultdict(lambda: 0)) vc_preemptable_usage = collections.defaultdict( lambda: collections.defaultdict(lambda: 0)) for vc in vc_list: vc_info[vc["vcName"]] = json.loads(vc["quota"]) active_job_list = data_handler.GetActiveJobList() for job in active_job_list: jobParam = json.loads(base64.b64decode(job["jobParams"])) if "gpuType" in jobParam: if not jobParam["preemptionAllowed"]: vc_usage[job["vcName"]][jobParam["gpuType"]] += GetJobTotalGpu( jobParam) else: vc_preemptable_usage[job["vcName"]][ jobParam["gpuType"]] += GetJobTotalGpu(jobParam) result = quota.calculate_vc_gpu_counts(cluster_total, cluster_available, cluster_reserved, vc_info, vc_usage) vc_total, vc_used, vc_available, vc_unschedulable = result for vc in vc_list: if vc["vcName"] == vcName and AuthorizationManager.HasAccess( userName, ResourceType.VC, vcName, Permission.User): num_active_jobs = 0 for job in active_job_list: if job["vcName"] == vcName and job["jobStatus"] == "running": num_active_jobs += 1 username = job["userName"] jobParam = json.loads(base64.b64decode(job["jobParams"])) if "gpuType" in jobParam: if not jobParam["preemptionAllowed"]: if username not in user_status: user_status[username] = ResourceInfo() user_status[username].Add( ResourceInfo({ jobParam["gpuType"]: GetJobTotalGpu(jobParam) })) else: if username not in user_status_preemptable: user_status_preemptable[ username] = ResourceInfo() user_status_preemptable[username].Add( ResourceInfo({ jobParam["gpuType"]: GetJobTotalGpu(jobParam) })) vc["gpu_capacity"] = vc_total[vcName] vc["gpu_used"] = vc_used[vcName] vc["gpu_preemptable_used"] = vc_preemptable_usage[vcName] vc["gpu_unschedulable"] = vc_unschedulable[vcName] vc["gpu_avaliable"] = vc_available[vcName] vc["AvaliableJobNum"] = num_active_jobs vc["node_status"] = cluster_status["node_status"] vc["user_status"] = [] for user_name, user_gpu in user_status.iteritems(): # TODO: job_manager.getAlias should be put in a util file user_name = user_name.split("@")[0].strip() vc["user_status"].append({ "userName": user_name, "userGPU": user_gpu.ToSerializable() }) vc["user_status_preemptable"] = [] for user_name, user_gpu in user_status_preemptable.iteritems(): user_name = user_name.split("@")[0].strip() vc["user_status_preemptable"].append({ "userName": user_name, "userGPU": user_gpu.ToSerializable() }) try: gpu_idle_url = config["gpu_reporter"] + '/gpu_idle' gpu_idle_params = {"vc": vcName} gpu_idle_response = requests.get(gpu_idle_url, params=gpu_idle_params) gpu_idle_json = gpu_idle_response.json() vc["gpu_idle"] = gpu_idle_json except Exception: logger.exception("Failed to fetch gpu_idle from gpu-exporter") ret = vc break return ret