def job_status_detail_with_finished_time(job_status_detail, status, msg=""): # This method is called when a job succeeds/fails/is killed/has an error # job_status_detail must be None or a list if (job_status_detail is not None) and (not isinstance(job_status_detail, list)): return job_status_detail # Force adding an item for empty detail if (job_status_detail is None) or (len(job_status_detail) == 0): job_status_detail = [{}] finished_at = k8sUtils.localize_time(datetime.datetime.now()) new_job_status_detail = [] status_change_message = "{} at {}. {}".format(status, finished_at, msg) # add finishedAt for all pods if absent for pod_status_detail in job_status_detail: # Mark started time the same as finished time for a fast finishing job if "startedAt" not in pod_status_detail: pod_status_detail["startedAt"] = finished_at if "finishedAt" not in pod_status_detail: pod_status_detail["finishedAt"] = finished_at if "message" not in pod_status_detail: pod_status_detail["message"] = status_change_message else: pod_status_detail["message"] += "\n" + status_change_message new_job_status_detail.append(pod_status_detail) return new_job_status_detail
def UpdateJobStatus(redis_conn, launcher, job, notifier=None, dataHandlerOri=None): assert (job["jobStatus"] == "scheduling" or job["jobStatus"] == "running") if dataHandlerOri is None: dataHandler = DataHandler() else: dataHandler = dataHandlerOri jobParams = json.loads(b64decode(job["jobParams"])) result, details, diagnostics = launcher.get_job_status(job["jobId"]) logger.info("Job status: %s %s", job["jobId"], result) jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") if "userId" not in jobParams: jobParams["userId"] = "0" if result == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) # TODO: Refactor detail = get_job_status_detail(job) detail = job_status_detail_with_finished_time(detail, "finished") dataFields = { "jobStatusDetail": b64encode(json.dumps(detail)), "jobStatus": "finished" } conditionFields = {"jobId": job["jobId"]} dataHandler.UpdateJobTextFields(conditionFields, dataFields) launcher.delete_job(job["jobId"], force=True) if notifier is not None: notifier.notify( notify.new_job_state_change_message(job["userName"], job["jobId"], result.strip())) elif result == "Running": update_job_state_latency(redis_conn, job["jobId"], "running") launcher.scale_job(job) if job["jobStatus"] != "running": started_at = k8sUtils.localize_time(datetime.datetime.now()) detail = [{ "startedAt": started_at, "message": "started at: {}".format(started_at) }] dataFields = { "jobStatusDetail": b64encode(json.dumps(detail)), "jobStatus": "running" } conditionFields = {"jobId": job["jobId"]} dataHandler.UpdateJobTextFields(conditionFields, dataFields) if notifier is not None: notifier.notify( notify.new_job_state_change_message( job["userName"], job["jobId"], result.strip())) elif result == "Failed": now = datetime.datetime.now() params = json.loads(base64decode(job["jobParams"])) if params.get("debug") is True and (now - job["jobTime"]).seconds < 60: logger.info("leave job %s there for debug for 60s", job["jobId"]) return logger.warning("Job %s fails, cleaning...", job["jobId"]) if notifier is not None: notifier.notify( notify.new_job_state_change_message(job["userName"], job["jobId"], result.strip())) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) # TODO: Refactor detail = get_job_status_detail(job) detail = job_status_detail_with_finished_time(detail, "failed") dataFields = { "jobStatusDetail": b64encode(json.dumps(detail)), "jobStatus": "failed", "errorMsg": diagnostics } conditionFields = {"jobId": job["jobId"]} dataHandler.UpdateJobTextFields(conditionFields, dataFields) launcher.delete_job(job["jobId"], force=True) elif result == "Unknown" or result == "NotFound": if job["jobId"] not in UnusualJobs: logger.warning("!!! Job status ---{}---, job: {}".format( result, job["jobId"])) UnusualJobs[job["jobId"]] = datetime.datetime.now() # TODO # 1) May need to reduce the timeout. # It takes minutes before pod turns into "Unknown", we may don't need to wait so long. # 2) If node resume before we resubmit the job, the job will end in status 'NotFound'. elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 30: del UnusualJobs[job["jobId"]] # TODO refine later # before resubmit the job, reset the endpoints # update all endpoint to status 'pending', so it would restart when job is ready endpoints = dataHandler.GetJobEndpoints(job["jobId"]) for endpoint_id, endpoint in list(endpoints.items()): endpoint["status"] = "pending" logger.debug("Reset endpoint status to 'pending': {}".format( endpoint_id)) dataHandler.UpdateEndpoint(endpoint) logger.warning( "Job {} fails in Kubernetes as {}, delete and re-submit.". format(job["jobId"], result)) launcher.kill_job(job["jobId"], "queued") if notifier is not None: notifier.notify( notify.new_job_state_change_message( job["userName"], job["jobId"], result.strip())) elif result == "Pending": _, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextFields( {"jobId": job["jobId"]}, {"jobStatusDetail": b64encode(json.dumps(detail))}) if result != "Unknown" and result != "NotFound" and job[ "jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]] if dataHandlerOri is None: dataHandler.Close()