def fix_endpoints(runnings): if len(runnings) == 0: logger.debug("no running endpoints to fix") return resp = k8s_core_api.list_namespaced_pod( namespace="default", pretty="pretty_example", label_selector="type=job", ) start = pytz.UTC.localize(datetime.datetime.now() - datetime.timedelta(hours=1)) pods = { pod.metadata.name: pod for pod in resp.items if pod.metadata.creation_timestamp > start } logger.info("get running pods %s", pods.keys()) with DataHandler() as data_handler: for endpoint_id, point in runnings.items(): update_file_modification_time("endpoint_manager") if is_need_fix(endpoint_id, point, pods): delete_k8s_endpoint(point["id"]) point["status"] = "pending" logger.info("reset endpoint %s to pending", endpoint_id) data_handler.UpdateEndpoint(point)
def update_job_logs(): while True: try: dataHandler = DataHandler() pendingJobs = dataHandler.GetPendingJobs() dataHandler.Close() for job in pendingJobs: update_file_modification_time("joblog_manager") try: if job["jobStatus"] == "running": logger.info("updating job logs for job %s" % job["jobId"]) jobParams = json.loads( base64.b64decode(job["jobParams"].encode( "utf-8")).decode("utf-8")) jobPath, workPath, dataPath = GetStoragePath( jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join( config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") extract_job_log(job["jobId"], logPath, jobParams["userId"]) except Exception as e: logger.exception("handling logs from %s", job["jobId"]) except Exception as e: logger.exception("get pending jobs failed") time.sleep(1)
def Run(): register_stack_trace_dump() create_log() logger.info("start to update job logs ...") while True: update_file_modification_time("joblog_manager") with manager_iteration_histogram.labels("joblog_manager").time(): try: update_job_logs() except Exception as e: logger.exception("update job logs failed") time.sleep(1)
def Run(): register_stack_trace_dump() create_log() logger.info("start to update user directory...") while True: update_file_modification_time("user_manager") with manager_iteration_histogram.labels("user_manager").time(): try: set_user_directory() except Exception as e: logger.exception("set user directory failed") time.sleep(1)
def set_user_directory(): dataHandler = DataHandler() users = dataHandler.GetUsers() for username, userid, public_key, private_key in users: update_file_modification_time("user_manager") if "@" in username: username = username.split("@")[0] if "/" in username: username = username.split("/")[1] if "\\" in username: username = username.split("\\")[1] userpath = os.path.join(config["storage-mount-path"], "work/" + username) if not os.path.exists(userpath): logger.info("Found a new user %s" % username) logger.info("Creating home directory %s for user %s" % (userpath, username)) os.system("mkdir -p " + userpath) os.system("chown -R " + str(userid) + ":" + "500000513 " + userpath) ssh_path = os.path.join(userpath, ".ssh") if not os.path.exists(ssh_path): os.system("mkdir -p " + ssh_path) sshkeypath = os.path.join(userpath, ".ssh/id_rsa") pubkeypath = os.path.join(userpath, ".ssh/id_rsa.pub") authorized_keyspath = os.path.join(userpath, ".ssh/authorized_keys") if not os.path.exists(sshkeypath): logger.info("Creating sshkey for user %s" % (username)) with open(sshkeypath, "w") as wf: wf.write(private_key) with open(pubkeypath, "w") as wf: wf.write(public_key) os.system("chown -R " + str(userid) + ":" + "500000513 " + userpath) # Permission of .ssh has to be 700, otherwise, users cannot access # .ssh via Samba file share. os.system("chmod 700 " + os.path.dirname(sshkeypath)) os.system("chmod 600 " + sshkeypath) os.system("chmod 600 " + pubkeypath) if not os.path.exists(authorized_keyspath): logger.info("Creating authorized_keys for user %s" % (username)) with open(authorized_keyspath, "w") as wf: wf.write("\n") wf.write(public_key) os.system("chown -R " + str(userid) + ":" + "500000513 " + authorized_keyspath) os.system("chmod 644 " + authorized_keyspath)
def run(): register_stack_trace_dump() create_log() while True: update_file_modification_time("db_manager") with manager_iteration_histogram.labels("db_manager").time(): try: delete_old_cluster_status(CLUSTER_STATUS_EXPIRY) delete_old_inactive_jobs(JOBS_EXPIRY) except: logger.exception("Deleting old cluster status failed", exc_info=True) time.sleep(86400)
def Run(): register_stack_trace_dump() create_log() logger.info("start to DoDataConvert...") while True: update_file_modification_time("DataConvert") with manager_iteration_histogram.labels("data_convert").time(): try: DoDataConvert() except Exception as e: logger.exception("do dataConvert failed") time.sleep(1)
def Run(): register_stack_trace_dump() create_log() while True: update_file_modification_time("endpoint_manager") with manager_iteration_histogram.labels("endpoint_manager").time(): # start endpoints start_endpoints() time.sleep(1) # clean up endpoints for jobs which is NOT running cleanup_endpoints() time.sleep(1)
def Run(): register_stack_trace_dump() create_log() logging.info("start to update nodes usage information ...") config["cluster_status"] = None while True: update_file_modification_time("node_manager") with manager_iteration_histogram.labels("node_manager").time(): try: get_cluster_status() except Exception as e: logging.exception("get cluster status failed") time.sleep(30)
def start_endpoints(): runnings = {} # return endpoints with status running to check it's status try: data_handler = DataHandler() try: pendings, runnings = data_handler.GetPendingEndpoints() for endpoint_id, endpoint in pendings.items(): update_file_modification_time("endpoint_manager") try: job = data_handler.GetJob(jobId=endpoint["jobId"])[0] logger.info("checking endpoint %s, status is %s", endpoint["jobId"], job["jobStatus"]) if job["jobStatus"] != "running": continue point = get_k8s_endpoint(endpoint["id"]) logger.debug("get endpoint %s", endpoint["jobId"]) if point is not None: endpoint["status"] = "running" # only retain spec here, some other fields have datetime, # can not be serialized to json endpoint["endpointDescription"] = { "spec": point.spec.to_dict() } pod = k8sUtils.get_pod("default", endpoint["podName"]) if pod is not None: logger.info("update endpoint's nodeName %s, %s", endpoint["jobId"], pod.spec.node_name) endpoint["nodeName"] = pod.spec.node_name else: start_endpoint(endpoint) endpoint["lastUpdated"] = datetime.datetime.now().isoformat( ) data_handler.UpdateEndpoint(endpoint) except Exception as e: logger.exception("Process endpoint failed %s", endpoint) except Exception as e: logger.exception("start endpoint failed") finally: data_handler.Close() except Exception as e: logger.exception("close data handler failed") return runnings
def Run(): register_stack_trace_dump() notifier = notify.Notifier(config.get("job-manager")) notifier.start() create_log() while True: update_file_modification_time("job_manager") with manager_iteration_histogram.labels("job_manager").time(): try: config["racks"] = k8sUtils.get_node_labels("rack") config["skus"] = k8sUtils.get_node_labels("sku") except Exception as e: logging.exception("get node labels failed") try: dataHandler = DataHandler() pendingJobs = dataHandler.GetPendingJobs() TakeJobActions(pendingJobs) pendingJobs = dataHandler.GetPendingJobs() logging.info("Updating status for %d jobs" % len(pendingJobs)) for job in pendingJobs: try: logging.info("Processing job: %s, status: %s" % (job["jobId"], job["jobStatus"])) if job["jobStatus"] == "killing": KillJob(job["jobId"], "killed") elif job["jobStatus"] == "pausing": KillJob(job["jobId"], "paused") elif job["jobStatus"] == "scheduling" or job[ "jobStatus"] == "running": UpdateJobStatus(job, notifier) elif job["jobStatus"] == "unapproved": ApproveJob(job) except Exception as e: logging.warning(e, exc_info=True) except Exception as e: logging.warning("Process job failed!", exc_info=True) finally: try: dataHandler.Close() except: pass time.sleep(1)
def Run(): register_stack_trace_dump() create_log() while True: update_file_modification_time("endpoint_manager") with manager_iteration_histogram.labels("endpoint_manager").time(): try: runnings = start_endpoints() fix_endpoints(runnings) # clean up endpoints for jobs which is NOT running cleanup_endpoints() except Exception: logger.exception("processing this round of endpoints failed") time.sleep(1)
def Run(): register_stack_trace_dump() create_log() while True: update_file_modification_time("command_manager") with manager_iteration_histogram.labels("command_manager").time(): try: dataHandler = DataHandler() pendingCommands = dataHandler.GetPendingCommands() for command in pendingCommands: try: logger.info("Processing command: %s", command["id"]) RunCommand(command) except Exception as e: logger.exception("run command failed") except Exception as e: logger.exception("getting command failed") time.sleep(1)
def run(): register_stack_trace_dump() create_log() update = lambda: update_file_modification_time("db_manager") while True: update() with manager_iteration_histogram.labels("db_manager").time(): try: delete_old_cluster_status(CLUSTER_STATUS_EXPIRY) # query below is too time consuming since lastUpdated in job table is not indexed # delete_old_inactive_jobs(JOBS_EXPIRY) except: logger.exception("Deleting old cluster status failed") sleep_with_update(86400, update)
def Run(redis_port, target_status): register_stack_trace_dump() process_name = "job_manager_" + target_status create_log(process_name=process_name) notifier = notify.Notifier(config.get("job-manager")) notifier.start() launcher_type = config.get("job-manager", {}).get("launcher", "python") if launcher_type == "python": launcher = PythonLauncher() elif launcher_type == "controller": launcher = LauncherStub() else: logger.error("unknown launcher_type %s", launcher_type) sys.exit(2) launcher.start() redis_conn = redis.StrictRedis(host="localhost", port=redis_port, db=0) while True: update_file_modification_time(process_name) with manager_iteration_histogram.labels(process_name).time(): try: config["racks"] = k8sUtils.get_node_labels("rack") config["skus"] = k8sUtils.get_node_labels("sku") except Exception as e: logger.exception("get node labels failed") try: launcher.wait_tasks_done( ) # wait for tasks from previous batch done data_handler = DataHandler() if target_status == "queued": jobs = data_handler.GetJobList( "all", "all", num=None, status="queued,scheduling,running") take_job_actions(data_handler, redis_conn, launcher, jobs) else: jobs = data_handler.GetJobList("all", "all", num=None, status=target_status) logger.info("Updating status for %d %s jobs", len(jobs), target_status) for job in jobs: logger.info("Processing job: %s, status: %s" % (job["jobId"], job["jobStatus"])) if job["jobStatus"] == "killing": launcher.kill_job(job["jobId"], "killed") elif job["jobStatus"] == "pausing": launcher.kill_job(job["jobId"], "paused") elif job["jobStatus"] == "running": UpdateJobStatus(redis_conn, launcher, job, notifier, dataHandlerOri=data_handler) elif job["jobStatus"] == "scheduling": UpdateJobStatus(redis_conn, launcher, job, notifier, dataHandlerOri=data_handler) elif job["jobStatus"] == "unapproved": ApproveJob(redis_conn, job, dataHandlerOri=data_handler) else: logger.error("unknown job status %s for job %s", job["jobStatus"], job["jobId"]) except Exception as e: logger.exception("Process jobs failed!") finally: try: data_handler.Close() except: pass time.sleep(1)