def fix_endpoints(runnings):
    if len(runnings) == 0:
        logger.debug("no running endpoints to fix")
        return

    resp = k8s_core_api.list_namespaced_pod(
        namespace="default",
        pretty="pretty_example",
        label_selector="type=job",
    )
    start = pytz.UTC.localize(datetime.datetime.now() -
                              datetime.timedelta(hours=1))
    pods = {
        pod.metadata.name: pod
        for pod in resp.items
        if pod.metadata.creation_timestamp > start
    }
    logger.info("get running pods %s", pods.keys())

    with DataHandler() as data_handler:
        for endpoint_id, point in runnings.items():
            update_file_modification_time("endpoint_manager")

            if is_need_fix(endpoint_id, point, pods):
                delete_k8s_endpoint(point["id"])
                point["status"] = "pending"
                logger.info("reset endpoint %s to pending", endpoint_id)
                data_handler.UpdateEndpoint(point)
示例#2
0
def update_job_logs():
    while True:
        try:
            dataHandler = DataHandler()
            pendingJobs = dataHandler.GetPendingJobs()
            dataHandler.Close()
            for job in pendingJobs:
                update_file_modification_time("joblog_manager")
                try:
                    if job["jobStatus"] == "running":
                        logger.info("updating job logs for job %s" %
                                    job["jobId"])
                        jobParams = json.loads(
                            base64.b64decode(job["jobParams"].encode(
                                "utf-8")).decode("utf-8"))
                        jobPath, workPath, dataPath = GetStoragePath(
                            jobParams["jobPath"], jobParams["workPath"],
                            jobParams["dataPath"])
                        localJobPath = os.path.join(
                            config["storage-mount-path"], jobPath)
                        logPath = os.path.join(localJobPath, "logs/joblog.txt")

                        extract_job_log(job["jobId"], logPath,
                                        jobParams["userId"])
                except Exception as e:
                    logger.exception("handling logs from %s", job["jobId"])
        except Exception as e:
            logger.exception("get pending jobs failed")

        time.sleep(1)
示例#3
0
def Run():
    register_stack_trace_dump()
    create_log()
    logger.info("start to update job logs ...")

    while True:
        update_file_modification_time("joblog_manager")

        with manager_iteration_histogram.labels("joblog_manager").time():
            try:
                update_job_logs()
            except Exception as e:
                logger.exception("update job logs failed")
        time.sleep(1)
示例#4
0
def Run():
    register_stack_trace_dump()
    create_log()
    logger.info("start to update user directory...")

    while True:
        update_file_modification_time("user_manager")

        with manager_iteration_histogram.labels("user_manager").time():
            try:
                set_user_directory()
            except Exception as e:
                logger.exception("set user directory failed")
        time.sleep(1)
示例#5
0
def set_user_directory():
    dataHandler = DataHandler()
    users = dataHandler.GetUsers()
    for username, userid, public_key, private_key in users:
        update_file_modification_time("user_manager")

        if "@" in username:
            username = username.split("@")[0]
        if "/" in username:
            username = username.split("/")[1]
        if "\\" in username:
            username = username.split("\\")[1]
        userpath = os.path.join(config["storage-mount-path"],
                                "work/" + username)
        if not os.path.exists(userpath):
            logger.info("Found a new user %s" % username)
            logger.info("Creating home directory %s for user %s" %
                        (userpath, username))
            os.system("mkdir -p " + userpath)
            os.system("chown -R " + str(userid) + ":" + "500000513 " + userpath)

        ssh_path = os.path.join(userpath, ".ssh")
        if not os.path.exists(ssh_path):
            os.system("mkdir -p " + ssh_path)

        sshkeypath = os.path.join(userpath, ".ssh/id_rsa")
        pubkeypath = os.path.join(userpath, ".ssh/id_rsa.pub")
        authorized_keyspath = os.path.join(userpath, ".ssh/authorized_keys")

        if not os.path.exists(sshkeypath):
            logger.info("Creating sshkey for user %s" % (username))
            with open(sshkeypath, "w") as wf:
                wf.write(private_key)
            with open(pubkeypath, "w") as wf:
                wf.write(public_key)
            os.system("chown -R " + str(userid) + ":" + "500000513 " + userpath)
            # Permission of .ssh has to be 700, otherwise, users cannot access
            # .ssh via Samba file share.
            os.system("chmod 700 " + os.path.dirname(sshkeypath))
            os.system("chmod 600 " + sshkeypath)
            os.system("chmod 600 " + pubkeypath)

        if not os.path.exists(authorized_keyspath):
            logger.info("Creating authorized_keys for user %s" % (username))
            with open(authorized_keyspath, "w") as wf:
                wf.write("\n")
                wf.write(public_key)
            os.system("chown -R " + str(userid) + ":" + "500000513 " +
                      authorized_keyspath)
            os.system("chmod 644 " + authorized_keyspath)
示例#6
0
def run():
    register_stack_trace_dump()
    create_log()
    while True:
        update_file_modification_time("db_manager")

        with manager_iteration_histogram.labels("db_manager").time():
            try:
                delete_old_cluster_status(CLUSTER_STATUS_EXPIRY)
                delete_old_inactive_jobs(JOBS_EXPIRY)
            except:
                logger.exception("Deleting old cluster status failed",
                                 exc_info=True)
        time.sleep(86400)
示例#7
0
def Run():
    register_stack_trace_dump()
    create_log()
    logger.info("start to DoDataConvert...")

    while True:
        update_file_modification_time("DataConvert")

        with manager_iteration_histogram.labels("data_convert").time():
            try:
                DoDataConvert()
            except Exception as e:
                logger.exception("do dataConvert failed")
        time.sleep(1)
示例#8
0
def Run():
    register_stack_trace_dump()
    create_log()

    while True:
        update_file_modification_time("endpoint_manager")

        with manager_iteration_histogram.labels("endpoint_manager").time():
            # start endpoints
            start_endpoints()
            time.sleep(1)

            # clean up endpoints for jobs which is NOT running
            cleanup_endpoints()
        time.sleep(1)
示例#9
0
def Run():
    register_stack_trace_dump()
    create_log()
    logging.info("start to update nodes usage information ...")
    config["cluster_status"] = None

    while True:
        update_file_modification_time("node_manager")

        with manager_iteration_histogram.labels("node_manager").time():
            try:
                get_cluster_status()
            except Exception as e:
                logging.exception("get cluster status failed")
        time.sleep(30)
示例#10
0
def start_endpoints():
    runnings = {} # return endpoints with status running to check it's status

    try:
        data_handler = DataHandler()
        try:
            pendings, runnings = data_handler.GetPendingEndpoints()

            for endpoint_id, endpoint in pendings.items():
                update_file_modification_time("endpoint_manager")

                try:
                    job = data_handler.GetJob(jobId=endpoint["jobId"])[0]
                    logger.info("checking endpoint %s, status is %s",
                                endpoint["jobId"], job["jobStatus"])
                    if job["jobStatus"] != "running":
                        continue

                    point = get_k8s_endpoint(endpoint["id"])
                    logger.debug("get endpoint %s", endpoint["jobId"])
                    if point is not None:
                        endpoint["status"] = "running"
                        # only retain spec here, some other fields have datetime,
                        # can not be serialized to json
                        endpoint["endpointDescription"] = {
                            "spec": point.spec.to_dict()
                        }
                        pod = k8sUtils.get_pod("default", endpoint["podName"])
                        if pod is not None:
                            logger.info("update endpoint's nodeName %s, %s",
                                        endpoint["jobId"], pod.spec.node_name)
                            endpoint["nodeName"] = pod.spec.node_name
                    else:
                        start_endpoint(endpoint)

                    endpoint["lastUpdated"] = datetime.datetime.now().isoformat(
                    )
                    data_handler.UpdateEndpoint(endpoint)
                except Exception as e:
                    logger.exception("Process endpoint failed %s", endpoint)
        except Exception as e:
            logger.exception("start endpoint failed")
        finally:
            data_handler.Close()
    except Exception as e:
        logger.exception("close data handler failed")
    return runnings
示例#11
0
def Run():
    register_stack_trace_dump()
    notifier = notify.Notifier(config.get("job-manager"))
    notifier.start()
    create_log()

    while True:
        update_file_modification_time("job_manager")

        with manager_iteration_histogram.labels("job_manager").time():
            try:
                config["racks"] = k8sUtils.get_node_labels("rack")
                config["skus"] = k8sUtils.get_node_labels("sku")
            except Exception as e:
                logging.exception("get node labels failed")

            try:
                dataHandler = DataHandler()
                pendingJobs = dataHandler.GetPendingJobs()
                TakeJobActions(pendingJobs)

                pendingJobs = dataHandler.GetPendingJobs()
                logging.info("Updating status for %d jobs" % len(pendingJobs))
                for job in pendingJobs:
                    try:
                        logging.info("Processing job: %s, status: %s" %
                                     (job["jobId"], job["jobStatus"]))
                        if job["jobStatus"] == "killing":
                            KillJob(job["jobId"], "killed")
                        elif job["jobStatus"] == "pausing":
                            KillJob(job["jobId"], "paused")
                        elif job["jobStatus"] == "scheduling" or job[
                                "jobStatus"] == "running":
                            UpdateJobStatus(job, notifier)
                        elif job["jobStatus"] == "unapproved":
                            ApproveJob(job)
                    except Exception as e:
                        logging.warning(e, exc_info=True)
            except Exception as e:
                logging.warning("Process job failed!", exc_info=True)
            finally:
                try:
                    dataHandler.Close()
                except:
                    pass

        time.sleep(1)
示例#12
0
def Run():
    register_stack_trace_dump()
    create_log()

    while True:
        update_file_modification_time("endpoint_manager")

        with manager_iteration_histogram.labels("endpoint_manager").time():
            try:
                runnings = start_endpoints()

                fix_endpoints(runnings)

                # clean up endpoints for jobs which is NOT running
                cleanup_endpoints()
            except Exception:
                logger.exception("processing this round of endpoints failed")
        time.sleep(1)
示例#13
0
def Run():
    register_stack_trace_dump()
    create_log()

    while True:
        update_file_modification_time("command_manager")

        with manager_iteration_histogram.labels("command_manager").time():
            try:
                dataHandler = DataHandler()
                pendingCommands = dataHandler.GetPendingCommands()
                for command in pendingCommands:
                    try:
                        logger.info("Processing command: %s", command["id"])
                        RunCommand(command)
                    except Exception as e:
                        logger.exception("run command failed")
            except Exception as e:
                logger.exception("getting command failed")
        time.sleep(1)
示例#14
0
def run():
    register_stack_trace_dump()
    create_log()

    update = lambda: update_file_modification_time("db_manager")
    while True:
        update()

        with manager_iteration_histogram.labels("db_manager").time():
            try:
                delete_old_cluster_status(CLUSTER_STATUS_EXPIRY)
                # query below is too time consuming since lastUpdated in job table is not indexed
                # delete_old_inactive_jobs(JOBS_EXPIRY)
            except:
                logger.exception("Deleting old cluster status failed")

        sleep_with_update(86400, update)
示例#15
0
def Run(redis_port, target_status):
    register_stack_trace_dump()
    process_name = "job_manager_" + target_status

    create_log(process_name=process_name)

    notifier = notify.Notifier(config.get("job-manager"))
    notifier.start()

    launcher_type = config.get("job-manager", {}).get("launcher", "python")
    if launcher_type == "python":
        launcher = PythonLauncher()
    elif launcher_type == "controller":
        launcher = LauncherStub()
    else:
        logger.error("unknown launcher_type %s", launcher_type)
        sys.exit(2)
    launcher.start()

    redis_conn = redis.StrictRedis(host="localhost", port=redis_port, db=0)

    while True:
        update_file_modification_time(process_name)

        with manager_iteration_histogram.labels(process_name).time():
            try:
                config["racks"] = k8sUtils.get_node_labels("rack")
                config["skus"] = k8sUtils.get_node_labels("sku")
            except Exception as e:
                logger.exception("get node labels failed")

            try:
                launcher.wait_tasks_done(
                )  # wait for tasks from previous batch done

                data_handler = DataHandler()

                if target_status == "queued":
                    jobs = data_handler.GetJobList(
                        "all",
                        "all",
                        num=None,
                        status="queued,scheduling,running")
                    take_job_actions(data_handler, redis_conn, launcher, jobs)
                else:
                    jobs = data_handler.GetJobList("all",
                                                   "all",
                                                   num=None,
                                                   status=target_status)
                    logger.info("Updating status for %d %s jobs", len(jobs),
                                target_status)

                    for job in jobs:
                        logger.info("Processing job: %s, status: %s" %
                                    (job["jobId"], job["jobStatus"]))
                        if job["jobStatus"] == "killing":
                            launcher.kill_job(job["jobId"], "killed")
                        elif job["jobStatus"] == "pausing":
                            launcher.kill_job(job["jobId"], "paused")
                        elif job["jobStatus"] == "running":
                            UpdateJobStatus(redis_conn,
                                            launcher,
                                            job,
                                            notifier,
                                            dataHandlerOri=data_handler)
                        elif job["jobStatus"] == "scheduling":
                            UpdateJobStatus(redis_conn,
                                            launcher,
                                            job,
                                            notifier,
                                            dataHandlerOri=data_handler)
                        elif job["jobStatus"] == "unapproved":
                            ApproveJob(redis_conn,
                                       job,
                                       dataHandlerOri=data_handler)
                        else:
                            logger.error("unknown job status %s for job %s",
                                         job["jobStatus"], job["jobId"])
            except Exception as e:
                logger.exception("Process jobs failed!")
            finally:
                try:
                    data_handler.Close()
                except:
                    pass

        time.sleep(1)