Exemplo n.º 1
0
class DADispatcher(DAServer):
    """
    Service for job status bookkeeping.
    Protocol:
      SRV: send READY
      CLT: send START, EXIT, or DONE
    """

    STATES = ["UNKNOWN", "CREATED", "PENDING", "RUNNING", "DONE", "FAILED", "EXITED"]

    CLUSTERS = ["lsf", "interactive", "local"]
    FALLBACK = {"lsf": "interactive", "interactive": "", "local": ""}
    MAXACTIVE = {"lsf": 40, "interactive": 10, "local": 20}

    class JobInfo(object):
        """
        Struct for job information.
        """

        def __init__(self, name_):
            self.name = name_
            self.key = string.join(random.sample(string.ascii_lowercase, 4), "")
            self.cluster = ""
            self.proc = None
            self.node = ""
            self.state = "CREATED"
            self.lastHB = 0

    def __init__(self, workspace_, resubmit=False, terminal=None):
        DAServer.__init__(self, "dispatch")

        self._workspace = workspace_
        self._webDir = ""
        self._jobInfo = {}
        self._resubmit = resubmit
        self._readyJobs = dict([(k, []) for k in DADispatcher.CLUSTERS])
        self._activeJobs = dict([(k, []) for k in DADispatcher.CLUSTERS])
        self._lock = threading.Lock()
        self._stateChanged = threading.Event()
        if terminal:
            self._terminal = terminal
        else:
            self._terminal = Terminal(TERMNODE, verbose=True)

        self.options = dict([(k, "") for k in DADispatcher.CLUSTERS])

    def __del__(self):
        for cluster in DADispatcher.CLUSTERS:
            for jobInfo in self._activeJobs[cluster]:
                self.kill(jobInfo)

    def authenticate(self, jobName_, key_):
        try:
            return key_ == self._jobInfo[jobName_].key
        except:
            return False

    def canServe(self, jobName_):
        if jobName_ in self._jobInfo:
            return 1
        else:
            return -1

    def serve(self, request_, jobName_):
        jobInfo = self._jobInfo[jobName_]

        jobInfo.lastHB = time.time()

        try:
            response = request_.recv(1024)
            request_.send("OK")
            if response == "HB":
                self.log("Heart beat from", jobName_)
                return
            elif response in DADispatcher.STATES:
                self.log("Set state", jobName_, response)
            else:
                raise Exception()
        except:
            response = "UNKNOWN"

        with self._lock:
            try:
                jobInfo.state = response

                finished = False

                if jobInfo.state == "DONE":
                    self._activeJobs[jobInfo.cluster].remove(jobInfo)
                    finished = True
                elif jobInfo.state == "FAILED":
                    self._activeJobs[jobInfo.cluster].remove(jobInfo)
                    if self._resubmit:
                        self._readyJobs[jobInfo.cluster].append(jobInfo)

                    finished = True

                if finished:
                    if jobInfo.cluster == "interactive":
                        jobInfo.proc.close()
                    elif jobInfo.cluster == "local":
                        jobInfo.proc.communicate()

            except:
                self.log("Exception while serving", jobName_, "\n", excDump())

            if jobInfo.state == "FAILED":
                with open(self._workspace + "/logs/" + jobName_ + ".fail", "w") as failLog:
                    pass

        self._stateChanged.set()

    def createJob(self, jobName_, cluster_, append=True):
        jobInfo = DADispatcher.JobInfo(jobName_)
        jobInfo.cluster = cluster_
        self._jobInfo[jobName_] = jobInfo
        if append:
            self._readyJobs[cluster_].append(jobInfo)
        if DEBUG:
            self.log("Created", jobName_)
        return jobInfo

    def submitOne(self, cluster, logdir=""):
        if len(self._activeJobs[cluster]) >= DADispatcher.MAXACTIVE[cluster] or len(self._readyJobs[cluster]) == 0:
            return False

        with self._lock:
            try:
                jobInfo = self._readyJobs[cluster].pop(0)
            except IndexError:
                return False

        if DEBUG:
            self.log("submit", jobInfo.name)

        if self.submit(jobInfo, logdir):
            with self._lock:
                self._activeJobs[cluster].append(jobInfo)
            return True
        else:
            with self._lock:
                self._readyJobs[cluster].append(jobInfo)
            return False

    def submit(self, jobInfo_, logdir=""):
        self.log("Submitting job ", jobInfo_.name)

        if not logdir:
            logdir = self._workspace + "/logs"

        try:
            if jobInfo_.cluster == "lsf":
                command = "bsub -J {jobName} -o {log} -cwd '$TMPDIR' {options} 'source {environment};darun.py {workspace} {jobName} {key}'".format(
                    jobName=jobInfo_.name,
                    log=logdir + "/" + jobInfo_.name + ".log",
                    options=self.options["lsf"],
                    environment=self._workspace + "/environment",
                    workspace=self._workspace,
                    key=jobInfo_.key,
                )

                self.log(command)

                bsubout = self._terminal.communicate(command)

                success = False
                if len(bsubout) != 0 and "submitted" in bsubout[0]:
                    matches = re.search("<([0-9]+)>", bsubout[0])
                    if matches:
                        success = True

                if not success:
                    self.log("bsub failed")
                    raise Exception

                self.log("lxbatch job ID for {0} is {1}".format(jobInfo_.name, matches.group(1)))

                proc = matches.group(1)
                node = ""

            elif jobInfo_.cluster == "interactive":
                node = TERMNODE

                if LOADBALANCE:
                    hostProc = subprocess.Popen(["host", TERMNODE], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    out, err = hostProc.communicate()
                    for line in out.split("\n"):
                        if "has address" in line:
                            addr = line.split()[3]
                            for term in Terminal.OPENTERMS:
                                if term.addr == addr:
                                    break
                            else:
                                node = addr
                                break

                command = "cd $TMPDIR;source {environment};darun.py -p {workspace} {jobName} {key} >> {log} 2>&1;exit".format(
                    environment=self._workspace + "/environment",
                    workspace=self._workspace,
                    jobName=jobInfo_.name,
                    key=jobInfo_.key,
                    log=logdir + "/" + jobInfo_.name + ".log",
                )

                self.log(node + ":", command)

                term = Terminal(node)
                term.write(command)

                self.log("Command issued to", term.node)

                proc = term
                node = term.node

            elif jobInfo_.cluster == "local":
                command = "cd {tmpdir};source {environment};darun.py -p {workspace} {jobName} {key} >> {log} 2>&1".format(
                    tmpdir=TMPDIR,
                    environment=self._workspace + "/environment",
                    workspace=self._workspace,
                    jobName=jobInfo_.name,
                    key=jobInfo_.key,
                    log=logdir + "/" + jobInfo_.name + ".log",
                )

                self.log(command)

                proc = subprocess.Popen(
                    command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
                )  # stdout will be redirected to a log file within the job

                self.log("Subprocess started")

                node = "localhost"

        except:
            return False

        with self._lock:
            jobInfo_.proc = proc
            jobInfo_.state = "PENDING"
            jobInfo_.node = node
            jobInfo_.lastHB = time.time()

        self._stateChanged.set()

        return True

    def kill(self, jobInfo_):
        if jobInfo_.cluster == "lsf":
            response = self._terminal.communicate("bkill {0}".format(jobInfo_.proc))
            for line in response:
                self.log(line)

        elif jobInfo_.cluster == "interactive":
            if jobInfo_.proc.isOpen():
                jobInfo_.proc.close(force=True)

        elif jobInfo_.cluster == "local":
            if jobInfo_.proc.poll() is None:
                jobInfo_.proc.terminate()

        try:
            self._jobInfo.pop(jobInfo_.name)
        except:
            self.log("Exception while trying to remove", jobInfo_.name)

    def dispatch(self, logdir=""):
        monitorTerminate = threading.Event()
        monitorThread = threading.Thread(target=self.monitor, args=(monitorTerminate,), name="monitor")
        monitorThread.daemon = True
        monitorThread.start()

        while True:
            submitted = False
            for cluster in DADispatcher.CLUSTERS:
                if self.submitOne(cluster, logdir):
                    submitted = True

            if submitted:
                continue

            with self._lock:
                nReady = reduce(lambda x, y: x + y, map(len, self._readyJobs.values()))
                nActive = reduce(lambda x, y: x + y, map(len, self._activeJobs.values()))

            if nReady == 0 and nActive == 0:
                break

            self._stateChanged.wait(60.0)

            if not self._stateChanged.isSet():  # timeout
                self.log("No job changed state in the last 60 seconds. Now checking for stale jobs..")
                exited = []
                longpend = []

                if len(self._activeJobs["lsf"]) != 0:
                    lsfNodes = {}
                    try:
                        response = self._terminal.communicate("bjobs")[1:]
                        if DEBUG:
                            print "bjobs", response

                        for line in response:
                            id = line.split()[0]
                            node = line.split()[5]
                            lsfNodes[id] = node

                    except:
                        self.log("Job status query failed")

                with self._lock:
                    for jobInfo in self._activeJobs["lsf"]:
                        # Two different tasks - if id is in the list of ids, set the node name.
                        # If not, the job may have exited abnormally - check state.
                        if jobInfo.proc in lsfNodes:
                            if not jobInfo.node:
                                jobInfo.node = lsfNodes[jobInfo.proc]

                            if jobInfo.lastHB < time.time() - 120:
                                self.log("No heartbeat from", jobInfo.name, "for 120 seconds")
                                if jobInfo.state == "PENDING":
                                    longpend.append(jobInfo)
                                elif jobInfo.state == "RUNNING":
                                    exited.append(jobInfo)
                        else:
                            self.log(jobInfo.name, "disappeared from LSF job list")
                            exited.append(jobInfo)

                    for jobInfo in self._activeJobs["interactive"]:
                        if not jobInfo.proc.isOpen():
                            exited.append(jobInfo)
                        elif jobInfo.lastHB < time.time() - 120:
                            self.log("No heartbeat from", jobInfo.name, "for 120 seconds")
                            if jobInfo.state == "PENDING":
                                longpend.append(jobInfo)
                            elif jobInfo.state == "RUNNING":
                                exited.append(jobInfo)

                    for jobInfo in self._activeJobs["local"]:
                        if jobInfo.proc.poll() is not None:
                            exited.append(jobInfo)
                        elif jobInfo.lastHB < time.time() - 120:
                            self.log("No heartbeat from", jobInfo.name, "for 120 seconds")
                            if jobInfo.state == "PENDING":
                                longpend.append(jobInfo)
                            elif jobInfo.state == "RUNNING":
                                exited.append(jobInfo)

                    for jobInfo in exited:
                        self.log("Set state", jobInfo.name, "EXITED")
                        self.kill(jobInfo)  # removes from self._jobInfo
                        jobInfo.state = "EXITED"
                        self._activeJobs[jobInfo.cluster].remove(jobInfo)

                for jobInfo in exited:
                    with open(self._workspace + "/logs/" + jobInfo.name + ".fail", "w") as failLog:
                        pass

                resubmit = []
                if self._resubmit:
                    resubmit += exited

                available = dict(
                    [(c, DADispatcher.MAXACTIVE[c] - len(self._activeJobs[c])) for c in DADispatcher.CLUSTERS]
                )
                for jobInfo in longpend:
                    fallback = DADispatcher.FALLBACK[jobInfo.cluster]
                    if fallback and available[fallback] > 0:
                        # This job did not start in time and there is a space in the fallback queue
                        with self._lock:
                            self.kill(jobInfo)  # removes from self._jobInfo
                            jobInfo.state = "EXITED"
                            self._activeJobs[jobInfo.cluster].remove(jobInfo)
                            jobInfo.cluster = fallback
                            resubmit.append(jobInfo)
                            available[fallback] -= 1

                for jobInfo in resubmit:
                    newJobInfo = self.createJob(jobInfo.name, jobInfo.cluster, append=False)

                    if self.submit(newJobInfo, logdir):
                        with self._lock:
                            self._activeJobs[jobInfo.cluster].append(newJobInfo)
                    else:
                        with self._lock:
                            self._readyJobs[jobInfo.cluster].append(newJobInfo)

                    self._stateChanged.set()

            time.sleep(1)  # allow the monitor thread to catch up
            self._stateChanged.clear()

        monitorTerminate.set()
        self._stateChanged.set()
        monitorThread.join()

    def monitor(self, _terminate):
        self.printStatus()
        self.printStatusWeb()
        lastWebUpdate = time.time()

        while True:
            self._stateChanged.wait(10.0)
            if _terminate.isSet():
                break

            self.printStatus()
            if time.time() > lastWebUpdate + 60.0:
                self.printStatusWeb()
                lastWebUpdate = time.time()

    def countJobs(self):
        jobCounts = dict((key, 0) for key in DADispatcher.STATES)

        with self._lock:
            for jobInfo in self._jobInfo.values():
                jobCounts[jobInfo.state] += 1

        return jobCounts

    def setWebDir(self, dir_):
        self._webDir = dir_
        try:
            os.mkdir(self._webDir)
        except OSError:
            pass

    def printStatus(self):
        jobCounts = self.countJobs()

        line = ""
        for state in DADispatcher.STATES:
            line += " {state}: {n}".format(state=state, n=jobCounts[state])

        line = "\r" + line
        line += " " * 10
        sys.stdout.write(line)
        if DEBUG:
            sys.stdout.write("\n")
        sys.stdout.flush()

    def printStatusWeb(self, copyLogs=False):
        if not self._webDir:
            return

        if copyLogs:
            logDir = self._webDir + "/logs"
            if not os.path.exists(logDir):
                os.mkdir(logDir)

            for fileName in os.listdir(self._workspace + "/logs"):
                sourceName = self._workspace + "/logs/" + fileName
                destName = logDir + "/" + fileName
                if not os.path.exists(destName) or os.stat(sourceName).st_mtime > os.stat(destName).st_mtime:
                    shutil.copy(sourceName, destName)

        allJobs = self._jobInfo.keys()

        summaryName = self._webDir + "/summary.dat"
        if not os.path.exists(summaryName):
            with open(summaryName, "w") as summaryFile:
                # add more info in the future?
                summaryFile.write("workspace = " + self._workspace)

        statusDir = self._webDir + "/status"

        if not os.path.exists(statusDir):
            os.makedirs(statusDir)
            for job in allJobs:
                open(statusDir + "/" + job + ".UNKNOWN", "w").close()

        with self._lock:
            for statusFile in os.listdir(statusDir):
                jobName = statusFile[: statusFile.rfind(".")]
                if jobName not in self._jobInfo:
                    continue
                current = statusFile[statusFile.rfind(".") + 1 :]
                actual = self._jobInfo[jobName].state
                if current != actual:
                    os.rename(statusDir + "/" + statusFile, statusDir + "/" + jobName + "." + actual)
Exemplo n.º 2
0
        jobConfig["outputFile"] = options.outputFile.strip()
        jobConfig["addSuffix"] = not options.noSuffix
        jobConfig["reducer"] = options.reducer.strip()
        jobConfig["maxSize"] = options.maxSize

        if jobConfig["reducer"] != "None" and not jobConfig["outputFile"]:
            raise RuntimeError("Reducer requires output file name specification")

        ### OPEN WORKSPACE ###

        os.mkdir(workspace)
        os.mkdir(workspace + "/inputs")
        os.mkdir(workspace + "/logs")

        if options.environment.strip():
            cmds = terminal.communicate(options.environment.strip())
            with open(workspace + "/environment", "w") as envFile:
                for cmd in cmds:
                    envFile.write(cmd + "\n")

    print "Using {0} as workspace".format(workspace)

    ### RUNTIME-SPECIFIC CONFIGURATIONS ###

    jobConfig["taskID"] = taskID
    jobConfig["serverHost"] = os.environ["HOSTNAME"]
    jobConfig["serverPort"] = tcpServer.server_address[1]
    jobConfig["serverWorkDir"] = TMPDIR + "/" + taskID
    jobConfig["logDir"] = HTMLDIR + "/" + taskID + "/logs"
    # In principle log directory can be anywhere; we are choosing it to be directly in the HTMLDIR for convenience