def __init__(self, config, maxCores, maxMemory, maxDisk): super(MesosBatchSystem, self).__init__(config, maxCores, maxMemory, maxDisk) # The hot-deployed resource representing the user script. Will be passed along in every # Mesos task. Also see setUserScript(). self.userScript = None """ :type: toil.resource.Resource """ # Dictionary of queues, which toil assigns jobs to. Each queue represents a job type, # defined by resource usage self.jobQueues = JobQueue() # Address of the Mesos master in the form host:port where host can be an IP or a hostname self.mesosMasterAddress = config.mesosMasterAddress # Written to when Mesos kills tasks, as directed by Toil self.killedJobIds = set() # The IDs of job to be killed self.killJobIds = set() # Contains jobs on which killBatchJobs were called, regardless of whether or not they # actually were killed or ended by themselves self.intendedKill = set() # Map of host address to job ids # this is somewhat redundant since Mesos returns the number of workers per # node. However, that information isn't guaranteed to reach the leader, # so we also track the state here. When the information is returned from # mesos, prefer that information over this attempt at state tracking. self.hostToJobIDs = {} # see self.setNodeFilter self.nodeFilter = [] # Dict of launched jobIDs to TaskData objects self.runningJobMap = {} # Mesos has no easy way of getting a task's resources so we track them here self.taskResources = {} # Queue of jobs whose status has been updated, according to Mesos self.updatedJobsQueue = Queue() # The Mesos driver used by this scheduler self.driver = None # A dictionary mapping a node's IP to an ExecutorInfo object describing important # properties of our executor running on that node. Only an approximation of the truth. self.executors = {} # A set of Mesos slave IDs, one for each slave running on a non-preemptable node. Only an # approximation of the truth. Recently launched nodes may be absent from this set for a # while and a node's absence from this set does not imply its preemptability. But it is # generally safer to assume a node is preemptable since non-preemptability is a stronger # requirement. If we tracked the set of preemptable nodes instead, we'd have to use # absence as an indicator of non-preemptability and could therefore be misled into # believeing that a recently launched preemptable node was non-preemptable. self.nonPreemptableNodes = set() self.executor = self._buildExecutor() self.unusedJobID = itertools.count() self.lastReconciliation = time.time() self.reconciliationPeriod = 120 # These control how frequently to log a message that would indicate if no jobs are # currently able to run on the offers given. This can happen if the cluster is busy # or if the nodes in the cluster simply don't have enough resources to run the jobs self.lastTimeOfferLogged = 0 self.logPeriod = 30 # seconds self.ignoredNodes = set() self._startDriver()
def testJobQueue(self, testJobs=1000): """ The mesos JobQueue sorts MesosShape objects by requirement and this test ensures that that sorting is what is expected: non-preemptible jobs groups first, with priority given to large jobs. """ from toil.batchSystems.mesos import JobQueue jobQueue = JobQueue() for jobNum in range(0, testJobs): testJob = self._getJob(cores=random.choice(list(range(10))), preemptable=random.choice([True, False])) jobQueue.insertJob(testJob, testJob.resources) sortedTypes = jobQueue.sortedTypes self.assertGreaterEqual(20, len(sortedTypes)) self.assertTrue(all(sortedTypes[i] <= sortedTypes[i + 1] for i in range(len(sortedTypes) - 1))) preemptable = sortedTypes.pop(0).preemptable for jtype in sortedTypes: # all non preemptable jobTypes must be first in sorted order if preemptable: # all the rest of the jobTypes must be preemptable as well assert jtype.preemptable elif jtype.preemptable: # we have reached our first preemptable job preemptable = jtype.preemptable # make sure proper number of jobs are in queue self.assertEqual(len(jobQueue.jobIDs()), testJobs) testJob = self._getJob(cores=random.choice(list(range(10)))) jobQueue.insertJob(testJob, testJob.resources) testJobs += 1 self.assertEqual(len(jobQueue.jobIDs()), testJobs) tmpJob = None while not jobQueue.typeEmpty(testJob.resources): testJobs -= 1 tmpJob = jobQueue.nextJobOfType(testJob.resources) self.assertEqual(len(jobQueue.jobIDs()), testJobs) # Ensure FIFO self.assertIs(testJob, tmpJob)
class MesosBatchSystem(BatchSystemSupport, AbstractScalableBatchSystem, mesos.interface.Scheduler): """ A Toil batch system implementation that uses Apache Mesos to distribute toil jobs as Mesos tasks over a cluster of slave nodes. A Mesos framework consists of a scheduler and an executor. This class acts as the scheduler and is typically run on the master node that also runs the Mesos master process with which the scheduler communicates via a driver component. The executor is implemented in a separate class. It is run on each slave node and communicates with the Mesos slave process via another driver object. The scheduler may also be run on a separate node from the master, which we then call somewhat ambiguously the driver node. """ @classmethod def supportsHotDeployment(cls): return True @classmethod def supportsWorkerCleanup(cls): return True class ExecutorInfo(object): def __init__(self, nodeAddress, slaveId, nodeInfo, lastSeen): super(MesosBatchSystem.ExecutorInfo, self).__init__() self.nodeAddress = nodeAddress self.slaveId = slaveId self.nodeInfo = nodeInfo self.lastSeen = lastSeen def __init__(self, config, maxCores, maxMemory, maxDisk): super(MesosBatchSystem, self).__init__(config, maxCores, maxMemory, maxDisk) # The hot-deployed resource representing the user script. Will be passed along in every # Mesos task. Also see setUserScript(). self.userScript = None """ :type: toil.resource.Resource """ # Dictionary of queues, which toil assigns jobs to. Each queue represents a job type, # defined by resource usage self.jobQueues = JobQueue() # Address of the Mesos master in the form host:port where host can be an IP or a hostname self.mesosMasterAddress = config.mesosMasterAddress # Written to when Mesos kills tasks, as directed by Toil self.killedJobIds = set() # The IDs of job to be killed self.killJobIds = set() # Contains jobs on which killBatchJobs were called, regardless of whether or not they # actually were killed or ended by themselves self.intendedKill = set() # Map of host address to job ids # this is somewhat redundant since Mesos returns the number of workers per # node. However, that information isn't guaranteed to reach the leader, # so we also track the state here. When the information is returned from # mesos, prefer that information over this attempt at state tracking. self.hostToJobIDs = {} # see self.setNodeFilter self.nodeFilter = [] # Dict of launched jobIDs to TaskData objects self.runningJobMap = {} # Mesos has no easy way of getting a task's resources so we track them here self.taskResources = {} # Queue of jobs whose status has been updated, according to Mesos self.updatedJobsQueue = Queue() # The Mesos driver used by this scheduler self.driver = None # A dictionary mapping a node's IP to an ExecutorInfo object describing important # properties of our executor running on that node. Only an approximation of the truth. self.executors = {} # A set of Mesos slave IDs, one for each slave running on a non-preemptable node. Only an # approximation of the truth. Recently launched nodes may be absent from this set for a # while and a node's absence from this set does not imply its preemptability. But it is # generally safer to assume a node is preemptable since non-preemptability is a stronger # requirement. If we tracked the set of preemptable nodes instead, we'd have to use # absence as an indicator of non-preemptability and could therefore be misled into # believeing that a recently launched preemptable node was non-preemptable. self.nonPreemptableNodes = set() self.executor = self._buildExecutor() self.unusedJobID = itertools.count() self.lastReconciliation = time.time() self.reconciliationPeriod = 120 # These control how frequently to log a message that would indicate if no jobs are # currently able to run on the offers given. This can happen if the cluster is busy # or if the nodes in the cluster simply don't have enough resources to run the jobs self.lastTimeOfferLogged = 0 self.logPeriod = 30 # seconds self.ignoredNodes = set() self._startDriver() def setUserScript(self, userScript): self.userScript = userScript def ignoreNode(self, nodeAddress): self.ignoredNodes.add(nodeAddress) def unignoreNode(self, nodeAddress): self.ignoredNodes.remove(nodeAddress) def issueBatchJob(self, jobNode): """ Issues the following command returning a unique jobID. Command is the string to run, memory is an int giving the number of bytes the job needs to run in and cores is the number of cpus needed for the job and error-file is the path of the file to place any std-err/std-out in. """ self.checkResourceRequest(jobNode.memory, jobNode.cores, jobNode.disk) jobID = next(self.unusedJobID) job = ToilJob(jobID=jobID, name=str(jobNode), resources=ResourceRequirement(**jobNode._requirements), command=jobNode.command, userScript=self.userScript, environment=self.environment.copy(), workerCleanupInfo=self.workerCleanupInfo) jobType = job.resources log.debug("Queueing the job command: %s with job id: %s ...", jobNode.command, str(jobID)) # TODO: round all elements of resources self.jobQueues.insertJob(job, jobType) self.taskResources[jobID] = job.resources log.debug("... queued") return jobID def killBatchJobs(self, jobIDs): # FIXME: probably still racy assert self.driver is not None localSet = set() for jobID in jobIDs: self.killJobIds.add(jobID) localSet.add(jobID) self.intendedKill.add(jobID) # FIXME: a bit too expensive for my taste if jobID in self.getIssuedBatchJobIDs(): taskId = mesos_pb2.TaskID() taskId.value = str(jobID) self.driver.killTask(taskId) else: self.killJobIds.remove(jobID) localSet.remove(jobID) while localSet: intersection = localSet.intersection(self.killedJobIds) if intersection: localSet -= intersection self.killedJobIds -= intersection else: time.sleep(1) def getIssuedBatchJobIDs(self): jobIds = set(self.jobQueues.jobIDs()) jobIds.update(list(self.runningJobMap.keys())) return list(jobIds) def getRunningBatchJobIDs(self): currentTime = dict() for jobID, data in list(self.runningJobMap.items()): currentTime[jobID] = time.time() - data.startTime return currentTime def getUpdatedBatchJob(self, maxWait): while True: try: item = self.updatedJobsQueue.get(timeout=maxWait) except Empty: return None jobId, exitValue, wallTime = item try: self.intendedKill.remove(jobId) except KeyError: log.debug('Job %s ended with status %i, took %s seconds.', jobId, exitValue, '???' if wallTime is None else str(wallTime)) return item else: log.debug('Job %s ended naturally before it could be killed.', jobId) def nodeInUse(self, nodeIP): return nodeIP in self.hostToJobIDs @contextmanager def nodeFiltering(self, filter): self.nodeFilter = [filter] yield self.nodeFilter = [] def getWaitDuration(self): """ Gets the period of time to wait (floating point, in seconds) between checking for missing/overlong jobs. """ return self.reconciliationPeriod @classmethod def getRescueBatchJobFrequency(cls): return 30 * 60 # Half an hour def _buildExecutor(self): """ Creates and returns an ExecutorInfo instance representing our executor implementation. """ # The executor program is installed as a setuptools entry point by setup.py info = mesos_pb2.ExecutorInfo() info.name = "toil" info.command.value = resolveEntryPoint('_toil_mesos_executor') info.executor_id.value = "toil-%i" % os.getpid() info.source = pwd.getpwuid(os.getuid()).pw_name return info def _startDriver(self): """ The Mesos driver thread which handles the scheduler's communication with the Mesos master """ framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = "toil" framework.principal = framework.name self.driver = mesos.native.MesosSchedulerDriver(self, framework, self._resolveAddress(self.mesosMasterAddress), True) # enable implicit acknowledgements assert self.driver.start() == mesos_pb2.DRIVER_RUNNING @staticmethod def _resolveAddress(address): """ Resolves the host in the given string. The input is of the form host[:port]. This method is idempotent, i.e. the host may already be a dotted IP address. >>> # noinspection PyProtectedMember >>> f=MesosBatchSystem._resolveAddress >>> f('localhost') '127.0.0.1' >>> f('127.0.0.1') '127.0.0.1' >>> f('localhost:123') '127.0.0.1:123' >>> f('127.0.0.1:123') '127.0.0.1:123' """ address = address.split(':') assert len(address) in (1, 2) address[0] = socket.gethostbyname(address[0]) return ':'.join(address) def shutdown(self): log.debug("Stopping Mesos driver") self.driver.stop() log.debug("Joining Mesos driver") driver_result = self.driver.join() log.debug("Joined Mesos driver") if driver_result != mesos_pb2.DRIVER_STOPPED: raise RuntimeError("Mesos driver failed with %i", driver_result) def registered(self, driver, frameworkId, masterInfo): """ Invoked when the scheduler successfully registers with a Mesos master """ log.debug("Registered with framework ID %s", frameworkId.value) def _declineAllOffers(self, driver, offers): for offer in offers: log.debug("Declining offer %s.", offer.id.value) driver.declineOffer(offer.id) def _parseOffer(self, offer): cores = 0 memory = 0 disk = 0 preemptable = None for attribute in offer.attributes: if attribute.name == 'preemptable': assert preemptable is None, "Attribute 'preemptable' occurs more than once." preemptable = strict_bool(attribute.text.value) if preemptable is None: log.debug('Slave not marked as either preemptable or not. Assuming non-preemptable.') preemptable = False for resource in offer.resources: if resource.name == "cpus": cores += resource.scalar.value elif resource.name == "mem": memory += resource.scalar.value elif resource.name == "disk": disk += resource.scalar.value return cores, memory, disk, preemptable def _prepareToRun(self, jobType, offer): # Get the first element to insure FIFO job = self.jobQueues.nextJobOfType(jobType) task = self._newMesosTask(job, offer) return task def _updateStateToRunning(self, offer, runnableTasks): for task in runnableTasks: resourceKey = int(task.task_id.value) resources = self.taskResources[resourceKey] slaveIP = socket.gethostbyname(offer.hostname) try: self.hostToJobIDs[slaveIP].append(resourceKey) except KeyError: self.hostToJobIDs[slaveIP] = [resourceKey] self.runningJobMap[int(task.task_id.value)] = TaskData(startTime=time.time(), slaveID=offer.slave_id.value, slaveIP=slaveIP, executorID=task.executor.executor_id.value, cores=resources.cores, memory=resources.memory) del self.taskResources[resourceKey] log.debug('Launched Mesos task %s.', task.task_id.value) def resourceOffers(self, driver, offers): """ Invoked when resources have been offered to this framework. """ self._trackOfferedNodes(offers) jobTypes = self.jobQueues.sorted() # TODO: We may want to assert that numIssued >= numRunning if not jobTypes or len(self.getIssuedBatchJobIDs()) == len(self.getRunningBatchJobIDs()): log.debug('There are no queued tasks. Declining Mesos offers.') # Without jobs, we can get stuck with no jobs and no new offers until we decline it. self._declineAllOffers(driver, offers) return unableToRun = True # Right now, gives priority to largest jobs for offer in offers: if offer.hostname in self.ignoredNodes: log.debug("Declining offer %s because node %s is designated for termination" % (offer.id.value, offer.hostname)) driver.declineOffer(offer.id) continue runnableTasks = [] # TODO: In an offer, can there ever be more than one resource with the same name? offerCores, offerMemory, offerDisk, offerPreemptable = self._parseOffer(offer) log.debug('Got offer %s for a %spreemptable slave with %.2f MiB memory, %.2f core(s) ' 'and %.2f MiB of disk.', offer.id.value, '' if offerPreemptable else 'non-', offerMemory, offerCores, offerDisk) remainingCores = offerCores remainingMemory = offerMemory remainingDisk = offerDisk for jobType in jobTypes: runnableTasksOfType = [] # Because we are not removing from the list until outside of the while loop, we # must decrement the number of jobs left to run ourselves to avoid an infinite # loop. nextToLaunchIndex = 0 # Toil specifies disk and memory in bytes but Mesos uses MiB while ( not self.jobQueues.typeEmpty(jobType) # On a non-preemptable node we can run any job, on a preemptable node we # can only run preemptable jobs: and (not offerPreemptable or jobType.preemptable) and remainingCores >= jobType.cores and remainingDisk >= toMiB(jobType.disk) and remainingMemory >= toMiB(jobType.memory)): task = self._prepareToRun(jobType, offer) # TODO: this used to be a conditional but Hannes wanted it changed to an assert # TODO: ... so we can understand why it exists. assert int(task.task_id.value) not in self.runningJobMap runnableTasksOfType.append(task) log.debug("Preparing to launch Mesos task %s using offer %s ...", task.task_id.value, offer.id.value) remainingCores -= jobType.cores remainingMemory -= toMiB(jobType.memory) remainingDisk -= toMiB(jobType.disk) nextToLaunchIndex += 1 else: log.debug('Offer %(offer)s not suitable to run the tasks with requirements ' '%(requirements)r. Mesos offered %(memory)s memory, %(cores)s cores ' 'and %(disk)s of disk on a %(non)spreemptable slave.', dict(offer=offer.id.value, requirements=jobType.__dict__, non='' if offerPreemptable else 'non-', memory=fromMiB(offerMemory), cores=offerCores, disk=fromMiB(offerDisk))) runnableTasks.extend(runnableTasksOfType) # Launch all runnable tasks together so we only call launchTasks once per offer if runnableTasks: unableToRun = False driver.launchTasks(offer.id, runnableTasks) self._updateStateToRunning(offer, runnableTasks) else: log.debug('Although there are queued jobs, none of them could be run with offer %s ' 'extended to the framework.', offer.id) driver.declineOffer(offer.id) if unableToRun and time.time() > (self.lastTimeOfferLogged + self.logPeriod): self.lastTimeOfferLogged = time.time() log.debug('Although there are queued jobs, none of them were able to run in ' 'any of the offers extended to the framework. There are currently ' '%i jobs running. Enable debug level logging to see more details about ' 'job types and offers received.', len(self.runningJobMap)) def _trackOfferedNodes(self, offers): for offer in offers: nodeAddress = socket.gethostbyname(offer.hostname) self._registerNode(nodeAddress, offer.slave_id.value) preemptable = False for attribute in offer.attributes: if attribute.name == 'preemptable': preemptable = strict_bool(attribute.text.value) if preemptable: try: self.nonPreemptableNodes.remove(offer.slave_id.value) except KeyError: pass else: self.nonPreemptableNodes.add(offer.slave_id.value) def _filterOfferedNodes(self, offers): if not self.nodeFilter: return offers executorInfoOrNone = [self.executors.get(socket.gethostbyname(offer.hostname)) for offer in offers] executorInfos = [_f for _f in executorInfoOrNone if _f] executorsToConsider = list(filter(self.nodeFilter[0], executorInfos)) ipsToConsider = {ex.nodeAddress for ex in executorsToConsider} return [offer for offer in offers if socket.gethostbyname(offer.hostname) in ipsToConsider] def _newMesosTask(self, job, offer): """ Build the Mesos task object for a given the Toil job and Mesos offer """ task = mesos_pb2.TaskInfo() task.task_id.value = str(job.jobID) task.slave_id.value = offer.slave_id.value task.name = job.name task.data = pickle.dumps(job) task.executor.MergeFrom(self.executor) cpus = task.resources.add() cpus.name = "cpus" cpus.type = mesos_pb2.Value.SCALAR cpus.scalar.value = job.resources.cores disk = task.resources.add() disk.name = "disk" disk.type = mesos_pb2.Value.SCALAR if toMiB(job.resources.disk) > 1: disk.scalar.value = toMiB(job.resources.disk) else: log.warning("Job %s uses less disk than Mesos requires. Rounding %s up to 1 MiB.", job.jobID, job.resources.disk) disk.scalar.value = 1 mem = task.resources.add() mem.name = "mem" mem.type = mesos_pb2.Value.SCALAR if toMiB(job.resources.memory) > 1: mem.scalar.value = toMiB(job.resources.memory) else: log.warning("Job %s uses less memory than Mesos requires. Rounding %s up to 1 MiB.", job.jobID, job.resources.memory) mem.scalar.value = 1 return task def statusUpdate(self, driver, update): """ Invoked when the status of a task has changed (e.g., a slave is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc). Note that returning from this callback _acknowledges_ receipt of this status update! If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered (note, however, that this is currently not true if the slave sending the status update is lost/fails during that time). """ jobID = int(update.task_id.value) stateName = mesos_pb2.TaskState.Name(update.state) log.debug("Job %i is in state '%s'.", jobID, stateName) def jobEnded(_exitStatus, wallTime=None): try: self.killJobIds.remove(jobID) except KeyError: pass else: self.killedJobIds.add(jobID) self.updatedJobsQueue.put((jobID, _exitStatus, wallTime)) slaveIP = None try: slaveIP = self.runningJobMap[jobID].slaveIP except KeyError: log.warning("Job %i returned exit code %i but isn't tracked as running.", jobID, _exitStatus) else: del self.runningJobMap[jobID] try: self.hostToJobIDs[slaveIP].remove(jobID) except KeyError: log.warning("Job %i returned exit code %i from unknown host.", jobID, _exitStatus) if update.state == mesos_pb2.TASK_FINISHED: jobEnded(0, wallTime=unpack('d', update.data)[0]) elif update.state == mesos_pb2.TASK_FAILED: try: exitStatus = int(update.message) except ValueError: exitStatus = 255 log.warning("Job %i failed with message '%s'", jobID, update.message) else: log.warning('Job %i failed with exit status %i', jobID, exitStatus) jobEnded(exitStatus) elif update.state in (mesos_pb2.TASK_LOST, mesos_pb2.TASK_KILLED, mesos_pb2.TASK_ERROR): log.warning("Job %i is in unexpected state %s with message '%s'.", jobID, stateName, update.message) jobEnded(255) def frameworkMessage(self, driver, executorId, slaveId, message): """ Invoked when an executor sends a message. """ log.debug('Got framework message from executor %s running on slave %s: %s', executorId.value, slaveId.value, message) message = ast.literal_eval(message) assert isinstance(message, dict) # Handle the mandatory fields of a message nodeAddress = message.pop('address') executor = self._registerNode(nodeAddress, slaveId.value) # Handle optional message fields for k, v in iteritems(message): if k == 'nodeInfo': assert isinstance(v, dict) resources = [taskData for taskData in itervalues(self.runningJobMap) if taskData.executorID == executorId.value] requestedCores = sum(taskData.cores for taskData in resources) requestedMemory = sum(taskData.memory for taskData in resources) executor.nodeInfo = NodeInfo(requestedCores=requestedCores, requestedMemory=requestedMemory, **v) self.executors[nodeAddress] = executor else: raise RuntimeError("Unknown message field '%s'." % k) def _registerNode(self, nodeAddress, slaveId): executor = self.executors.get(nodeAddress) if executor is None or executor.slaveId != slaveId: executor = self.ExecutorInfo(nodeAddress=nodeAddress, slaveId=slaveId, nodeInfo=None, lastSeen=time.time()) self.executors[nodeAddress] = executor else: executor.lastSeen = time.time() return executor def getNodes(self, preemptable=None, timeout=600): timeout = timeout or sys.maxsize return {nodeAddress: executor.nodeInfo for nodeAddress, executor in iteritems(self.executors) if time.time() - executor.lastSeen < timeout and (preemptable is None or preemptable == (executor.slaveId not in self.nonPreemptableNodes))} def reregistered(self, driver, masterInfo): """ Invoked when the scheduler re-registers with a newly elected Mesos master. """ log.debug('Registered with new master') def executorLost(self, driver, executorId, slaveId, status): """ Invoked when an executor has exited/terminated. """ log.warning("Executor '%s' lost.", executorId) @classmethod def setOptions(cl, setOption): setOption("mesosMasterAddress", None, None, 'localhost:5050')
class MesosBatchSystem(BatchSystemLocalSupport, AbstractScalableBatchSystem, Scheduler): """ A Toil batch system implementation that uses Apache Mesos to distribute toil jobs as Mesos tasks over a cluster of agent nodes. A Mesos framework consists of a scheduler and an executor. This class acts as the scheduler and is typically run on the master node that also runs the Mesos master process with which the scheduler communicates via a driver component. The executor is implemented in a separate class. It is run on each agent node and communicates with the Mesos agent process via another driver object. The scheduler may also be run on a separate node from the master, which we then call somewhat ambiguously the driver node. """ @classmethod def supportsAutoDeployment(cls): return True @classmethod def supportsWorkerCleanup(cls): return True class ExecutorInfo: def __init__(self, nodeAddress, agentId, nodeInfo, lastSeen): super(MesosBatchSystem.ExecutorInfo, self).__init__() self.nodeAddress = nodeAddress self.agentId = agentId self.nodeInfo = nodeInfo self.lastSeen = lastSeen def __init__(self, config, maxCores, maxMemory, maxDisk): super().__init__(config, maxCores, maxMemory, maxDisk) # The auto-deployed resource representing the user script. Will be passed along in every # Mesos task. Also see setUserScript(). self.userScript = None """ :type: toil.resource.Resource """ # Dictionary of queues, which toil assigns jobs to. Each queue represents a job type, # defined by resource usage self.jobQueues = JobQueue() # Address of the Mesos master in the form host:port where host can be an IP or a hostname self.mesos_endpoint = config.mesos_endpoint # Written to when Mesos kills tasks, as directed by Toil. # Jobs must not enter this set until they are removed from runningJobMap. self.killedJobIds = set() # The IDs of job to be killed self.killJobIds = set() # Contains jobs on which killBatchJobs were called, regardless of whether or not they # actually were killed or ended by themselves self.intendedKill = set() # Map of host address to job ids # this is somewhat redundant since Mesos returns the number of workers per # node. However, that information isn't guaranteed to reach the leader, # so we also track the state here. When the information is returned from # mesos, prefer that information over this attempt at state tracking. self.hostToJobIDs = {} # see self.setNodeFilter self.nodeFilter = [] # Dict of launched jobIDs to TaskData objects self.runningJobMap = {} # Mesos has no easy way of getting a task's resources so we track them here self.taskResources = {} # Queue of jobs whose status has been updated, according to Mesos self.updatedJobsQueue = Queue() # The Mesos driver used by this scheduler self.driver = None # The string framework ID that we are assigned when registering with the Mesos master self.frameworkId = None # A dictionary mapping a node's IP to an ExecutorInfo object describing important # properties of our executor running on that node. Only an approximation of the truth. self.executors = {} # A dictionary mapping back from agent ID to the last observed IP address of its node. self.agentsByID = {} # A set of Mesos agent IDs, one for each agent running on a # non-preemptable node. Only an approximation of the truth. Recently # launched nodes may be absent from this set for a while and a node's # absence from this set does not imply its preemptability. But it is # generally safer to assume a node is preemptable since # non-preemptability is a stronger requirement. If we tracked the set # of preemptable nodes instead, we'd have to use absence as an # indicator of non-preemptability and could therefore be misled into # believing that a recently launched preemptable node was # non-preemptable. self.nonPreemptableNodes = set() self.executor = self._buildExecutor() # These control how frequently to log a message that would indicate if no jobs are # currently able to run on the offers given. This can happen if the cluster is busy # or if the nodes in the cluster simply don't have enough resources to run the jobs self.lastTimeOfferLogged = 0 self.logPeriod = 30 # seconds self.ignoredNodes = set() self._startDriver() def setUserScript(self, userScript): self.userScript = userScript def ignoreNode(self, nodeAddress): self.ignoredNodes.add(nodeAddress) def unignoreNode(self, nodeAddress): self.ignoredNodes.remove(nodeAddress) def issueBatchJob(self, jobNode: JobDescription, job_environment: Optional[Dict[str, str]] = None): """ Issues the following command returning a unique jobID. Command is the string to run, memory is an int giving the number of bytes the job needs to run in and cores is the number of cpus needed for the job and error-file is the path of the file to place any std-err/std-out in. """ localID = self.handleLocalJob(jobNode) if localID: return localID mesos_resources = { "memory": jobNode.memory, "cores": jobNode.cores, "disk": jobNode.disk, "preemptable": jobNode.preemptable } self.checkResourceRequest( memory=mesos_resources["memory"], cores=mesos_resources["cores"], disk=mesos_resources["disk"] ) jobID = self.getNextJobID() environment = self.environment.copy() if job_environment: environment.update(job_environment) job = ToilJob(jobID=jobID, name=str(jobNode), resources=MesosShape(wallTime=0, **mesos_resources), command=jobNode.command, userScript=self.userScript, environment=environment, workerCleanupInfo=self.workerCleanupInfo) jobType = job.resources log.debug("Queueing the job command: %s with job id: %s ...", jobNode.command, str(jobID)) # TODO: round all elements of resources self.taskResources[jobID] = job.resources self.jobQueues.insertJob(job, jobType) log.debug("... queued") return jobID def killBatchJobs(self, jobIDs): # Some jobs may be local. Kill them first. self.killLocalJobs(jobIDs) # The driver thread does the actual work of killing the remote jobs. # We have to give it instructions, and block until the jobs are killed. assert self.driver is not None # This is the set of jobs that this invocation has asked to be killed, # but which haven't been killed yet. localSet = set() for jobID in jobIDs: # Queue the job up to be killed self.killJobIds.add(jobID) localSet.add(jobID) # Record that we meant to kill it, in case it finishes up by itself. self.intendedKill.add(jobID) if jobID in self.getIssuedBatchJobIDs(): # Since the job has been issued, we have to kill it taskId = addict.Dict() taskId.value = str(jobID) log.debug("Kill issued job %s" % str(jobID)) self.driver.killTask(taskId) else: # This job was never issued. Maybe it is a local job. # We don't have to kill it. log.debug("Skip non-issued job %s" % str(jobID)) self.killJobIds.remove(jobID) localSet.remove(jobID) # Now localSet just has the non-local/issued jobs that we asked to kill while localSet: # Wait until they are all dead intersection = localSet.intersection(self.killedJobIds) if intersection: localSet -= intersection # When jobs are killed that we asked for, clear them out of # killedJobIds where the other thread put them self.killedJobIds -= intersection else: time.sleep(1) # Now all the jobs we asked to kill are dead. We know they are no # longer running, because that update happens before their IDs go into # killedJobIds. So we can safely return. def getIssuedBatchJobIDs(self): jobIds = set(self.jobQueues.jobIDs()) jobIds.update(list(self.runningJobMap.keys())) return list(jobIds) + list(self.getIssuedLocalJobIDs()) def getRunningBatchJobIDs(self): currentTime = dict() for jobID, data in list(self.runningJobMap.items()): currentTime[jobID] = time.time() - data.startTime currentTime.update(self.getRunningLocalJobIDs()) return currentTime def getUpdatedBatchJob(self, maxWait): local_tuple = self.getUpdatedLocalJob(0) if local_tuple: return local_tuple while True: try: item = self.updatedJobsQueue.get(timeout=maxWait) except Empty: return None try: self.intendedKill.remove(item.jobID) except KeyError: log.debug('Job %s ended with status %i, took %s seconds.', item.jobID, item.exitStatus, '???' if item.wallTime is None else str(item.wallTime)) return item else: log.debug('Job %s ended naturally before it could be killed.', item.jobID) def nodeInUse(self, nodeIP: str) -> bool: return nodeIP in self.hostToJobIDs def getWaitDuration(self): """ Gets the period of time to wait (floating point, in seconds) between checking for missing/overlong jobs. """ return 1 def _buildExecutor(self): """ Creates and returns an ExecutorInfo-shaped object representing our executor implementation. """ # The executor program is installed as a setuptools entry point by setup.py info = addict.Dict() info.name = "toil" info.command.value = resolveEntryPoint('_toil_mesos_executor') info.executor_id.value = "toil-%i" % os.getpid() info.source = pwd.getpwuid(os.getuid()).pw_name return info def _startDriver(self): """ The Mesos driver thread which handles the scheduler's communication with the Mesos master """ framework = addict.Dict() framework.user = getpass.getuser() # We must determine the user name ourselves with pymesos framework.name = "toil" framework.principal = framework.name # Make the driver which implements most of the scheduler logic and calls back to us for the user-defined parts. # Make sure it will call us with nice namespace-y addicts self.driver = MesosSchedulerDriver(self, framework, self._resolveAddress(self.mesos_endpoint), use_addict=True, implicit_acknowledgements=True) self.driver.start() @staticmethod def _resolveAddress(address): """ Resolves the host in the given string. The input is of the form host[:port]. This method is idempotent, i.e. the host may already be a dotted IP address. >>> # noinspection PyProtectedMember >>> f=MesosBatchSystem._resolveAddress >>> f('localhost') '127.0.0.1' >>> f('127.0.0.1') '127.0.0.1' >>> f('localhost:123') '127.0.0.1:123' >>> f('127.0.0.1:123') '127.0.0.1:123' """ address = address.split(':') assert len(address) in (1, 2) address[0] = socket.gethostbyname(address[0]) return ':'.join(address) def shutdown(self) -> None: self.shutdownLocal() log.debug("Stopping Mesos driver") self.driver.stop() log.debug("Joining Mesos driver") driver_result = self.driver.join() log.debug("Joined Mesos driver") if driver_result is not None and driver_result != 'DRIVER_STOPPED': # TODO: The docs say join should return a code, but it keeps returning # None when apparently successful. So tolerate that here too. raise RuntimeError("Mesos driver failed with %s" % driver_result) def registered(self, driver, frameworkId, masterInfo): """ Invoked when the scheduler successfully registers with a Mesos master """ log.debug("Registered with framework ID %s", frameworkId.value) # Save the framework ID self.frameworkId = frameworkId.value def _declineAllOffers(self, driver, offers): for offer in offers: driver.declineOffer(offer.id) def _parseOffer(self, offer): cores = 0 memory = 0 disk = 0 preemptable = None for attribute in offer.attributes: if attribute.name == 'preemptable': assert preemptable is None, "Attribute 'preemptable' occurs more than once." preemptable = strict_bool(attribute.text.value) if preemptable is None: log.debug('Agent not marked as either preemptable or not. Assuming non-preemptable.') preemptable = False for resource in offer.resources: if resource.name == "cpus": cores += resource.scalar.value elif resource.name == "mem": memory += resource.scalar.value elif resource.name == "disk": disk += resource.scalar.value return cores, memory, disk, preemptable def _prepareToRun(self, jobType, offer): # Get the first element to ensure FIFO job = self.jobQueues.nextJobOfType(jobType) task = self._newMesosTask(job, offer) return task def _updateStateToRunning(self, offer, runnableTasks): for task in runnableTasks: resourceKey = int(task.task_id.value) resources = self.taskResources[resourceKey] agentIP = socket.gethostbyname(offer.hostname) try: self.hostToJobIDs[agentIP].append(resourceKey) except KeyError: self.hostToJobIDs[agentIP] = [resourceKey] self.runningJobMap[int(task.task_id.value)] = TaskData(startTime=time.time(), agentID=offer.agent_id.value, agentIP=agentIP, executorID=task.executor.executor_id.value, cores=resources.cores, memory=resources.memory) del self.taskResources[resourceKey] log.debug('Launched Mesos task %s.', task.task_id.value) def resourceOffers(self, driver, offers): """ Invoked when resources have been offered to this framework. """ self._trackOfferedNodes(offers) jobTypes = self.jobQueues.sortedTypes if not jobTypes: # Without jobs, we can get stuck with no jobs and no new offers until we decline it. self._declineAllOffers(driver, offers) return unableToRun = True # Right now, gives priority to largest jobs for offer in offers: if offer.hostname in self.ignoredNodes: driver.declineOffer(offer.id) continue runnableTasks = [] # TODO: In an offer, can there ever be more than one resource with the same name? offerCores, offerMemory, offerDisk, offerPreemptable = self._parseOffer(offer) log.debug('Got offer %s for a %spreemptable agent with %.2f MiB memory, %.2f core(s) ' 'and %.2f MiB of disk.', offer.id.value, '' if offerPreemptable else 'non-', offerMemory, offerCores, offerDisk) remainingCores = offerCores remainingMemory = offerMemory remainingDisk = offerDisk for jobType in jobTypes: runnableTasksOfType = [] # Because we are not removing from the list until outside of the while loop, we # must decrement the number of jobs left to run ourselves to avoid an infinite # loop. nextToLaunchIndex = 0 # Toil specifies disk and memory in bytes but Mesos uses MiB while ( not self.jobQueues.typeEmpty(jobType) # On a non-preemptable node we can run any job, on a preemptable node we # can only run preemptable jobs: and (not offerPreemptable or jobType.preemptable) and remainingCores >= jobType.cores and remainingDisk >= b_to_mib(jobType.disk) and remainingMemory >= b_to_mib(jobType.memory)): task = self._prepareToRun(jobType, offer) # TODO: this used to be a conditional but Hannes wanted it changed to an assert # TODO: ... so we can understand why it exists. assert int(task.task_id.value) not in self.runningJobMap runnableTasksOfType.append(task) log.debug("Preparing to launch Mesos task %s with %.2f cores, %.2f MiB memory, and %.2f MiB disk using offer %s ...", task.task_id.value, jobType.cores, b_to_mib(jobType.memory), b_to_mib(jobType.disk), offer.id.value) remainingCores -= jobType.cores remainingMemory -= b_to_mib(jobType.memory) remainingDisk -= b_to_mib(jobType.disk) nextToLaunchIndex += 1 if not self.jobQueues.typeEmpty(jobType): # report that remaining jobs cannot be run with the current resourcesq: log.debug('Offer %(offer)s not suitable to run the tasks with requirements ' '%(requirements)r. Mesos offered %(memory)s memory, %(cores)s cores ' 'and %(disk)s of disk on a %(non)spreemptable agent.', dict(offer=offer.id.value, requirements=jobType.__dict__, non='' if offerPreemptable else 'non-', memory=mib_to_b(offerMemory), cores=offerCores, disk=mib_to_b(offerDisk))) runnableTasks.extend(runnableTasksOfType) # Launch all runnable tasks together so we only call launchTasks once per offer if runnableTasks: unableToRun = False driver.launchTasks(offer.id, runnableTasks) self._updateStateToRunning(offer, runnableTasks) else: log.debug('Although there are queued jobs, none of them could be run with offer %s ' 'extended to the framework.', offer.id) driver.declineOffer(offer.id) if unableToRun and time.time() > (self.lastTimeOfferLogged + self.logPeriod): self.lastTimeOfferLogged = time.time() log.debug('Although there are queued jobs, none of them were able to run in ' 'any of the offers extended to the framework. There are currently ' '%i jobs running. Enable debug level logging to see more details about ' 'job types and offers received.', len(self.runningJobMap)) def _trackOfferedNodes(self, offers): for offer in offers: # All AgentID messages are required to have a value according to the Mesos Protobuf file. assert 'value' in offer.agent_id try: nodeAddress = socket.gethostbyname(offer.hostname) except: log.debug("Failed to resolve hostname %s" % offer.hostname) raise self._registerNode(nodeAddress, offer.agent_id.value) preemptable = False for attribute in offer.attributes: if attribute.name == 'preemptable': preemptable = strict_bool(attribute.text.value) if preemptable: try: self.nonPreemptableNodes.remove(offer.agent_id.value) except KeyError: pass else: self.nonPreemptableNodes.add(offer.agent_id.value) def _filterOfferedNodes(self, offers): if not self.nodeFilter: return offers executorInfoOrNone = [self.executors.get(socket.gethostbyname(offer.hostname)) for offer in offers] executorInfos = [_f for _f in executorInfoOrNone if _f] executorsToConsider = list(filter(self.nodeFilter[0], executorInfos)) ipsToConsider = {ex.nodeAddress for ex in executorsToConsider} return [offer for offer in offers if socket.gethostbyname(offer.hostname) in ipsToConsider] def _newMesosTask(self, job, offer): """ Build the Mesos task object for a given the Toil job and Mesos offer """ task = addict.Dict() task.task_id.value = str(job.jobID) task.agent_id.value = offer.agent_id.value task.name = job.name task.data = encode_data(pickle.dumps(job)) task.executor = addict.Dict(self.executor) task.resources = [] task.resources.append(addict.Dict()) cpus = task.resources[-1] cpus.name = 'cpus' cpus.type = 'SCALAR' cpus.scalar.value = job.resources.cores task.resources.append(addict.Dict()) disk = task.resources[-1] disk.name = 'disk' disk.type = 'SCALAR' if b_to_mib(job.resources.disk) > 1: disk.scalar.value = b_to_mib(job.resources.disk) else: log.warning("Job %s uses less disk than Mesos requires. Rounding %s up to 1 MiB.", job.jobID, job.resources.disk) disk.scalar.value = 1 task.resources.append(addict.Dict()) mem = task.resources[-1] mem.name = 'mem' mem.type = 'SCALAR' if b_to_mib(job.resources.memory) > 1: mem.scalar.value = b_to_mib(job.resources.memory) else: log.warning("Job %s uses less memory than Mesos requires. Rounding %s up to 1 MiB.", job.jobID, job.resources.memory) mem.scalar.value = 1 return task def statusUpdate(self, driver, update): """ Invoked when the status of a task has changed (e.g., a agent is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc). Note that returning from this callback _acknowledges_ receipt of this status update! If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered (note, however, that this is currently not true if the agent sending the status update is lost/fails during that time). """ jobID = int(update.task_id.value) log.debug("Job %i is in state '%s' due to reason '%s'.", jobID, update.state, update.reason) def jobEnded(_exitStatus, wallTime=None, exitReason=None): """ Notify external observers of the job ending. """ self.updatedJobsQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=_exitStatus, wallTime=wallTime, exitReason=exitReason)) agentIP = None try: agentIP = self.runningJobMap[jobID].agentIP except KeyError: log.warning("Job %i returned exit code %i but isn't tracked as running.", jobID, _exitStatus) else: # Mark the job as no longer running. We MUST do this BEFORE # saying we killed the job, or it will be possible for another # thread to kill a job and then see it as running. del self.runningJobMap[jobID] try: self.hostToJobIDs[agentIP].remove(jobID) except KeyError: log.warning("Job %i returned exit code %i from unknown host.", jobID, _exitStatus) try: self.killJobIds.remove(jobID) except KeyError: pass else: # We were asked to kill this job, so say that we have done so. # We do this LAST, after all status updates for the job have # been handled, to ensure a consistent view of the scheduler # state from other threads. self.killedJobIds.add(jobID) if update.state == 'TASK_FINISHED': # We get the running time of the job via the timestamp, which is in job-local time in seconds labels = update.labels.labels wallTime = None for label in labels: if label['key'] == 'wallTime': wallTime = float(label['value']) break assert(wallTime is not None) jobEnded(0, wallTime=wallTime, exitReason=BatchJobExitReason.FINISHED) elif update.state == 'TASK_FAILED': try: exitStatus = int(update.message) except ValueError: exitStatus = EXIT_STATUS_UNAVAILABLE_VALUE log.warning("Job %i failed with message '%s' due to reason '%s' on executor '%s' on agent '%s'.", jobID, update.message, update.reason, update.executor_id, update.agent_id) else: log.warning("Job %i failed with exit status %i and message '%s' due to reason '%s' on executor '%s' on agent '%s'.", jobID, exitStatus, update.message, update.reason, update.executor_id, update.agent_id) jobEnded(exitStatus, exitReason=BatchJobExitReason.FAILED) elif update.state == 'TASK_LOST': log.warning("Job %i is lost.", jobID) jobEnded(EXIT_STATUS_UNAVAILABLE_VALUE, exitReason=BatchJobExitReason.LOST) elif update.state in ('TASK_KILLED', 'TASK_ERROR'): log.warning("Job %i is in unexpected state %s with message '%s' due to reason '%s'.", jobID, update.state, update.message, update.reason) jobEnded(EXIT_STATUS_UNAVAILABLE_VALUE, exitReason=(BatchJobExitReason.KILLED if update.state == 'TASK_KILLED' else BatchJobExitReason.ERROR)) if 'limitation' in update: log.warning("Job limit info: %s" % update.limitation) def frameworkMessage(self, driver, executorId, agentId, message): """ Invoked when an executor sends a message. """ # Take it out of base 64 encoding from Protobuf message = decode_data(message).decode() log.debug('Got framework message from executor %s running on agent %s: %s', executorId.value, agentId.value, message) message = ast.literal_eval(message) assert isinstance(message, dict) # Handle the mandatory fields of a message nodeAddress = message.pop('address') executor = self._registerNode(nodeAddress, agentId.value) # Handle optional message fields for k, v in message.items(): if k == 'nodeInfo': assert isinstance(v, dict) resources = [taskData for taskData in self.runningJobMap.values() if taskData.executorID == executorId.value] requestedCores = sum(taskData.cores for taskData in resources) requestedMemory = sum(taskData.memory for taskData in resources) executor.nodeInfo = NodeInfo(requestedCores=requestedCores, requestedMemory=requestedMemory, **v) self.executors[nodeAddress] = executor else: raise RuntimeError("Unknown message field '%s'." % k) def _registerNode(self, nodeAddress, agentId, nodePort=5051): """ Called when we get communication from an agent. Remembers the information about the agent by address, and the agent address by agent ID. """ executor = self.executors.get(nodeAddress) if executor is None or executor.agentId != agentId: executor = self.ExecutorInfo(nodeAddress=nodeAddress, agentId=agentId, nodeInfo=None, lastSeen=time.time()) self.executors[nodeAddress] = executor else: executor.lastSeen = time.time() # Record the IP under the agent id self.agentsByID[agentId] = nodeAddress return executor def getNodes(self, preemptable: Optional[bool] = None, timeout: Optional[int] = None) -> Dict[str, NodeInfo]: """ Return all nodes that match: - preemptable status (None includes all) - timeout period (seen within the last # seconds, or None for all) """ nodes = dict() for node_ip, executor in self.executors.items(): if preemptable is None or (preemptable == (executor.agentId not in self.nonPreemptableNodes)): if timeout is None or (time.time() - executor.lastSeen < timeout): nodes[node_ip] = executor.nodeInfo return nodes def reregistered(self, driver, masterInfo): """ Invoked when the scheduler re-registers with a newly elected Mesos master. """ log.debug('Registered with new master') def _handleFailedExecutor(self, agentID, executorID=None): """ Should be called when we find out an executor has failed. Gets the log from some container (since we are never handed a container ID) that ran on the given executor on the given agent, if the agent is still up, and dumps it to our log. All IDs are strings. If executorID is None, dumps all executors from the agent. Useful for debugging failing executor code. """ log.warning("Handling failure of executor '%s' on agent '%s'.", executorID, agentID) try: # Look up the IP. We should always know it unless we get answers # back without having accepted offers. agentAddress = self.agentsByID[agentID] # For now we assume the agent is always on the same port. We could # maybe sniff this from the URL that comes in the offer but it's # not guaranteed to be there. agentPort = 5051 # We need the container ID to read the log, but we are never given # it, and I can't find a good way to list it, because the API only # seems to report running containers. So we dump all the available # files with /files/debug and look for one that looks right. filesQueryURL = errorLogURL = "http://%s:%d/files/debug" % \ (agentAddress, agentPort) # Download all the root mount points, which are in an object from # mounted name to real name filesDict = json.loads(urlopen(filesQueryURL).read()) log.debug('Available files: %s', repr(filesDict.keys())) # Generate filenames for each container pointing to where stderr should be stderrFilenames = [] # And look for the actual agent logs. agentLogFilenames = [] for filename in filesDict: if (self.frameworkId in filename and agentID in filename and (executorID is None or executorID in filename)): stderrFilenames.append("%s/stderr" % filename) elif filename.endswith("log"): agentLogFilenames.append(filename) if len(stderrFilenames) == 0: log.warning("Could not find any containers in '%s'." % filesDict) for stderrFilename in stderrFilenames: try: # According to # http://mesos.apache.org/documentation/latest/sandbox/ we can use # the web API to fetch the error log. errorLogURL = "http://%s:%d/files/download?path=%s" % \ (agentAddress, agentPort, quote_plus(stderrFilename)) log.warning("Attempting to retrieve executor error log: %s", errorLogURL) for line in urlopen(errorLogURL): # Warn all the lines of the executor's error log log.warning("Executor: %s", line.rstrip()) except Exception as e: log.warning("Could not retrieve exceutor log due to: '%s'.", e) log.warning(traceback.format_exc()) for agentLogFilename in agentLogFilenames: try: agentLogURL = "http://%s:%d/files/download?path=%s" % \ (agentAddress, agentPort, quote_plus(agentLogFilename)) log.warning("Attempting to retrieve agent log: %s", agentLogURL) for line in urlopen(agentLogURL): # Warn all the lines of the agent's log log.warning("Agent: %s", line.rstrip()) except Exception as e: log.warning("Could not retrieve agent log due to: '%s'.", e) log.warning(traceback.format_exc()) except Exception as e: log.warning("Could not retrieve logs due to: '%s'.", e) log.warning(traceback.format_exc()) def executorLost(self, driver, executorId, agentId, status): """ Invoked when an executor has exited/terminated abnormally. """ failedId = executorId.get('value', None) log.warning("Executor '%s' reported lost with status '%s'.", failedId, status) self._handleFailedExecutor(agentId.value, failedId) @classmethod def get_default_mesos_endpoint(cls) -> str: """ Get the default IP/hostname and port that we will look for Mesos at. """ return f'{get_public_ip()}:5050' @classmethod def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None: parser.add_argument("--mesosEndpoint", "--mesosMaster", dest="mesos_endpoint", default=cls.get_default_mesos_endpoint(), help="The host and port of the Mesos master separated by colon. (default: %(default)s)") @classmethod def setOptions(cls, setOption): setOption("mesos_endpoint", None, None, cls.get_default_mesos_endpoint(), old_names=["mesosMasterAddress"])
def __init__(self, config, maxCores, maxMemory, maxDisk): super(MesosBatchSystem, self).__init__(config, maxCores, maxMemory, maxDisk) # The hot-deployed resource representing the user script. Will be passed along in every # Mesos task. Also see setUserScript(). self.userScript = None """ :type: toil.resource.Resource """ # Dictionary of queues, which toil assigns jobs to. Each queue represents a job type, # defined by resource usage self.jobQueues = JobQueue() # Address of the Mesos master in the form host:port where host can be an IP or a hostname self.mesosMasterAddress = config.mesosMasterAddress # Written to when Mesos kills tasks, as directed by Toil self.killedJobIds = set() # The IDs of job to be killed self.killJobIds = set() # Contains jobs on which killBatchJobs were called, regardless of whether or not they # actually were killed or ended by themselves self.intendedKill = set() # Map of host address to job ids # this is somewhat redundant since Mesos returns the number of workers per # node. However, that information isn't guaranteed to reach the leader, # so we also track the state here. When the information is returned from # mesos, prefer that information over this attempt at state tracking. self.hostToJobIDs = {} # see self.setNodeFilter self.nodeFilter = [] # Dict of launched jobIDs to TaskData objects self.runningJobMap = {} # Mesos has no easy way of getting a task's resources so we track them here self.taskResources = {} # Queue of jobs whose status has been updated, according to Mesos self.updatedJobsQueue = Queue() # The Mesos driver used by this scheduler self.driver = None # A dictionary mapping a node's IP to an ExecutorInfo object describing important # properties of our executor running on that node. Only an approximation of the truth. self.executors = {} # A set of Mesos slave IDs, one for each slave running on a non-preemptable node. Only an # approximation of the truth. Recently launched nodes may be absent from this set for a # while and a node's absence from this set does not imply its preemptability. But it is # generally safer to assume a node is preemptable since non-preemptability is a stronger # requirement. If we tracked the set of preemptable nodes instead, we'd have to use # absence as an indicator of non-preemptability and could therefore be misled into # believeing that a recently launched preemptable node was non-preemptable. self.nonPreemptableNodes = set() self.executor = self._buildExecutor() self.unusedJobID = itertools.count() self.lastReconciliation = time.time() self.reconciliationPeriod = 120 # These control how frequently to log a message that would indicate if no jobs are # currently able to run on the offers given. This can happen if the cluster is busy # or if the nodes in the cluster simply don't have enough resources to run the jobs self.lastTimeOfferLogged = 0 self.logPeriod = 30 # seconds self._startDriver()
class MesosBatchSystem(BatchSystemSupport, AbstractScalableBatchSystem, mesos.interface.Scheduler): """ A Toil batch system implementation that uses Apache Mesos to distribute toil jobs as Mesos tasks over a cluster of slave nodes. A Mesos framework consists of a scheduler and an executor. This class acts as the scheduler and is typically run on the master node that also runs the Mesos master process with which the scheduler communicates via a driver component. The executor is implemented in a separate class. It is run on each slave node and communicates with the Mesos slave process via another driver object. The scheduler may also be run on a separate node from the master, which we then call somewhat ambiguously the driver node. """ @classmethod def supportsHotDeployment(cls): return True @classmethod def supportsWorkerCleanup(cls): return True class ExecutorInfo(object): def __init__(self, nodeAddress, slaveId, nodeInfo, lastSeen): super(MesosBatchSystem.ExecutorInfo, self).__init__() self.nodeAddress = nodeAddress self.slaveId = slaveId self.nodeInfo = nodeInfo self.lastSeen = lastSeen def __init__(self, config, maxCores, maxMemory, maxDisk): super(MesosBatchSystem, self).__init__(config, maxCores, maxMemory, maxDisk) # The hot-deployed resource representing the user script. Will be passed along in every # Mesos task. Also see setUserScript(). self.userScript = None """ :type: toil.resource.Resource """ # Dictionary of queues, which toil assigns jobs to. Each queue represents a job type, # defined by resource usage self.jobQueues = JobQueue() # Address of the Mesos master in the form host:port where host can be an IP or a hostname self.mesosMasterAddress = config.mesosMasterAddress # Written to when Mesos kills tasks, as directed by Toil self.killedJobIds = set() # The IDs of job to be killed self.killJobIds = set() # Contains jobs on which killBatchJobs were called, regardless of whether or not they # actually were killed or ended by themselves self.intendedKill = set() # Map of host address to job ids # this is somewhat redundant since Mesos returns the number of workers per # node. However, that information isn't guaranteed to reach the leader, # so we also track the state here. When the information is returned from # mesos, prefer that information over this attempt at state tracking. self.hostToJobIDs = {} # see self.setNodeFilter self.nodeFilter = [] # Dict of launched jobIDs to TaskData objects self.runningJobMap = {} # Mesos has no easy way of getting a task's resources so we track them here self.taskResources = {} # Queue of jobs whose status has been updated, according to Mesos self.updatedJobsQueue = Queue() # The Mesos driver used by this scheduler self.driver = None # A dictionary mapping a node's IP to an ExecutorInfo object describing important # properties of our executor running on that node. Only an approximation of the truth. self.executors = {} # A set of Mesos slave IDs, one for each slave running on a non-preemptable node. Only an # approximation of the truth. Recently launched nodes may be absent from this set for a # while and a node's absence from this set does not imply its preemptability. But it is # generally safer to assume a node is preemptable since non-preemptability is a stronger # requirement. If we tracked the set of preemptable nodes instead, we'd have to use # absence as an indicator of non-preemptability and could therefore be misled into # believeing that a recently launched preemptable node was non-preemptable. self.nonPreemptableNodes = set() self.executor = self._buildExecutor() self.unusedJobID = itertools.count() self.lastReconciliation = time.time() self.reconciliationPeriod = 120 # These control how frequently to log a message that would indicate if no jobs are # currently able to run on the offers given. This can happen if the cluster is busy # or if the nodes in the cluster simply don't have enough resources to run the jobs self.lastTimeOfferLogged = 0 self.logPeriod = 30 # seconds self._startDriver() def setUserScript(self, userScript): self.userScript = userScript def issueBatchJob(self, jobNode): """ Issues the following command returning a unique jobID. Command is the string to run, memory is an int giving the number of bytes the job needs to run in and cores is the number of cpus needed for the job and error-file is the path of the file to place any std-err/std-out in. """ self.checkResourceRequest(jobNode.memory, jobNode.cores, jobNode.disk) jobID = next(self.unusedJobID) job = ToilJob(jobID=jobID, name=str(jobNode), resources=ResourceRequirement(**jobNode._requirements), command=jobNode.command, userScript=self.userScript, environment=self.environment.copy(), workerCleanupInfo=self.workerCleanupInfo) jobType = job.resources log.debug("Queueing the job command: %s with job id: %s ...", jobNode.command, str(jobID)) # TODO: round all elements of resources self.jobQueues.insertJob(job, jobType) self.taskResources[jobID] = job.resources log.debug("... queued") return jobID def killBatchJobs(self, jobIDs): # FIXME: probably still racy assert self.driver is not None localSet = set() for jobID in jobIDs: self.killJobIds.add(jobID) localSet.add(jobID) self.intendedKill.add(jobID) # FIXME: a bit too expensive for my taste if jobID in self.getIssuedBatchJobIDs(): taskId = mesos_pb2.TaskID() taskId.value = str(jobID) self.driver.killTask(taskId) else: self.killJobIds.remove(jobID) localSet.remove(jobID) while localSet: intersection = localSet.intersection(self.killedJobIds) if intersection: localSet -= intersection self.killedJobIds -= intersection else: time.sleep(1) def getIssuedBatchJobIDs(self): jobIds = set(self.jobQueues.jobIDs()) jobIds.update(list(self.runningJobMap.keys())) return list(jobIds) def getRunningBatchJobIDs(self): currentTime = dict() for jobID, data in list(self.runningJobMap.items()): currentTime[jobID] = time.time() - data.startTime return currentTime def getUpdatedBatchJob(self, maxWait): while True: try: item = self.updatedJobsQueue.get(timeout=maxWait) except Empty: return None jobId, exitValue, wallTime = item try: self.intendedKill.remove(jobId) except KeyError: log.debug('Job %s ended with status %i, took %s seconds.', jobId, exitValue, '???' if wallTime is None else str(wallTime)) return item else: log.debug('Job %s ended naturally before it could be killed.', jobId) def nodeInUse(self, nodeIP): return nodeIP in self.hostToJobIDs @contextmanager def nodeFiltering(self, filter): self.nodeFilter = [filter] yield self.nodeFilter = [] def getWaitDuration(self): """ Gets the period of time to wait (floating point, in seconds) between checking for missing/overlong jobs. """ return self.reconciliationPeriod @classmethod def getRescueBatchJobFrequency(cls): return 30 * 60 # Half an hour def _buildExecutor(self): """ Creates and returns an ExecutorInfo instance representing our executor implementation. """ # The executor program is installed as a setuptools entry point by setup.py info = mesos_pb2.ExecutorInfo() info.name = "toil" info.command.value = resolveEntryPoint('_toil_mesos_executor') info.executor_id.value = "toil-%i" % os.getpid() info.source = pwd.getpwuid(os.getuid()).pw_name return info def _startDriver(self): """ The Mesos driver thread which handles the scheduler's communication with the Mesos master """ framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = "toil" framework.principal = framework.name self.driver = mesos.native.MesosSchedulerDriver(self, framework, self._resolveAddress(self.mesosMasterAddress), True) # enable implicit acknowledgements assert self.driver.start() == mesos_pb2.DRIVER_RUNNING @staticmethod def _resolveAddress(address): """ Resolves the host in the given string. The input is of the form host[:port]. This method is idempotent, i.e. the host may already be a dotted IP address. >>> # noinspection PyProtectedMember >>> f=MesosBatchSystem._resolveAddress >>> f('localhost') '127.0.0.1' >>> f('127.0.0.1') '127.0.0.1' >>> f('localhost:123') '127.0.0.1:123' >>> f('127.0.0.1:123') '127.0.0.1:123' """ address = address.split(':') assert len(address) in (1, 2) address[0] = socket.gethostbyname(address[0]) return ':'.join(address) def shutdown(self): log.debug("Stopping Mesos driver") self.driver.stop() log.debug("Joining Mesos driver") driver_result = self.driver.join() log.debug("Joined Mesos driver") if driver_result != mesos_pb2.DRIVER_STOPPED: raise RuntimeError("Mesos driver failed with %i", driver_result) def registered(self, driver, frameworkId, masterInfo): """ Invoked when the scheduler successfully registers with a Mesos master """ log.debug("Registered with framework ID %s", frameworkId.value) def _declineAllOffers(self, driver, offers): for offer in offers: log.debug("Declining offer %s.", offer.id.value) driver.declineOffer(offer.id) def _parseOffer(self, offer): cores = 0 memory = 0 disk = 0 preemptable = None for attribute in offer.attributes: if attribute.name == 'preemptable': assert preemptable is None, "Attribute 'preemptable' occurs more than once." preemptable = strict_bool(attribute.text.value) if preemptable is None: log.debug('Slave not marked as either preemptable or not. Assuming non-preemptable.') preemptable = False for resource in offer.resources: if resource.name == "cpus": cores += resource.scalar.value elif resource.name == "mem": memory += resource.scalar.value elif resource.name == "disk": disk += resource.scalar.value return cores, memory, disk, preemptable def _prepareToRun(self, jobType, offer): # Get the first element to insure FIFO job = self.jobQueues.nextJobOfType(jobType) task = self._newMesosTask(job, offer) return task def _updateStateToRunning(self, offer, runnableTasks): for task in runnableTasks: resourceKey = int(task.task_id.value) resources = self.taskResources[resourceKey] slaveIP = socket.gethostbyname(offer.hostname) try: self.hostToJobIDs[slaveIP].append(resourceKey) except KeyError: self.hostToJobIDs[slaveIP] = [resourceKey] self.runningJobMap[int(task.task_id.value)] = TaskData(startTime=time.time(), slaveID=offer.slave_id.value, slaveIP=slaveIP, executorID=task.executor.executor_id.value, cores=resources.cores, memory=resources.memory) del self.taskResources[resourceKey] log.debug('Launched Mesos task %s.', task.task_id.value) def resourceOffers(self, driver, offers): """ Invoked when resources have been offered to this framework. """ self._trackOfferedNodes(offers) jobTypes = self.jobQueues.sorted() # TODO: We may want to assert that numIssued >= numRunning if not jobTypes or len(self.getIssuedBatchJobIDs()) == len(self.getRunningBatchJobIDs()): log.debug('There are no queued tasks. Declining Mesos offers.') # Without jobs, we can get stuck with no jobs and no new offers until we decline it. self._declineAllOffers(driver, offers) return unableToRun = True # Right now, gives priority to largest jobs for offer in offers: runnableTasks = [] # TODO: In an offer, can there ever be more than one resource with the same name? offerCores, offerMemory, offerDisk, offerPreemptable = self._parseOffer(offer) log.debug('Got offer %s for a %spreemptable slave with %.2f MiB memory, %.2f core(s) ' 'and %.2f MiB of disk.', offer.id.value, '' if offerPreemptable else 'non-', offerMemory, offerCores, offerDisk) remainingCores = offerCores remainingMemory = offerMemory remainingDisk = offerDisk for jobType in jobTypes: runnableTasksOfType = [] # Because we are not removing from the list until outside of the while loop, we # must decrement the number of jobs left to run ourselves to avoid an infinite # loop. nextToLaunchIndex = 0 # Toil specifies disk and memory in bytes but Mesos uses MiB while ( not self.jobQueues.typeEmpty(jobType) # On a non-preemptable node we can run any job, on a preemptable node we # can only run preemptable jobs: and (not offerPreemptable or jobType.preemptable) and remainingCores >= jobType.cores and remainingDisk >= toMiB(jobType.disk) and remainingMemory >= toMiB(jobType.memory)): task = self._prepareToRun(jobType, offer) # TODO: this used to be a conditional but Hannes wanted it changed to an assert # TODO: ... so we can understand why it exists. assert int(task.task_id.value) not in self.runningJobMap runnableTasksOfType.append(task) log.debug("Preparing to launch Mesos task %s using offer %s ...", task.task_id.value, offer.id.value) remainingCores -= jobType.cores remainingMemory -= toMiB(jobType.memory) remainingDisk -= toMiB(jobType.disk) nextToLaunchIndex += 1 else: log.debug('Offer %(offer)s not suitable to run the tasks with requirements ' '%(requirements)r. Mesos offered %(memory)s memory, %(cores)s cores ' 'and %(disk)s of disk on a %(non)spreemptable slave.', dict(offer=offer.id.value, requirements=jobType.__dict__, non='' if offerPreemptable else 'non-', memory=fromMiB(offerMemory), cores=offerCores, disk=fromMiB(offerDisk))) runnableTasks.extend(runnableTasksOfType) # Launch all runnable tasks together so we only call launchTasks once per offer if runnableTasks: unableToRun = False driver.launchTasks(offer.id, runnableTasks) self._updateStateToRunning(offer, runnableTasks) else: log.debug('Although there are queued jobs, none of them could be run with offer %s ' 'extended to the framework.', offer.id) driver.declineOffer(offer.id) if unableToRun and time.time() > (self.lastTimeOfferLogged + self.logPeriod): self.lastTimeOfferLogged = time.time() log.debug('Although there are queued jobs, none of them were able to run in ' 'any of the offers extended to the framework. There are currently ' '%i jobs running. Enable debug level logging to see more details about ' 'job types and offers received.', len(self.runningJobMap)) def _trackOfferedNodes(self, offers): for offer in offers: nodeAddress = socket.gethostbyname(offer.hostname) self._registerNode(nodeAddress, offer.slave_id.value) preemptable = False for attribute in offer.attributes: if attribute.name == 'preemptable': preemptable = strict_bool(attribute.text.value) if preemptable: try: self.nonPreemptableNodes.remove(offer.slave_id.value) except KeyError: pass else: self.nonPreemptableNodes.add(offer.slave_id.value) def _filterOfferedNodes(self, offers): if not self.nodeFilter: return offers executorInfoOrNone = [self.executors.get(socket.gethostbyname(offer.hostname)) for offer in offers] executorInfos = [_f for _f in executorInfoOrNone if _f] executorsToConsider = list(filter(self.nodeFilter[0], executorInfos)) ipsToConsider = {ex.nodeAddress for ex in executorsToConsider} return [offer for offer in offers if socket.gethostbyname(offer.hostname) in ipsToConsider] def _newMesosTask(self, job, offer): """ Build the Mesos task object for a given the Toil job and Mesos offer """ task = mesos_pb2.TaskInfo() task.task_id.value = str(job.jobID) task.slave_id.value = offer.slave_id.value task.name = job.name task.data = pickle.dumps(job) task.executor.MergeFrom(self.executor) cpus = task.resources.add() cpus.name = "cpus" cpus.type = mesos_pb2.Value.SCALAR cpus.scalar.value = job.resources.cores disk = task.resources.add() disk.name = "disk" disk.type = mesos_pb2.Value.SCALAR if toMiB(job.resources.disk) > 1: disk.scalar.value = toMiB(job.resources.disk) else: log.warning("Job %s uses less disk than Mesos requires. Rounding %s up to 1 MiB.", job.jobID, job.resources.disk) disk.scalar.value = 1 mem = task.resources.add() mem.name = "mem" mem.type = mesos_pb2.Value.SCALAR if toMiB(job.resources.memory) > 1: mem.scalar.value = toMiB(job.resources.memory) else: log.warning("Job %s uses less memory than Mesos requires. Rounding %s up to 1 MiB.", job.jobID, job.resources.memory) mem.scalar.value = 1 return task def statusUpdate(self, driver, update): """ Invoked when the status of a task has changed (e.g., a slave is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc). Note that returning from this callback _acknowledges_ receipt of this status update! If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered (note, however, that this is currently not true if the slave sending the status update is lost/fails during that time). """ jobID = int(update.task_id.value) stateName = mesos_pb2.TaskState.Name(update.state) log.debug("Job %i is in state '%s'.", jobID, stateName) def jobEnded(_exitStatus, wallTime=None): try: self.killJobIds.remove(jobID) except KeyError: pass else: self.killedJobIds.add(jobID) self.updatedJobsQueue.put((jobID, _exitStatus, wallTime)) slaveIP = None try: slaveIP = self.runningJobMap[jobID].slaveIP except KeyError: log.warning("Job %i returned exit code %i but isn't tracked as running.", jobID, _exitStatus) else: del self.runningJobMap[jobID] try: self.hostToJobIDs[slaveIP].remove(jobID) except KeyError: log.warning("Job %i returned exit code %i from unknown host.", jobID, _exitStatus) if update.state == mesos_pb2.TASK_FINISHED: jobEnded(0, wallTime=unpack('d', update.data)[0]) elif update.state == mesos_pb2.TASK_FAILED: try: exitStatus = int(update.message) except ValueError: exitStatus = 255 log.warning("Job %i failed with message '%s'", jobID, update.message) else: log.warning('Job %i failed with exit status %i', jobID, exitStatus) jobEnded(exitStatus) elif update.state in (mesos_pb2.TASK_LOST, mesos_pb2.TASK_KILLED, mesos_pb2.TASK_ERROR): log.warning("Job %i is in unexpected state %s with message '%s'.", jobID, stateName, update.message) jobEnded(255) def frameworkMessage(self, driver, executorId, slaveId, message): """ Invoked when an executor sends a message. """ log.debug('Got framework message from executor %s running on slave %s: %s', executorId.value, slaveId.value, message) message = ast.literal_eval(message) assert isinstance(message, dict) # Handle the mandatory fields of a message nodeAddress = message.pop('address') executor = self._registerNode(nodeAddress, slaveId.value) # Handle optional message fields for k, v in iteritems(message): if k == 'nodeInfo': assert isinstance(v, dict) resources = [taskData for taskData in itervalues(self.runningJobMap) if taskData.executorID == executorId.value] requestedCores = sum(taskData.cores for taskData in resources) requestedMemory = sum(taskData.memory for taskData in resources) executor.nodeInfo = NodeInfo(requestedCores=requestedCores, requestedMemory=requestedMemory, **v) self.executors[nodeAddress] = executor else: raise RuntimeError("Unknown message field '%s'." % k) def _registerNode(self, nodeAddress, slaveId): executor = self.executors.get(nodeAddress) if executor is None or executor.slaveId != slaveId: executor = self.ExecutorInfo(nodeAddress=nodeAddress, slaveId=slaveId, nodeInfo=None, lastSeen=time.time()) self.executors[nodeAddress] = executor else: executor.lastSeen = time.time() return executor def getNodes(self, preemptable=None, timeout=600): timeout = timeout or sys.maxsize return {nodeAddress: executor.nodeInfo for nodeAddress, executor in iteritems(self.executors) if time.time() - executor.lastSeen < timeout and (preemptable is None or preemptable == (executor.slaveId not in self.nonPreemptableNodes))} def reregistered(self, driver, masterInfo): """ Invoked when the scheduler re-registers with a newly elected Mesos master. """ log.debug('Registered with new master') def executorLost(self, driver, executorId, slaveId, status): """ Invoked when an executor has exited/terminated. """ log.warning("Executor '%s' lost.", executorId) @classmethod def setOptions(cl, setOption): setOption("mesosMasterAddress", None, None, 'localhost:5050')
def testJobQueue(self, testJobs=1000): from toil.batchSystems.mesos import JobQueue jobQueue = JobQueue() for jobNum in range(0, testJobs): testJob = self._getJob(cores=random.choice(range(10)), preemptable=random.choice([True, False])) jobQueue.insertJob(testJob, testJob.resources) sortedTypes = jobQueue.sorted() # test this is properly sorted self.assertGreaterEqual(20, len(sortedTypes)) self.assertTrue( all(sortedTypes[i] <= sortedTypes[i + 1] for i in range(len(sortedTypes) - 1))) preemptable = sortedTypes.pop(0).preemptable for jtype in sortedTypes: # all non preemptable jobTypes must be first in sorted order if preemptable: # all the rest of the jobTypes must be preemptable as well assert jtype.preemptable elif jtype.preemptable: # we have reached our first preemptable job preemptable = jtype.preemptable # make sure proper number of jobs are in queue self.assertEqual(len(jobQueue.jobIDs()), testJobs) testJob = self._getJob(cores=random.choice(range(10))) jobQueue.insertJob(testJob, testJob.resources) testJobs += 1 self.assertEqual(len(jobQueue.jobIDs()), testJobs) tmpJob = None while not jobQueue.typeEmpty(testJob.resources): testJobs -= 1 tmpJob = jobQueue.nextJobOfType(testJob.resources) self.assertEqual(len(jobQueue.jobIDs()), testJobs) # Ensure FIFO self.assertIs(testJob, tmpJob)
def testJobQueue(self, testJobs=1000): from toil.batchSystems.mesos import JobQueue jobQueue = JobQueue() for jobNum in range(0, testJobs): testJob = self._getJob(cores=random.choice(range(10)), preemptable=random.choice([True, False])) jobQueue.insertJob(testJob, testJob.resources) sortedTypes = jobQueue.sorted() # test this is properly sorted self.assertGreaterEqual(20, len(sortedTypes)) self.assertTrue(all(sortedTypes[i] <= sortedTypes[i + 1] for i in range(len(sortedTypes) - 1))) preemptable = sortedTypes.pop(0).preemptable for jtype in sortedTypes: # all non preemptable jobTypes must be first in sorted order if preemptable: # all the rest of the jobTypes must be preemptable as well assert jtype.preemptable elif jtype.preemptable: # we have reached our first preemptable job preemptable = jtype.preemptable # make sure proper number of jobs are in queue self.assertEqual(len(jobQueue.jobIDs()), testJobs) testJob = self._getJob(cores=random.choice(range(10))) jobQueue.insertJob(testJob, testJob.resources) testJobs += 1 self.assertEqual(len(jobQueue.jobIDs()), testJobs) tmpJob = None while not jobQueue.typeEmpty(testJob.resources): testJobs -= 1 tmpJob = jobQueue.nextJobOfType(testJob.resources) self.assertEqual(len(jobQueue.jobIDs()), testJobs) # Ensure FIFO self.assertIs(testJob, tmpJob)