def __init__(self, config, maxCpus, maxMemory): AbstractBatchSystem.__init__(self, config, maxCpus, maxMemory) #Call the parent constructor if maxMemory != sys.maxint: logger.warn("A max memory has been specified for the parasol batch system class of %i, but currently " "this batchsystem interface does not support such limiting" % maxMemory) #Keep the name of the results file for the pstat2 command.. self.parasolCommand = config.attrib["parasol_command"] self.parasolResultsFile = getParasolResultsFileName(config.attrib["job_store"]) #Reset the batchjob queue and results (initially, we do this again once we've killed the jobs) self.queuePattern = re.compile("q\s+([0-9]+)") self.runningPattern = re.compile("r\s+([0-9]+)\s+[\S]+\s+[\S]+\s+([0-9]+)\s+[\S]+") self.killBatchJobs(self.getIssuedBatchJobIDs()) #Kill any jobs on the current stack logger.info("Going to sleep for a few seconds to kill any existing jobs") time.sleep(5) #Give batch system a second to sort itself out. logger.info("Removed any old jobs from the queue") #Reset the batchjob queue and results exitValue = popenParasolCommand("%s -results=%s clear sick" % (self.parasolCommand, self.parasolResultsFile), False)[0] if exitValue is not None: logger.warn("Could not clear sick status of the parasol batch %s" % self.parasolResultsFile) exitValue = popenParasolCommand("%s -results=%s flushResults" % (self.parasolCommand, self.parasolResultsFile), False)[0] if exitValue is not None: logger.warn("Could not flush the parasol batch %s" % self.parasolResultsFile) open(self.parasolResultsFile, 'w').close() logger.info("Reset the results queue") #Stuff to allow max cpus to be work self.outputQueue1 = Queue() self.outputQueue2 = Queue() #worker = Thread(target=getUpdatedJob, args=(self.parasolResultsFileHandle, self.outputQueue1, self.outputQueue2)) #worker.setDaemon(True) worker = Process(target=getUpdatedJob, args=(self.parasolResultsFile, self.outputQueue1, self.outputQueue2)) worker.daemon = True worker.start() self.usedCpus = 0 self.jobIDsToCpu = {}
def __init__(self, config, maxCores, maxMemory, maxDisk): AbstractBatchSystem.__init__(self, config, maxCores, maxMemory, maxDisk) #Call the parent constructor if maxMemory != sys.maxint: logger.warn("A max memory has been specified for the parasol batch system class of %i, but currently " "this batchsystem interface does not support such limiting" % maxMemory) #Keep the name of the results file for the pstat2 command.. self.parasolCommand = config.parasolCommand self.parasolResultsFile = getParasolResultsFileName(config.jobStore) #Reset the job queue and results (initially, we do this again once we've killed the jobs) self.queuePattern = re.compile("q\s+([0-9]+)") self.runningPattern = re.compile("r\s+([0-9]+)\s+[\S]+\s+[\S]+\s+([0-9]+)\s+[\S]+") self.killBatchJobs(self.getIssuedBatchJobIDs()) #Kill any jobs on the current stack logger.info("Going to sleep for a few seconds to kill any existing jobs") time.sleep(5) #Give batch system a second to sort itself out. logger.info("Removed any old jobs from the queue") #Reset the job queue and results exitValue = popenParasolCommand("%s -results=%s clear sick" % (self.parasolCommand, self.parasolResultsFile), False)[0] if exitValue is not None: logger.warn("Could not clear sick status of the parasol batch %s" % self.parasolResultsFile) exitValue = popenParasolCommand("%s -results=%s flushResults" % (self.parasolCommand, self.parasolResultsFile), False)[0] if exitValue is not None: logger.warn("Could not flush the parasol batch %s" % self.parasolResultsFile) open(self.parasolResultsFile, 'w').close() logger.info("Reset the results queue") #Stuff to allow max cpus to be work self.outputQueue1 = Queue() self.outputQueue2 = Queue() #worker = Thread(target=getUpdatedJob, args=(self.parasolResultsFileHandle, self.outputQueue1, self.outputQueue2)) #worker.setDaemon(True) worker = Process(target=getUpdatedJob, args=(self.parasolResultsFile, self.outputQueue1, self.outputQueue2)) worker.daemon = True worker.start() self.usedCpus = 0 self.jobIDsToCpu = {}
def shutdown(self, driver): log.critical("Shutting down executor...") for taskId, pid in self.runningTasks.items(): self.killTask(driver, taskId) Resource.cleanSystem() AbstractBatchSystem.workerCleanup(self.workerCleanupInfo) log.critical("Executor shut down")
def __init__(self, config, maxCores, maxMemory, maxDisk, masterAddress, userScript=None, toilDistribution=None): AbstractBatchSystem.__init__(self, config, maxCores, maxMemory, maxDisk) # The hot-deployed resources representing the user script and the toil distribution # respectively. Will be passed along in every Mesos task. See # toil.common.HotDeployedResource for details. self.userScript = userScript self.toilDistribution = toilDistribution # Written to when mesos kills tasks, as directed by toil self.killedSet = set() # Dictionary of queues, which toil assigns jobs to. Each queue represents a job type, # defined by resource usage self.jobQueueList = defaultdict(list) # Address of Mesos master in the form host:port where host can be an IP or a hostname self.masterAddress = masterAddress # queue of jobs to kill, by jobID. self.killSet = set() # contains jobs on which killBatchJobs were called, #regardless of whether or not they actually were killed or #ended by themselves. self.intendedKill = set() # Dict of launched jobIDs to TaskData named tuple. Contains start time, executorID, and # slaveID. self.runningJobMap = {} # Queue of jobs whose status has been updated, according to mesos. Req'd by toil self.updatedJobsQueue = Queue() # Whether to use implicit/explicit acknowledgments self.implicitAcknowledgements = self.getImplicit() # Reference to the Mesos driver used by this scheduler, to be instantiated in run() self.driver = None # FIXME: This comment makes no sense to me # Returns Mesos executor object, which is merged into Mesos tasks as they are built self.executor = self.buildExecutor() self.nextJobID = 0 self.lastReconciliation = time.time() self.reconciliationPeriod = 120 # Start the driver self._startDriver()
def __init__(self, config, maxCores, maxMemory, maxDisk): if maxCores > self.numCores: log.warn('Limiting maxCores to CPU count of system (%i).', self.numCores) maxCores = self.numCores if maxMemory > self.physicalMemory: log.warn('Limiting maxMemory to physically available memory (%i).', self.physicalMemory) maxMemory = self.physicalMemory AbstractBatchSystem.__init__(self, config, maxCores, maxMemory, maxDisk) assert self.maxCores >= self.minCores assert self.maxMemory >= 1 # The scale allows the user to apply a factor to each task's cores requirement, thereby # squeezing more tasks onto each core (scale < 1) or stretching tasks over more cores # (scale > 1). self.scale = config.scale # Number of worker threads that will be started self.numWorkers = int(self.maxCores / self.minCores) # A counter to generate job IDs and a lock to guard it self.jobIndex = 0 self.jobIndexLock = Lock() # A dictionary mapping IDs of submitted jobs to the command line self.jobs = {} """ :type: dict[str,str] """ # A queue of jobs waiting to be executed. Consumed by the workers. self.inputQueue = Queue() # A queue of finished jobs. Produced by the workers. self.outputQueue = Queue() # A dictionary mapping IDs of currently running jobs to their Info objects self.runningJobs = {} """ :type: dict[str,Info] """ # The list of worker threads self.workerThreads = [] """ :type list[Thread] """ # A pool representing available CPU in units of minCores self.coreFractions = ResourcePool(self.numWorkers) # A lock to work around the lack of thread-safety in Python's subprocess module self.popenLock = Lock() # A pool representing available memory in bytes self.memory = ResourcePool(self.maxMemory) log.info( 'Setting up the thread pool with %i workers, ' 'given a minimum CPU fraction of %f ' 'and a maximum CPU value of %i.', self.numWorkers, self.minCores, maxCores) for i in xrange(self.numWorkers): worker = Thread(target=self.worker, args=(self.inputQueue, )) self.workerThreads.append(worker) worker.start()
def __init__(self, config, maxCores, maxMemory, maxDisk): assert type(maxCores) == int if maxCores > self.numCores: logger.warn('Limiting maxCores to CPU count of system (%i).', self.numCores) maxCores = self.numCores AbstractBatchSystem.__init__(self, config, maxCores, maxMemory, maxDisk) assert self.maxCores >= 1 assert self.maxMemory >= 1 # The scale allows the user to apply a factor to each task's cores requirement, thereby squeezing more tasks # onto each core (scale < 1) or stretching tasks over more cores (scale > 1). self.scale = config.scale # The minimal fractional CPU. Tasks with a smaller cores requirement will be rounded up to this value. One # important invariant of this class is that each worker thread represents a CPU requirement of minCores, # meaning that we can never run more than numCores / minCores jobs concurrently. With minCores set to .1, # a task with cores=1 will occupy 10 workers. One of these workers will be blocked on the Popen.wait() call for # the worker.py child process, the others will be blocked on the acquiring the core semaphore. self.minCores = 0.1 # Number of worker threads that will be started self.numWorkers = int(self.maxCores / self.minCores) # A counter to generate job IDs and a lock to guard it self.jobIndex = 0 self.jobIndexLock = Lock() # A dictionary mapping IDs of submitted jobs to those jobs self.jobs = {} # A queue of jobs waiting to be executed. Consumed by the workers. self.inputQueue = Queue() # A queue of finished jobs. Produced by the workers. self.outputQueue = Queue() # A dictionary mapping IDs of currently running jobs to their Info objects self.runningJobs = {} # The list of worker threads self.workerThreads = [] # A semaphore representing available CPU in units of minCores self.coreSemaphore = Semaphore(self.numWorkers) # A counter representing failed acquisitions of the semaphore, also in units of minCores, and a lock to guard it self.coreOverflow = 0 self.coreOverflowLock = Lock() # A lock to work around the lack of thread-safety in Python's subprocess module self.popenLock = Lock() # A counter representing available memory in bytes self.memoryPool = self.maxMemory # A condition object used to guard it (a semphore would force us to acquire each unit of memory individually) self.memoryCondition = Condition() logger.info( 'Setting up the thread pool with %i workers, ' 'given a minimum CPU fraction of %f ' 'and a maximum CPU value of %i.', self.numWorkers, self.minCores, maxCores) for i in xrange(self.numWorkers): worker = Thread(target=self.worker, args=(self.inputQueue, )) self.workerThreads.append(worker) worker.start()
def __init__(self, config, maxCores, maxMemory, maxDisk): if maxCores > self.numCores: log.warn('Limiting maxCores to CPU count of system (%i).', self.numCores) maxCores = self.numCores if maxMemory > self.physicalMemory: log.warn('Limiting maxMemory to physically available memory (%i).', self.physicalMemory) maxMemory = self.physicalMemory AbstractBatchSystem.__init__(self, config, maxCores, maxMemory, maxDisk) assert self.maxCores >= self.minCores assert self.maxMemory >= 1 # The scale allows the user to apply a factor to each task's cores requirement, thereby # squeezing more tasks onto each core (scale < 1) or stretching tasks over more cores # (scale > 1). self.scale = config.scale # Number of worker threads that will be started self.numWorkers = int(self.maxCores / self.minCores) # A counter to generate job IDs and a lock to guard it self.jobIndex = 0 self.jobIndexLock = Lock() # A dictionary mapping IDs of submitted jobs to the command line self.jobs = {} """ :type: dict[str,str] """ # A queue of jobs waiting to be executed. Consumed by the workers. self.inputQueue = Queue() # A queue of finished jobs. Produced by the workers. self.outputQueue = Queue() # A dictionary mapping IDs of currently running jobs to their Info objects self.runningJobs = {} """ :type: dict[str,Info] """ # The list of worker threads self.workerThreads = [] """ :type list[Thread] """ # A pool representing available CPU in units of minCores self.coreFractions = ResourcePool(self.numWorkers) # A lock to work around the lack of thread-safety in Python's subprocess module self.popenLock = Lock() # A pool representing available memory in bytes self.memory = ResourcePool(self.maxMemory) log.info('Setting up the thread pool with %i workers, ' 'given a minimum CPU fraction of %f ' 'and a maximum CPU value of %i.', self.numWorkers, self.minCores, maxCores) for i in xrange(self.numWorkers): worker = Thread(target=self.worker, args=(self.inputQueue,)) self.workerThreads.append(worker) worker.start()
def __init__(self, config, maxCpus, maxMemory, maxDisk, masterIP, useBadExecutor=False, userScript=None, toilDistribution=None): AbstractBatchSystem.__init__(self, config, maxCpus, maxMemory, maxDisk) # The hot-deployed resources representing the user script and the toil distribution respectively. Will be # passed along in every Mesos task. See toil.common.HotDeployedResource for details. self.userScript = userScript self.toilDistribution = toilDistribution # Written to when mesos kills tasks, as directed by toil self.killedSet = set() # Dictionary of queues, which toil assigns jobs to. Each queue represents a batchjob type, # defined by resource usage self.jobQueueList = defaultdict(list) # IP of mesos master. specified in MesosBatchSystem, currently loopback self.masterIP = masterIP # queue of jobs to kill, by jobID. self.killSet = set() # Dict of launched jobIDs to TaskData named tuple. Contains start time, executorID, and slaveID. self.runningJobMap = {} # Queue of jobs whose status has been updated, according to mesos. Req'd by toil self.updatedJobsQueue = Queue() # Wether to use implicit/explicit acknowledgments self.implicitAcknowledgements = self.getImplicit() # Reference to the Mesos driver used by this scheduler, to be instantiated in run() self.driver = None # FIXME: This comment makes no sense to me # Returns Mesos executor object, which is merged into Mesos tasks as they are built self.executor = self.buildExecutor(bad=useBadExecutor) self.nextJobID = 0 self.lastReconciliation = time.time() self.reconciliationPeriod = 120 # Start the driver self._startDriver()
def __init__(self, config, maxCpus, maxMemory, maxDisk, badWorker=False): assert type(maxCpus) == int if maxCpus > self.numCores: logger.warn('Limiting maxCpus to CPU count of system (%i).', self.numCores) maxCpus = self.numCores AbstractBatchSystem.__init__(self, config, maxCpus, maxMemory, maxDisk) assert self.maxCpus >= 1 assert self.maxMemory >= 1 # The scale allows the user to apply a factor to each task's CPU requirement, thereby squeezing more tasks # onto each core (scale < 1) or stretching tasks over more cores (scale > 1). self.scale = float(config.attrib['scale']) # The minimal fractional CPU. Tasks with a smaller CPU requirement will be rounded up to this value. One # important invariant of this class is that each worker thread represents a CPU requirement of minCpu, # meaning that we can never run more than numCores / minCpu jobs concurrently. With minCpu set to .1, # a task with cpu=1 will occupy 10 workers. One of these workers will be blocked on the Popen.wait() call for # the worker.py child process, the others will be blocked on the acquiring the CPU semaphore. self.minCpu = 0.1 # Number of worker threads that will be started self.numWorkers = int(self.maxCpus / self.minCpu) # A counter to generate batchjob IDs and a lock to guard it self.jobIndex = 0 self.jobIndexLock = Lock() # A dictionary mapping IDs of submitted jobs to those jobs self.jobs = {} # A queue of jobs waiting to be executed. Consumed by the workers. self.inputQueue = Queue() # A queue of finished jobs. Produced by the workers. self.outputQueue = Queue() # A dictionary mapping IDs of currently running jobs to their Info objects self.runningJobs = {} # The list of worker threads self.workerThreads = [] # A semaphore representing available CPU in units of minCpu self.cpuSemaphore = Semaphore(self.numWorkers) # A counter representing failed acquisitions of the semaphore, also in units of minCpu, and a lock to guard it self.cpuOverflow = 0 self.cpuOverflowLock = Lock() # A lock to work around the lack of thread-safety in Python's subprocess module self.popenLock = Lock() # A counter representing available memory in bytes self.memoryPool = self.maxMemory # A condition object used to guard it (a semphore would force us to acquire each unit of memory individually) self.memoryCondition = Condition() logger.info('Setting up the thread pool with %i workers, ' 'given a minimum CPU fraction of %f ' 'and a maximum CPU value of %i.', self.numWorkers, self.minCpu, maxCpus) self.workerFn = self.badWorker if badWorker else self.worker for i in xrange(self.numWorkers): worker = Thread(target=self.workerFn, args=(self.inputQueue,)) self.workerThreads.append(worker) worker.start()
def shutdown(self): """ Cleanly terminate worker threads. Add sentinels to inputQueue equal to maxThreads. Join all worker threads. """ # Remove reference to inputQueue (raises exception if inputQueue is used after method call) inputQueue = self.inputQueue self.inputQueue = None for i in xrange(self.numWorkers): inputQueue.put(None) for thread in self.workerThreads: thread.join() AbstractBatchSystem.workerCleanup(self.workerCleanupInfo)
def __init__(self, config, maxCores, maxMemory): AbstractBatchSystem.__init__(self, config, maxCores, maxMemory) #Call the parent constructor self.lsfResultsFile = self._getResultsFileName(config.jobStore) #Reset the job queue and results (initially, we do this again once we've killed the jobs) self.lsfResultsFileHandle = open(self.lsfResultsFile, 'w') self.lsfResultsFileHandle.close() #We lose any previous state in this file, and ensure the files existence self.currentjobs = set() self.obtainSystemConstants() self.jobIDs = dict() self.lsfJobIDs = dict() self.nextJobID = 0 self.newJobsQueue = Queue() self.updatedJobsQueue = Queue() self.worker = Worker(self.newJobsQueue, self.updatedJobsQueue, self) self.worker.setDaemon(True) self.worker.start()
def __init__(self, config, maxCores, maxMemory): AbstractBatchSystem.__init__(self, config, maxCores, maxMemory) #Call the parent constructor self.lsfResultsFile = getParasolResultsFileName(config.jobStore) #Reset the job queue and results (initially, we do this again once we've killed the jobs) self.lsfResultsFileHandle = open(self.lsfResultsFile, 'w') self.lsfResultsFileHandle.close() #We lose any previous state in this file, and ensure the files existence self.currentjobs = set() self.obtainSystemConstants() self.jobIDs = dict() self.lsfJobIDs = dict() self.nextJobID = 0 self.newJobsQueue = Queue() self.updatedJobsQueue = Queue() self.worker = Worker(self.newJobsQueue, self.updatedJobsQueue, self) self.worker.setDaemon(True) self.worker.start()
def __init__(self, config, maxCores, maxMemory, maxDisk): AbstractBatchSystem.__init__(self, config, maxCores, maxMemory, maxDisk) self.gridengineResultsFile = self._getResultsFileName(config.jobStore) # Reset the job queue and results (initially, we do this again once we've killed the jobs) self.gridengineResultsFileHandle = open(self.gridengineResultsFile, 'w') # We lose any previous state in this file, and ensure the files existence self.gridengineResultsFileHandle.close() self.currentJobs = set() self.maxCPU, self.maxMEM = self.obtainSystemConstants() self.nextJobID = 0 self.newJobsQueue = Queue() self.updatedJobsQueue = Queue() self.killQueue = Queue() self.killedJobsQueue = Queue() self.worker = Worker(self.newJobsQueue, self.updatedJobsQueue, self.killQueue, self.killedJobsQueue, self) self.worker.start()
def __init__(self, config, maxCores, maxMemory, maxDisk): AbstractBatchSystem.__init__(self, config, maxCores, maxMemory, maxDisk) if maxMemory != sys.maxint: logger.warn('The Parasol batch system does not support maxMemory.') # Keep the name of the results file for the pstat2 command.. command = config.parasolCommand if os.path.sep not in command: try: command = next(which(command)) except StopIteration: raise RuntimeError("Can't find %s on PATH." % command) logger.info('Using Parasol at %s', command) self.parasolCommand = command self.parasolResultsDir = tempfile.mkdtemp(dir=config.jobStore) # In Parasol, each results file corresponds to a separate batch, and all jobs in a batch # have the same cpu and memory requirements. The keys to this dictionary are the (cpu, # memory) tuples for each batch. A new batch is created whenever a job has a new unique # combination of cpu and memory requirements. self.resultsFiles = dict() self.maxBatches = config.parasolMaxBatches # Allows the worker process to send back the IDs of jobs that have finished, so the batch # system can decrease its used cpus counter self.cpuUsageQueue = Queue() # Also stores finished job IDs, but is read by getUpdatedJobIDs(). self.updatedJobsQueue = Queue() # Use this to stop the worker when shutting down self.running = True self.worker = Thread(target=self.updatedJobWorker, args=()) self.worker.start() self.usedCpus = 0 self.jobIDsToCpu = {} # Set of jobs that have been issued but aren't known to have finished or been killed yet. # Jobs that end by themselves are removed in getUpdatedJob, and jobs that are killed are # removed in killBatchJobs. self.runningJobs = set()
def setEnv(self, name, value=None): if value and ' ' in value: raise ValueError('Parasol does not support spaces in environment variable values.') return AbstractBatchSystem.setEnv(self, name, value)
def setEnv(self, name, value=None): if value and ',' in value: raise ValueError( "GridEngine does not support commata in environment variable values" ) return AbstractBatchSystem.setEnv(self, name, value)
def setEnv(self, name, value=None): if value and ',' in value: raise ValueError("GridEngine does not support commata in environment variable values") return AbstractBatchSystem.setEnv(self, name, value)
def __init__(self, config, batchSystem1, batchSystem2, batchSystemChoiceFn): AbstractBatchSystem.__init__(self, config, 0, 0) #Call the parent constructor self.batchSystem1 = batchSystem1 self.batchSystem2 = batchSystem2 self.batchSystemChoiceFn = batchSystemChoiceFn
def setEnv(self, name, value=None): if value and ' ' in value: raise ValueError( 'Parasol does not support spaces in environment variable values.' ) return AbstractBatchSystem.setEnv(self, name, value)