Пример #1
0
 def __init__(self, config, maxCpus, maxMemory):
     AbstractBatchSystem.__init__(self, config, maxCpus, maxMemory) #Call the parent constructor
     if maxMemory != sys.maxint:
         logger.warn("A max memory has been specified for the parasol batch system class of %i, but currently "
                     "this batchsystem interface does not support such limiting" % maxMemory)
     #Keep the name of the results file for the pstat2 command..
     self.parasolCommand = config.attrib["parasol_command"]
     self.parasolResultsFile = getParasolResultsFileName(config.attrib["job_store"])
     #Reset the batchjob queue and results (initially, we do this again once we've killed the jobs)
     self.queuePattern = re.compile("q\s+([0-9]+)")
     self.runningPattern = re.compile("r\s+([0-9]+)\s+[\S]+\s+[\S]+\s+([0-9]+)\s+[\S]+")
     self.killBatchJobs(self.getIssuedBatchJobIDs()) #Kill any jobs on the current stack
     logger.info("Going to sleep for a few seconds to kill any existing jobs")
     time.sleep(5) #Give batch system a second to sort itself out.
     logger.info("Removed any old jobs from the queue")
     #Reset the batchjob queue and results
     exitValue = popenParasolCommand("%s -results=%s clear sick" % (self.parasolCommand, self.parasolResultsFile), False)[0]
     if exitValue is not None:
         logger.warn("Could not clear sick status of the parasol batch %s" % self.parasolResultsFile)
     exitValue = popenParasolCommand("%s -results=%s flushResults" % (self.parasolCommand, self.parasolResultsFile), False)[0]
     if exitValue is not None:
         logger.warn("Could not flush the parasol batch %s" % self.parasolResultsFile)
     open(self.parasolResultsFile, 'w').close()
     logger.info("Reset the results queue")
     #Stuff to allow max cpus to be work
     self.outputQueue1 = Queue()
     self.outputQueue2 = Queue()
     #worker = Thread(target=getUpdatedJob, args=(self.parasolResultsFileHandle, self.outputQueue1, self.outputQueue2))
     #worker.setDaemon(True)
     worker = Process(target=getUpdatedJob, args=(self.parasolResultsFile, self.outputQueue1, self.outputQueue2))
     worker.daemon = True
     worker.start()
     self.usedCpus = 0
     self.jobIDsToCpu = {}
Пример #2
0
 def __init__(self, config, maxCores, maxMemory, maxDisk):
     AbstractBatchSystem.__init__(self, config, maxCores, maxMemory, maxDisk) #Call the parent constructor
     if maxMemory != sys.maxint:
         logger.warn("A max memory has been specified for the parasol batch system class of %i, but currently "
                     "this batchsystem interface does not support such limiting" % maxMemory)
     #Keep the name of the results file for the pstat2 command..
     self.parasolCommand = config.parasolCommand
     self.parasolResultsFile = getParasolResultsFileName(config.jobStore)
     #Reset the job queue and results (initially, we do this again once we've killed the jobs)
     self.queuePattern = re.compile("q\s+([0-9]+)")
     self.runningPattern = re.compile("r\s+([0-9]+)\s+[\S]+\s+[\S]+\s+([0-9]+)\s+[\S]+")
     self.killBatchJobs(self.getIssuedBatchJobIDs()) #Kill any jobs on the current stack
     logger.info("Going to sleep for a few seconds to kill any existing jobs")
     time.sleep(5) #Give batch system a second to sort itself out.
     logger.info("Removed any old jobs from the queue")
     #Reset the job queue and results
     exitValue = popenParasolCommand("%s -results=%s clear sick" % (self.parasolCommand, self.parasolResultsFile), False)[0]
     if exitValue is not None:
         logger.warn("Could not clear sick status of the parasol batch %s" % self.parasolResultsFile)
     exitValue = popenParasolCommand("%s -results=%s flushResults" % (self.parasolCommand, self.parasolResultsFile), False)[0]
     if exitValue is not None:
         logger.warn("Could not flush the parasol batch %s" % self.parasolResultsFile)
     open(self.parasolResultsFile, 'w').close()
     logger.info("Reset the results queue")
     #Stuff to allow max cpus to be work
     self.outputQueue1 = Queue()
     self.outputQueue2 = Queue()
     #worker = Thread(target=getUpdatedJob, args=(self.parasolResultsFileHandle, self.outputQueue1, self.outputQueue2))
     #worker.setDaemon(True)
     worker = Process(target=getUpdatedJob, args=(self.parasolResultsFile, self.outputQueue1, self.outputQueue2))
     worker.daemon = True
     worker.start()
     self.usedCpus = 0
     self.jobIDsToCpu = {}
Пример #3
0
 def shutdown(self, driver):
     log.critical("Shutting down executor...")
     for taskId, pid in self.runningTasks.items():
         self.killTask(driver, taskId)
     Resource.cleanSystem()
     AbstractBatchSystem.workerCleanup(self.workerCleanupInfo)
     log.critical("Executor shut down")
Пример #4
0
 def shutdown(self, driver):
     log.critical("Shutting down executor...")
     for taskId, pid in self.runningTasks.items():
         self.killTask(driver, taskId)
     Resource.cleanSystem()
     AbstractBatchSystem.workerCleanup(self.workerCleanupInfo)
     log.critical("Executor shut down")
Пример #5
0
    def __init__(self,
                 config,
                 maxCores,
                 maxMemory,
                 maxDisk,
                 masterAddress,
                 userScript=None,
                 toilDistribution=None):
        AbstractBatchSystem.__init__(self, config, maxCores, maxMemory,
                                     maxDisk)
        # The hot-deployed resources representing the user script and the toil distribution
        # respectively. Will be passed along in every Mesos task. See
        # toil.common.HotDeployedResource for details.
        self.userScript = userScript
        self.toilDistribution = toilDistribution

        # Written to when mesos kills tasks, as directed by toil
        self.killedSet = set()

        # Dictionary of queues, which toil assigns jobs to. Each queue represents a job type,
        # defined by resource usage
        self.jobQueueList = defaultdict(list)

        # Address of Mesos master in the form host:port where host can be an IP or a hostname
        self.masterAddress = masterAddress

        # queue of jobs to kill, by jobID.
        self.killSet = set()

        # contains jobs on which killBatchJobs were called,
        #regardless of whether or not they actually were killed or
        #ended by themselves.
        self.intendedKill = set()

        # Dict of launched jobIDs to TaskData named tuple. Contains start time, executorID, and
        # slaveID.
        self.runningJobMap = {}

        # Queue of jobs whose status has been updated, according to mesos. Req'd by toil
        self.updatedJobsQueue = Queue()

        # Whether to use implicit/explicit acknowledgments
        self.implicitAcknowledgements = self.getImplicit()

        # Reference to the Mesos driver used by this scheduler, to be instantiated in run()
        self.driver = None

        # FIXME: This comment makes no sense to me

        # Returns Mesos executor object, which is merged into Mesos tasks as they are built
        self.executor = self.buildExecutor()

        self.nextJobID = 0
        self.lastReconciliation = time.time()
        self.reconciliationPeriod = 120

        # Start the driver
        self._startDriver()
Пример #6
0
    def __init__(self, config, maxCores, maxMemory, maxDisk):
        if maxCores > self.numCores:
            log.warn('Limiting maxCores to CPU count of system (%i).',
                     self.numCores)
            maxCores = self.numCores
        if maxMemory > self.physicalMemory:
            log.warn('Limiting maxMemory to physically available memory (%i).',
                     self.physicalMemory)
            maxMemory = self.physicalMemory
        AbstractBatchSystem.__init__(self, config, maxCores, maxMemory,
                                     maxDisk)
        assert self.maxCores >= self.minCores
        assert self.maxMemory >= 1

        # The scale allows the user to apply a factor to each task's cores requirement, thereby
        # squeezing more tasks onto each core (scale < 1) or stretching tasks over more cores
        # (scale > 1).
        self.scale = config.scale
        # Number of worker threads that will be started
        self.numWorkers = int(self.maxCores / self.minCores)
        # A counter to generate job IDs and a lock to guard it
        self.jobIndex = 0
        self.jobIndexLock = Lock()
        # A dictionary mapping IDs of submitted jobs to the command line
        self.jobs = {}
        """
        :type: dict[str,str]
        """
        # A queue of jobs waiting to be executed. Consumed by the workers.
        self.inputQueue = Queue()
        # A queue of finished jobs. Produced by the workers.
        self.outputQueue = Queue()
        # A dictionary mapping IDs of currently running jobs to their Info objects
        self.runningJobs = {}
        """
        :type: dict[str,Info]
        """
        # The list of worker threads
        self.workerThreads = []
        """
        :type list[Thread]
        """
        # A pool representing available CPU in units of minCores
        self.coreFractions = ResourcePool(self.numWorkers)
        # A lock to work around the lack of thread-safety in Python's subprocess module
        self.popenLock = Lock()
        # A pool representing available memory in bytes
        self.memory = ResourcePool(self.maxMemory)
        log.info(
            'Setting up the thread pool with %i workers, '
            'given a minimum CPU fraction of %f '
            'and a maximum CPU value of %i.', self.numWorkers, self.minCores,
            maxCores)
        for i in xrange(self.numWorkers):
            worker = Thread(target=self.worker, args=(self.inputQueue, ))
            self.workerThreads.append(worker)
            worker.start()
Пример #7
0
 def __init__(self, config, maxCores, maxMemory, maxDisk):
     assert type(maxCores) == int
     if maxCores > self.numCores:
         logger.warn('Limiting maxCores to CPU count of system (%i).',
                     self.numCores)
         maxCores = self.numCores
     AbstractBatchSystem.__init__(self, config, maxCores, maxMemory,
                                  maxDisk)
     assert self.maxCores >= 1
     assert self.maxMemory >= 1
     # The scale allows the user to apply a factor to each task's cores requirement, thereby squeezing more tasks
     # onto each core (scale < 1) or stretching tasks over more cores (scale > 1).
     self.scale = config.scale
     # The minimal fractional CPU. Tasks with a smaller cores requirement will be rounded up to this value. One
     # important invariant of this class is that each worker thread represents a CPU requirement of minCores,
     # meaning that we can never run more than numCores / minCores jobs concurrently. With minCores set to .1,
     # a task with cores=1 will occupy 10 workers. One of these workers will be blocked on the Popen.wait() call for
     # the worker.py child process, the others will be blocked on the acquiring the core semaphore.
     self.minCores = 0.1
     # Number of worker threads that will be started
     self.numWorkers = int(self.maxCores / self.minCores)
     # A counter to generate job IDs and a lock to guard it
     self.jobIndex = 0
     self.jobIndexLock = Lock()
     # A dictionary mapping IDs of submitted jobs to those jobs
     self.jobs = {}
     # A queue of jobs waiting to be executed. Consumed by the workers.
     self.inputQueue = Queue()
     # A queue of finished jobs. Produced by the workers.
     self.outputQueue = Queue()
     # A dictionary mapping IDs of currently running jobs to their Info objects
     self.runningJobs = {}
     # The list of worker threads
     self.workerThreads = []
     # A semaphore representing available CPU in units of minCores
     self.coreSemaphore = Semaphore(self.numWorkers)
     # A counter representing failed acquisitions of the semaphore, also in units of minCores, and a lock to guard it
     self.coreOverflow = 0
     self.coreOverflowLock = Lock()
     # A lock to work around the lack of thread-safety in Python's subprocess module
     self.popenLock = Lock()
     # A counter representing available memory in bytes
     self.memoryPool = self.maxMemory
     # A condition object used to guard it (a semphore would force us to acquire each unit of memory individually)
     self.memoryCondition = Condition()
     logger.info(
         'Setting up the thread pool with %i workers, '
         'given a minimum CPU fraction of %f '
         'and a maximum CPU value of %i.', self.numWorkers, self.minCores,
         maxCores)
     for i in xrange(self.numWorkers):
         worker = Thread(target=self.worker, args=(self.inputQueue, ))
         self.workerThreads.append(worker)
         worker.start()
Пример #8
0
    def __init__(self, config, maxCores, maxMemory, maxDisk):
        if maxCores > self.numCores:
            log.warn('Limiting maxCores to CPU count of system (%i).', self.numCores)
            maxCores = self.numCores
        if maxMemory > self.physicalMemory:
            log.warn('Limiting maxMemory to physically available memory (%i).', self.physicalMemory)
            maxMemory = self.physicalMemory
        AbstractBatchSystem.__init__(self, config, maxCores, maxMemory, maxDisk)
        assert self.maxCores >= self.minCores
        assert self.maxMemory >= 1

        # The scale allows the user to apply a factor to each task's cores requirement, thereby
        # squeezing more tasks onto each core (scale < 1) or stretching tasks over more cores
        # (scale > 1).
        self.scale = config.scale
        # Number of worker threads that will be started
        self.numWorkers = int(self.maxCores / self.minCores)
        # A counter to generate job IDs and a lock to guard it
        self.jobIndex = 0
        self.jobIndexLock = Lock()
        # A dictionary mapping IDs of submitted jobs to the command line
        self.jobs = {}
        """
        :type: dict[str,str]
        """
        # A queue of jobs waiting to be executed. Consumed by the workers.
        self.inputQueue = Queue()
        # A queue of finished jobs. Produced by the workers.
        self.outputQueue = Queue()
        # A dictionary mapping IDs of currently running jobs to their Info objects
        self.runningJobs = {}
        """
        :type: dict[str,Info]
        """
        # The list of worker threads
        self.workerThreads = []
        """
        :type list[Thread]
        """
        # A pool representing available CPU in units of minCores
        self.coreFractions = ResourcePool(self.numWorkers)
        # A lock to work around the lack of thread-safety in Python's subprocess module
        self.popenLock = Lock()
        # A pool representing available memory in bytes
        self.memory = ResourcePool(self.maxMemory)
        log.info('Setting up the thread pool with %i workers, '
                 'given a minimum CPU fraction of %f '
                 'and a maximum CPU value of %i.', self.numWorkers, self.minCores, maxCores)
        for i in xrange(self.numWorkers):
            worker = Thread(target=self.worker, args=(self.inputQueue,))
            self.workerThreads.append(worker)
            worker.start()
Пример #9
0
    def __init__(self,
                 config,
                 maxCpus,
                 maxMemory,
                 maxDisk,
                 masterIP,
                 useBadExecutor=False,
                 userScript=None,
                 toilDistribution=None):
        AbstractBatchSystem.__init__(self, config, maxCpus, maxMemory, maxDisk)
        # The hot-deployed resources representing the user script and the toil distribution respectively. Will be
        # passed along in every Mesos task. See toil.common.HotDeployedResource for details.
        self.userScript = userScript
        self.toilDistribution = toilDistribution

        # Written to when mesos kills tasks, as directed by toil
        self.killedSet = set()

        # Dictionary of queues, which toil assigns jobs to. Each queue represents a batchjob type,
        # defined by resource usage
        self.jobQueueList = defaultdict(list)

        # IP of mesos master. specified in MesosBatchSystem, currently loopback
        self.masterIP = masterIP

        # queue of jobs to kill, by jobID.
        self.killSet = set()

        # Dict of launched jobIDs to TaskData named tuple. Contains start time, executorID, and slaveID.
        self.runningJobMap = {}

        # Queue of jobs whose status has been updated, according to mesos. Req'd by toil
        self.updatedJobsQueue = Queue()

        # Wether to use implicit/explicit acknowledgments
        self.implicitAcknowledgements = self.getImplicit()

        # Reference to the Mesos driver used by this scheduler, to be instantiated in run()
        self.driver = None

        # FIXME: This comment makes no sense to me

        # Returns Mesos executor object, which is merged into Mesos tasks as they are built
        self.executor = self.buildExecutor(bad=useBadExecutor)

        self.nextJobID = 0
        self.lastReconciliation = time.time()
        self.reconciliationPeriod = 120

        # Start the driver
        self._startDriver()
Пример #10
0
    def __init__(self, config, maxCores, maxMemory, maxDisk, masterAddress,
                 userScript=None, toilDistribution=None):
        AbstractBatchSystem.__init__(self, config, maxCores, maxMemory, maxDisk)
        # The hot-deployed resources representing the user script and the toil distribution
        # respectively. Will be passed along in every Mesos task. See
        # toil.common.HotDeployedResource for details.
        self.userScript = userScript
        self.toilDistribution = toilDistribution

        # Written to when mesos kills tasks, as directed by toil
        self.killedSet = set()

        # Dictionary of queues, which toil assigns jobs to. Each queue represents a job type,
        # defined by resource usage
        self.jobQueueList = defaultdict(list)

        # Address of Mesos master in the form host:port where host can be an IP or a hostname
        self.masterAddress = masterAddress

        # queue of jobs to kill, by jobID.
        self.killSet = set()

        # contains jobs on which killBatchJobs were called,
        #regardless of whether or not they actually were killed or
        #ended by themselves.
        self.intendedKill = set()

        # Dict of launched jobIDs to TaskData named tuple. Contains start time, executorID, and
        # slaveID.
        self.runningJobMap = {}

        # Queue of jobs whose status has been updated, according to mesos. Req'd by toil
        self.updatedJobsQueue = Queue()

        # Whether to use implicit/explicit acknowledgments
        self.implicitAcknowledgements = self.getImplicit()

        # Reference to the Mesos driver used by this scheduler, to be instantiated in run()
        self.driver = None

        # FIXME: This comment makes no sense to me

        # Returns Mesos executor object, which is merged into Mesos tasks as they are built
        self.executor = self.buildExecutor()

        self.nextJobID = 0
        self.lastReconciliation = time.time()
        self.reconciliationPeriod = 120

        # Start the driver
        self._startDriver()
Пример #11
0
 def __init__(self, config, maxCpus, maxMemory, maxDisk, badWorker=False):
     assert type(maxCpus) == int
     if maxCpus > self.numCores:
         logger.warn('Limiting maxCpus to CPU count of system (%i).', self.numCores)
         maxCpus = self.numCores
     AbstractBatchSystem.__init__(self, config, maxCpus, maxMemory, maxDisk)
     assert self.maxCpus >= 1
     assert self.maxMemory >= 1
     # The scale allows the user to apply a factor to each task's CPU requirement, thereby squeezing more tasks
     # onto each core (scale < 1) or stretching tasks over more cores (scale > 1).
     self.scale = float(config.attrib['scale'])
     # The minimal fractional CPU. Tasks with a smaller CPU requirement will be rounded up to this value. One
     # important invariant of this class is that each worker thread represents a CPU requirement of minCpu,
     # meaning that we can never run more than numCores / minCpu jobs concurrently. With minCpu set to .1,
     # a task with cpu=1 will occupy 10 workers. One of these workers will be blocked on the Popen.wait() call for
     # the worker.py child process, the others will be blocked on the acquiring the CPU semaphore.
     self.minCpu = 0.1
     # Number of worker threads that will be started
     self.numWorkers = int(self.maxCpus / self.minCpu)
     # A counter to generate batchjob IDs and a lock to guard it
     self.jobIndex = 0
     self.jobIndexLock = Lock()
     # A dictionary mapping IDs of submitted jobs to those jobs
     self.jobs = {}
     # A queue of jobs waiting to be executed. Consumed by the workers.
     self.inputQueue = Queue()
     # A queue of finished jobs. Produced by the workers.
     self.outputQueue = Queue()
     # A dictionary mapping IDs of currently running jobs to their Info objects
     self.runningJobs = {}
     # The list of worker threads
     self.workerThreads = []
     # A semaphore representing available CPU in units of minCpu
     self.cpuSemaphore = Semaphore(self.numWorkers)
     # A counter representing failed acquisitions of the semaphore, also in units of minCpu, and a lock to guard it
     self.cpuOverflow = 0
     self.cpuOverflowLock = Lock()
     # A lock to work around the lack of thread-safety in Python's subprocess module
     self.popenLock = Lock()
     # A counter representing available memory in bytes
     self.memoryPool = self.maxMemory
     # A condition object used to guard it (a semphore would force us to acquire each unit of memory individually)
     self.memoryCondition = Condition()
     logger.info('Setting up the thread pool with %i workers, '
                 'given a minimum CPU fraction of %f '
                 'and a maximum CPU value of %i.', self.numWorkers, self.minCpu, maxCpus)
     self.workerFn = self.badWorker if badWorker else self.worker
     for i in xrange(self.numWorkers):
         worker = Thread(target=self.workerFn, args=(self.inputQueue,))
         self.workerThreads.append(worker)
         worker.start()
Пример #12
0
    def shutdown(self):
        """
        Cleanly terminate worker threads. Add sentinels to inputQueue equal to maxThreads. Join
        all worker threads.
        """
        # Remove reference to inputQueue (raises exception if inputQueue is used after method call)
        inputQueue = self.inputQueue
        self.inputQueue = None
        for i in xrange(self.numWorkers):
            inputQueue.put(None)

        for thread in self.workerThreads:
            thread.join()
        AbstractBatchSystem.workerCleanup(self.workerCleanupInfo)
Пример #13
0
    def shutdown(self):
        """
        Cleanly terminate worker threads. Add sentinels to inputQueue equal to maxThreads. Join
        all worker threads.
        """
        # Remove reference to inputQueue (raises exception if inputQueue is used after method call)
        inputQueue = self.inputQueue
        self.inputQueue = None
        for i in xrange(self.numWorkers):
            inputQueue.put(None)

        for thread in self.workerThreads:
            thread.join()
        AbstractBatchSystem.workerCleanup(self.workerCleanupInfo)
Пример #14
0
    def __init__(self, config, maxCores, maxMemory):
        AbstractBatchSystem.__init__(self, config, maxCores, maxMemory) #Call the parent constructor
        self.lsfResultsFile = self._getResultsFileName(config.jobStore)
        #Reset the job queue and results (initially, we do this again once we've killed the jobs)
        self.lsfResultsFileHandle = open(self.lsfResultsFile, 'w')
        self.lsfResultsFileHandle.close() #We lose any previous state in this file, and ensure the files existence
        self.currentjobs = set()
        self.obtainSystemConstants()
        self.jobIDs = dict()
        self.lsfJobIDs = dict()
        self.nextJobID = 0

        self.newJobsQueue = Queue()
        self.updatedJobsQueue = Queue()
        self.worker = Worker(self.newJobsQueue, self.updatedJobsQueue, self)
        self.worker.setDaemon(True)
        self.worker.start()
Пример #15
0
    def __init__(self, config, maxCores, maxMemory):
        AbstractBatchSystem.__init__(self, config, maxCores, maxMemory) #Call the parent constructor
        self.lsfResultsFile = getParasolResultsFileName(config.jobStore)
        #Reset the job queue and results (initially, we do this again once we've killed the jobs)
        self.lsfResultsFileHandle = open(self.lsfResultsFile, 'w')
        self.lsfResultsFileHandle.close() #We lose any previous state in this file, and ensure the files existence
        self.currentjobs = set()
        self.obtainSystemConstants()
        self.jobIDs = dict()
        self.lsfJobIDs = dict()
        self.nextJobID = 0

        self.newJobsQueue = Queue()
        self.updatedJobsQueue = Queue()
        self.worker = Worker(self.newJobsQueue, self.updatedJobsQueue, self)
        self.worker.setDaemon(True)
        self.worker.start()
Пример #16
0
 def __init__(self, config, maxCores, maxMemory, maxDisk):
     AbstractBatchSystem.__init__(self, config, maxCores, maxMemory, maxDisk)
     self.gridengineResultsFile = self._getResultsFileName(config.jobStore)
     # Reset the job queue and results (initially, we do this again once we've killed the jobs)
     self.gridengineResultsFileHandle = open(self.gridengineResultsFile, 'w')
     # We lose any previous state in this file, and ensure the files existence
     self.gridengineResultsFileHandle.close()
     self.currentJobs = set()
     self.maxCPU, self.maxMEM = self.obtainSystemConstants()
     self.nextJobID = 0
     self.newJobsQueue = Queue()
     self.updatedJobsQueue = Queue()
     self.killQueue = Queue()
     self.killedJobsQueue = Queue()
     self.worker = Worker(self.newJobsQueue, self.updatedJobsQueue, self.killQueue,
                          self.killedJobsQueue, self)
     self.worker.start()
Пример #17
0
 def __init__(self, config, maxCores, maxMemory, maxDisk):
     AbstractBatchSystem.__init__(self, config, maxCores, maxMemory,
                                  maxDisk)
     self.gridengineResultsFile = self._getResultsFileName(config.jobStore)
     # Reset the job queue and results (initially, we do this again once we've killed the jobs)
     self.gridengineResultsFileHandle = open(self.gridengineResultsFile,
                                             'w')
     # We lose any previous state in this file, and ensure the files existence
     self.gridengineResultsFileHandle.close()
     self.currentJobs = set()
     self.maxCPU, self.maxMEM = self.obtainSystemConstants()
     self.nextJobID = 0
     self.newJobsQueue = Queue()
     self.updatedJobsQueue = Queue()
     self.killQueue = Queue()
     self.killedJobsQueue = Queue()
     self.worker = Worker(self.newJobsQueue, self.updatedJobsQueue,
                          self.killQueue, self.killedJobsQueue, self)
     self.worker.start()
Пример #18
0
    def __init__(self, config, maxCpus, maxMemory, maxDisk, masterIP, useBadExecutor=False, userScript=None, toilDistribution=None):
        AbstractBatchSystem.__init__(self, config, maxCpus, maxMemory, maxDisk)
        # The hot-deployed resources representing the user script and the toil distribution respectively. Will be
        # passed along in every Mesos task. See toil.common.HotDeployedResource for details.
        self.userScript = userScript
        self.toilDistribution = toilDistribution

        # Written to when mesos kills tasks, as directed by toil
        self.killedSet = set()

        # Dictionary of queues, which toil assigns jobs to. Each queue represents a batchjob type,
        # defined by resource usage
        self.jobQueueList = defaultdict(list)

        # IP of mesos master. specified in MesosBatchSystem, currently loopback
        self.masterIP = masterIP

        # queue of jobs to kill, by jobID.
        self.killSet = set()

        # Dict of launched jobIDs to TaskData named tuple. Contains start time, executorID, and slaveID.
        self.runningJobMap = {}

        # Queue of jobs whose status has been updated, according to mesos. Req'd by toil
        self.updatedJobsQueue = Queue()

        # Wether to use implicit/explicit acknowledgments
        self.implicitAcknowledgements = self.getImplicit()

        # Reference to the Mesos driver used by this scheduler, to be instantiated in run()
        self.driver = None

        # FIXME: This comment makes no sense to me

        # Returns Mesos executor object, which is merged into Mesos tasks as they are built
        self.executor = self.buildExecutor(bad=useBadExecutor)

        self.nextJobID = 0
        self.lastReconciliation = time.time()
        self.reconciliationPeriod = 120

        # Start the driver
        self._startDriver()
Пример #19
0
    def __init__(self, config, maxCores, maxMemory, maxDisk):
        AbstractBatchSystem.__init__(self, config, maxCores, maxMemory,
                                     maxDisk)
        if maxMemory != sys.maxint:
            logger.warn('The Parasol batch system does not support maxMemory.')
        # Keep the name of the results file for the pstat2 command..
        command = config.parasolCommand
        if os.path.sep not in command:
            try:
                command = next(which(command))
            except StopIteration:
                raise RuntimeError("Can't find %s on PATH." % command)
        logger.info('Using Parasol at %s', command)
        self.parasolCommand = command
        self.parasolResultsDir = tempfile.mkdtemp(dir=config.jobStore)

        # In Parasol, each results file corresponds to a separate batch, and all jobs in a batch
        # have the same cpu and memory requirements. The keys to this dictionary are the (cpu,
        # memory) tuples for each batch. A new batch is created whenever a job has a new unique
        # combination of cpu and memory requirements.
        self.resultsFiles = dict()
        self.maxBatches = config.parasolMaxBatches

        # Allows the worker process to send back the IDs of jobs that have finished, so the batch
        #  system can decrease its used cpus counter
        self.cpuUsageQueue = Queue()

        # Also stores finished job IDs, but is read by getUpdatedJobIDs().
        self.updatedJobsQueue = Queue()

        # Use this to stop the worker when shutting down
        self.running = True

        self.worker = Thread(target=self.updatedJobWorker, args=())
        self.worker.start()
        self.usedCpus = 0
        self.jobIDsToCpu = {}

        # Set of jobs that have been issued but aren't known to have finished or been killed yet.
        #  Jobs that end by themselves are removed in getUpdatedJob, and jobs that are killed are
        #  removed in killBatchJobs.
        self.runningJobs = set()
Пример #20
0
    def __init__(self, config, maxCores, maxMemory, maxDisk):
        AbstractBatchSystem.__init__(self, config, maxCores, maxMemory, maxDisk)
        if maxMemory != sys.maxint:
            logger.warn('The Parasol batch system does not support maxMemory.')
        # Keep the name of the results file for the pstat2 command..
        command = config.parasolCommand
        if os.path.sep not in command:
            try:
                command = next(which(command))
            except StopIteration:
                raise RuntimeError("Can't find %s on PATH." % command)
        logger.info('Using Parasol at %s', command)
        self.parasolCommand = command
        self.parasolResultsDir = tempfile.mkdtemp(dir=config.jobStore)

        # In Parasol, each results file corresponds to a separate batch, and all jobs in a batch
        # have the same cpu and memory requirements. The keys to this dictionary are the (cpu,
        # memory) tuples for each batch. A new batch is created whenever a job has a new unique
        # combination of cpu and memory requirements.
        self.resultsFiles = dict()
        self.maxBatches = config.parasolMaxBatches

        # Allows the worker process to send back the IDs of jobs that have finished, so the batch
        #  system can decrease its used cpus counter
        self.cpuUsageQueue = Queue()

        # Also stores finished job IDs, but is read by getUpdatedJobIDs().
        self.updatedJobsQueue = Queue()

        # Use this to stop the worker when shutting down
        self.running = True

        self.worker = Thread(target=self.updatedJobWorker, args=())
        self.worker.start()
        self.usedCpus = 0
        self.jobIDsToCpu = {}

        # Set of jobs that have been issued but aren't known to have finished or been killed yet.
        #  Jobs that end by themselves are removed in getUpdatedJob, and jobs that are killed are
        #  removed in killBatchJobs.
        self.runningJobs = set()
Пример #21
0
 def setEnv(self, name, value=None):
     if value and ' ' in value:
         raise ValueError('Parasol does not support spaces in environment variable values.')
     return AbstractBatchSystem.setEnv(self, name, value)
Пример #22
0
 def setEnv(self, name, value=None):
     if value and ',' in value:
         raise ValueError(
             "GridEngine does not support commata in environment variable values"
         )
     return AbstractBatchSystem.setEnv(self, name, value)
Пример #23
0
 def setEnv(self, name, value=None):
     if value and ',' in value:
         raise ValueError("GridEngine does not support commata in environment variable values")
     return AbstractBatchSystem.setEnv(self, name, value)
Пример #24
0
 def __init__(self, config, batchSystem1, batchSystem2, batchSystemChoiceFn):
     AbstractBatchSystem.__init__(self, config, 0, 0) #Call the parent constructor
     self.batchSystem1 = batchSystem1
     self.batchSystem2 = batchSystem2
     self.batchSystemChoiceFn = batchSystemChoiceFn
Пример #25
0
 def setEnv(self, name, value=None):
     if value and ' ' in value:
         raise ValueError(
             'Parasol does not support spaces in environment variable values.'
         )
     return AbstractBatchSystem.setEnv(self, name, value)