示例#1
0
 def testBatchResourceLimits(self):
     jobNode1 = JobNode(command="sleep 1000",
                        requirements=dict(memory=1 << 30,
                                          cores=1,
                                          disk=1000,
                                          preemptable=preemptable),
                        jobName='testResourceLimits',
                        unitName=None,
                        jobStoreID='1')
     job1 = self.batchSystem.issueBatchJob(jobNode1)
     self.assertIsNotNone(job1)
     jobNode2 = JobNode(command="sleep 1000",
                        requirements=dict(memory=2 << 30,
                                          cores=1,
                                          disk=1000,
                                          preemptable=preemptable),
                        jobName='testResourceLimits',
                        unitName=None,
                        jobStoreID='2')
     job2 = self.batchSystem.issueBatchJob(jobNode2)
     self.assertIsNotNone(job2)
     batches = self._getBatchList()
     self.assertEqual(len(batches), 2)
     # It would be better to directly check that the batches have the correct memory and cpu
     # values, but Parasol seems to slightly change the values sometimes.
     self.assertNotEqual(batches[0]['ram'], batches[1]['ram'])
     # Need to kill one of the jobs because there are only two cores available
     self.batchSystem.killBatchJobs([job2])
     job3 = self.batchSystem.issueBatchJob(jobNode1)
     self.assertIsNotNone(job3)
     batches = self._getBatchList()
     self.assertEqual(len(batches), 1)
示例#2
0
        def testSetEnv(self):
            # Parasol disobeys shell rules and stupidly splits the command at
            # the space character into arguments before exec'ing it, whether
            # the space is quoted, escaped or not.

            script_shell = 'if [ "x${FOO}" == "xbar" ] ; then exit 23 ; else exit 42 ; fi'

            # Escape the semicolons
            script_protected = script_shell.replace(';', '\;')

            # Turn into a string which convinces bash to take all args and paste them back together and run them
            command = "bash -c \"\\${@}\" bash eval " + script_protected
            jobNode4 = JobNode(command=command,
                               jobName='test4',
                               unitName=None,
                               jobStoreID='4',
                               requirements=defaultRequirements)
            job4 = self.batchSystem.issueBatchJob(jobNode4)
            jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(
                maxWait=1000)
            self.assertEqual(exitStatus, 42)
            self.assertEqual(jobID, job4)
            # Now set the variable and ensure that it is present
            self.batchSystem.setEnv('FOO', 'bar')
            jobNode5 = JobNode(command=command,
                               jobName='test5',
                               unitName=None,
                               jobStoreID='5',
                               requirements=defaultRequirements)
            job5 = self.batchSystem.issueBatchJob(jobNode5)
            jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(
                maxWait=1000)
            self.assertEqual(exitStatus, 23)
            self.assertEqual(jobID, job5)
示例#3
0
        def testSetEnv(self):
            # Parasol disobeys shell rules and stupidly splits the command at the space character
            # before exec'ing it, whether the space is quoted, escaped or not. This means that we
            # can't have escaped or quotes spaces in the command line. So we can't use bash -c
            #  '...' or python -c '...'. The safest thing to do here is to script the test and
            # invoke that script rather than inline the test via -c.
            def assertEnv():
                import os, sys
                sys.exit(23 if os.getenv('FOO') == 'bar' else 42)

            script_body = dedent('\n'.join(getsource(assertEnv).split('\n')[1:]))
            with tempFileContaining(script_body, suffix='.py') as script_path:
                # First, ensure that the test fails if the variable is *not* set
                command = sys.executable + ' ' + script_path
                jobNode4 = JobNode(command=command, jobName='test4', unitName=None,
                                   jobStoreID='4', requirements=defaultRequirements)
                job4 = self.batchSystem.issueBatchJob(jobNode4)
                jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(maxWait=1000)
                self.assertEqual(exitStatus, 42)
                self.assertEqual(jobID, job4)
                # Now set the variable and ensure that it is present
                self.batchSystem.setEnv('FOO', 'bar')
                jobNode5 = JobNode(command=command, jobName='test5', unitName=None,
                                   jobStoreID='5', requirements=defaultRequirements)
                job5 = self.batchSystem.issueBatchJob(jobNode5)
                jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(maxWait=1000)
                self.assertEqual(exitStatus, 23)
                self.assertEqual(jobID, job5)
示例#4
0
        def testRunJobs(self):
            jobNode1 = JobNode(command='sleep 1000', jobName='test1', unitName=None,
                               jobStoreID='1', requirements=defaultRequirements)
            jobNode2 = JobNode(command='sleep 1000', jobName='test2', unitName=None,
                               jobStoreID='2', requirements=defaultRequirements)
            job1 = self.batchSystem.issueBatchJob(jobNode1)
            job2 = self.batchSystem.issueBatchJob(jobNode2)

            issuedIDs = self._waitForJobsToIssue(2)
            self.assertEqual(set(issuedIDs), {job1, job2})

            # Now at some point we want these jobs to become running
            # But since we may be testing against a live cluster (Kubernetes)
            # we want to handle weird cases and high cluster load as much as we can.

            # Wait a bit for any Dockers to download and for the
            # jobs to have a chance to start.
            # TODO: We insist on neither of these ever finishing when we test
            # getUpdatedBatchJob, and the sleep time is longer than the time we
            # should spend waiting for both to start, so if our cluster can
            # only run one job at a time, we will fail the test.
            runningJobIDs = self._waitForJobsToStart(2, tries=120)
            self.assertEqual(set(runningJobIDs), {job1, job2})

            # Killing the jobs instead of allowing them to complete means this test can run very
            # quickly if the batch system issues and starts the jobs quickly.
            self.batchSystem.killBatchJobs([job1, job2])
            self.assertEqual({}, self.batchSystem.getRunningBatchJobIDs())

            # Issue a job and then allow it to finish by itself, causing it to be added to the
            # updated jobs queue.
            # We would like to have this touch something on the filesystem and
            # then check for it having happened, but we can't guarantee that
            # the batch system will run against the same filesystem we are
            # looking at.
            jobNode3 = JobNode(command="mktemp -d", jobName='test3', unitName=None,
                               jobStoreID='3', requirements=defaultRequirements)
            job3 = self.batchSystem.issueBatchJob(jobNode3)

            jobUpdateInfo = self.batchSystem.getUpdatedBatchJob(maxWait=1000)
            jobID, exitStatus, wallTime = jobUpdateInfo.jobID, jobUpdateInfo.exitStatus, jobUpdateInfo.wallTime
            log.info('Third job completed: {} {} {}'.format(jobID, exitStatus, wallTime))

            # Since the first two jobs were killed, the only job in the updated jobs queue should
            # be job 3. If the first two jobs were (incorrectly) added to the queue, this will
            # fail with jobID being equal to job1 or job2.
            self.assertEqual(jobID, job3)
            self.assertEqual(exitStatus, 0)
            if self.supportsWallTime():
                self.assertTrue(wallTime > 0)
            else:
                self.assertIsNone(wallTime)
            # TODO: Work out a way to check if the job we asked to run actually ran.
            # Don't just believe the batch system, but don't assume it ran on this machine either.
            self.assertFalse(self.batchSystem.getUpdatedBatchJob(0))

            # Make sure killBatchJobs can handle jobs that don't exist
            self.batchSystem.killBatchJobs([10])
示例#5
0
        def testRunJobs(self):
            jobNode1 = JobNode(command='sleep 1000',
                               jobName='test1',
                               unitName=None,
                               jobStoreID='1',
                               requirements=defaultRequirements)
            jobNode2 = JobNode(command='sleep 1000',
                               jobName='test2',
                               unitName=None,
                               jobStoreID='2',
                               requirements=defaultRequirements)
            job1 = self.batchSystem.issueBatchJob(jobNode1)
            job2 = self.batchSystem.issueBatchJob(jobNode2)

            issuedIDs = self._waitForJobsToIssue(2)
            self.assertEqual(set(issuedIDs), {job1, job2})

            runningJobIDs = self._waitForJobsToStart(2)
            self.assertEqual(set(runningJobIDs), {job1, job2})

            # Killing the jobs instead of allowing them to complete means this test can run very
            # quickly if the batch system issues and starts the jobs quickly.
            self.batchSystem.killBatchJobs([job1, job2])
            self.assertEqual({}, self.batchSystem.getRunningBatchJobIDs())

            # Issue a job and then allow it to finish by itself, causing it to be added to the
            # updated jobs queue.
            # We would like to have this touch something on the filesystem and
            # then check for it having happened, but we can't guarantee that
            # the batch system will run against the same filesystem we are
            # looking at.
            jobNode3 = JobNode(command="mktemp -d",
                               jobName='test3',
                               unitName=None,
                               jobStoreID='3',
                               requirements=defaultRequirements)
            job3 = self.batchSystem.issueBatchJob(jobNode3)

            jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(
                maxWait=1000)

            # Since the first two jobs were killed, the only job in the updated jobs queue should
            # be job 3. If the first two jobs were (incorrectly) added to the queue, this will
            # fail with jobID being equal to job1 or job2.
            self.assertEqual(jobID, job3)
            self.assertEqual(exitStatus, 0)
            if self.supportsWallTime():
                self.assertTrue(wallTime > 0)
            else:
                self.assertIsNone(wallTime)
            # TODO: Work out a way to check if the job we asked to run actually ran.
            # Don't just believe the batch system, but don't assume it ran on this machine either.
            self.assertFalse(self.batchSystem.getUpdatedBatchJob(0))

            # Make sure killBatchJobs can handle jobs that don't exist
            self.batchSystem.killBatchJobs([10])
示例#6
0
        def testRunJobs(self):
            testPath = os.path.join(self.tempDir, "test.txt")
            jobNode1 = JobNode(command='sleep 1000',
                               jobName='test1',
                               unitName=None,
                               jobStoreID='1',
                               requirements=defaultRequirements)
            jobNode2 = JobNode(command='sleep 1000',
                               jobName='test2',
                               unitName=None,
                               jobStoreID='2',
                               requirements=defaultRequirements)
            job1 = self.batchSystem.issueBatchJob(jobNode1)
            job2 = self.batchSystem.issueBatchJob(jobNode2)

            issuedIDs = self._waitForJobsToIssue(2)
            self.assertEqual(set(issuedIDs), {job1, job2})

            runningJobIDs = self._waitForJobsToStart(2)
            self.assertEqual(set(runningJobIDs), {job1, job2})

            # Killing the jobs instead of allowing them to complete means this test can run very
            # quickly if the batch system issues and starts the jobs quickly.
            self.batchSystem.killBatchJobs([job1, job2])
            self.assertEqual({}, self.batchSystem.getRunningBatchJobIDs())

            # Issue a job and then allow it to finish by itself, causing it to be added to the
            # updated jobs queue.
            self.assertFalse(os.path.exists(testPath))
            jobNode3 = JobNode(command="touch %s" % testPath,
                               jobName='test3',
                               unitName=None,
                               jobStoreID='3',
                               requirements=defaultRequirements)
            job3 = self.batchSystem.issueBatchJob(jobNode3)

            jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(
                maxWait=1000)

            # Since the first two jobs were killed, the only job in the updated jobs queue should
            # be job 3. If the first two jobs were (incorrectly) added to the queue, this will
            # fail with jobID being equal to job1 or job2.
            self.assertEqual(exitStatus, 0)
            self.assertEqual(jobID, job3)
            if self.supportsWallTime():
                self.assertTrue(wallTime > 0)
            else:
                self.assertIsNone(wallTime)
            if not os.path.exists(testPath):
                time.sleep(20)
            self.assertTrue(os.path.exists(testPath))
            self.assertFalse(self.batchSystem.getUpdatedBatchJob(0))

            # Make sure killBatchJobs can handle jobs that don't exist
            self.batchSystem.killBatchJobs([10])
示例#7
0
    def _testClusterScaling(self, config, numJobs, numPreemptableJobs, jobShape):
        """
        Test the ClusterScaler class with different patterns of job creation. Tests ascertain that
        autoscaling occurs and that all the jobs are run.
        """
        # First do simple test of creating 100 preemptable and non-premptable jobs and check the
        # jobs are completed okay, then print the amount of worker time expended and the total
        # number of worker nodes used.

        mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0)
        mock.start()
        clusterScaler = ScalerThread(mock, mock, config)
        clusterScaler.start()
        try:
            # Add 100 jobs to complete
            list(map(lambda x: mock.addJob(jobShape=jobShape),
                     list(range(numJobs))))
            list(map(lambda x: mock.addJob(jobShape=jobShape, preemptable=True),
                     list(range(numPreemptableJobs))))

            # Add some completed jobs
            for preemptable in (True, False):
                if preemptable and numPreemptableJobs > 0 or not preemptable and numJobs > 0:
                    # Add 1000 random jobs
                    for _ in range(1000):
                        x = mock.getNodeShape(nodeType=jobShape)
                        iJ = JobNode(jobStoreID=1,
                                     requirements=dict(
                                         memory=random.choice(list(range(1, x.memory))),
                                         cores=random.choice(list(range(1, x.cores))),
                                         disk=random.choice(list(range(1, x.disk))),
                                         preemptable=preemptable),
                                     command=None,
                                     jobName='testClusterScaling', unitName='')
                        clusterScaler.addCompletedJob(iJ, random.choice(list(range(1, x.wallTime))))

            startTime = time.time()
            # Wait while the cluster processes the jobs
            while (mock.getNumberOfJobsIssued(preemptable=False) > 0
                   or mock.getNumberOfJobsIssued(preemptable=True) > 0
                   or mock.getNumberOfNodes() > 0 or mock.getNumberOfNodes(preemptable=True) > 0):
                logger.debug("Running, non-preemptable queue size: %s, non-preemptable workers: %s, "
                            "preemptable queue size: %s, preemptable workers: %s" %
                            (mock.getNumberOfJobsIssued(preemptable=False),
                             mock.getNumberOfNodes(preemptable=False),
                             mock.getNumberOfJobsIssued(preemptable=True),
                             mock.getNumberOfNodes(preemptable=True)))
                clusterScaler.check()
                time.sleep(0.5)
            logger.debug("We waited %s for cluster to finish" % (time.time() - startTime))
        finally:
            clusterScaler.shutdown()
            mock.shutDown()

        # Print some info about the autoscaling
        logger.debug("Total-jobs: %s: Max-workers: %s, "
                     "Total-worker-time: %s, Worker-time-per-job: %s" %
                    (mock.totalJobs, sum(mock.maxWorkers.values()),
                     mock.totalWorkerTime,
                     old_div(mock.totalWorkerTime, mock.totalJobs) if mock.totalJobs > 0 else 0.0))
示例#8
0
 def setUp(self):
     super(AbstractJobStoreTest.Test, self).setUp()
     self.namePrefix = 'jobstore-test-' + str(uuid.uuid4())
     self.master = self._createJobStore()
     self.config = self._createConfig()
     self.master.initialize(self.config)
     self.arbitraryRequirements = {'memory': 1, 'disk': 2, 'cores': 1, 'preemptable': False}
     self.arbitraryJob = JobNode(command='command',
                                 jobStoreID=None,
                                 jobName='arbitrary', unitName=None,
                                 requirements=self.arbitraryRequirements)
示例#9
0
    def testIgnoreNode(self):
        self.batchSystem.ignoreNode('localhost')
        jobNode = JobNode(command='sleep 1000', jobName='test2', unitName=None,
                           jobStoreID='1', requirements=defaultRequirements)
        job = self.batchSystem.issueBatchJob(jobNode)

        issuedID = self._waitForJobsToIssue(1)
        self.assertEqual(set(issuedID), {job})

        runningJobIDs = self._waitForJobsToStart(1)
        #Make sure job is NOT running
        self.assertEqual(set(runningJobIDs), set({}))
示例#10
0
 def test(self):
     # We'll use fractions to avoid rounding errors. Remember that not every fraction can be
     # represented as a floating point number.
     F = Fraction
     # This test isn't general enough to cover every possible value of minCores in
     # SingleMachineBatchSystem. Instead we hard-code a value and assert it.
     minCores = F(1, 10)
     self.assertEquals(float(minCores), SingleMachineBatchSystem.minCores)
     for maxCores in {F(minCores), minCores * 10, F(1), F(numCores, 2), F(numCores)}:
         for coresPerJob in {F(minCores), F(minCores * 10), F(1), F(maxCores, 2), F(maxCores)}:
             for load in (F(1, 10), F(1), F(10)):
                 jobs = int(maxCores / coresPerJob * load)
                 if jobs >= 1 and minCores <= coresPerJob < maxCores:
                     self.assertEquals(maxCores, float(maxCores))
                     bs = SingleMachineBatchSystem(
                         config=hidden.AbstractBatchSystemTest.createConfig(),
                         maxCores=float(maxCores),
                         # Ensure that memory or disk requirements don't get in the way.
                         maxMemory=jobs * 10,
                         maxDisk=jobs * 10)
                     try:
                         jobIds = set()
                         for i in range(0, int(jobs)):
                             jobIds.add(bs.issueBatchJob(JobNode(command=self.scriptCommand(),
                                                                 requirements=dict(
                                                                     cores=float( coresPerJob),
                                                                     memory=1, disk=1,
                                                                     preemptable=preemptable),
                                                                 jobName=str(i), unitName='', jobStoreID=str(i))))
                         self.assertEquals(len(jobIds), jobs)
                         while jobIds:
                             job = bs.getUpdatedBatchJob(maxWait=10)
                             self.assertIsNotNone(job)
                             jobId, status, wallTime = job
                             self.assertEquals(status, 0)
                             # would raise KeyError on absence
                             jobIds.remove(jobId)
                     finally:
                         bs.shutdown()
                     concurrentTasks, maxConcurrentTasks = getCounters(self.counterPath)
                     self.assertEquals(concurrentTasks, 0)
                     log.info('maxCores: {maxCores}, '
                              'coresPerJob: {coresPerJob}, '
                              'load: {load}'.format(**locals()))
                     # This is the key assertion:
                     expectedMaxConcurrentTasks = min(old_div(maxCores, coresPerJob), jobs)
                     self.assertEquals(maxConcurrentTasks, expectedMaxConcurrentTasks)
                     resetCounters(self.counterPath)
示例#11
0
    def testClusterScalingMultipleNodeTypes(self):

        smallNode = Shape(20, 5, 10, 10, False)
        mediumNode = Shape(20, 10, 10, 10, False)
        largeNode = Shape(20, 20, 10, 10, False)

        numJobs = 100

        config = Config()

        # Make defaults dummy values
        config.defaultMemory = 1
        config.defaultCores = 1
        config.defaultDisk = 1

        # No preemptable nodes/jobs
        config.preemptableNodeTypes = []
        config.minPreemptableNodes = []
        config.maxPreemptableNodes = []  # No preemptable nodes

        # Make sure the node types don't have to be ordered
        config.nodeTypes = [largeNode, smallNode, mediumNode]
        config.minNodes = [0, 0, 0]
        config.maxNodes = [10, 10]  # test expansion of this list

        # Algorithm parameters
        config.targetTime = defaultTargetTime
        config.betaInertia = 0.1
        config.scaleInterval = 3

        mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0)
        clusterScaler = ScalerThread(mock, mock, config)
        clusterScaler.start()
        mock.start()

        try:
            # Add small jobs
            list(
                map(lambda x: mock.addJob(jobShape=smallNode),
                    list(range(numJobs))))
            list(
                map(lambda x: mock.addJob(jobShape=mediumNode),
                    list(range(numJobs))))

            # Add medium completed jobs
            for i in range(1000):
                iJ = JobNode(jobStoreID=1,
                             requirements=dict(memory=random.choice(
                                 range(smallNode.memory, mediumNode.memory)),
                                               cores=mediumNode.cores,
                                               disk=largeNode.cores,
                                               preemptable=False),
                             command=None,
                             jobName='testClusterScaling',
                             unitName='')
                clusterScaler.addCompletedJob(iJ, random.choice(range(1, 10)))

            while mock.getNumberOfJobsIssued() > 0 or mock.getNumberOfNodes(
            ) > 0:
                logger.debug("%i nodes currently provisioned" %
                             mock.getNumberOfNodes())
                # Make sure there are no large nodes
                self.assertEqual(mock.getNumberOfNodes(nodeType=largeNode), 0)
                clusterScaler.check()
                time.sleep(0.5)
        finally:
            clusterScaler.shutdown()
            mock.shutDown()

        # Make sure jobs ran on both the small and medium node types
        self.assertTrue(mock.totalJobs > 0)
        self.assertTrue(mock.maxWorkers[smallNode] > 0)
        self.assertTrue(mock.maxWorkers[mediumNode] > 0)

        self.assertEqual(mock.maxWorkers[largeNode], 0)
示例#12
0
        def test(self):
            """
            This is a front-to-back test of the "happy" path in a job store, i.e. covering things
            that occur in the dat to day life of a job store. The purist might insist that this be
            split up into several cases and I agree wholeheartedly.
            """
            master = self.master

            # Test initial state
            #
            self.assertFalse(master.exists('foo'))
            self.assertRaises(NoSuchJobException, master.load, 'foo')

            # Create parent job and verify its existence/properties
            #
            masterRequirements = dict(memory=12,
                                      cores=34,
                                      disk=35,
                                      preemptable=True)
            jobNodeOnMaster = JobNode(command='master1',
                                      requirements=masterRequirements,
                                      jobName='test1',
                                      unitName='onMaster',
                                      jobStoreID=None,
                                      predecessorNumber=0)
            jobOnMaster = master.create(jobNodeOnMaster)
            self.assertTrue(master.exists(jobOnMaster.jobStoreID))
            self.assertEquals(jobOnMaster.command, 'master1')
            self.assertEquals(jobOnMaster.memory, masterRequirements['memory'])
            self.assertEquals(jobOnMaster.cores, masterRequirements['cores'])
            self.assertEquals(jobOnMaster.disk, masterRequirements['disk'])
            self.assertEquals(jobOnMaster.preemptable,
                              masterRequirements['preemptable'])
            self.assertEquals(jobOnMaster.jobName, 'test1')
            self.assertEquals(jobOnMaster.unitName, 'onMaster')
            self.assertEquals(jobOnMaster.stack, [])
            self.assertEquals(jobOnMaster.predecessorNumber, 0)
            self.assertEquals(jobOnMaster.predecessorsFinished, set())
            self.assertEquals(jobOnMaster.logJobStoreFileID, None)

            # Create a second instance of the job store, simulating a worker ...
            #
            worker = self._createJobStore()
            worker.resume()
            self.assertEquals(worker.config, self.config)
            self.assertIsNot(worker.config, self.config)
            # ... and load the parent job there.
            jobOnWorker = worker.load(jobOnMaster.jobStoreID)
            self.assertEquals(jobOnMaster, jobOnWorker)

            # Update state on job
            #
            # The following demonstrates the job update pattern, where files to be deleted are
            # referenced in "filesToDelete" array, which is persisted to disk first. If things go
            # wrong during the update, this list of files to delete is used to remove the
            # unneeded files
            jobOnWorker.filesToDelete = ['1', '2']
            worker.update(jobOnWorker)
            # Check jobs to delete persisted
            self.assertEquals(
                master.load(jobOnWorker.jobStoreID).filesToDelete, ['1', '2'])
            # Create children
            childRequirements1 = dict(memory=23,
                                      cores=45,
                                      disk=46,
                                      preemptable=True)
            jobNodeOnChild1 = JobNode(command='child1',
                                      requirements=childRequirements1,
                                      jobName='test2',
                                      unitName='onChild1',
                                      jobStoreID=None)
            childRequirements2 = dict(memory=34,
                                      cores=56,
                                      disk=57,
                                      preemptable=False)
            jobNodeOnChild2 = JobNode(command='master1',
                                      requirements=childRequirements2,
                                      jobName='test3',
                                      unitName='onChild2',
                                      jobStoreID=None)
            child1 = worker.create(jobNodeOnChild1)
            child2 = worker.create(jobNodeOnChild2)
            # Update parent
            jobOnWorker.stack.append((child1, child2))
            jobOnWorker.filesToDelete = []
            worker.update(jobOnWorker)

            # Check equivalence between master and worker
            #
            self.assertNotEquals(jobOnWorker, jobOnMaster)
            # Reload parent job on master
            jobOnMaster = master.load(jobOnMaster.jobStoreID)
            self.assertEquals(jobOnWorker, jobOnMaster)
            # Load children on master an check equivalence
            self.assertEquals(master.load(child1.jobStoreID), child1)
            self.assertEquals(master.load(child2.jobStoreID), child2)

            # Test changing and persisting job state across multiple jobs
            #
            childJobs = [
                worker.load(childNode.jobStoreID)
                for childNode in jobOnMaster.stack[-1]
            ]
            for childJob in childJobs:
                childJob.logJobStoreFileID = str(uuid.uuid4())
                childJob.remainingRetryCount = 66
                self.assertNotEquals(childJob,
                                     master.load(childJob.jobStoreID))
            for childJob in childJobs:
                worker.update(childJob)
            for childJob in childJobs:
                self.assertEquals(master.load(childJob.jobStoreID), childJob)
                self.assertEquals(worker.load(childJob.jobStoreID), childJob)

            # Test job iterator - the results of the iterator are effected by eventual
            # consistency. We cannot guarantee all jobs will appear but we can assert that all
            # jobs that show up are a subset of all existing jobs. If we had deleted jobs before
            # this we would have to worry about ghost jobs appearing and this assertion would not
            # be valid
            self.assertTrue(
                set(childJobs + [jobOnMaster]) >= set(worker.jobs()))
            self.assertTrue(
                set(childJobs + [jobOnMaster]) >= set(master.jobs()))

            # Test job deletions
            #
            # First delete parent, this should have no effect on the children
            self.assertTrue(master.exists(jobOnMaster.jobStoreID))
            self.assertTrue(worker.exists(jobOnMaster.jobStoreID))
            master.delete(jobOnMaster.jobStoreID)
            self.assertFalse(master.exists(jobOnMaster.jobStoreID))
            self.assertFalse(worker.exists(jobOnMaster.jobStoreID))

            for childJob in childJobs:
                self.assertTrue(master.exists(childJob.jobStoreID))
                self.assertTrue(worker.exists(childJob.jobStoreID))
                master.delete(childJob.jobStoreID)
                self.assertFalse(master.exists(childJob.jobStoreID))
                self.assertFalse(worker.exists(childJob.jobStoreID))
                self.assertRaises(NoSuchJobException, worker.load,
                                  childJob.jobStoreID)
                self.assertRaises(NoSuchJobException, master.load,
                                  childJob.jobStoreID)

            try:
                with master.readSharedFileStream('missing') as _:
                    pass
                self.fail('Expecting NoSuchFileException')
            except NoSuchFileException:
                pass

            # Test shared files: Write shared file on master, ...
            #
            with master.writeSharedFileStream('foo') as f:
                f.write('bar')
            # ... read that file on worker, ...
            with worker.readSharedFileStream('foo') as f:
                self.assertEquals('bar', f.read())
            # ... and read it again on master.
            with master.readSharedFileStream('foo') as f:
                self.assertEquals('bar', f.read())

            with master.writeSharedFileStream('nonEncrypted',
                                              isProtected=False) as f:
                f.write('bar')
            self.assertUrl(master.getSharedPublicUrl('nonEncrypted'))
            self.assertRaises(NoSuchFileException, master.getSharedPublicUrl,
                              'missing')

            # Test per-job files: Create empty file on master, ...
            #
            # First recreate job
            jobOnMaster = master.create(jobNodeOnMaster)
            fileOne = worker.getEmptyFileStoreID(jobOnMaster.jobStoreID)
            # Check file exists
            self.assertTrue(worker.fileExists(fileOne))
            self.assertTrue(master.fileExists(fileOne))
            # ... write to the file on worker, ...
            with worker.updateFileStream(fileOne) as f:
                f.write('one')
            # ... read the file as a stream on the master, ....
            with master.readFileStream(fileOne) as f:
                self.assertEquals(f.read(), 'one')

            # ... and copy it to a temporary physical file on the master.
            fh, path = tempfile.mkstemp()
            try:
                os.close(fh)
                tmpPath = path + '.read-only'
                master.readFile(fileOne, tmpPath)
                try:
                    shutil.copyfile(tmpPath, path)
                finally:
                    os.unlink(tmpPath)
                with open(path, 'r+') as f:
                    self.assertEquals(f.read(), 'one')
                    # Write a different string to the local file ...
                    f.seek(0)
                    f.truncate(0)
                    f.write('two')
                # ... and create a second file from the local file.
                fileTwo = master.writeFile(path, jobOnMaster.jobStoreID)
                with worker.readFileStream(fileTwo) as f:
                    self.assertEquals(f.read(), 'two')
                # Now update the first file from the local file ...
                master.updateFile(fileOne, path)
                with worker.readFileStream(fileOne) as f:
                    self.assertEquals(f.read(), 'two')
            finally:
                os.unlink(path)
            # Create a third file to test the last remaining method.
            with worker.writeFileStream(jobOnMaster.jobStoreID) as (f,
                                                                    fileThree):
                f.write('three')
            with master.readFileStream(fileThree) as f:
                self.assertEquals(f.read(), 'three')
            # Delete a file explicitly but leave files for the implicit deletion through the parent
            worker.deleteFile(fileOne)

            # Check the file is gone
            #
            for store in worker, master:
                self.assertFalse(store.fileExists(fileOne))
                self.assertRaises(NoSuchFileException, store.readFile, fileOne,
                                  '')
                try:
                    with store.readFileStream(fileOne) as _:
                        pass
                    self.fail('Expecting NoSuchFileException')
                except NoSuchFileException:
                    pass

            # Test stats and logging
            #
            stats = None

            def callback(f2):
                stats.add(f2.read())

            stats = set()
            self.assertEquals(0, master.readStatsAndLogging(callback))
            self.assertEquals(set(), stats)
            worker.writeStatsAndLogging('1')
            self.assertEquals(1, master.readStatsAndLogging(callback))
            self.assertEquals({'1'}, stats)
            self.assertEquals(0, master.readStatsAndLogging(callback))
            worker.writeStatsAndLogging('1')
            worker.writeStatsAndLogging('2')
            stats = set()
            self.assertEquals(2, master.readStatsAndLogging(callback))
            self.assertEquals({'1', '2'}, stats)
            largeLogEntry = os.urandom(self._largeLogEntrySize())
            stats = set()
            worker.writeStatsAndLogging(largeLogEntry)
            self.assertEquals(1, master.readStatsAndLogging(callback))
            self.assertEquals({largeLogEntry}, stats)

            # test the readAll parameter
            self.assertEqual(
                4, master.readStatsAndLogging(callback, readAll=True))

            # Delete parent
            #
            master.delete(jobOnMaster.jobStoreID)
            self.assertFalse(master.exists(jobOnMaster.jobStoreID))
示例#13
0
    def _testClusterScaling(self, config, numJobs, numPreemptableJobs):
        """
        Test the ClusterScaler class with different patterns of job creation. Tests ascertain
        that autoscaling occurs and that all the jobs are run.
        """
        # First do simple test of creating 100 preemptable and non-premptable jobs and check the
        # jobs are completed okay, then print the amount of worker time expended and the total
        # number of worker nodes used.

        logger.info("Creating dummy batch system and scalar")

        mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0)
        clusterScaler = ClusterScaler(mock, mock, config)
        clusterScaler.start()
        try:
            # Add 100 jobs to complete
            logger.info("Creating test jobs")
            map(lambda x: mock.addJob(), range(numJobs))
            map(lambda x: mock.addJob(preemptable=True),
                range(numPreemptableJobs))

            # Add some completed jobs
            for preemptable in (True, False):
                if preemptable and numPreemptableJobs > 0 or not preemptable and numJobs > 0:
                    # Add a 1000 random jobs
                    for i in xrange(1000):
                        x = mock.getNodeShape(preemptable)
                        iJ = JobNode(jobStoreID=1,
                                     requirements=dict(
                                         memory=random.choice(
                                             range(1, x.memory)),
                                         cores=random.choice(range(1,
                                                                   x.cores)),
                                         disk=random.choice(range(1, x.disk)),
                                         preemptable=preemptable),
                                     command=None,
                                     jobName='testClusterScaling',
                                     unitName='')
                        clusterScaler.addCompletedJob(
                            iJ, random.choice(range(1, x.wallTime)))

            logger.info("Waiting for jobs to be processed")
            startTime = time.time()
            # Wait while the cluster the process chunks through the jobs
            while (mock.getNumberOfJobsIssued(preemptable=False) > 0
                   or mock.getNumberOfJobsIssued(preemptable=True) > 0
                   or mock.getNumberOfNodes() > 0
                   or mock.getNumberOfNodes(preemptable=True) > 0):
                logger.info(
                    "Running, non-preemptable queue size: %s, non-preemptable workers: %s, "
                    "preemptable queue size: %s, preemptable workers: %s" %
                    (mock.getNumberOfJobsIssued(preemptable=False),
                     mock.getNumberOfNodes(preemptable=False),
                     mock.getNumberOfJobsIssued(preemptable=True),
                     mock.getNumberOfNodes(preemptable=True)))
                clusterScaler.check()
                time.sleep(0.5)
            logger.info("We waited %s for cluster to finish" %
                        (time.time() - startTime))
        finally:
            clusterScaler.shutdown()

        # Print some info about the autoscaling
        for i, bs in enumerate(mock.delegates):
            preemptable = bool(i)
            logger.info("Preemptable: %s, Total-jobs: %s: Max-workers: %s,"
                        " Total-worker-time: %s, Worker-time-per-job: %s" %
                        (preemptable, bs.totalJobs, bs.maxWorkers,
                         bs.totalWorkerTime, bs.totalWorkerTime /
                         bs.totalJobs if bs.totalJobs > 0 else 0.0))