def testBatchResourceLimits(self): jobNode1 = JobNode(command="sleep 1000", requirements=dict(memory=1 << 30, cores=1, disk=1000, preemptable=preemptable), jobName='testResourceLimits', unitName=None, jobStoreID='1') job1 = self.batchSystem.issueBatchJob(jobNode1) self.assertIsNotNone(job1) jobNode2 = JobNode(command="sleep 1000", requirements=dict(memory=2 << 30, cores=1, disk=1000, preemptable=preemptable), jobName='testResourceLimits', unitName=None, jobStoreID='2') job2 = self.batchSystem.issueBatchJob(jobNode2) self.assertIsNotNone(job2) batches = self._getBatchList() self.assertEqual(len(batches), 2) # It would be better to directly check that the batches have the correct memory and cpu # values, but Parasol seems to slightly change the values sometimes. self.assertNotEqual(batches[0]['ram'], batches[1]['ram']) # Need to kill one of the jobs because there are only two cores available self.batchSystem.killBatchJobs([job2]) job3 = self.batchSystem.issueBatchJob(jobNode1) self.assertIsNotNone(job3) batches = self._getBatchList() self.assertEqual(len(batches), 1)
def testSetEnv(self): # Parasol disobeys shell rules and stupidly splits the command at # the space character into arguments before exec'ing it, whether # the space is quoted, escaped or not. script_shell = 'if [ "x${FOO}" == "xbar" ] ; then exit 23 ; else exit 42 ; fi' # Escape the semicolons script_protected = script_shell.replace(';', '\;') # Turn into a string which convinces bash to take all args and paste them back together and run them command = "bash -c \"\\${@}\" bash eval " + script_protected jobNode4 = JobNode(command=command, jobName='test4', unitName=None, jobStoreID='4', requirements=defaultRequirements) job4 = self.batchSystem.issueBatchJob(jobNode4) jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob( maxWait=1000) self.assertEqual(exitStatus, 42) self.assertEqual(jobID, job4) # Now set the variable and ensure that it is present self.batchSystem.setEnv('FOO', 'bar') jobNode5 = JobNode(command=command, jobName='test5', unitName=None, jobStoreID='5', requirements=defaultRequirements) job5 = self.batchSystem.issueBatchJob(jobNode5) jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob( maxWait=1000) self.assertEqual(exitStatus, 23) self.assertEqual(jobID, job5)
def testSetEnv(self): # Parasol disobeys shell rules and stupidly splits the command at the space character # before exec'ing it, whether the space is quoted, escaped or not. This means that we # can't have escaped or quotes spaces in the command line. So we can't use bash -c # '...' or python -c '...'. The safest thing to do here is to script the test and # invoke that script rather than inline the test via -c. def assertEnv(): import os, sys sys.exit(23 if os.getenv('FOO') == 'bar' else 42) script_body = dedent('\n'.join(getsource(assertEnv).split('\n')[1:])) with tempFileContaining(script_body, suffix='.py') as script_path: # First, ensure that the test fails if the variable is *not* set command = sys.executable + ' ' + script_path jobNode4 = JobNode(command=command, jobName='test4', unitName=None, jobStoreID='4', requirements=defaultRequirements) job4 = self.batchSystem.issueBatchJob(jobNode4) jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(maxWait=1000) self.assertEqual(exitStatus, 42) self.assertEqual(jobID, job4) # Now set the variable and ensure that it is present self.batchSystem.setEnv('FOO', 'bar') jobNode5 = JobNode(command=command, jobName='test5', unitName=None, jobStoreID='5', requirements=defaultRequirements) job5 = self.batchSystem.issueBatchJob(jobNode5) jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(maxWait=1000) self.assertEqual(exitStatus, 23) self.assertEqual(jobID, job5)
def testRunJobs(self): jobNode1 = JobNode(command='sleep 1000', jobName='test1', unitName=None, jobStoreID='1', requirements=defaultRequirements) jobNode2 = JobNode(command='sleep 1000', jobName='test2', unitName=None, jobStoreID='2', requirements=defaultRequirements) job1 = self.batchSystem.issueBatchJob(jobNode1) job2 = self.batchSystem.issueBatchJob(jobNode2) issuedIDs = self._waitForJobsToIssue(2) self.assertEqual(set(issuedIDs), {job1, job2}) # Now at some point we want these jobs to become running # But since we may be testing against a live cluster (Kubernetes) # we want to handle weird cases and high cluster load as much as we can. # Wait a bit for any Dockers to download and for the # jobs to have a chance to start. # TODO: We insist on neither of these ever finishing when we test # getUpdatedBatchJob, and the sleep time is longer than the time we # should spend waiting for both to start, so if our cluster can # only run one job at a time, we will fail the test. runningJobIDs = self._waitForJobsToStart(2, tries=120) self.assertEqual(set(runningJobIDs), {job1, job2}) # Killing the jobs instead of allowing them to complete means this test can run very # quickly if the batch system issues and starts the jobs quickly. self.batchSystem.killBatchJobs([job1, job2]) self.assertEqual({}, self.batchSystem.getRunningBatchJobIDs()) # Issue a job and then allow it to finish by itself, causing it to be added to the # updated jobs queue. # We would like to have this touch something on the filesystem and # then check for it having happened, but we can't guarantee that # the batch system will run against the same filesystem we are # looking at. jobNode3 = JobNode(command="mktemp -d", jobName='test3', unitName=None, jobStoreID='3', requirements=defaultRequirements) job3 = self.batchSystem.issueBatchJob(jobNode3) jobUpdateInfo = self.batchSystem.getUpdatedBatchJob(maxWait=1000) jobID, exitStatus, wallTime = jobUpdateInfo.jobID, jobUpdateInfo.exitStatus, jobUpdateInfo.wallTime log.info('Third job completed: {} {} {}'.format(jobID, exitStatus, wallTime)) # Since the first two jobs were killed, the only job in the updated jobs queue should # be job 3. If the first two jobs were (incorrectly) added to the queue, this will # fail with jobID being equal to job1 or job2. self.assertEqual(jobID, job3) self.assertEqual(exitStatus, 0) if self.supportsWallTime(): self.assertTrue(wallTime > 0) else: self.assertIsNone(wallTime) # TODO: Work out a way to check if the job we asked to run actually ran. # Don't just believe the batch system, but don't assume it ran on this machine either. self.assertFalse(self.batchSystem.getUpdatedBatchJob(0)) # Make sure killBatchJobs can handle jobs that don't exist self.batchSystem.killBatchJobs([10])
def testRunJobs(self): jobNode1 = JobNode(command='sleep 1000', jobName='test1', unitName=None, jobStoreID='1', requirements=defaultRequirements) jobNode2 = JobNode(command='sleep 1000', jobName='test2', unitName=None, jobStoreID='2', requirements=defaultRequirements) job1 = self.batchSystem.issueBatchJob(jobNode1) job2 = self.batchSystem.issueBatchJob(jobNode2) issuedIDs = self._waitForJobsToIssue(2) self.assertEqual(set(issuedIDs), {job1, job2}) runningJobIDs = self._waitForJobsToStart(2) self.assertEqual(set(runningJobIDs), {job1, job2}) # Killing the jobs instead of allowing them to complete means this test can run very # quickly if the batch system issues and starts the jobs quickly. self.batchSystem.killBatchJobs([job1, job2]) self.assertEqual({}, self.batchSystem.getRunningBatchJobIDs()) # Issue a job and then allow it to finish by itself, causing it to be added to the # updated jobs queue. # We would like to have this touch something on the filesystem and # then check for it having happened, but we can't guarantee that # the batch system will run against the same filesystem we are # looking at. jobNode3 = JobNode(command="mktemp -d", jobName='test3', unitName=None, jobStoreID='3', requirements=defaultRequirements) job3 = self.batchSystem.issueBatchJob(jobNode3) jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob( maxWait=1000) # Since the first two jobs were killed, the only job in the updated jobs queue should # be job 3. If the first two jobs were (incorrectly) added to the queue, this will # fail with jobID being equal to job1 or job2. self.assertEqual(jobID, job3) self.assertEqual(exitStatus, 0) if self.supportsWallTime(): self.assertTrue(wallTime > 0) else: self.assertIsNone(wallTime) # TODO: Work out a way to check if the job we asked to run actually ran. # Don't just believe the batch system, but don't assume it ran on this machine either. self.assertFalse(self.batchSystem.getUpdatedBatchJob(0)) # Make sure killBatchJobs can handle jobs that don't exist self.batchSystem.killBatchJobs([10])
def testRunJobs(self): testPath = os.path.join(self.tempDir, "test.txt") jobNode1 = JobNode(command='sleep 1000', jobName='test1', unitName=None, jobStoreID='1', requirements=defaultRequirements) jobNode2 = JobNode(command='sleep 1000', jobName='test2', unitName=None, jobStoreID='2', requirements=defaultRequirements) job1 = self.batchSystem.issueBatchJob(jobNode1) job2 = self.batchSystem.issueBatchJob(jobNode2) issuedIDs = self._waitForJobsToIssue(2) self.assertEqual(set(issuedIDs), {job1, job2}) runningJobIDs = self._waitForJobsToStart(2) self.assertEqual(set(runningJobIDs), {job1, job2}) # Killing the jobs instead of allowing them to complete means this test can run very # quickly if the batch system issues and starts the jobs quickly. self.batchSystem.killBatchJobs([job1, job2]) self.assertEqual({}, self.batchSystem.getRunningBatchJobIDs()) # Issue a job and then allow it to finish by itself, causing it to be added to the # updated jobs queue. self.assertFalse(os.path.exists(testPath)) jobNode3 = JobNode(command="touch %s" % testPath, jobName='test3', unitName=None, jobStoreID='3', requirements=defaultRequirements) job3 = self.batchSystem.issueBatchJob(jobNode3) jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob( maxWait=1000) # Since the first two jobs were killed, the only job in the updated jobs queue should # be job 3. If the first two jobs were (incorrectly) added to the queue, this will # fail with jobID being equal to job1 or job2. self.assertEqual(exitStatus, 0) self.assertEqual(jobID, job3) if self.supportsWallTime(): self.assertTrue(wallTime > 0) else: self.assertIsNone(wallTime) if not os.path.exists(testPath): time.sleep(20) self.assertTrue(os.path.exists(testPath)) self.assertFalse(self.batchSystem.getUpdatedBatchJob(0)) # Make sure killBatchJobs can handle jobs that don't exist self.batchSystem.killBatchJobs([10])
def _testClusterScaling(self, config, numJobs, numPreemptableJobs, jobShape): """ Test the ClusterScaler class with different patterns of job creation. Tests ascertain that autoscaling occurs and that all the jobs are run. """ # First do simple test of creating 100 preemptable and non-premptable jobs and check the # jobs are completed okay, then print the amount of worker time expended and the total # number of worker nodes used. mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0) mock.start() clusterScaler = ScalerThread(mock, mock, config) clusterScaler.start() try: # Add 100 jobs to complete list(map(lambda x: mock.addJob(jobShape=jobShape), list(range(numJobs)))) list(map(lambda x: mock.addJob(jobShape=jobShape, preemptable=True), list(range(numPreemptableJobs)))) # Add some completed jobs for preemptable in (True, False): if preemptable and numPreemptableJobs > 0 or not preemptable and numJobs > 0: # Add 1000 random jobs for _ in range(1000): x = mock.getNodeShape(nodeType=jobShape) iJ = JobNode(jobStoreID=1, requirements=dict( memory=random.choice(list(range(1, x.memory))), cores=random.choice(list(range(1, x.cores))), disk=random.choice(list(range(1, x.disk))), preemptable=preemptable), command=None, jobName='testClusterScaling', unitName='') clusterScaler.addCompletedJob(iJ, random.choice(list(range(1, x.wallTime)))) startTime = time.time() # Wait while the cluster processes the jobs while (mock.getNumberOfJobsIssued(preemptable=False) > 0 or mock.getNumberOfJobsIssued(preemptable=True) > 0 or mock.getNumberOfNodes() > 0 or mock.getNumberOfNodes(preemptable=True) > 0): logger.debug("Running, non-preemptable queue size: %s, non-preemptable workers: %s, " "preemptable queue size: %s, preemptable workers: %s" % (mock.getNumberOfJobsIssued(preemptable=False), mock.getNumberOfNodes(preemptable=False), mock.getNumberOfJobsIssued(preemptable=True), mock.getNumberOfNodes(preemptable=True))) clusterScaler.check() time.sleep(0.5) logger.debug("We waited %s for cluster to finish" % (time.time() - startTime)) finally: clusterScaler.shutdown() mock.shutDown() # Print some info about the autoscaling logger.debug("Total-jobs: %s: Max-workers: %s, " "Total-worker-time: %s, Worker-time-per-job: %s" % (mock.totalJobs, sum(mock.maxWorkers.values()), mock.totalWorkerTime, old_div(mock.totalWorkerTime, mock.totalJobs) if mock.totalJobs > 0 else 0.0))
def setUp(self): super(AbstractJobStoreTest.Test, self).setUp() self.namePrefix = 'jobstore-test-' + str(uuid.uuid4()) self.master = self._createJobStore() self.config = self._createConfig() self.master.initialize(self.config) self.arbitraryRequirements = {'memory': 1, 'disk': 2, 'cores': 1, 'preemptable': False} self.arbitraryJob = JobNode(command='command', jobStoreID=None, jobName='arbitrary', unitName=None, requirements=self.arbitraryRequirements)
def testIgnoreNode(self): self.batchSystem.ignoreNode('localhost') jobNode = JobNode(command='sleep 1000', jobName='test2', unitName=None, jobStoreID='1', requirements=defaultRequirements) job = self.batchSystem.issueBatchJob(jobNode) issuedID = self._waitForJobsToIssue(1) self.assertEqual(set(issuedID), {job}) runningJobIDs = self._waitForJobsToStart(1) #Make sure job is NOT running self.assertEqual(set(runningJobIDs), set({}))
def test(self): # We'll use fractions to avoid rounding errors. Remember that not every fraction can be # represented as a floating point number. F = Fraction # This test isn't general enough to cover every possible value of minCores in # SingleMachineBatchSystem. Instead we hard-code a value and assert it. minCores = F(1, 10) self.assertEquals(float(minCores), SingleMachineBatchSystem.minCores) for maxCores in {F(minCores), minCores * 10, F(1), F(numCores, 2), F(numCores)}: for coresPerJob in {F(minCores), F(minCores * 10), F(1), F(maxCores, 2), F(maxCores)}: for load in (F(1, 10), F(1), F(10)): jobs = int(maxCores / coresPerJob * load) if jobs >= 1 and minCores <= coresPerJob < maxCores: self.assertEquals(maxCores, float(maxCores)) bs = SingleMachineBatchSystem( config=hidden.AbstractBatchSystemTest.createConfig(), maxCores=float(maxCores), # Ensure that memory or disk requirements don't get in the way. maxMemory=jobs * 10, maxDisk=jobs * 10) try: jobIds = set() for i in range(0, int(jobs)): jobIds.add(bs.issueBatchJob(JobNode(command=self.scriptCommand(), requirements=dict( cores=float( coresPerJob), memory=1, disk=1, preemptable=preemptable), jobName=str(i), unitName='', jobStoreID=str(i)))) self.assertEquals(len(jobIds), jobs) while jobIds: job = bs.getUpdatedBatchJob(maxWait=10) self.assertIsNotNone(job) jobId, status, wallTime = job self.assertEquals(status, 0) # would raise KeyError on absence jobIds.remove(jobId) finally: bs.shutdown() concurrentTasks, maxConcurrentTasks = getCounters(self.counterPath) self.assertEquals(concurrentTasks, 0) log.info('maxCores: {maxCores}, ' 'coresPerJob: {coresPerJob}, ' 'load: {load}'.format(**locals())) # This is the key assertion: expectedMaxConcurrentTasks = min(old_div(maxCores, coresPerJob), jobs) self.assertEquals(maxConcurrentTasks, expectedMaxConcurrentTasks) resetCounters(self.counterPath)
def testClusterScalingMultipleNodeTypes(self): smallNode = Shape(20, 5, 10, 10, False) mediumNode = Shape(20, 10, 10, 10, False) largeNode = Shape(20, 20, 10, 10, False) numJobs = 100 config = Config() # Make defaults dummy values config.defaultMemory = 1 config.defaultCores = 1 config.defaultDisk = 1 # No preemptable nodes/jobs config.preemptableNodeTypes = [] config.minPreemptableNodes = [] config.maxPreemptableNodes = [] # No preemptable nodes # Make sure the node types don't have to be ordered config.nodeTypes = [largeNode, smallNode, mediumNode] config.minNodes = [0, 0, 0] config.maxNodes = [10, 10] # test expansion of this list # Algorithm parameters config.targetTime = defaultTargetTime config.betaInertia = 0.1 config.scaleInterval = 3 mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0) clusterScaler = ScalerThread(mock, mock, config) clusterScaler.start() mock.start() try: # Add small jobs list( map(lambda x: mock.addJob(jobShape=smallNode), list(range(numJobs)))) list( map(lambda x: mock.addJob(jobShape=mediumNode), list(range(numJobs)))) # Add medium completed jobs for i in range(1000): iJ = JobNode(jobStoreID=1, requirements=dict(memory=random.choice( range(smallNode.memory, mediumNode.memory)), cores=mediumNode.cores, disk=largeNode.cores, preemptable=False), command=None, jobName='testClusterScaling', unitName='') clusterScaler.addCompletedJob(iJ, random.choice(range(1, 10))) while mock.getNumberOfJobsIssued() > 0 or mock.getNumberOfNodes( ) > 0: logger.debug("%i nodes currently provisioned" % mock.getNumberOfNodes()) # Make sure there are no large nodes self.assertEqual(mock.getNumberOfNodes(nodeType=largeNode), 0) clusterScaler.check() time.sleep(0.5) finally: clusterScaler.shutdown() mock.shutDown() # Make sure jobs ran on both the small and medium node types self.assertTrue(mock.totalJobs > 0) self.assertTrue(mock.maxWorkers[smallNode] > 0) self.assertTrue(mock.maxWorkers[mediumNode] > 0) self.assertEqual(mock.maxWorkers[largeNode], 0)
def test(self): """ This is a front-to-back test of the "happy" path in a job store, i.e. covering things that occur in the dat to day life of a job store. The purist might insist that this be split up into several cases and I agree wholeheartedly. """ master = self.master # Test initial state # self.assertFalse(master.exists('foo')) self.assertRaises(NoSuchJobException, master.load, 'foo') # Create parent job and verify its existence/properties # masterRequirements = dict(memory=12, cores=34, disk=35, preemptable=True) jobNodeOnMaster = JobNode(command='master1', requirements=masterRequirements, jobName='test1', unitName='onMaster', jobStoreID=None, predecessorNumber=0) jobOnMaster = master.create(jobNodeOnMaster) self.assertTrue(master.exists(jobOnMaster.jobStoreID)) self.assertEquals(jobOnMaster.command, 'master1') self.assertEquals(jobOnMaster.memory, masterRequirements['memory']) self.assertEquals(jobOnMaster.cores, masterRequirements['cores']) self.assertEquals(jobOnMaster.disk, masterRequirements['disk']) self.assertEquals(jobOnMaster.preemptable, masterRequirements['preemptable']) self.assertEquals(jobOnMaster.jobName, 'test1') self.assertEquals(jobOnMaster.unitName, 'onMaster') self.assertEquals(jobOnMaster.stack, []) self.assertEquals(jobOnMaster.predecessorNumber, 0) self.assertEquals(jobOnMaster.predecessorsFinished, set()) self.assertEquals(jobOnMaster.logJobStoreFileID, None) # Create a second instance of the job store, simulating a worker ... # worker = self._createJobStore() worker.resume() self.assertEquals(worker.config, self.config) self.assertIsNot(worker.config, self.config) # ... and load the parent job there. jobOnWorker = worker.load(jobOnMaster.jobStoreID) self.assertEquals(jobOnMaster, jobOnWorker) # Update state on job # # The following demonstrates the job update pattern, where files to be deleted are # referenced in "filesToDelete" array, which is persisted to disk first. If things go # wrong during the update, this list of files to delete is used to remove the # unneeded files jobOnWorker.filesToDelete = ['1', '2'] worker.update(jobOnWorker) # Check jobs to delete persisted self.assertEquals( master.load(jobOnWorker.jobStoreID).filesToDelete, ['1', '2']) # Create children childRequirements1 = dict(memory=23, cores=45, disk=46, preemptable=True) jobNodeOnChild1 = JobNode(command='child1', requirements=childRequirements1, jobName='test2', unitName='onChild1', jobStoreID=None) childRequirements2 = dict(memory=34, cores=56, disk=57, preemptable=False) jobNodeOnChild2 = JobNode(command='master1', requirements=childRequirements2, jobName='test3', unitName='onChild2', jobStoreID=None) child1 = worker.create(jobNodeOnChild1) child2 = worker.create(jobNodeOnChild2) # Update parent jobOnWorker.stack.append((child1, child2)) jobOnWorker.filesToDelete = [] worker.update(jobOnWorker) # Check equivalence between master and worker # self.assertNotEquals(jobOnWorker, jobOnMaster) # Reload parent job on master jobOnMaster = master.load(jobOnMaster.jobStoreID) self.assertEquals(jobOnWorker, jobOnMaster) # Load children on master an check equivalence self.assertEquals(master.load(child1.jobStoreID), child1) self.assertEquals(master.load(child2.jobStoreID), child2) # Test changing and persisting job state across multiple jobs # childJobs = [ worker.load(childNode.jobStoreID) for childNode in jobOnMaster.stack[-1] ] for childJob in childJobs: childJob.logJobStoreFileID = str(uuid.uuid4()) childJob.remainingRetryCount = 66 self.assertNotEquals(childJob, master.load(childJob.jobStoreID)) for childJob in childJobs: worker.update(childJob) for childJob in childJobs: self.assertEquals(master.load(childJob.jobStoreID), childJob) self.assertEquals(worker.load(childJob.jobStoreID), childJob) # Test job iterator - the results of the iterator are effected by eventual # consistency. We cannot guarantee all jobs will appear but we can assert that all # jobs that show up are a subset of all existing jobs. If we had deleted jobs before # this we would have to worry about ghost jobs appearing and this assertion would not # be valid self.assertTrue( set(childJobs + [jobOnMaster]) >= set(worker.jobs())) self.assertTrue( set(childJobs + [jobOnMaster]) >= set(master.jobs())) # Test job deletions # # First delete parent, this should have no effect on the children self.assertTrue(master.exists(jobOnMaster.jobStoreID)) self.assertTrue(worker.exists(jobOnMaster.jobStoreID)) master.delete(jobOnMaster.jobStoreID) self.assertFalse(master.exists(jobOnMaster.jobStoreID)) self.assertFalse(worker.exists(jobOnMaster.jobStoreID)) for childJob in childJobs: self.assertTrue(master.exists(childJob.jobStoreID)) self.assertTrue(worker.exists(childJob.jobStoreID)) master.delete(childJob.jobStoreID) self.assertFalse(master.exists(childJob.jobStoreID)) self.assertFalse(worker.exists(childJob.jobStoreID)) self.assertRaises(NoSuchJobException, worker.load, childJob.jobStoreID) self.assertRaises(NoSuchJobException, master.load, childJob.jobStoreID) try: with master.readSharedFileStream('missing') as _: pass self.fail('Expecting NoSuchFileException') except NoSuchFileException: pass # Test shared files: Write shared file on master, ... # with master.writeSharedFileStream('foo') as f: f.write('bar') # ... read that file on worker, ... with worker.readSharedFileStream('foo') as f: self.assertEquals('bar', f.read()) # ... and read it again on master. with master.readSharedFileStream('foo') as f: self.assertEquals('bar', f.read()) with master.writeSharedFileStream('nonEncrypted', isProtected=False) as f: f.write('bar') self.assertUrl(master.getSharedPublicUrl('nonEncrypted')) self.assertRaises(NoSuchFileException, master.getSharedPublicUrl, 'missing') # Test per-job files: Create empty file on master, ... # # First recreate job jobOnMaster = master.create(jobNodeOnMaster) fileOne = worker.getEmptyFileStoreID(jobOnMaster.jobStoreID) # Check file exists self.assertTrue(worker.fileExists(fileOne)) self.assertTrue(master.fileExists(fileOne)) # ... write to the file on worker, ... with worker.updateFileStream(fileOne) as f: f.write('one') # ... read the file as a stream on the master, .... with master.readFileStream(fileOne) as f: self.assertEquals(f.read(), 'one') # ... and copy it to a temporary physical file on the master. fh, path = tempfile.mkstemp() try: os.close(fh) tmpPath = path + '.read-only' master.readFile(fileOne, tmpPath) try: shutil.copyfile(tmpPath, path) finally: os.unlink(tmpPath) with open(path, 'r+') as f: self.assertEquals(f.read(), 'one') # Write a different string to the local file ... f.seek(0) f.truncate(0) f.write('two') # ... and create a second file from the local file. fileTwo = master.writeFile(path, jobOnMaster.jobStoreID) with worker.readFileStream(fileTwo) as f: self.assertEquals(f.read(), 'two') # Now update the first file from the local file ... master.updateFile(fileOne, path) with worker.readFileStream(fileOne) as f: self.assertEquals(f.read(), 'two') finally: os.unlink(path) # Create a third file to test the last remaining method. with worker.writeFileStream(jobOnMaster.jobStoreID) as (f, fileThree): f.write('three') with master.readFileStream(fileThree) as f: self.assertEquals(f.read(), 'three') # Delete a file explicitly but leave files for the implicit deletion through the parent worker.deleteFile(fileOne) # Check the file is gone # for store in worker, master: self.assertFalse(store.fileExists(fileOne)) self.assertRaises(NoSuchFileException, store.readFile, fileOne, '') try: with store.readFileStream(fileOne) as _: pass self.fail('Expecting NoSuchFileException') except NoSuchFileException: pass # Test stats and logging # stats = None def callback(f2): stats.add(f2.read()) stats = set() self.assertEquals(0, master.readStatsAndLogging(callback)) self.assertEquals(set(), stats) worker.writeStatsAndLogging('1') self.assertEquals(1, master.readStatsAndLogging(callback)) self.assertEquals({'1'}, stats) self.assertEquals(0, master.readStatsAndLogging(callback)) worker.writeStatsAndLogging('1') worker.writeStatsAndLogging('2') stats = set() self.assertEquals(2, master.readStatsAndLogging(callback)) self.assertEquals({'1', '2'}, stats) largeLogEntry = os.urandom(self._largeLogEntrySize()) stats = set() worker.writeStatsAndLogging(largeLogEntry) self.assertEquals(1, master.readStatsAndLogging(callback)) self.assertEquals({largeLogEntry}, stats) # test the readAll parameter self.assertEqual( 4, master.readStatsAndLogging(callback, readAll=True)) # Delete parent # master.delete(jobOnMaster.jobStoreID) self.assertFalse(master.exists(jobOnMaster.jobStoreID))
def _testClusterScaling(self, config, numJobs, numPreemptableJobs): """ Test the ClusterScaler class with different patterns of job creation. Tests ascertain that autoscaling occurs and that all the jobs are run. """ # First do simple test of creating 100 preemptable and non-premptable jobs and check the # jobs are completed okay, then print the amount of worker time expended and the total # number of worker nodes used. logger.info("Creating dummy batch system and scalar") mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0) clusterScaler = ClusterScaler(mock, mock, config) clusterScaler.start() try: # Add 100 jobs to complete logger.info("Creating test jobs") map(lambda x: mock.addJob(), range(numJobs)) map(lambda x: mock.addJob(preemptable=True), range(numPreemptableJobs)) # Add some completed jobs for preemptable in (True, False): if preemptable and numPreemptableJobs > 0 or not preemptable and numJobs > 0: # Add a 1000 random jobs for i in xrange(1000): x = mock.getNodeShape(preemptable) iJ = JobNode(jobStoreID=1, requirements=dict( memory=random.choice( range(1, x.memory)), cores=random.choice(range(1, x.cores)), disk=random.choice(range(1, x.disk)), preemptable=preemptable), command=None, jobName='testClusterScaling', unitName='') clusterScaler.addCompletedJob( iJ, random.choice(range(1, x.wallTime))) logger.info("Waiting for jobs to be processed") startTime = time.time() # Wait while the cluster the process chunks through the jobs while (mock.getNumberOfJobsIssued(preemptable=False) > 0 or mock.getNumberOfJobsIssued(preemptable=True) > 0 or mock.getNumberOfNodes() > 0 or mock.getNumberOfNodes(preemptable=True) > 0): logger.info( "Running, non-preemptable queue size: %s, non-preemptable workers: %s, " "preemptable queue size: %s, preemptable workers: %s" % (mock.getNumberOfJobsIssued(preemptable=False), mock.getNumberOfNodes(preemptable=False), mock.getNumberOfJobsIssued(preemptable=True), mock.getNumberOfNodes(preemptable=True))) clusterScaler.check() time.sleep(0.5) logger.info("We waited %s for cluster to finish" % (time.time() - startTime)) finally: clusterScaler.shutdown() # Print some info about the autoscaling for i, bs in enumerate(mock.delegates): preemptable = bool(i) logger.info("Preemptable: %s, Total-jobs: %s: Max-workers: %s," " Total-worker-time: %s, Worker-time-per-job: %s" % (preemptable, bs.totalJobs, bs.maxWorkers, bs.totalWorkerTime, bs.totalWorkerTime / bs.totalJobs if bs.totalJobs > 0 else 0.0))