def testMaxNodes(self): """ Set the scaler to be very aggressive, give it a ton of jobs, and make sure it doesn't go over maxNodes. """ self.config.targetTime = 1 self.config.betaInertia = 0.0 self.config.maxNodes = [2, 3] scaler = ClusterScaler(self.provisioner, self.leader, self.config) jobShapes = [ Shape(wallTime=3600, cores=2, memory=h2b('1G'), disk=h2b('2G'), preemptable=True) ] * 1000 jobShapes.extend([ Shape(wallTime=3600, cores=2, memory=h2b('1G'), disk=h2b('2G'), preemptable=False) ] * 1000) estimatedNodeCounts = scaler.getEstimatedNodeCounts( jobShapes, defaultdict(int)) self.assertEqual(estimatedNodeCounts[r3_8xlarge], 2) self.assertEqual(estimatedNodeCounts[c4_8xlarge_preemptable], 3)
def testPathologicalCase(self): """Test a pathological case where only one node can be requested to fit months' worth of jobs. If the reservation is extended to fit a long job, and the bin-packer naively searches through all the reservation slices to find the first slice that fits, it will happily assign the first slot that fits the job, even if that slot occurs days in the future. """ # Add one job that partially fills an r3.8xlarge for 1000 hours self.bpf.addJobShape( Shape(wallTime=3600000, memory=h2b('10G'), cores=0, disk=h2b('10G'), preemptable=False)) for _ in range(500): # Add 500 CPU-hours worth of jobs that fill an r3.8xlarge self.bpf.addJobShape( Shape(wallTime=3600, memory=h2b('26G'), cores=32, disk=h2b('60G'), preemptable=False)) # Hopefully we didn't assign just one node to cover all those jobs. self.assertNotEqual(self.bpf.getRequiredNodes(), { r3_8xlarge: 1, c4_8xlarge_preemptable: 0 })
def testJobTooLargeForAllNodes(self): """ If a job is too large for all node types, the scaler should print a warning, but definitely not crash. """ # Takes more RAM than an r3.8xlarge largerThanR3 = Shape(wallTime=3600, memory=h2b('360G'), cores=32, disk=h2b('600G'), preemptable=False) self.bpf.addJobShape(largerThanR3)
def testLongRunningJobs(self): """ Test that jobs with long run times (especially service jobs) are aggressively parallelized. This is important, because services are one case where the degree of parallelization really, really matters. If you have multiple services, they may all need to be running simultaneously before any real work can be done. Despite setting globalTargetTime=3600, this should launch 1000 t2.micros because each job's estimated runtime (30000 seconds) extends well beyond 3600 seconds. """ allocation = self.run1000JobsOnMicros(jobCores=1, jobMem=h2b('1G'), jobDisk=h2b('1G'), jobTime=30000, globalTargetTime=3600) self.assertEqual(allocation, {t2_micro: 1000})
def testHighTargetTime(self): """ Test that a high targetTime (3600 seconds) maximizes packing within the targetTime. Ideally, high targetTime means: Maximize packing within the targetTime after the cpu/disk/mem have been packed. Disk/cpu/mem packing is prioritized first, so we set job resource reqs so that each t2.micro (1 cpu/8G disk/1G RAM) can only run one job at a time with its resources. Each job is parametrized to take 300 seconds, so 12 of them should fit into each node's 3600 second window. 1000/12 = 83.33, so we expect 84 nodes. """ allocation = self.run1000JobsOnMicros(jobCores=1, jobMem=h2b('1G'), jobDisk=h2b('1G'), jobTime=300, globalTargetTime=3600) self.assertEqual(allocation, {t2_micro: 84})
def testPreemptableDeficitResponse(self): """ When a preemptable deficit was detected by a previous run of the loop, the scaler should add non-preemptable nodes to compensate in proportion to preemptableCompensation. """ self.config.targetTime = 1 self.config.betaInertia = 0.0 self.config.maxNodes = [10, 10] # This should mean that one non-preemptable node is launched # for every two preemptable nodes "missing". self.config.preemptableCompensation = 0.5 # In this case, we want to explicitly set up the config so # that we can have preemptable and non-preemptable nodes of # the same type. That is the only situation where # preemptableCompensation applies. self.config.nodeTypes = [c4_8xlarge_preemptable, c4_8xlarge] self.provisioner.setAutoscaledNodeTypes([ ({t}, None) for t in self.config.nodeTypes ]) scaler = ClusterScaler(self.provisioner, self.leader, self.config) # Simulate a situation where a previous run caused a # "deficit" of 5 preemptable nodes (e.g. a spot bid was lost) scaler.preemptableNodeDeficit[c4_8xlarge] = 5 # Add a bunch of preemptable jobs (so the bin-packing # estimate for the non-preemptable node should still be 0) jobShapes = [ Shape(wallTime=3600, cores=2, memory=h2b('1G'), disk=h2b('2G'), preemptable=True) ] * 1000 estimatedNodeCounts = scaler.getEstimatedNodeCounts( jobShapes, defaultdict(int)) # We don't care about the estimated size of the preemptable # nodes. All we want to know is if we responded to the deficit # properly: 0.5 * 5 (preemptableCompensation * the deficit) = 3 (rounded up). self.assertEqual( estimatedNodeCounts[self.provisioner.node_shapes_for_testing[1]], 3)
def human2bytes(s: str) -> int: """ Attempts to guess the string format based on default symbols set and return the corresponding bytes as an integer. When unable to recognize the format ValueError is raised. """ logger.warning( 'Deprecated toil method. Please use "toil.lib.conversions.human2bytes()" instead."' ) return h2b(s)
def testLowTargetTime(self): """ Test that a low targetTime (0) parallelizes jobs aggressively (1000 queued jobs require 1000 nodes). Ideally, low targetTime means: Start quickly and maximize parallelization after the cpu/disk/mem have been packed. Disk/cpu/mem packing is prioritized first, so we set job resource reqs so that each t2.micro (1 cpu/8G disk/1G RAM) can only run one job at a time with its resources. Each job is parametrized to take 300 seconds, so (the minimum of) 1 of them should fit into each node's 0 second window, so we expect 1000 nodes. """ allocation = self.run1000JobsOnMicros(jobCores=1, jobMem=h2b('1G'), jobDisk=h2b('1G'), jobTime=300, globalTargetTime=0) self.assertEqual(allocation, {t2_micro: 1000})
def testPackingOneShape(self): """Pack one shape and check that the resulting reservations look sane.""" self.bpf.nodeReservations[c4_8xlarge_preemptable] = [ NodeReservation(c4_8xlarge_preemptable) ] self.bpf.addJobShape( Shape(wallTime=1000, cores=2, memory=h2b('1G'), disk=h2b('2G'), preemptable=True)) self.assertEqual(self.bpf.nodeReservations[r3_8xlarge], []) self.assertEqual([ x.shapes() for x in self.bpf.nodeReservations[c4_8xlarge_preemptable] ], [[ Shape(wallTime=1000, memory=h2b('59G'), cores=34, disk=h2b('98G'), preemptable=True), Shape(wallTime=2600, memory=h2b('60G'), cores=36, disk=h2b('100G'), preemptable=True) ]])
def testAddingInitialNode(self): """Pack one shape when no nodes are available and confirm that we fit one node properly.""" self.bpf.addJobShape( Shape(wallTime=1000, cores=2, memory=h2b('1G'), disk=h2b('2G'), preemptable=True)) self.assertEqual([ x.shapes() for x in self.bpf.nodeReservations[c4_8xlarge_preemptable] ], [[ Shape(wallTime=1000, memory=h2b('59G'), cores=34, disk=h2b('98G'), preemptable=True), Shape(wallTime=2600, memory=h2b('60G'), cores=36, disk=h2b('100G'), preemptable=True) ]])
NodeInfo) from toil.common import Config, defaultTargetTime from toil.job import JobDescription from toil.lib.conversions import human2bytes as h2b from toil.provisioners import parse_node_types from toil.provisioners.abstractProvisioner import AbstractProvisioner, Shape from toil.provisioners.clusterScaler import (BinPackedFit, ClusterScaler, NodeReservation, ScalerThread) from toil.provisioners.node import Node from toil.test import ToilTest, slow, travis_test logger = logging.getLogger(__name__) # simplified c4.8xlarge (preemptable) c4_8xlarge_preemptable = Shape(wallTime=3600, memory=h2b('60G'), cores=36, disk=h2b('100G'), preemptable=True) # simplified c4.8xlarge (non-preemptable) c4_8xlarge = Shape(wallTime=3600, memory=h2b('60G'), cores=36, disk=h2b('100G'), preemptable=False) # simplified r3.8xlarge (non-preemptable) r3_8xlarge = Shape(wallTime=3600, memory=h2b('260G'), cores=32, disk=h2b('600G'), preemptable=False)