def test_rate_limit_while_creating_job(self): # Make sure the rate limit cuts a user off. settings = util.settings(self.cook_url) if settings['rate-limit']['job-submission'] is None: pytest.skip( "Can't test job submission rate limit without submission rate limit set." ) if not settings['rate-limit']['job-submission']['enforce?']: pytest.skip("Enforcing must be on for test to run") user = self.user_factory.new_user() bucket_size = settings['rate-limit']['job-submission']['bucket-size'] extra_size = replenishment_rate = settings['rate-limit'][ 'job-submission']['tokens-replenished-per-minute'] if extra_size < 100: extra_size = 100 if bucket_size > 3000 or extra_size > 1000: pytest.skip( "Job submission rate limit test would require making too many or too few jobs to run the test." ) with user: jobs_to_kill = [] try: # First, empty most but not all of the tocken bucket. jobs1, resp1 = util.submit_jobs(self.cook_url, {}, bucket_size - 60) jobs_to_kill.extend(jobs1) self.assertEqual(resp1.status_code, 201) # Then another 1060 to get us very negative. jobs2, resp2 = util.submit_jobs(self.cook_url, {}, extra_size + 60) jobs_to_kill.extend(jobs2) self.assertEqual(resp2.status_code, 201) # And finally a request that gets cut off. jobs3, resp3 = util.submit_jobs(self.cook_url, {}, 10) self.assertEqual(resp3.status_code, 400) # The timestamp can change so we should only match on the prefix. expectedPrefix = f'User {user.name} is inserting too quickly. Not allowed to insert for' self.assertEqual(resp3.json()['error'][:len(expectedPrefix)], expectedPrefix) # Earn back 70 seconds of tokens. time.sleep(70.0 * extra_size / replenishment_rate) jobs4, resp4 = util.submit_jobs(self.cook_url, {}, 10) jobs_to_kill.extend(jobs4) self.assertEqual(resp4.status_code, 201) finally: util.kill_jobs(self.cook_url, jobs_to_kill)
def test_container_submit_no_image(self): """Test submitting a job with a port specification but no image.""" settings_dict = util.settings(self.cook_url) if 'pools' not in settings_dict or 'default-containers' not in settings_dict['pools']: self.skipTest("Test requires default containers") JOB_PORT = 30030 progress_file_env = util.retrieve_progress_file_env(type(self).cook_url) hostname_progress_cmd = util.progress_line(type(self).cook_url, 50, # Don't really care, we just need a val '$(hostname -I)', write_to_file=True) container = DockerContainer(port_mapping=[ DockerPortMapping(host_port=0, container_port=JOB_PORT, protocol='tcp') ]) uuid = self.client.submit(command=f'{hostname_progress_cmd} && nc -l -p {JOB_PORT} $(hostname -I)', container=container, env={progress_file_env: 'progress.txt'}, max_retries=5, pool=util.default_submit_pool()) addr = None try: util.wait_for_instance_with_progress(type(self).cook_url, str(uuid), 50) job = self.client.query(uuid) addr = job.instances[0].progress_message self.assertIsNotNone(addr) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: sock.connect((addr, JOB_PORT)) message = b"hello world!" self.assertEqual(sock.send(message), len(message)) except Exception as e: if addr is not None: raise Exception(f"Could not connect to {addr}: {e}") from e else: raise e finally: self.client.kill(uuid)
def trigger_preemption(self, pool): """ Triggers preemption on the provided pool (which can be None) by doing the following: 1. Choose a user, X 2. Lower X's cpu share to 0.1 and cpu quota to 1.0 3. Submit a job, J1, from X with 1.0 cpu and priority 99 (fills the cpu quota) 4. Wait for J1 to start running 5. Submit a job, J2, from X with 0.1 cpu and priority 100 6. Wait until J1 is preempted (to make room for J2) """ admin = self.user_factory.admin() user = self.user_factory.new_user() all_job_uuids = [] try: small_cpus = 0.1 large_cpus = small_cpus * 10 with admin: # Lower the user's cpu share and quota util.set_limit(self.cook_url, 'share', user.name, cpus=small_cpus, pool=pool) util.set_limit(self.cook_url, 'quota', user.name, cpus=large_cpus, pool=pool) with user: # Submit a large job that fills up the user's quota base_priority = 99 command = 'sleep 600' uuid_large, _ = util.submit_job(self.cook_url, priority=base_priority, cpus=large_cpus, command=command, pool=pool) all_job_uuids.append(uuid_large) util.wait_for_running_instance(self.cook_url, uuid_large) # Submit a higher-priority job that should trigger preemption uuid_high_priority, _ = util.submit_job( self.cook_url, priority=base_priority + 1, cpus=small_cpus, command=command, name='higher_priority_job', pool=pool) all_job_uuids.append(uuid_high_priority) # Assert that the lower-priority job was preempted def low_priority_job(): job = util.load_job(self.cook_url, uuid_large) one_hour_in_millis = 60 * 60 * 1000 start = util.current_milli_time() - one_hour_in_millis end = util.current_milli_time() running = util.jobs(self.cook_url, user=user.name, state='running', start=start, end=end).json() waiting = util.jobs(self.cook_url, user=user.name, state='waiting', start=start, end=end).json() self.logger.info( f'Currently running jobs: {json.dumps(running, indent=2)}' ) self.logger.info( f'Currently waiting jobs: {json.dumps(waiting, indent=2)}' ) return job def job_was_preempted(job): for instance in job['instances']: self.logger.debug( f'Checking if instance was preempted: {instance}') if instance.get( 'reason_string') == 'Preempted by rebalancer': return True self.logger.info(f'Job has not been preempted: {job}') return False max_wait_ms = util.settings( self.cook_url )['rebalancer']['interval-seconds'] * 1000 * 1.5 self.logger.info( f'Waiting up to {max_wait_ms} milliseconds for preemption to happen' ) util.wait_until(low_priority_job, job_was_preempted, max_wait_ms=max_wait_ms, wait_interval_ms=5000) finally: with admin: util.kill_jobs(self.cook_url, all_job_uuids, assert_response=False) util.reset_limit(self.cook_url, 'share', user.name, reason=self.current_name(), pool=pool) util.reset_limit(self.cook_url, 'quota', user.name, reason=self.current_name(), pool=pool)
def test_rate_limit_launching_jobs(self): settings = util.settings(self.cook_url) if settings['rate-limit']['job-launch'] is None: pytest.skip( "Can't test job launch rate limit without launch rate limit set." ) # Allow an environmental variable override. name = os.getenv('COOK_LAUNCH_RATE_LIMIT_NAME') if name is not None: user = self.user_factory.user_class(name) else: user = self.user_factory.new_user() if not settings['rate-limit']['job-launch']['enforce?']: pytest.skip("Enforcing must be on for test to run") bucket_size = settings['rate-limit']['job-launch']['bucket-size'] token_rate = settings['rate-limit']['job-launch'][ 'tokens-replenished-per-minute'] # In some environments, e.g., minimesos, we can only launch so many concurrent jobs. if token_rate < 5 or token_rate > 20: pytest.skip( "Job launch rate limit test is only validated to reliably work correctly with certain token rates." ) if bucket_size < 10 or bucket_size > 20: pytest.skip( "Job launch rate limit test is only validated to reliably work correctly with certain token bucket sizes." ) with user: job_uuids = [] try: jobspec = {"command": "sleep 240", 'cpus': 0.03, 'mem': 32} self.logger.info( f'Submitting initial batch of {bucket_size-1} jobs') initial_uuids, initial_response = util.submit_jobs( self.cook_url, jobspec, bucket_size - 1) job_uuids.extend(initial_uuids) self.assertEqual(201, initial_response.status_code, msg=initial_response.content) def submit_jobs(): self.logger.info( f'Submitting subsequent batch of {bucket_size-1} jobs') subsequent_uuids, subsequent_response = util.submit_jobs( self.cook_url, jobspec, bucket_size - 1) job_uuids.extend(subsequent_uuids) self.assertEqual(201, subsequent_response.status_code, msg=subsequent_response.content) def is_rate_limit_triggered(_): jobs1 = util.query_jobs(self.cook_url, True, uuid=job_uuids).json() waiting_jobs = [ j for j in jobs1 if j['status'] == 'waiting' ] running_jobs = [ j for j in jobs1 if j['status'] == 'running' ] self.logger.debug( f'There are {len(waiting_jobs)} waiting jobs') # We submitted just under two buckets. We should only see a bucket + some extra running. No more. return len(running_jobs) >= bucket_size and len( running_jobs) < (bucket_size + token_rate / 2) and len(waiting_jobs) > 0 util.wait_until(submit_jobs, is_rate_limit_triggered) jobs2 = util.query_jobs(self.cook_url, True, uuid=job_uuids).json() running_jobs = [j for j in jobs2 if j['status'] == 'running'] self.assertEqual(len(running_jobs), bucket_size) finally: util.kill_jobs(self.cook_url, job_uuids)
def test_preemption(self): admin = self.user_factory.admin() user = self.user_factory.new_user() all_job_uuids = [] try: small_cpus = 0.1 large_cpus = small_cpus * 10 with admin: # Lower the user's cpu share and quota util.set_limit(self.cook_url, 'share', user.name, cpus=small_cpus) util.set_limit(self.cook_url, 'quota', user.name, cpus=large_cpus) with user: # Submit a large job that fills up the user's quota base_priority = 99 command = 'sleep 600' uuid_large, _ = util.submit_job(self.cook_url, priority=base_priority, cpus=large_cpus, command=command) all_job_uuids.append(uuid_large) util.wait_for_running_instance(self.cook_url, uuid_large) # Submit a higher-priority job that should trigger preemption uuid_high_priority, _ = util.submit_job( self.cook_url, priority=base_priority + 1, cpus=small_cpus, command=command, name='higher_priority_job') all_job_uuids.append(uuid_high_priority) # Assert that the lower-priority job was preempted def low_priority_job(): job = util.load_job(self.cook_url, uuid_large) one_hour_in_millis = 60 * 60 * 1000 start = util.current_milli_time() - one_hour_in_millis end = util.current_milli_time() running = util.jobs(self.cook_url, user=user.name, state='running', start=start, end=end).json() waiting = util.jobs(self.cook_url, user=user.name, state='waiting', start=start, end=end).json() self.logger.info( f'Currently running jobs: {json.dumps(running, indent=2)}' ) self.logger.info( f'Currently waiting jobs: {json.dumps(waiting, indent=2)}' ) return job def job_was_preempted(job): for instance in job['instances']: self.logger.debug( f'Checking if instance was preempted: {instance}') if instance.get( 'reason_string') == 'Preempted by rebalancer': return True self.logger.info(f'Job has not been preempted: {job}') return False max_wait_ms = util.settings( self.cook_url )['rebalancer']['interval-seconds'] * 1000 * 1.5 self.logger.info( f'Waiting up to {max_wait_ms} milliseconds for preemption to happen' ) util.wait_until(low_priority_job, job_was_preempted, max_wait_ms=max_wait_ms, wait_interval_ms=5000) finally: with admin: util.kill_jobs(self.cook_url, all_job_uuids, assert_response=False) util.reset_limit(self.cook_url, 'share', user.name, reason=self.current_name()) util.reset_limit(self.cook_url, 'quota', user.name, reason=self.current_name())