def wait_for_output_file(cook_url, job_uuid, name): """Waits for a file with the given name for the given job to exist""" def query(): cp, _ = ls(job_uuid, cook_url, parse_json=False) return json.loads(stdout(cp)) if cp.returncode == 0 else [] def predicate(entries): logging.debug(f'Job {job_uuid} has entries {entries}') return ls_entry_by_name(entries, name) response = util.wait_until(query, predicate) return response
def test_dynamic_clusters(self): """ Test that dynamic cluster configuration functionality is working. """ docker_image = util.docker_image() container = {'type': 'docker', 'docker': {'image': docker_image}} admin = self.user_factory.admin() # Force all clusters to have state = deleted via the API clusters = [ cluster for cluster in util.compute_clusters(self.cook_url)['db-configs'] if cluster["state"] == "running" ] with admin: self.logger.info(f'Clusters {clusters}') # First state = draining for cluster in clusters: cluster["state"] = "draining" cluster["state-locked?"] = True self.logger.info(f'Trying to update cluster {cluster}') data, resp = util.update_compute_cluster( self.cook_url, cluster) self.assertEqual(201, resp.status_code, resp.content) # Then state = deleted for cluster in clusters: cluster["state"] = "deleted" util.wait_until( lambda: util.update_compute_cluster( self.cook_url, cluster), lambda x: 201 == x[1].status_code, 300000, 5000) # Create at least one new cluster with a unique test name (using one of the existing cluster's IP and cert) test_cluster_name = f'test_cluster_{round(time.time() * 1000)}' test_cluster = { "name": test_cluster_name, "state": "running", "base-path": clusters[0]["base-path"], "ca-cert": clusters[0]["ca-cert"], "template": clusters[0]["template"] } data, resp = util.create_compute_cluster(self.cook_url, test_cluster) self.assertEqual(201, resp.status_code, resp.content) # Test create cluster with duplicate name data, resp = util.create_compute_cluster(self.cook_url, test_cluster) self.assertEqual(422, resp.status_code, resp.content) self.assertEqual( f'Compute cluster with name {test_cluster_name} already exists', data['error']['message'], resp.content) # Check that a job schedules successfully command = "true" job_uuid, resp = util.submit_job(self.cook_url, command=command, container=container) self.assertEqual(201, resp.status_code, resp.content) instance = util.wait_for_instance(self.cook_url, job_uuid) message = repr(instance) self.assertIsNotNone(instance['compute-cluster'], message) instance_compute_cluster_name = instance['compute-cluster']['name'] self.assertEqual(test_cluster["name"], instance_compute_cluster_name, instance['compute-cluster']) util.wait_for_instance(self.cook_url, job_uuid, status='success') running_clusters = [ cluster for cluster in util.compute_clusters(self.cook_url)['db-configs'] if cluster["state"] == "running" ] self.assertEqual(1, len(running_clusters), running_clusters) self.assertEqual(test_cluster["name"], running_clusters[0]["name"], running_clusters) with admin: # Delete test cluster # First state = draining test_cluster["state"] = "draining" data, resp = util.update_compute_cluster(self.cook_url, test_cluster) self.assertEqual(201, resp.status_code, resp.content) # Then state = deleted test_cluster["state"] = "deleted" util.wait_until( lambda: util.update_compute_cluster(self.cook_url, test_cluster ), lambda x: 201 == x[1].status_code, 300000, 5000) # Hard-delete the original non-test clusters for cluster in clusters: self.logger.info(f'Trying to delete cluster {cluster}') resp = util.delete_compute_cluster(self.cook_url, cluster) self.assertEqual(204, resp.status_code, resp.content) # Force give up leadership resp = util.shutdown_leader(self.cook_url, "test_dynamic_clusters") self.assertEqual(b'Accepted', resp) # Old clusters should be re-created # wait for cook to come up util.wait_until( lambda: [ cluster for cluster in util.compute_clusters(self.cook_url)[ 'db-configs'] if cluster["state"] == "running" ], lambda x: len(x) == len(clusters), 420000, 5000) # Check that a job schedules successfully command = "true" job_uuid, resp = util.submit_job(self.cook_url, command=command, container=container) self.assertEqual(201, resp.status_code, resp.content) util.wait_for_instance(self.cook_url, job_uuid, status='success') with admin: # Hard-delete test cluster resp = util.delete_compute_cluster(self.cook_url, test_cluster) self.assertEqual(204, resp.status_code, resp.content)
def test_checkpoint_locality(self): """ Test that restored instances run in the same location as their checkpointed instances. """ # Get the set of clusters that correspond to the pool under test and are running pool = util.default_submit_pool() clusters = util.compute_clusters(self.cook_url) running_clusters = [ c for c in clusters['in-mem-configs'] if pool in c['cluster-definition']['config']['synthetic-pods'] ['pools'] and c['state'] == 'running' ] self.logger.info( f'Running clusters for pool {pool}: {running_clusters}') if len(running_clusters) == 0: self.skipTest( f'Requires at least 1 running compute cluster for pool {pool}') # Submit an initial canary job job_uuid, resp = util.submit_job(self.cook_url, pool=pool, command='true') self.assertEqual(201, resp.status_code, resp.content) util.wait_for_instance(self.cook_url, job_uuid, status='success', indent=None) # Submit a long-running job with checkpointing checkpoint_job_uuid, resp = util.submit_job( self.cook_url, pool=pool, command=f'sleep {util.DEFAULT_TEST_TIMEOUT_SECS}', max_retries=5, checkpoint={'mode': 'auto'}) self.assertEqual(201, resp.status_code, resp.content) try: # Wait for the job to be running checkpoint_instance = util.wait_for_instance(self.cook_url, checkpoint_job_uuid, status='running', indent=None) checkpoint_instance_uuid = checkpoint_instance['task_id'] checkpoint_location = next( c['location'] for c in running_clusters if c['name'] == checkpoint_instance['compute-cluster']['name']) admin = self.user_factory.admin() try: # Force all clusters in the instance's location to have state = draining with admin: for cluster in running_clusters: if cluster['location'] == checkpoint_location: cluster_update = dict(cluster) # Set state = draining cluster_update['state'] = 'draining' cluster_update['state-locked?'] = True # The location, cluster-definition, and features fields cannot be sent in the update cluster_update.pop('location', None) cluster_update.pop('cluster-definition', None) cluster_update.pop('features', None) self.logger.info( f'Trying to update cluster to draining: {cluster_update}' ) util.wait_until( lambda: util.update_compute_cluster( self.cook_url, cluster_update)[1], lambda response: response.status_code == 201 and len(response.json()) > 0) else: self.logger.info( f'Not updating cluster - not in location {checkpoint_location}: {cluster}' ) # Kill the running checkpoint job instance util.kill_instance(self.cook_url, checkpoint_instance_uuid) # Submit another canary job job_uuid, resp = util.submit_job(self.cook_url, pool=pool, command='true') self.assertEqual(201, resp.status_code, resp.content) cluster_locations = set(c['location'] for c in running_clusters) if len(cluster_locations) > 1: # The canary job should run in the non-draining location self.logger.info( f'There are > 1 cluster locations under test: {cluster_locations}' ) util.wait_for_instance(self.cook_url, job_uuid, status='success', indent=None) else: self.logger.info( f'There is only 1 cluster location under test: {cluster_locations}' ) # The checkpoint job should be waiting util.wait_for_instance(self.cook_url, checkpoint_job_uuid, status='failed', indent=None) util.wait_for_job_in_statuses(self.cook_url, checkpoint_job_uuid, ['waiting']) finally: # Revert all clusters in the instance's location to state = running with admin: for cluster in running_clusters: if cluster['location'] == checkpoint_location: cluster_update = dict(cluster) # Set state = running cluster_update['state'] = 'running' cluster_update['state-locked?'] = False # The location, cluster-definition, and features fields cannot be sent in the update cluster_update.pop('location', None) cluster_update.pop('cluster-definition', None) cluster_update.pop('features', None) self.logger.info( f'Trying to update cluster to running: {cluster_update}' ) util.wait_until( lambda: util.update_compute_cluster( self.cook_url, cluster_update)[1], lambda response: response.status_code == 201 and len(response.json()) > 0) else: self.logger.info( f'Not updating cluster - not in location {checkpoint_location}: {cluster}' ) # Wait for the checkpoint job to be running again, in the same location as before checkpoint_instance = util.wait_for_instance( self.cook_url, checkpoint_job_uuid, status='running', indent=None) self.assertEqual( checkpoint_location, next(c['location'] for c in running_clusters if c['name'] == checkpoint_instance['compute-cluster']['name'])) finally: # Kill the checkpoint job to not leave it running util.kill_jobs(self.cook_url, [checkpoint_job_uuid])
def trigger_preemption(self, pool): """ Triggers preemption on the provided pool (which can be None) by doing the following: 1. Choose a user, X 2. Lower X's cpu share to 0.1 and cpu quota to 1.0 3. Submit a job, J1, from X with 1.0 cpu and priority 99 (fills the cpu quota) 4. Wait for J1 to start running 5. Submit a job, J2, from X with 0.1 cpu and priority 100 6. Wait until J1 is preempted (to make room for J2) """ admin = self.user_factory.admin() user = self.user_factory.new_user() all_job_uuids = [] try: small_cpus = 0.1 large_cpus = small_cpus * 10 with admin: # Lower the user's cpu share and quota util.set_limit(self.cook_url, 'share', user.name, cpus=small_cpus, pool=pool) util.set_limit(self.cook_url, 'quota', user.name, cpus=large_cpus, pool=pool) with user: # Submit a large job that fills up the user's quota base_priority = 99 command = 'sleep 600' uuid_large, _ = util.submit_job(self.cook_url, priority=base_priority, cpus=large_cpus, command=command, pool=pool) all_job_uuids.append(uuid_large) util.wait_for_running_instance(self.cook_url, uuid_large) # Submit a higher-priority job that should trigger preemption uuid_high_priority, _ = util.submit_job( self.cook_url, priority=base_priority + 1, cpus=small_cpus, command=command, name='higher_priority_job', pool=pool) all_job_uuids.append(uuid_high_priority) # Assert that the lower-priority job was preempted def low_priority_job(): job = util.load_job(self.cook_url, uuid_large) one_hour_in_millis = 60 * 60 * 1000 start = util.current_milli_time() - one_hour_in_millis end = util.current_milli_time() running = util.jobs(self.cook_url, user=user.name, state='running', start=start, end=end).json() waiting = util.jobs(self.cook_url, user=user.name, state='waiting', start=start, end=end).json() self.logger.info( f'Currently running jobs: {json.dumps(running, indent=2)}' ) self.logger.info( f'Currently waiting jobs: {json.dumps(waiting, indent=2)}' ) return job def job_was_preempted(job): for instance in job['instances']: self.logger.debug( f'Checking if instance was preempted: {instance}') if instance.get( 'reason_string') == 'Preempted by rebalancer': return True self.logger.info(f'Job has not been preempted: {job}') return False max_wait_ms = util.settings( self.cook_url )['rebalancer']['interval-seconds'] * 1000 * 1.5 self.logger.info( f'Waiting up to {max_wait_ms} milliseconds for preemption to happen' ) util.wait_until(low_priority_job, job_was_preempted, max_wait_ms=max_wait_ms, wait_interval_ms=5000) finally: with admin: util.kill_jobs(self.cook_url, all_job_uuids, assert_response=False) util.reset_limit(self.cook_url, 'share', user.name, reason=self.current_name(), pool=pool) util.reset_limit(self.cook_url, 'quota', user.name, reason=self.current_name(), pool=pool)
def test_rate_limit_launching_jobs(self): settings = util.settings(self.cook_url) if settings['rate-limit']['job-launch'] is None: pytest.skip( "Can't test job launch rate limit without launch rate limit set." ) # Allow an environmental variable override. name = os.getenv('COOK_LAUNCH_RATE_LIMIT_NAME') if name is not None: user = self.user_factory.user_class(name) else: user = self.user_factory.new_user() if not settings['rate-limit']['job-launch']['enforce?']: pytest.skip("Enforcing must be on for test to run") bucket_size = settings['rate-limit']['job-launch']['bucket-size'] token_rate = settings['rate-limit']['job-launch'][ 'tokens-replenished-per-minute'] # In some environments, e.g., minimesos, we can only launch so many concurrent jobs. if token_rate < 5 or token_rate > 20: pytest.skip( "Job launch rate limit test is only validated to reliably work correctly with certain token rates." ) if bucket_size < 10 or bucket_size > 20: pytest.skip( "Job launch rate limit test is only validated to reliably work correctly with certain token bucket sizes." ) with user: job_uuids = [] try: jobspec = {"command": "sleep 240", 'cpus': 0.03, 'mem': 32} self.logger.info( f'Submitting initial batch of {bucket_size-1} jobs') initial_uuids, initial_response = util.submit_jobs( self.cook_url, jobspec, bucket_size - 1) job_uuids.extend(initial_uuids) self.assertEqual(201, initial_response.status_code, msg=initial_response.content) def submit_jobs(): self.logger.info( f'Submitting subsequent batch of {bucket_size-1} jobs') subsequent_uuids, subsequent_response = util.submit_jobs( self.cook_url, jobspec, bucket_size - 1) job_uuids.extend(subsequent_uuids) self.assertEqual(201, subsequent_response.status_code, msg=subsequent_response.content) def is_rate_limit_triggered(_): jobs1 = util.query_jobs(self.cook_url, True, uuid=job_uuids).json() waiting_jobs = [ j for j in jobs1 if j['status'] == 'waiting' ] running_jobs = [ j for j in jobs1 if j['status'] == 'running' ] self.logger.debug( f'There are {len(waiting_jobs)} waiting jobs') # We submitted just under two buckets. We should only see a bucket + some extra running. No more. return len(running_jobs) >= bucket_size and len( running_jobs) < (bucket_size + token_rate / 2) and len(waiting_jobs) > 0 util.wait_until(submit_jobs, is_rate_limit_triggered) jobs2 = util.query_jobs(self.cook_url, True, uuid=job_uuids).json() running_jobs = [j for j in jobs2 if j['status'] == 'running'] self.assertEqual(len(running_jobs), bucket_size) finally: util.kill_jobs(self.cook_url, job_uuids)
def test_preemption(self): admin = self.user_factory.admin() user = self.user_factory.new_user() all_job_uuids = [] try: small_cpus = 0.1 large_cpus = small_cpus * 10 with admin: # Lower the user's cpu share and quota util.set_limit(self.cook_url, 'share', user.name, cpus=small_cpus) util.set_limit(self.cook_url, 'quota', user.name, cpus=large_cpus) with user: # Submit a large job that fills up the user's quota base_priority = 99 command = 'sleep 600' uuid_large, _ = util.submit_job(self.cook_url, priority=base_priority, cpus=large_cpus, command=command) all_job_uuids.append(uuid_large) util.wait_for_running_instance(self.cook_url, uuid_large) # Submit a higher-priority job that should trigger preemption uuid_high_priority, _ = util.submit_job( self.cook_url, priority=base_priority + 1, cpus=small_cpus, command=command, name='higher_priority_job') all_job_uuids.append(uuid_high_priority) # Assert that the lower-priority job was preempted def low_priority_job(): job = util.load_job(self.cook_url, uuid_large) one_hour_in_millis = 60 * 60 * 1000 start = util.current_milli_time() - one_hour_in_millis end = util.current_milli_time() running = util.jobs(self.cook_url, user=user.name, state='running', start=start, end=end).json() waiting = util.jobs(self.cook_url, user=user.name, state='waiting', start=start, end=end).json() self.logger.info( f'Currently running jobs: {json.dumps(running, indent=2)}' ) self.logger.info( f'Currently waiting jobs: {json.dumps(waiting, indent=2)}' ) return job def job_was_preempted(job): for instance in job['instances']: self.logger.debug( f'Checking if instance was preempted: {instance}') if instance.get( 'reason_string') == 'Preempted by rebalancer': return True self.logger.info(f'Job has not been preempted: {job}') return False max_wait_ms = util.settings( self.cook_url )['rebalancer']['interval-seconds'] * 1000 * 1.5 self.logger.info( f'Waiting up to {max_wait_ms} milliseconds for preemption to happen' ) util.wait_until(low_priority_job, job_was_preempted, max_wait_ms=max_wait_ms, wait_interval_ms=5000) finally: with admin: util.kill_jobs(self.cook_url, all_job_uuids, assert_response=False) util.reset_limit(self.cook_url, 'share', user.name, reason=self.current_name()) util.reset_limit(self.cook_url, 'quota', user.name, reason=self.current_name())