def test_job_count_quota(self): admin = self.user_factory.admin() user = self.user_factory.new_user() all_job_uuids = [] try: # User with no quota can't submit jobs with admin: resp = util.set_limit(self.cook_url, 'quota', user.name, count=0) self.assertEqual(resp.status_code, 201, resp.text) with user: _, resp = util.submit_job(self.cook_url) self.assertEqual(resp.status_code, 422, msg=resp.text) # Reset user's quota back to default, then user can submit jobs again with admin: resp = util.reset_limit(self.cook_url, 'quota', user.name) self.assertEqual(resp.status_code, 204, resp.text) with user: job_uuid, resp = util.submit_job(self.cook_url) self.assertEqual(resp.status_code, 201, msg=resp.text) all_job_uuids.append(job_uuid) # Can't set negative quota with admin: resp = util.set_limit(self.cook_url, 'quota', user.name, count=-1) self.assertEqual(resp.status_code, 400, resp.text) finally: with admin: util.kill_jobs(self.cook_url, all_job_uuids) util.reset_limit(self.cook_url, 'quota', user.name)
def test_admin_cannot_impersonate(self): user1 = self.user_factory.new_user() job_uuids = [] try: # admin can create jobs with self.admin: job_uuid, resp = util.submit_job(self.cook_url, command='sleep 1') self.assertEqual(resp.status_code, 201, resp.text) job_uuids.append(job_uuid) util.reset_limit(self.cook_url, 'quota', user1.name, reason=self.current_name()) # users can create jobs with user1: job_uuid, resp = util.submit_job(self.cook_url, command='sleep 1') self.assertEqual(resp.status_code, 201, resp.text) job_uuids.append(job_uuid) # admin cannot impersonate others creating jobs (not an authorized impersonator) with self.admin.impersonating(user1): job_uuid, resp = util.submit_job(self.cook_url, command='sleep 1') self.assertEqual(resp.status_code, 403, resp.text) finally: with self.admin: util.kill_jobs(self.cook_url, [j for j in job_uuids if j])
def test_basic_submit(self): job_uuid_1, resp = util.submit_job(self.cook_url_1) self.assertEqual(resp.status_code, 201) job_uuid_2, resp = util.submit_job(self.cook_url_2) self.assertEqual(resp.status_code, 201) job = util.wait_for_job(self.cook_url_1, job_uuid_1, 'completed') self.assertEqual('success', job['instances'][0]['status']) job = util.wait_for_job(self.cook_url_2, job_uuid_2, 'completed') self.assertEqual('success', job['instances'][0]['status'])
def test_disable_mea_culpa(self): job_uuid, resp = util.submit_job(self.cook_url, disable_mea_culpa_retries=True) self.assertEqual(201, resp.status_code) job = self.get_job(job_uuid) self.assertEqual(True, job['disable_mea_culpa_retries']) job_uuid, resp = util.submit_job(self.cook_url, disable_mea_culpa_retries=False) self.assertEqual(201, resp.status_code) job = self.get_job(job_uuid) self.assertEqual(False, job['disable_mea_culpa_retries'])
def test_job_cpu_quota(self): admin = self.user_factory.admin() user = self.user_factory.new_user() all_job_uuids = [] try: # User with no quota can't submit jobs with admin: resp = util.set_limit(self.cook_url, 'quota', user.name, cpus=0) self.assertEqual(resp.status_code, 201, resp.text) with user: _, resp = util.submit_job(self.cook_url) self.assertEqual(resp.status_code, 422, msg=resp.text) # User with tiny quota can't submit bigger jobs, but can submit tiny jobs with admin: resp = util.set_limit(self.cook_url, 'quota', user.name, cpus=0.25) self.assertEqual(resp.status_code, 201, resp.text) with user: _, resp = util.submit_job(self.cook_url, cpus=0.5) self.assertEqual(resp.status_code, 422, msg=resp.text) job_uuid, resp = util.submit_job(self.cook_url, cpus=0.25) self.assertEqual(resp.status_code, 201, msg=resp.text) all_job_uuids.append(job_uuid) # Reset user's quota back to default, then user can submit jobs again with admin: resp = util.reset_limit(self.cook_url, 'quota', user.name, reason=self.current_name()) self.assertEqual(resp.status_code, 204, resp.text) with user: job_uuid, resp = util.submit_job(self.cook_url) self.assertEqual(resp.status_code, 201, msg=resp.text) all_job_uuids.append(job_uuid) # Can't set negative quota with admin: resp = util.set_limit(self.cook_url, 'quota', user.name, cpus=-4) self.assertEqual(resp.status_code, 400, resp.text) finally: with admin: util.kill_jobs(self.cook_url, all_job_uuids, assert_response=False) util.reset_limit(self.cook_url, 'quota', user.name, reason=self.current_name())
def test_expected_runtime_field(self): # Should support expected_runtime expected_runtime = 1 job_uuid, resp = util.submit_job(self.cook_url, expected_runtime=expected_runtime) self.assertEqual(resp.status_code, 201) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('success', job['instances'][0]['status']) self.assertEqual(expected_runtime, job['expected_runtime']) # Should disallow expected_runtime > max_runtime expected_runtime = 2 max_runtime = expected_runtime - 1 job_uuid, resp = util.submit_job(self.cook_url, expected_runtime=expected_runtime, max_runtime=max_runtime) self.assertEqual(resp.status_code, 400)
def test_allow_partial(self): def absent_uuids(response): return [part for part in response.json()['error'].split() if util.is_valid_uuid(part)] job_uuid_1, resp = util.submit_job(self.cook_url) self.assertEqual(201, resp.status_code) job_uuid_2, resp = util.submit_job(self.cook_url) self.assertEqual(201, resp.status_code) # Only valid job uuids resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2]) self.assertEqual(200, resp.status_code) # Mixed valid, invalid job uuids bogus_uuid = str(uuid.uuid4()) resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2, bogus_uuid]) self.assertEqual(404, resp.status_code) self.assertEqual([bogus_uuid], absent_uuids(resp)) resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2, bogus_uuid], partial='false') self.assertEqual(404, resp.status_code, resp.json()) self.assertEqual([bogus_uuid], absent_uuids(resp)) # Partial results with mixed valid, invalid job uuids resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2, bogus_uuid], partial='true') self.assertEqual(200, resp.status_code, resp.json()) self.assertEqual(2, len(resp.json())) self.assertEqual([job_uuid_1, job_uuid_2].sort(), [job['uuid'] for job in resp.json()].sort()) # Only valid instance uuids job = util.wait_for_job(self.cook_url, job_uuid_1, 'completed') instance_uuid_1 = job['instances'][0]['task_id'] job = util.wait_for_job(self.cook_url, job_uuid_2, 'completed') instance_uuid_2 = job['instances'][0]['task_id'] resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2]) self.assertEqual(200, resp.status_code) # Mixed valid, invalid instance uuids resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2, bogus_uuid]) self.assertEqual(404, resp.status_code) self.assertEqual([bogus_uuid], absent_uuids(resp)) resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2, bogus_uuid], partial='false') self.assertEqual(404, resp.status_code) self.assertEqual([bogus_uuid], absent_uuids(resp)) # Partial results with mixed valid, invalid instance uuids resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2, bogus_uuid], partial='true') self.assertEqual(200, resp.status_code) self.assertEqual(2, len(resp.json())) self.assertEqual([job_uuid_1, job_uuid_2].sort(), [job['uuid'] for job in resp.json()].sort())
def test_cancel_job(self): job_uuid, _ = util.submit_job(self.cook_url, command='sleep 300') util.wait_for_job(self.cook_url, job_uuid, 'running') resp = util.session.delete('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)) self.assertEqual(204, resp.status_code) job = util.session.get('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)).json()[0] self.assertEqual('failed', job['state'])
def test_user_total_usage(self): user = self.user_factory.new_user() with user: job_spec = {'cpus': 0.11, 'mem': 123, 'command': 'sleep 600'} pools, _ = util.active_pools(self.cook_url) job_uuids = [] try: for pool in pools: job_uuid, resp = util.submit_job(self.cook_url, pool=pool['name'], **job_spec) self.assertEqual(201, resp.status_code, resp.text) job_uuids.append(job_uuid) util.wait_for_jobs(self.cook_url, job_uuids, 'running') resp = util.user_current_usage(self.cook_url, user=user.name, group_breakdown='true') self.assertEqual(resp.status_code, 200, resp.content) usage_data = resp.json() total_usage = usage_data['total_usage'] self.assertEqual(job_spec['mem'] * len(job_uuids), total_usage['mem'], total_usage) self.assertEqual(job_spec['cpus'] * len(job_uuids), total_usage['cpus'], total_usage) self.assertEqual(len(job_uuids), total_usage['jobs'], total_usage) finally: util.kill_jobs(self.cook_url, job_uuids)
def test_application_field(self): # Should support application application = {'name': 'foo-app', 'version': '0.1.0'} job_uuid, resp = util.submit_job(self.cook_url, application=application) self.assertEqual(resp.status_code, 201) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('success', job['instances'][0]['status']) self.assertEqual(application, job['application']) # Should require application name _, resp = util.submit_job(self.cook_url, application={'version': '0.1.0'}) self.assertEqual(resp.status_code, 400) # Should require application version _, resp = util.submit_job(self.cook_url, application={'name': 'foo-app'}) self.assertEqual(resp.status_code, 400)
def test_impersonated_job_delete(self): user1, user2 = self.user_factory.new_users(2) with user1: job_uuid, resp = util.submit_job(self.cook_url, command='sleep 60') self.assertEqual(resp.status_code, 201, resp.text) try: # authorized impersonator with self.poser: util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403) with self.poser.impersonating(user2): util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403) with self.poser.impersonating(user1): util.kill_jobs(self.cook_url, [job_uuid]) # unauthorized impersonation attempts by arbitrary user with user2: util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403) with user2.impersonating(user2): util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403) with user2.impersonating(user1): util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403) # unauthorized impersonation attempts by job owner with user1.impersonating(user2): util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403) with user1.impersonating(self.admin): util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403) with user1: util.kill_jobs(self.cook_url, [job_uuid]) finally: with self.admin: util.kill_jobs(self.cook_url, [job_uuid])
def test_get_queue(self): bad_constraint = [["HOSTNAME", "EQUALS", "lol won't get scheduled"]] uuid, resp = util.submit_job(self.master_url, command='sleep 30', constraints=bad_constraint) self.assertEqual(201, resp.status_code, resp.content) try: slave_queue = util.session.get('%s/queue' % self.slave_url, allow_redirects=False) self.assertEqual(307, slave_queue.status_code) default_pool = util.default_pool(self.master_url) pool = default_pool or 'no-pool' self.logger.info(f'Checking the queue endpoint for pool {pool}') @retry(stop_max_delay=30000, wait_fixed=1000) # Need to wait for a rank cycle def check_queue(): master_queue = util.session.get( slave_queue.headers['Location']) self.assertEqual(200, master_queue.status_code, master_queue.content) pool_queue = master_queue.json()[pool] self.assertTrue( any([job['job/uuid'] == uuid for job in pool_queue]), pool_queue) check_queue() finally: util.kill_jobs(self.master_url, [uuid])
def test_cancel_instance(self): job_uuid, _ = util.submit_job(self.cook_url, command='sleep 10', max_retries=2) job = util.wait_for_job(self.cook_url, job_uuid, 'running') task_id = job['instances'][0]['task_id'] resp = util.session.delete('%s/rawscheduler?instance=%s' % (self.cook_url, task_id)) self.assertEqual(204, resp.status_code) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('success', job['state'])
def test_multi_user_usage(self): users = self.user_factory.new_users(6) job_resources = {'cpus': 0.1, 'mem': 123} all_job_uuids = [] pools, _ = util.all_pools(self.cook_url) try: # Start jobs for several users for i, user in enumerate(users): with user: for j in range(i): job_uuid, resp = util.submit_job(self.cook_url, command='sleep 480', max_retries=2, **job_resources) self.assertEqual(resp.status_code, 201, resp.content) all_job_uuids.append(job_uuid) job = util.load_job(self.cook_url, job_uuid) self.assertEqual(user.name, job['user'], job) # Don't query until the jobs are all running util.wait_for_jobs(self.cook_url, all_job_uuids, 'running') # Check the usage for each of our users for i, user in enumerate(users): with user: # Get the current usage resp = util.user_current_usage(self.cook_url, user=user.name) self.assertEqual(resp.status_code, 200, resp.content) usage_data = resp.json() # Check that the response structure looks as expected if pools: self.assertEqual(list(usage_data.keys()), ['total_usage', 'pools'], usage_data) else: self.assertEqual(list(usage_data.keys()), ['total_usage'], usage_data) self.assertEqual(len(usage_data['total_usage']), 4, usage_data) # Check that each user's usage is as expected self.assertEqual(usage_data['total_usage']['mem'], job_resources['mem'] * i, usage_data) self.assertEqual(usage_data['total_usage']['cpus'], job_resources['cpus'] * i, usage_data) self.assertEqual(usage_data['total_usage']['gpus'], 0, usage_data) self.assertEqual(usage_data['total_usage']['jobs'], i, usage_data) finally: for job_uuid in all_job_uuids: job = util.load_job(self.cook_url, job_uuid) for instance in job['instances']: if instance['status'] == 'failed': mesos.dump_sandbox_files(util.session, instance, job) # Terminate all of the jobs if all_job_uuids: with self.user_factory.admin(): util.kill_jobs(self.cook_url, all_job_uuids, assert_response=False)
def test_max_runtime_exceeded(self): settings_timeout_interval_minutes = util.get_in(self.settings(), 'task-constraints', 'timeout-interval-minutes') # the value needs to be a little more than 2 times settings_timeout_interval_minutes to allow # at least two runs of the lingering task killer job_timeout_interval_seconds = (2 * settings_timeout_interval_minutes * 60) + 15 job_uuid, resp = util.submit_job(self.cook_url, command='sleep %s' % job_timeout_interval_seconds, max_runtime=5000) self.assertEqual(201, resp.status_code) job = util.wait_for_job(self.cook_url, job_uuid, 'completed', job_timeout_interval_seconds * 1000) self.assertEqual(1, len(job['instances'])) self.assertEqual('failed', job['instances'][0]['status']) self.assertEqual(2003, job['instances'][0]['reason_code'])
def test_change_retries(self): job_uuid, _ = util.submit_job(self.cook_url, command='sleep 10') util.wait_for_job(self.cook_url, job_uuid, 'running') resp = util.session.delete('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)) self.assertEqual(204, resp.status_code) job = util.session.get('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)).json()[0] self.assertEqual('failed', job['state']) resp = util.session.put('%s/retry' % self.cook_url, json={'retries': 2, 'jobs': [job_uuid]}) self.assertEqual(201, resp.status_code, resp.text) job = util.session.get('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)).json()[0] self.assertEqual('waiting', job['status']) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('success', job['state'])
def test_federated_query(self): # Submit to cluster #1 job_uuid_1, resp = util.submit_job(self.cook_url_1) self.assertEqual(resp.status_code, 201) # Submit to cluster #2 job_uuid_2, resp = util.submit_job(self.cook_url_2) self.assertEqual(resp.status_code, 201) # Ask for both jobs from cluster #1, expect to get the first resp = util.query_jobs(self.cook_url_1, job=[job_uuid_1, job_uuid_2], partial='true') self.assertEqual(200, resp.status_code, resp.json()) self.assertEqual(1, len(resp.json())) self.assertEqual([job_uuid_1], [job['uuid'] for job in resp.json()]) # Ask for both jobs from cluster #2, expect to get the second resp = util.query_jobs(self.cook_url_2, job=[job_uuid_1, job_uuid_2], partial='true') self.assertEqual(200, resp.status_code, resp.json()) self.assertEqual(1, len(resp.json())) self.assertEqual([job_uuid_2], [job['uuid'] for job in resp.json()])
def test_get_queue(self): job_uuid, resp = util.submit_job(self.master_url, constraints=[["HOSTNAME", "EQUALS", "can't schedule"]]) self.assertEqual(201, resp.status_code, resp.content) slave_queue = util.session.get('%s/queue' % self.slave_url, allow_redirects=False) self.assertEqual(307, slave_queue.status_code) @retry(stop_max_delay=30000, wait_fixed=1000) # Need to wait for a rank cycle def check_queue(): master_queue = util.session.get(slave_queue.headers['Location']) self.assertEqual(200, master_queue.status_code, master_queue.content) self.assertTrue(any([job['job/uuid'] == job_uuid for job in master_queue.json()['normal']])) check_queue() util.session.delete('%s/rawscheduler?job=%s' % (self.master_url, job_uuid))
def test_constraints(self): state = util.get_mesos_state(self.mesos_url) hosts = [agent['hostname'] for agent in state['slaves']] bad_job_uuid, resp = util.submit_job(self.cook_url, constraints=[["HOSTNAME", "EQUALS", "lol won't get scheduled"]]) self.assertEqual(resp.status_code, 201, resp.text) host_to_job_uuid = {} for hostname in hosts: constraints = [["HOSTNAME", "EQUALS", hostname]] job_uuid, resp = util.submit_job(self.cook_url, constraints=constraints) self.assertEqual(resp.status_code, 201, resp.text) host_to_job_uuid[hostname] = job_uuid for hostname, job_uuid in host_to_job_uuid.items(): job = util.wait_for_job(self.cook_url, job_uuid, 'completed') hostname_constrained = job['instances'][0]['hostname'] self.assertEqual(hostname, hostname_constrained) self.assertEqual([["HOSTNAME", "EQUALS", hostname]], job['constraints']) # This job should have been scheduled since the job submitted after it has completed # however, its constraint means it won't get scheduled job = util.wait_for_job(self.cook_url, bad_job_uuid, 'waiting', max_delay=3000)
def test_job_delete_permission(self): user1, user2 = self.user_factory.new_users(2) with user1: job_uuid, resp = util.submit_job(self.cook_url, command='sleep 30') try: self.assertEqual(resp.status_code, 201, resp.text) with user2: util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403) with user1: util.kill_jobs(self.cook_url, [job_uuid]) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('failed', job['state']) finally: with user1: util.kill_jobs(self.cook_url, [job_uuid])
def test_self_impersonate(self): user1 = self.user_factory.new_user() job_uuids = [] try: # normal user can self-impersonate with user1.impersonating(user1): job_uuid, resp = util.submit_job(self.cook_url, command='sleep 1') self.assertEqual(resp.status_code, 201, resp.text) job_uuids.append(job_uuid) # admin can self-impersonate for admin endpoints # i.e., self-impersonation is treated as a non-impersonated request with self.admin.impersonating(self.admin): resp = util.query_queue(self.cook_url) self.assertEqual(resp.status_code, 200, resp.text) finally: with self.admin: util.kill_jobs(self.cook_url, [j for j in job_uuids if j])
def test_multi_user_usage(self): users = self.user_factory.new_users(6) job_resources = {'cpus': 0.1, 'mem': 123} all_job_uuids = [] try: # Start jobs for several users for i, user in enumerate(users): with user: for j in range(i): job_uuid, resp = util.submit_job(self.cook_url, command='sleep 480', **job_resources) self.assertEqual(resp.status_code, 201, resp.content) all_job_uuids.append(job_uuid) # Don't query until the jobs are all running util.wait_for_jobs(self.cook_url, all_job_uuids, 'running') # Check the usage for each of our users for i, user in enumerate(users): with user: # Get the current usage resp = util.user_current_usage(self.cook_url, user=user.name) self.assertEqual(resp.status_code, 200, resp.content) usage_data = resp.json() # Check that the response structure looks as expected self.assertEqual(list(usage_data.keys()), ['total_usage'], usage_data) self.assertEqual(len(usage_data['total_usage']), 4, usage_data) # Check that each user's usage is as expected self.assertEqual(usage_data['total_usage']['mem'], job_resources['mem'] * i, usage_data) self.assertEqual(usage_data['total_usage']['cpus'], job_resources['cpus'] * i, usage_data) self.assertEqual(usage_data['total_usage']['gpus'], 0, usage_data) self.assertEqual(usage_data['total_usage']['jobs'], i, usage_data) finally: # Terminate all of the jobs if all_job_uuids: with self.user_factory.admin(): util.kill_jobs(self.cook_url, all_job_uuids)
def test_self_impersonate(self): user1 = self.user_factory.new_user() job_uuids = [] try: # normal user can self-impersonate with user1.impersonating(user1): job_uuid, resp = util.submit_job(self.cook_url, command='sleep 1') self.assertEqual(resp.status_code, 201, resp.text) job_uuids.append(job_uuid) # admin can self-impersonate for admin endpoints # i.e., self-impersonation is treated as a non-impersonated request with self.admin.impersonating(self.admin): # The /queue endpoint redirects to the master, but we don't need to follow that. # As long as we don't get an auth error, we're good. resp = util.query_queue(self.cook_url, allow_redirects=False) self.assertIn(resp.status_code, [200, 307], resp.text) finally: with self.admin: util.kill_jobs(self.cook_url, [j for j in job_uuids if j])
def test_job_delete_permission(self): user1, user2 = self.user_factory.new_users(2) with user1: job_uuid, resp = util.submit_job(self.cook_url, command='sleep 30') try: self.assertEqual(resp.status_code, 201, resp.text) with user2: resp = util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403) self.assertEqual( f'You are not authorized to kill the following jobs: {job_uuid}', resp.json()['error']) with user1: util.kill_jobs(self.cook_url, [job_uuid]) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('failed', job['state']) finally: with user1: util.kill_jobs(self.cook_url, [job_uuid], assert_response=False)
def test_group_delete_permission(self): user1, user2 = self.user_factory.new_users(2) with user1: group_spec = util.minimal_group() group_uuid = group_spec['uuid'] job_uuid, resp = util.submit_job(self.cook_url, command='sleep 30', group=group_uuid) try: self.assertEqual(resp.status_code, 201, resp.text) with user2: util.kill_groups(self.cook_url, [group_uuid], expected_status_code=403) with user1: util.kill_groups(self.cook_url, [group_uuid]) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('failed', job['state']) finally: with user1: util.kill_jobs(self.cook_url, [job_uuid], assert_response=False)
def test_pool_scheduling(self): admin = self.user_factory.admin() user = self.user_factory.new_user() pools, _ = util.active_pools(self.cook_url) all_job_uuids = [] try: default_pool = util.default_pool(self.cook_url) self.assertLess(1, len(pools)) self.assertIsNotNone(default_pool) cpus = 0.1 with admin: self.logger.info( f'Running tasks: {json.dumps(util.running_tasks(self.cook_url), indent=2)}' ) for pool in pools: # Lower the user's cpu quota on this pool pool_name = pool['name'] quota_multiplier = 1 if pool_name == default_pool else 2 util.set_limit(self.cook_url, 'quota', user.name, cpus=cpus * quota_multiplier, pool=pool_name) with user: util.kill_running_and_waiting_jobs(self.cook_url, user.name) for pool in pools: pool_name = pool['name'] # Submit a job that fills the user's quota on this pool quota = util.get_limit(self.cook_url, 'quota', user.name, pool_name).json() quota_cpus = quota['cpus'] filling_job_uuid, _ = util.submit_job(self.cook_url, cpus=quota_cpus, command='sleep 600', pool=pool_name) all_job_uuids.append(filling_job_uuid) instance = util.wait_for_running_instance( self.cook_url, filling_job_uuid) slave_pool = util.node_pool(instance['hostname']) self.assertEqual(pool_name, slave_pool) # Submit a job that should not get scheduled job_uuid, _ = util.submit_job(self.cook_url, cpus=cpus, command='ls', pool=pool_name) all_job_uuids.append(job_uuid) job = util.load_job(self.cook_url, job_uuid) self.assertEqual('waiting', job['status']) # Assert that the unscheduled reason and data are correct @retry(stop_max_delay=60000, wait_fixed=5000) def check_unscheduled_reason(): jobs, _ = util.unscheduled_jobs( self.cook_url, job_uuid) self.logger.info(f'Unscheduled jobs: {jobs}') self.assertEqual(job_uuid, jobs[0]['uuid']) job_reasons = jobs[0]['reasons'] # Check the spot-in-queue reason reason = next(r for r in job_reasons if r['reason'] == 'You have 1 other jobs ahead in the ' 'queue.') self.assertEqual({'jobs': [filling_job_uuid]}, reason['data']) # Check the exceeding-quota reason reason = next( r for r in job_reasons if r['reason'] == reasons.JOB_WOULD_EXCEED_QUOTA) self.assertEqual( { 'cpus': { 'limit': quota_cpus, 'usage': quota_cpus + cpus } }, reason['data']) check_unscheduled_reason() finally: with admin: util.kill_jobs(self.cook_url, all_job_uuids, assert_response=False) for pool in pools: util.reset_limit(self.cook_url, 'quota', user.name, reason=self.current_name(), pool=pool['name'])
def test_failing_submit(self): job_uuid, resp = util.submit_job(self.cook_url, command='exit 1') self.assertEqual(201, resp.status_code) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual(1, len(job['instances'])) self.assertEqual('failed', job['instances'][0]['status'])
def test_basic_submit(self): job_uuid, resp = util.submit_job(self.cook_url) self.assertEqual(resp.status_code, 201) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('success', job['instances'][0]['status']) self.assertEqual(False, job['disable_mea_culpa_retries'])
def test_checkpoint_locality(self): """ Test that restored instances run in the same location as their checkpointed instances. """ # Get the set of clusters that correspond to the pool under test and are running pool = util.default_submit_pool() clusters = util.compute_clusters(self.cook_url) running_clusters = [ c for c in clusters['in-mem-configs'] if pool in c['cluster-definition']['config']['synthetic-pods'] ['pools'] and c['state'] == 'running' ] self.logger.info( f'Running clusters for pool {pool}: {running_clusters}') if len(running_clusters) == 0: self.skipTest( f'Requires at least 1 running compute cluster for pool {pool}') # Submit an initial canary job job_uuid, resp = util.submit_job(self.cook_url, pool=pool, command='true') self.assertEqual(201, resp.status_code, resp.content) util.wait_for_instance(self.cook_url, job_uuid, status='success', indent=None) # Submit a long-running job with checkpointing checkpoint_job_uuid, resp = util.submit_job( self.cook_url, pool=pool, command=f'sleep {util.DEFAULT_TEST_TIMEOUT_SECS}', max_retries=5, checkpoint={'mode': 'auto'}) self.assertEqual(201, resp.status_code, resp.content) try: # Wait for the job to be running checkpoint_instance = util.wait_for_instance(self.cook_url, checkpoint_job_uuid, status='running', indent=None) checkpoint_instance_uuid = checkpoint_instance['task_id'] checkpoint_location = next( c['location'] for c in running_clusters if c['name'] == checkpoint_instance['compute-cluster']['name']) admin = self.user_factory.admin() try: # Force all clusters in the instance's location to have state = draining with admin: for cluster in running_clusters: if cluster['location'] == checkpoint_location: cluster_update = dict(cluster) # Set state = draining cluster_update['state'] = 'draining' cluster_update['state-locked?'] = True # The location, cluster-definition, and features fields cannot be sent in the update cluster_update.pop('location', None) cluster_update.pop('cluster-definition', None) cluster_update.pop('features', None) self.logger.info( f'Trying to update cluster to draining: {cluster_update}' ) util.wait_until( lambda: util.update_compute_cluster( self.cook_url, cluster_update)[1], lambda response: response.status_code == 201 and len(response.json()) > 0) else: self.logger.info( f'Not updating cluster - not in location {checkpoint_location}: {cluster}' ) # Kill the running checkpoint job instance util.kill_instance(self.cook_url, checkpoint_instance_uuid) # Submit another canary job job_uuid, resp = util.submit_job(self.cook_url, pool=pool, command='true') self.assertEqual(201, resp.status_code, resp.content) cluster_locations = set(c['location'] for c in running_clusters) if len(cluster_locations) > 1: # The canary job should run in the non-draining location self.logger.info( f'There are > 1 cluster locations under test: {cluster_locations}' ) util.wait_for_instance(self.cook_url, job_uuid, status='success', indent=None) else: self.logger.info( f'There is only 1 cluster location under test: {cluster_locations}' ) # The checkpoint job should be waiting util.wait_for_instance(self.cook_url, checkpoint_job_uuid, status='failed', indent=None) util.wait_for_job_in_statuses(self.cook_url, checkpoint_job_uuid, ['waiting']) finally: # Revert all clusters in the instance's location to state = running with admin: for cluster in running_clusters: if cluster['location'] == checkpoint_location: cluster_update = dict(cluster) # Set state = running cluster_update['state'] = 'running' cluster_update['state-locked?'] = False # The location, cluster-definition, and features fields cannot be sent in the update cluster_update.pop('location', None) cluster_update.pop('cluster-definition', None) cluster_update.pop('features', None) self.logger.info( f'Trying to update cluster to running: {cluster_update}' ) util.wait_until( lambda: util.update_compute_cluster( self.cook_url, cluster_update)[1], lambda response: response.status_code == 201 and len(response.json()) > 0) else: self.logger.info( f'Not updating cluster - not in location {checkpoint_location}: {cluster}' ) # Wait for the checkpoint job to be running again, in the same location as before checkpoint_instance = util.wait_for_instance( self.cook_url, checkpoint_job_uuid, status='running', indent=None) self.assertEqual( checkpoint_location, next(c['location'] for c in running_clusters if c['name'] == checkpoint_instance['compute-cluster']['name'])) finally: # Kill the checkpoint job to not leave it running util.kill_jobs(self.cook_url, [checkpoint_job_uuid])
def test_dynamic_clusters(self): """ Test that dynamic cluster configuration functionality is working. """ docker_image = util.docker_image() container = {'type': 'docker', 'docker': {'image': docker_image}} admin = self.user_factory.admin() # Force all clusters to have state = deleted via the API clusters = [ cluster for cluster in util.compute_clusters(self.cook_url)['db-configs'] if cluster["state"] == "running" ] with admin: self.logger.info(f'Clusters {clusters}') # First state = draining for cluster in clusters: cluster["state"] = "draining" cluster["state-locked?"] = True self.logger.info(f'Trying to update cluster {cluster}') data, resp = util.update_compute_cluster( self.cook_url, cluster) self.assertEqual(201, resp.status_code, resp.content) # Then state = deleted for cluster in clusters: cluster["state"] = "deleted" util.wait_until( lambda: util.update_compute_cluster( self.cook_url, cluster), lambda x: 201 == x[1].status_code, 300000, 5000) # Create at least one new cluster with a unique test name (using one of the existing cluster's IP and cert) test_cluster_name = f'test_cluster_{round(time.time() * 1000)}' test_cluster = { "name": test_cluster_name, "state": "running", "base-path": clusters[0]["base-path"], "ca-cert": clusters[0]["ca-cert"], "template": clusters[0]["template"] } data, resp = util.create_compute_cluster(self.cook_url, test_cluster) self.assertEqual(201, resp.status_code, resp.content) # Test create cluster with duplicate name data, resp = util.create_compute_cluster(self.cook_url, test_cluster) self.assertEqual(422, resp.status_code, resp.content) self.assertEqual( f'Compute cluster with name {test_cluster_name} already exists', data['error']['message'], resp.content) # Check that a job schedules successfully command = "true" job_uuid, resp = util.submit_job(self.cook_url, command=command, container=container) self.assertEqual(201, resp.status_code, resp.content) instance = util.wait_for_instance(self.cook_url, job_uuid) message = repr(instance) self.assertIsNotNone(instance['compute-cluster'], message) instance_compute_cluster_name = instance['compute-cluster']['name'] self.assertEqual(test_cluster["name"], instance_compute_cluster_name, instance['compute-cluster']) util.wait_for_instance(self.cook_url, job_uuid, status='success') running_clusters = [ cluster for cluster in util.compute_clusters(self.cook_url)['db-configs'] if cluster["state"] == "running" ] self.assertEqual(1, len(running_clusters), running_clusters) self.assertEqual(test_cluster["name"], running_clusters[0]["name"], running_clusters) with admin: # Delete test cluster # First state = draining test_cluster["state"] = "draining" data, resp = util.update_compute_cluster(self.cook_url, test_cluster) self.assertEqual(201, resp.status_code, resp.content) # Then state = deleted test_cluster["state"] = "deleted" util.wait_until( lambda: util.update_compute_cluster(self.cook_url, test_cluster ), lambda x: 201 == x[1].status_code, 300000, 5000) # Hard-delete the original non-test clusters for cluster in clusters: self.logger.info(f'Trying to delete cluster {cluster}') resp = util.delete_compute_cluster(self.cook_url, cluster) self.assertEqual(204, resp.status_code, resp.content) # Force give up leadership resp = util.shutdown_leader(self.cook_url, "test_dynamic_clusters") self.assertEqual(b'Accepted', resp) # Old clusters should be re-created # wait for cook to come up util.wait_until( lambda: [ cluster for cluster in util.compute_clusters(self.cook_url)[ 'db-configs'] if cluster["state"] == "running" ], lambda x: len(x) == len(clusters), 420000, 5000) # Check that a job schedules successfully command = "true" job_uuid, resp = util.submit_job(self.cook_url, command=command, container=container) self.assertEqual(201, resp.status_code, resp.content) util.wait_for_instance(self.cook_url, job_uuid, status='success') with admin: # Hard-delete test cluster resp = util.delete_compute_cluster(self.cook_url, test_cluster) self.assertEqual(204, resp.status_code, resp.content)