Пример #1
0
 def test_job_count_quota(self):
     admin = self.user_factory.admin()
     user = self.user_factory.new_user()
     all_job_uuids = []
     try:
         # User with no quota can't submit jobs
         with admin:
             resp = util.set_limit(self.cook_url,
                                   'quota',
                                   user.name,
                                   count=0)
             self.assertEqual(resp.status_code, 201, resp.text)
         with user:
             _, resp = util.submit_job(self.cook_url)
             self.assertEqual(resp.status_code, 422, msg=resp.text)
         # Reset user's quota back to default, then user can submit jobs again
         with admin:
             resp = util.reset_limit(self.cook_url, 'quota', user.name)
             self.assertEqual(resp.status_code, 204, resp.text)
         with user:
             job_uuid, resp = util.submit_job(self.cook_url)
             self.assertEqual(resp.status_code, 201, msg=resp.text)
             all_job_uuids.append(job_uuid)
         # Can't set negative quota
         with admin:
             resp = util.set_limit(self.cook_url,
                                   'quota',
                                   user.name,
                                   count=-1)
             self.assertEqual(resp.status_code, 400, resp.text)
     finally:
         with admin:
             util.kill_jobs(self.cook_url, all_job_uuids)
             util.reset_limit(self.cook_url, 'quota', user.name)
Пример #2
0
 def test_admin_cannot_impersonate(self):
     user1 = self.user_factory.new_user()
     job_uuids = []
     try:
         # admin can create jobs
         with self.admin:
             job_uuid, resp = util.submit_job(self.cook_url,
                                              command='sleep 1')
             self.assertEqual(resp.status_code, 201, resp.text)
             job_uuids.append(job_uuid)
             util.reset_limit(self.cook_url,
                              'quota',
                              user1.name,
                              reason=self.current_name())
         # users can create jobs
         with user1:
             job_uuid, resp = util.submit_job(self.cook_url,
                                              command='sleep 1')
             self.assertEqual(resp.status_code, 201, resp.text)
             job_uuids.append(job_uuid)
         # admin cannot impersonate others creating jobs (not an authorized impersonator)
         with self.admin.impersonating(user1):
             job_uuid, resp = util.submit_job(self.cook_url,
                                              command='sleep 1')
             self.assertEqual(resp.status_code, 403, resp.text)
     finally:
         with self.admin:
             util.kill_jobs(self.cook_url, [j for j in job_uuids if j])
Пример #3
0
 def test_basic_submit(self):
     job_uuid_1, resp = util.submit_job(self.cook_url_1)
     self.assertEqual(resp.status_code, 201)
     job_uuid_2, resp = util.submit_job(self.cook_url_2)
     self.assertEqual(resp.status_code, 201)
     job = util.wait_for_job(self.cook_url_1, job_uuid_1, 'completed')
     self.assertEqual('success', job['instances'][0]['status'])
     job = util.wait_for_job(self.cook_url_2, job_uuid_2, 'completed')
     self.assertEqual('success', job['instances'][0]['status'])
Пример #4
0
    def test_disable_mea_culpa(self):
        job_uuid, resp = util.submit_job(self.cook_url, disable_mea_culpa_retries=True)
        self.assertEqual(201, resp.status_code)
        job = self.get_job(job_uuid)
        self.assertEqual(True, job['disable_mea_culpa_retries'])

        job_uuid, resp = util.submit_job(self.cook_url, disable_mea_culpa_retries=False)
        self.assertEqual(201, resp.status_code)
        job = self.get_job(job_uuid)
        self.assertEqual(False, job['disable_mea_culpa_retries'])
Пример #5
0
 def test_job_cpu_quota(self):
     admin = self.user_factory.admin()
     user = self.user_factory.new_user()
     all_job_uuids = []
     try:
         # User with no quota can't submit jobs
         with admin:
             resp = util.set_limit(self.cook_url,
                                   'quota',
                                   user.name,
                                   cpus=0)
             self.assertEqual(resp.status_code, 201, resp.text)
         with user:
             _, resp = util.submit_job(self.cook_url)
             self.assertEqual(resp.status_code, 422, msg=resp.text)
         # User with tiny quota can't submit bigger jobs, but can submit tiny jobs
         with admin:
             resp = util.set_limit(self.cook_url,
                                   'quota',
                                   user.name,
                                   cpus=0.25)
             self.assertEqual(resp.status_code, 201, resp.text)
         with user:
             _, resp = util.submit_job(self.cook_url, cpus=0.5)
             self.assertEqual(resp.status_code, 422, msg=resp.text)
             job_uuid, resp = util.submit_job(self.cook_url, cpus=0.25)
             self.assertEqual(resp.status_code, 201, msg=resp.text)
             all_job_uuids.append(job_uuid)
         # Reset user's quota back to default, then user can submit jobs again
         with admin:
             resp = util.reset_limit(self.cook_url,
                                     'quota',
                                     user.name,
                                     reason=self.current_name())
             self.assertEqual(resp.status_code, 204, resp.text)
         with user:
             job_uuid, resp = util.submit_job(self.cook_url)
             self.assertEqual(resp.status_code, 201, msg=resp.text)
             all_job_uuids.append(job_uuid)
         # Can't set negative quota
         with admin:
             resp = util.set_limit(self.cook_url,
                                   'quota',
                                   user.name,
                                   cpus=-4)
             self.assertEqual(resp.status_code, 400, resp.text)
     finally:
         with admin:
             util.kill_jobs(self.cook_url,
                            all_job_uuids,
                            assert_response=False)
             util.reset_limit(self.cook_url,
                              'quota',
                              user.name,
                              reason=self.current_name())
Пример #6
0
    def test_expected_runtime_field(self):
        # Should support expected_runtime
        expected_runtime = 1
        job_uuid, resp = util.submit_job(self.cook_url, expected_runtime=expected_runtime)
        self.assertEqual(resp.status_code, 201)
        job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
        self.assertEqual('success', job['instances'][0]['status'])
        self.assertEqual(expected_runtime, job['expected_runtime'])

        # Should disallow expected_runtime > max_runtime
        expected_runtime = 2
        max_runtime = expected_runtime - 1
        job_uuid, resp = util.submit_job(self.cook_url, expected_runtime=expected_runtime, max_runtime=max_runtime)
        self.assertEqual(resp.status_code, 400)
Пример #7
0
    def test_allow_partial(self):
        def absent_uuids(response):
            return [part for part in response.json()['error'].split() if util.is_valid_uuid(part)]

        job_uuid_1, resp = util.submit_job(self.cook_url)
        self.assertEqual(201, resp.status_code)
        job_uuid_2, resp = util.submit_job(self.cook_url)
        self.assertEqual(201, resp.status_code)

        # Only valid job uuids
        resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2])
        self.assertEqual(200, resp.status_code)

        # Mixed valid, invalid job uuids
        bogus_uuid = str(uuid.uuid4())
        resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2, bogus_uuid])
        self.assertEqual(404, resp.status_code)
        self.assertEqual([bogus_uuid], absent_uuids(resp))
        resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2, bogus_uuid], partial='false')
        self.assertEqual(404, resp.status_code, resp.json())
        self.assertEqual([bogus_uuid], absent_uuids(resp))

        # Partial results with mixed valid, invalid job uuids
        resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2, bogus_uuid], partial='true')
        self.assertEqual(200, resp.status_code, resp.json())
        self.assertEqual(2, len(resp.json()))
        self.assertEqual([job_uuid_1, job_uuid_2].sort(), [job['uuid'] for job in resp.json()].sort())

        # Only valid instance uuids
        job = util.wait_for_job(self.cook_url, job_uuid_1, 'completed')
        instance_uuid_1 = job['instances'][0]['task_id']
        job = util.wait_for_job(self.cook_url, job_uuid_2, 'completed')
        instance_uuid_2 = job['instances'][0]['task_id']
        resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2])
        self.assertEqual(200, resp.status_code)

        # Mixed valid, invalid instance uuids
        resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2, bogus_uuid])
        self.assertEqual(404, resp.status_code)
        self.assertEqual([bogus_uuid], absent_uuids(resp))
        resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2, bogus_uuid], partial='false')
        self.assertEqual(404, resp.status_code)
        self.assertEqual([bogus_uuid], absent_uuids(resp))

        # Partial results with mixed valid, invalid instance uuids
        resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2, bogus_uuid], partial='true')
        self.assertEqual(200, resp.status_code)
        self.assertEqual(2, len(resp.json()))
        self.assertEqual([job_uuid_1, job_uuid_2].sort(), [job['uuid'] for job in resp.json()].sort())
Пример #8
0
 def test_cancel_job(self):
     job_uuid, _ = util.submit_job(self.cook_url, command='sleep 300')
     util.wait_for_job(self.cook_url, job_uuid, 'running')
     resp = util.session.delete('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid))
     self.assertEqual(204, resp.status_code)
     job = util.session.get('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)).json()[0]
     self.assertEqual('failed', job['state'])
Пример #9
0
    def test_user_total_usage(self):
        user = self.user_factory.new_user()
        with user:
            job_spec = {'cpus': 0.11, 'mem': 123, 'command': 'sleep 600'}
            pools, _ = util.active_pools(self.cook_url)
            job_uuids = []
            try:
                for pool in pools:
                    job_uuid, resp = util.submit_job(self.cook_url,
                                                     pool=pool['name'],
                                                     **job_spec)
                    self.assertEqual(201, resp.status_code, resp.text)
                    job_uuids.append(job_uuid)

                util.wait_for_jobs(self.cook_url, job_uuids, 'running')
                resp = util.user_current_usage(self.cook_url,
                                               user=user.name,
                                               group_breakdown='true')
                self.assertEqual(resp.status_code, 200, resp.content)
                usage_data = resp.json()
                total_usage = usage_data['total_usage']

                self.assertEqual(job_spec['mem'] * len(job_uuids),
                                 total_usage['mem'], total_usage)
                self.assertEqual(job_spec['cpus'] * len(job_uuids),
                                 total_usage['cpus'], total_usage)
                self.assertEqual(len(job_uuids), total_usage['jobs'],
                                 total_usage)
            finally:
                util.kill_jobs(self.cook_url, job_uuids)
Пример #10
0
    def test_application_field(self):
        # Should support application
        application = {'name': 'foo-app', 'version': '0.1.0'}
        job_uuid, resp = util.submit_job(self.cook_url, application=application)
        self.assertEqual(resp.status_code, 201)
        job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
        self.assertEqual('success', job['instances'][0]['status'])
        self.assertEqual(application, job['application'])

        # Should require application name
        _, resp = util.submit_job(self.cook_url, application={'version': '0.1.0'})
        self.assertEqual(resp.status_code, 400)

        # Should require application version
        _, resp = util.submit_job(self.cook_url, application={'name': 'foo-app'})
        self.assertEqual(resp.status_code, 400)
Пример #11
0
 def test_impersonated_job_delete(self):
     user1, user2 = self.user_factory.new_users(2)
     with user1:
         job_uuid, resp = util.submit_job(self.cook_url, command='sleep 60')
         self.assertEqual(resp.status_code, 201, resp.text)
     try:
         # authorized impersonator
         with self.poser:
             util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403)
         with self.poser.impersonating(user2):
             util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403)
         with self.poser.impersonating(user1):
             util.kill_jobs(self.cook_url, [job_uuid])
         # unauthorized impersonation attempts by arbitrary user
         with user2:
             util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403)
         with user2.impersonating(user2):
             util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403)
         with user2.impersonating(user1):
             util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403)
         # unauthorized impersonation attempts by job owner
         with user1.impersonating(user2):
             util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403)
         with user1.impersonating(self.admin):
             util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403)
         with user1:
             util.kill_jobs(self.cook_url, [job_uuid])
     finally:
         with self.admin:
             util.kill_jobs(self.cook_url, [job_uuid])
Пример #12
0
    def test_get_queue(self):
        bad_constraint = [["HOSTNAME", "EQUALS", "lol won't get scheduled"]]
        uuid, resp = util.submit_job(self.master_url,
                                     command='sleep 30',
                                     constraints=bad_constraint)
        self.assertEqual(201, resp.status_code, resp.content)
        try:
            slave_queue = util.session.get('%s/queue' % self.slave_url,
                                           allow_redirects=False)
            self.assertEqual(307, slave_queue.status_code)
            default_pool = util.default_pool(self.master_url)
            pool = default_pool or 'no-pool'
            self.logger.info(f'Checking the queue endpoint for pool {pool}')

            @retry(stop_max_delay=30000,
                   wait_fixed=1000)  # Need to wait for a rank cycle
            def check_queue():
                master_queue = util.session.get(
                    slave_queue.headers['Location'])
                self.assertEqual(200, master_queue.status_code,
                                 master_queue.content)
                pool_queue = master_queue.json()[pool]
                self.assertTrue(
                    any([job['job/uuid'] == uuid for job in pool_queue]),
                    pool_queue)

            check_queue()
        finally:
            util.kill_jobs(self.master_url, [uuid])
Пример #13
0
 def test_cancel_instance(self):
     job_uuid, _ = util.submit_job(self.cook_url, command='sleep 10', max_retries=2)
     job = util.wait_for_job(self.cook_url, job_uuid, 'running')
     task_id = job['instances'][0]['task_id']
     resp = util.session.delete('%s/rawscheduler?instance=%s' % (self.cook_url, task_id))
     self.assertEqual(204, resp.status_code)
     job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
     self.assertEqual('success', job['state'])
Пример #14
0
 def test_multi_user_usage(self):
     users = self.user_factory.new_users(6)
     job_resources = {'cpus': 0.1, 'mem': 123}
     all_job_uuids = []
     pools, _ = util.all_pools(self.cook_url)
     try:
         # Start jobs for several users
         for i, user in enumerate(users):
             with user:
                 for j in range(i):
                     job_uuid, resp = util.submit_job(self.cook_url,
                                                      command='sleep 480',
                                                      max_retries=2,
                                                      **job_resources)
                     self.assertEqual(resp.status_code, 201, resp.content)
                     all_job_uuids.append(job_uuid)
                     job = util.load_job(self.cook_url, job_uuid)
                     self.assertEqual(user.name, job['user'], job)
         # Don't query until the jobs are all running
         util.wait_for_jobs(self.cook_url, all_job_uuids, 'running')
         # Check the usage for each of our users
         for i, user in enumerate(users):
             with user:
                 # Get the current usage
                 resp = util.user_current_usage(self.cook_url,
                                                user=user.name)
                 self.assertEqual(resp.status_code, 200, resp.content)
                 usage_data = resp.json()
                 # Check that the response structure looks as expected
                 if pools:
                     self.assertEqual(list(usage_data.keys()),
                                      ['total_usage', 'pools'], usage_data)
                 else:
                     self.assertEqual(list(usage_data.keys()),
                                      ['total_usage'], usage_data)
                 self.assertEqual(len(usage_data['total_usage']), 4,
                                  usage_data)
                 # Check that each user's usage is as expected
                 self.assertEqual(usage_data['total_usage']['mem'],
                                  job_resources['mem'] * i, usage_data)
                 self.assertEqual(usage_data['total_usage']['cpus'],
                                  job_resources['cpus'] * i, usage_data)
                 self.assertEqual(usage_data['total_usage']['gpus'], 0,
                                  usage_data)
                 self.assertEqual(usage_data['total_usage']['jobs'], i,
                                  usage_data)
     finally:
         for job_uuid in all_job_uuids:
             job = util.load_job(self.cook_url, job_uuid)
             for instance in job['instances']:
                 if instance['status'] == 'failed':
                     mesos.dump_sandbox_files(util.session, instance, job)
         # Terminate all of the jobs
         if all_job_uuids:
             with self.user_factory.admin():
                 util.kill_jobs(self.cook_url,
                                all_job_uuids,
                                assert_response=False)
Пример #15
0
 def test_max_runtime_exceeded(self):
     settings_timeout_interval_minutes = util.get_in(self.settings(), 'task-constraints', 'timeout-interval-minutes')
     # the value needs to be a little more than 2 times settings_timeout_interval_minutes to allow
     # at least two runs of the lingering task killer
     job_timeout_interval_seconds = (2 * settings_timeout_interval_minutes * 60) + 15
     job_uuid, resp = util.submit_job(self.cook_url, command='sleep %s' % job_timeout_interval_seconds, max_runtime=5000)
     self.assertEqual(201, resp.status_code)
     job = util.wait_for_job(self.cook_url, job_uuid, 'completed', job_timeout_interval_seconds * 1000)
     self.assertEqual(1, len(job['instances']))
     self.assertEqual('failed', job['instances'][0]['status'])
     self.assertEqual(2003, job['instances'][0]['reason_code'])
Пример #16
0
 def test_change_retries(self):
     job_uuid, _ = util.submit_job(self.cook_url, command='sleep 10')
     util.wait_for_job(self.cook_url, job_uuid, 'running')
     resp = util.session.delete('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid))
     self.assertEqual(204, resp.status_code)
     job = util.session.get('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)).json()[0]
     self.assertEqual('failed', job['state'])
     resp = util.session.put('%s/retry' % self.cook_url, json={'retries': 2, 'jobs': [job_uuid]})
     self.assertEqual(201, resp.status_code, resp.text)
     job = util.session.get('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)).json()[0]
     self.assertEqual('waiting', job['status'])
     job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
     self.assertEqual('success', job['state'])
Пример #17
0
    def test_federated_query(self):
        # Submit to cluster #1
        job_uuid_1, resp = util.submit_job(self.cook_url_1)
        self.assertEqual(resp.status_code, 201)

        # Submit to cluster #2
        job_uuid_2, resp = util.submit_job(self.cook_url_2)
        self.assertEqual(resp.status_code, 201)

        # Ask for both jobs from cluster #1, expect to get the first
        resp = util.query_jobs(self.cook_url_1,
                               job=[job_uuid_1, job_uuid_2],
                               partial='true')
        self.assertEqual(200, resp.status_code, resp.json())
        self.assertEqual(1, len(resp.json()))
        self.assertEqual([job_uuid_1], [job['uuid'] for job in resp.json()])

        # Ask for both jobs from cluster #2, expect to get the second
        resp = util.query_jobs(self.cook_url_2,
                               job=[job_uuid_1, job_uuid_2],
                               partial='true')
        self.assertEqual(200, resp.status_code, resp.json())
        self.assertEqual(1, len(resp.json()))
        self.assertEqual([job_uuid_2], [job['uuid'] for job in resp.json()])
Пример #18
0
    def test_get_queue(self):
        job_uuid, resp = util.submit_job(self.master_url, constraints=[["HOSTNAME",
                                                                        "EQUALS",
                                                                        "can't schedule"]])
        self.assertEqual(201, resp.status_code, resp.content)
        slave_queue = util.session.get('%s/queue' % self.slave_url, allow_redirects=False)
        self.assertEqual(307, slave_queue.status_code)

        @retry(stop_max_delay=30000, wait_fixed=1000) # Need to wait for a rank cycle
        def check_queue():
            master_queue = util.session.get(slave_queue.headers['Location'])
            self.assertEqual(200, master_queue.status_code, master_queue.content)
            self.assertTrue(any([job['job/uuid'] == job_uuid for job in master_queue.json()['normal']]))
        check_queue()
        util.session.delete('%s/rawscheduler?job=%s' % (self.master_url, job_uuid))
Пример #19
0
    def test_constraints(self):
        state = util.get_mesos_state(self.mesos_url)
        hosts = [agent['hostname'] for agent in state['slaves']]

        bad_job_uuid, resp = util.submit_job(self.cook_url, constraints=[["HOSTNAME", 
                                                                          "EQUALS", 
                                                                          "lol won't get scheduled"]])
        self.assertEqual(resp.status_code, 201, resp.text)
        
        host_to_job_uuid = {}
        for hostname in hosts:
            constraints = [["HOSTNAME", "EQUALS", hostname]]
            job_uuid, resp = util.submit_job(self.cook_url, constraints=constraints)
            self.assertEqual(resp.status_code, 201, resp.text)
            host_to_job_uuid[hostname] = job_uuid

        for hostname, job_uuid in host_to_job_uuid.items():
            job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
            hostname_constrained = job['instances'][0]['hostname']
            self.assertEqual(hostname, hostname_constrained)
            self.assertEqual([["HOSTNAME", "EQUALS", hostname]], job['constraints'])
        # This job should have been scheduled since the job submitted after it has completed
        # however, its constraint means it won't get scheduled
        job = util.wait_for_job(self.cook_url, bad_job_uuid, 'waiting', max_delay=3000)
Пример #20
0
 def test_job_delete_permission(self):
     user1, user2 = self.user_factory.new_users(2)
     with user1:
         job_uuid, resp = util.submit_job(self.cook_url, command='sleep 30')
     try:
         self.assertEqual(resp.status_code, 201, resp.text)
         with user2:
             util.kill_jobs(self.cook_url, [job_uuid],
                            expected_status_code=403)
         with user1:
             util.kill_jobs(self.cook_url, [job_uuid])
         job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
         self.assertEqual('failed', job['state'])
     finally:
         with user1:
             util.kill_jobs(self.cook_url, [job_uuid])
Пример #21
0
 def test_self_impersonate(self):
     user1 = self.user_factory.new_user()
     job_uuids = []
     try:
         # normal user can self-impersonate
         with user1.impersonating(user1):
             job_uuid, resp = util.submit_job(self.cook_url, command='sleep 1')
             self.assertEqual(resp.status_code, 201, resp.text)
             job_uuids.append(job_uuid)
         # admin can self-impersonate for admin endpoints
         # i.e., self-impersonation is treated as a non-impersonated request
         with self.admin.impersonating(self.admin):
             resp = util.query_queue(self.cook_url)
             self.assertEqual(resp.status_code, 200, resp.text)
     finally:
         with self.admin:
             util.kill_jobs(self.cook_url, [j for j in job_uuids if j])
Пример #22
0
 def test_multi_user_usage(self):
     users = self.user_factory.new_users(6)
     job_resources = {'cpus': 0.1, 'mem': 123}
     all_job_uuids = []
     try:
         # Start jobs for several users
         for i, user in enumerate(users):
             with user:
                 for j in range(i):
                     job_uuid, resp = util.submit_job(self.cook_url,
                                                      command='sleep 480',
                                                      **job_resources)
                     self.assertEqual(resp.status_code, 201, resp.content)
                     all_job_uuids.append(job_uuid)
         # Don't query until the jobs are all running
         util.wait_for_jobs(self.cook_url, all_job_uuids, 'running')
         # Check the usage for each of our users
         for i, user in enumerate(users):
             with user:
                 # Get the current usage
                 resp = util.user_current_usage(self.cook_url,
                                                user=user.name)
                 self.assertEqual(resp.status_code, 200, resp.content)
                 usage_data = resp.json()
                 # Check that the response structure looks as expected
                 self.assertEqual(list(usage_data.keys()), ['total_usage'],
                                  usage_data)
                 self.assertEqual(len(usage_data['total_usage']), 4,
                                  usage_data)
                 # Check that each user's usage is as expected
                 self.assertEqual(usage_data['total_usage']['mem'],
                                  job_resources['mem'] * i, usage_data)
                 self.assertEqual(usage_data['total_usage']['cpus'],
                                  job_resources['cpus'] * i, usage_data)
                 self.assertEqual(usage_data['total_usage']['gpus'], 0,
                                  usage_data)
                 self.assertEqual(usage_data['total_usage']['jobs'], i,
                                  usage_data)
     finally:
         # Terminate all of the jobs
         if all_job_uuids:
             with self.user_factory.admin():
                 util.kill_jobs(self.cook_url, all_job_uuids)
Пример #23
0
 def test_self_impersonate(self):
     user1 = self.user_factory.new_user()
     job_uuids = []
     try:
         # normal user can self-impersonate
         with user1.impersonating(user1):
             job_uuid, resp = util.submit_job(self.cook_url,
                                              command='sleep 1')
             self.assertEqual(resp.status_code, 201, resp.text)
             job_uuids.append(job_uuid)
         # admin can self-impersonate for admin endpoints
         # i.e., self-impersonation is treated as a non-impersonated request
         with self.admin.impersonating(self.admin):
             # The /queue endpoint redirects to the master, but we don't need to follow that.
             # As long as we don't get an auth error, we're good.
             resp = util.query_queue(self.cook_url, allow_redirects=False)
             self.assertIn(resp.status_code, [200, 307], resp.text)
     finally:
         with self.admin:
             util.kill_jobs(self.cook_url, [j for j in job_uuids if j])
Пример #24
0
 def test_job_delete_permission(self):
     user1, user2 = self.user_factory.new_users(2)
     with user1:
         job_uuid, resp = util.submit_job(self.cook_url, command='sleep 30')
     try:
         self.assertEqual(resp.status_code, 201, resp.text)
         with user2:
             resp = util.kill_jobs(self.cook_url, [job_uuid],
                                   expected_status_code=403)
             self.assertEqual(
                 f'You are not authorized to kill the following jobs: {job_uuid}',
                 resp.json()['error'])
         with user1:
             util.kill_jobs(self.cook_url, [job_uuid])
         job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
         self.assertEqual('failed', job['state'])
     finally:
         with user1:
             util.kill_jobs(self.cook_url, [job_uuid],
                            assert_response=False)
Пример #25
0
 def test_group_delete_permission(self):
     user1, user2 = self.user_factory.new_users(2)
     with user1:
         group_spec = util.minimal_group()
         group_uuid = group_spec['uuid']
         job_uuid, resp = util.submit_job(self.cook_url,
                                          command='sleep 30',
                                          group=group_uuid)
     try:
         self.assertEqual(resp.status_code, 201, resp.text)
         with user2:
             util.kill_groups(self.cook_url, [group_uuid],
                              expected_status_code=403)
         with user1:
             util.kill_groups(self.cook_url, [group_uuid])
         job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
         self.assertEqual('failed', job['state'])
     finally:
         with user1:
             util.kill_jobs(self.cook_url, [job_uuid],
                            assert_response=False)
Пример #26
0
    def test_pool_scheduling(self):
        admin = self.user_factory.admin()
        user = self.user_factory.new_user()
        pools, _ = util.active_pools(self.cook_url)
        all_job_uuids = []
        try:
            default_pool = util.default_pool(self.cook_url)
            self.assertLess(1, len(pools))
            self.assertIsNotNone(default_pool)

            cpus = 0.1
            with admin:
                self.logger.info(
                    f'Running tasks: {json.dumps(util.running_tasks(self.cook_url), indent=2)}'
                )
                for pool in pools:
                    # Lower the user's cpu quota on this pool
                    pool_name = pool['name']
                    quota_multiplier = 1 if pool_name == default_pool else 2
                    util.set_limit(self.cook_url,
                                   'quota',
                                   user.name,
                                   cpus=cpus * quota_multiplier,
                                   pool=pool_name)

            with user:
                util.kill_running_and_waiting_jobs(self.cook_url, user.name)
                for pool in pools:
                    pool_name = pool['name']

                    # Submit a job that fills the user's quota on this pool
                    quota = util.get_limit(self.cook_url, 'quota', user.name,
                                           pool_name).json()
                    quota_cpus = quota['cpus']
                    filling_job_uuid, _ = util.submit_job(self.cook_url,
                                                          cpus=quota_cpus,
                                                          command='sleep 600',
                                                          pool=pool_name)
                    all_job_uuids.append(filling_job_uuid)
                    instance = util.wait_for_running_instance(
                        self.cook_url, filling_job_uuid)
                    slave_pool = util.node_pool(instance['hostname'])
                    self.assertEqual(pool_name, slave_pool)

                    # Submit a job that should not get scheduled
                    job_uuid, _ = util.submit_job(self.cook_url,
                                                  cpus=cpus,
                                                  command='ls',
                                                  pool=pool_name)
                    all_job_uuids.append(job_uuid)
                    job = util.load_job(self.cook_url, job_uuid)
                    self.assertEqual('waiting', job['status'])

                    # Assert that the unscheduled reason and data are correct
                    @retry(stop_max_delay=60000, wait_fixed=5000)
                    def check_unscheduled_reason():
                        jobs, _ = util.unscheduled_jobs(
                            self.cook_url, job_uuid)
                        self.logger.info(f'Unscheduled jobs: {jobs}')
                        self.assertEqual(job_uuid, jobs[0]['uuid'])
                        job_reasons = jobs[0]['reasons']
                        # Check the spot-in-queue reason
                        reason = next(r for r in job_reasons if r['reason'] ==
                                      'You have 1 other jobs ahead in the '
                                      'queue.')
                        self.assertEqual({'jobs': [filling_job_uuid]},
                                         reason['data'])
                        # Check the exceeding-quota reason
                        reason = next(
                            r for r in job_reasons
                            if r['reason'] == reasons.JOB_WOULD_EXCEED_QUOTA)
                        self.assertEqual(
                            {
                                'cpus': {
                                    'limit': quota_cpus,
                                    'usage': quota_cpus + cpus
                                }
                            }, reason['data'])

                    check_unscheduled_reason()
        finally:
            with admin:
                util.kill_jobs(self.cook_url,
                               all_job_uuids,
                               assert_response=False)
                for pool in pools:
                    util.reset_limit(self.cook_url,
                                     'quota',
                                     user.name,
                                     reason=self.current_name(),
                                     pool=pool['name'])
Пример #27
0
 def test_failing_submit(self):
     job_uuid, resp = util.submit_job(self.cook_url, command='exit 1')
     self.assertEqual(201, resp.status_code)
     job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
     self.assertEqual(1, len(job['instances']))
     self.assertEqual('failed', job['instances'][0]['status'])
Пример #28
0
 def test_basic_submit(self):
     job_uuid, resp = util.submit_job(self.cook_url)
     self.assertEqual(resp.status_code, 201)
     job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
     self.assertEqual('success', job['instances'][0]['status'])
     self.assertEqual(False, job['disable_mea_culpa_retries'])
Пример #29
0
    def test_checkpoint_locality(self):
        """
        Test that restored instances run in the same location as their checkpointed instances.
        """
        # Get the set of clusters that correspond to the pool under test and are running
        pool = util.default_submit_pool()
        clusters = util.compute_clusters(self.cook_url)
        running_clusters = [
            c for c in clusters['in-mem-configs']
            if pool in c['cluster-definition']['config']['synthetic-pods']
            ['pools'] and c['state'] == 'running'
        ]
        self.logger.info(
            f'Running clusters for pool {pool}: {running_clusters}')
        if len(running_clusters) == 0:
            self.skipTest(
                f'Requires at least 1 running compute cluster for pool {pool}')

        # Submit an initial canary job
        job_uuid, resp = util.submit_job(self.cook_url,
                                         pool=pool,
                                         command='true')
        self.assertEqual(201, resp.status_code, resp.content)
        util.wait_for_instance(self.cook_url,
                               job_uuid,
                               status='success',
                               indent=None)

        # Submit a long-running job with checkpointing
        checkpoint_job_uuid, resp = util.submit_job(
            self.cook_url,
            pool=pool,
            command=f'sleep {util.DEFAULT_TEST_TIMEOUT_SECS}',
            max_retries=5,
            checkpoint={'mode': 'auto'})
        self.assertEqual(201, resp.status_code, resp.content)

        try:
            # Wait for the job to be running
            checkpoint_instance = util.wait_for_instance(self.cook_url,
                                                         checkpoint_job_uuid,
                                                         status='running',
                                                         indent=None)
            checkpoint_instance_uuid = checkpoint_instance['task_id']
            checkpoint_location = next(
                c['location'] for c in running_clusters
                if c['name'] == checkpoint_instance['compute-cluster']['name'])

            admin = self.user_factory.admin()
            try:
                # Force all clusters in the instance's location to have state = draining
                with admin:
                    for cluster in running_clusters:
                        if cluster['location'] == checkpoint_location:
                            cluster_update = dict(cluster)
                            # Set state = draining
                            cluster_update['state'] = 'draining'
                            cluster_update['state-locked?'] = True
                            # The location, cluster-definition, and features fields cannot be sent in the update
                            cluster_update.pop('location', None)
                            cluster_update.pop('cluster-definition', None)
                            cluster_update.pop('features', None)
                            self.logger.info(
                                f'Trying to update cluster to draining: {cluster_update}'
                            )
                            util.wait_until(
                                lambda: util.update_compute_cluster(
                                    self.cook_url, cluster_update)[1],
                                lambda response: response.status_code == 201
                                and len(response.json()) > 0)
                        else:
                            self.logger.info(
                                f'Not updating cluster - not in location {checkpoint_location}: {cluster}'
                            )

                # Kill the running checkpoint job instance
                util.kill_instance(self.cook_url, checkpoint_instance_uuid)

                # Submit another canary job
                job_uuid, resp = util.submit_job(self.cook_url,
                                                 pool=pool,
                                                 command='true')
                self.assertEqual(201, resp.status_code, resp.content)

                cluster_locations = set(c['location']
                                        for c in running_clusters)
                if len(cluster_locations) > 1:
                    # The canary job should run in the non-draining location
                    self.logger.info(
                        f'There are > 1 cluster locations under test: {cluster_locations}'
                    )
                    util.wait_for_instance(self.cook_url,
                                           job_uuid,
                                           status='success',
                                           indent=None)
                else:
                    self.logger.info(
                        f'There is only 1 cluster location under test: {cluster_locations}'
                    )

                # The checkpoint job should be waiting
                util.wait_for_instance(self.cook_url,
                                       checkpoint_job_uuid,
                                       status='failed',
                                       indent=None)
                util.wait_for_job_in_statuses(self.cook_url,
                                              checkpoint_job_uuid, ['waiting'])
            finally:
                # Revert all clusters in the instance's location to state = running
                with admin:
                    for cluster in running_clusters:
                        if cluster['location'] == checkpoint_location:
                            cluster_update = dict(cluster)
                            # Set state = running
                            cluster_update['state'] = 'running'
                            cluster_update['state-locked?'] = False
                            # The location, cluster-definition, and features fields cannot be sent in the update
                            cluster_update.pop('location', None)
                            cluster_update.pop('cluster-definition', None)
                            cluster_update.pop('features', None)
                            self.logger.info(
                                f'Trying to update cluster to running: {cluster_update}'
                            )
                            util.wait_until(
                                lambda: util.update_compute_cluster(
                                    self.cook_url, cluster_update)[1],
                                lambda response: response.status_code == 201
                                and len(response.json()) > 0)
                        else:
                            self.logger.info(
                                f'Not updating cluster - not in location {checkpoint_location}: {cluster}'
                            )

                # Wait for the checkpoint job to be running again, in the same location as before
                checkpoint_instance = util.wait_for_instance(
                    self.cook_url,
                    checkpoint_job_uuid,
                    status='running',
                    indent=None)
                self.assertEqual(
                    checkpoint_location,
                    next(c['location'] for c in running_clusters if c['name']
                         == checkpoint_instance['compute-cluster']['name']))
        finally:
            # Kill the checkpoint job to not leave it running
            util.kill_jobs(self.cook_url, [checkpoint_job_uuid])
Пример #30
0
    def test_dynamic_clusters(self):
        """
        Test that dynamic cluster configuration functionality is working.
        """
        docker_image = util.docker_image()
        container = {'type': 'docker', 'docker': {'image': docker_image}}
        admin = self.user_factory.admin()
        # Force all clusters to have state = deleted via the API
        clusters = [
            cluster
            for cluster in util.compute_clusters(self.cook_url)['db-configs']
            if cluster["state"] == "running"
        ]
        with admin:
            self.logger.info(f'Clusters {clusters}')
            # First state = draining
            for cluster in clusters:
                cluster["state"] = "draining"
                cluster["state-locked?"] = True
                self.logger.info(f'Trying to update cluster {cluster}')
                data, resp = util.update_compute_cluster(
                    self.cook_url, cluster)
                self.assertEqual(201, resp.status_code, resp.content)
            # Then state = deleted
            for cluster in clusters:
                cluster["state"] = "deleted"
                util.wait_until(
                    lambda: util.update_compute_cluster(
                        self.cook_url, cluster),
                    lambda x: 201 == x[1].status_code, 300000, 5000)
            # Create at least one new cluster with a unique test name (using one of the existing cluster's IP and cert)
            test_cluster_name = f'test_cluster_{round(time.time() * 1000)}'
            test_cluster = {
                "name": test_cluster_name,
                "state": "running",
                "base-path": clusters[0]["base-path"],
                "ca-cert": clusters[0]["ca-cert"],
                "template": clusters[0]["template"]
            }
            data, resp = util.create_compute_cluster(self.cook_url,
                                                     test_cluster)
            self.assertEqual(201, resp.status_code, resp.content)
            # Test create cluster with duplicate name
            data, resp = util.create_compute_cluster(self.cook_url,
                                                     test_cluster)
            self.assertEqual(422, resp.status_code, resp.content)
            self.assertEqual(
                f'Compute cluster with name {test_cluster_name} already exists',
                data['error']['message'], resp.content)

        # Check that a job schedules successfully
        command = "true"
        job_uuid, resp = util.submit_job(self.cook_url,
                                         command=command,
                                         container=container)
        self.assertEqual(201, resp.status_code, resp.content)
        instance = util.wait_for_instance(self.cook_url, job_uuid)
        message = repr(instance)
        self.assertIsNotNone(instance['compute-cluster'], message)
        instance_compute_cluster_name = instance['compute-cluster']['name']
        self.assertEqual(test_cluster["name"], instance_compute_cluster_name,
                         instance['compute-cluster'])
        util.wait_for_instance(self.cook_url, job_uuid, status='success')
        running_clusters = [
            cluster
            for cluster in util.compute_clusters(self.cook_url)['db-configs']
            if cluster["state"] == "running"
        ]
        self.assertEqual(1, len(running_clusters), running_clusters)
        self.assertEqual(test_cluster["name"], running_clusters[0]["name"],
                         running_clusters)

        with admin:
            # Delete test cluster
            # First state = draining
            test_cluster["state"] = "draining"
            data, resp = util.update_compute_cluster(self.cook_url,
                                                     test_cluster)
            self.assertEqual(201, resp.status_code, resp.content)
            # Then state = deleted
            test_cluster["state"] = "deleted"
            util.wait_until(
                lambda: util.update_compute_cluster(self.cook_url, test_cluster
                                                    ),
                lambda x: 201 == x[1].status_code, 300000, 5000)
            # Hard-delete the original non-test clusters
            for cluster in clusters:
                self.logger.info(f'Trying to delete cluster {cluster}')
                resp = util.delete_compute_cluster(self.cook_url, cluster)
                self.assertEqual(204, resp.status_code, resp.content)
            # Force give up leadership
            resp = util.shutdown_leader(self.cook_url, "test_dynamic_clusters")
            self.assertEqual(b'Accepted', resp)

        # Old clusters should be re-created
        # wait for cook to come up
        util.wait_until(
            lambda: [
                cluster for cluster in util.compute_clusters(self.cook_url)[
                    'db-configs'] if cluster["state"] == "running"
            ], lambda x: len(x) == len(clusters), 420000, 5000)
        # Check that a job schedules successfully
        command = "true"
        job_uuid, resp = util.submit_job(self.cook_url,
                                         command=command,
                                         container=container)
        self.assertEqual(201, resp.status_code, resp.content)
        util.wait_for_instance(self.cook_url, job_uuid, status='success')

        with admin:
            # Hard-delete test cluster
            resp = util.delete_compute_cluster(self.cook_url, test_cluster)
            self.assertEqual(204, resp.status_code, resp.content)