Exemplo n.º 1
0
 def test_cancel_job(self):
     job_uuid, _ = util.submit_job(self.cook_url, command='sleep 300')
     util.wait_for_job(self.cook_url, job_uuid, 'running')
     resp = util.session.delete('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid))
     self.assertEqual(204, resp.status_code)
     job = util.session.get('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)).json()[0]
     self.assertEqual('failed', job['state'])
Exemplo n.º 2
0
 def gpu_submit_helper(self, pool_name, gpu_count, gpu_model):
     query_model_name = gpu_model.lstrip('nvidia-').replace('-',
                                                            ' ').title()
     command = (
         '/usr/bin/nvidia-smi && /usr/bin/nvidia-smi -q > nvidia-smi-output && '
         f'cat nvidia-smi-output; expected_model="{query_model_name}"; '
         'num_gpus=$(grep "Attached GPUs" nvidia-smi-output | cut -d \':\' -f 2 | tr -d \'[:space:]\'); echo "num_gpus=$num_gpus"; '
         'num_expected_model=$(grep "$expected_model" nvidia-smi-output | wc -l); echo "num_expected_model=$num_expected_model"; '
         f'if [[ $num_gpus -eq {gpu_count} && $num_expected_model -eq {gpu_count} ]]; then exit 0; else exit 1; fi'
     )
     uuid = self.client.submit(command=command,
                               cpus=0.5,
                               mem=256.0,
                               pool=pool_name,
                               gpus=gpu_count,
                               env={'COOK_GPU_MODEL': gpu_model},
                               max_retries=5)
     try:
         util.wait_for_job(type(self).cook_url, uuid, 'completed')
         job = self.client.query(uuid)
         self.assertEqual(JobState.SUCCESS, job.state)
     except Exception as e:
         raise Exception(
             f"Submitting job with GPU {gpu_model} to pool {pool_name} failed"
         ) from e
     finally:
         self.client.kill(uuid)
Exemplo n.º 3
0
 def test_cancel_instance(self):
     job_uuid, _ = util.submit_job(self.cook_url, command='sleep 10', max_retries=2)
     job = util.wait_for_job(self.cook_url, job_uuid, 'running')
     task_id = job['instances'][0]['task_id']
     resp = util.session.delete('%s/rawscheduler?instance=%s' % (self.cook_url, task_id))
     self.assertEqual(204, resp.status_code)
     job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
     self.assertEqual('success', job['state'])
Exemplo n.º 4
0
 def test_basic_submit(self):
     job_uuid_1, resp = util.submit_job(self.cook_url_1)
     self.assertEqual(resp.status_code, 201)
     job_uuid_2, resp = util.submit_job(self.cook_url_2)
     self.assertEqual(resp.status_code, 201)
     job = util.wait_for_job(self.cook_url_1, job_uuid_1, 'completed')
     self.assertEqual('success', job['instances'][0]['status'])
     job = util.wait_for_job(self.cook_url_2, job_uuid_2, 'completed')
     self.assertEqual('success', job['instances'][0]['status'])
Exemplo n.º 5
0
 def test_change_retries(self):
     job_uuid, _ = util.submit_job(self.cook_url, command='sleep 10')
     util.wait_for_job(self.cook_url, job_uuid, 'running')
     resp = util.session.delete('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid))
     self.assertEqual(204, resp.status_code)
     job = util.session.get('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)).json()[0]
     self.assertEqual('failed', job['state'])
     resp = util.session.put('%s/retry' % self.cook_url, json={'retries': 2, 'jobs': [job_uuid]})
     self.assertEqual(201, resp.status_code, resp.text)
     job = util.session.get('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)).json()[0]
     self.assertEqual('waiting', job['status'])
     job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
     self.assertEqual('success', job['state'])
Exemplo n.º 6
0
    def test_allow_partial(self):
        def absent_uuids(response):
            return [part for part in response.json()['error'].split() if util.is_valid_uuid(part)]

        job_uuid_1, resp = util.submit_job(self.cook_url)
        self.assertEqual(201, resp.status_code)
        job_uuid_2, resp = util.submit_job(self.cook_url)
        self.assertEqual(201, resp.status_code)

        # Only valid job uuids
        resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2])
        self.assertEqual(200, resp.status_code)

        # Mixed valid, invalid job uuids
        bogus_uuid = str(uuid.uuid4())
        resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2, bogus_uuid])
        self.assertEqual(404, resp.status_code)
        self.assertEqual([bogus_uuid], absent_uuids(resp))
        resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2, bogus_uuid], partial='false')
        self.assertEqual(404, resp.status_code, resp.json())
        self.assertEqual([bogus_uuid], absent_uuids(resp))

        # Partial results with mixed valid, invalid job uuids
        resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2, bogus_uuid], partial='true')
        self.assertEqual(200, resp.status_code, resp.json())
        self.assertEqual(2, len(resp.json()))
        self.assertEqual([job_uuid_1, job_uuid_2].sort(), [job['uuid'] for job in resp.json()].sort())

        # Only valid instance uuids
        job = util.wait_for_job(self.cook_url, job_uuid_1, 'completed')
        instance_uuid_1 = job['instances'][0]['task_id']
        job = util.wait_for_job(self.cook_url, job_uuid_2, 'completed')
        instance_uuid_2 = job['instances'][0]['task_id']
        resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2])
        self.assertEqual(200, resp.status_code)

        # Mixed valid, invalid instance uuids
        resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2, bogus_uuid])
        self.assertEqual(404, resp.status_code)
        self.assertEqual([bogus_uuid], absent_uuids(resp))
        resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2, bogus_uuid], partial='false')
        self.assertEqual(404, resp.status_code)
        self.assertEqual([bogus_uuid], absent_uuids(resp))

        # Partial results with mixed valid, invalid instance uuids
        resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2, bogus_uuid], partial='true')
        self.assertEqual(200, resp.status_code)
        self.assertEqual(2, len(resp.json()))
        self.assertEqual([job_uuid_1, job_uuid_2].sort(), [job['uuid'] for job in resp.json()].sort())
Exemplo n.º 7
0
 def test_explicit_group(self):
     group_spec = self.minimal_group()
     job_a = util.minimal_job(group=group_spec["uuid"])
     job_b = util.minimal_job(group=group_spec["uuid"])
     data = {'jobs': [job_a, job_b], 'groups': [group_spec]}
     resp = util.session.post('%s/rawscheduler' % self.cook_url, json=data)
     self.assertEqual(resp.status_code, 201)
     jobs = util.session.get('%s/rawscheduler?job=%s&job=%s' %
                             (self.cook_url, job_a['uuid'], job_b['uuid']))
     self.assertEqual(200, jobs.status_code)
     jobs = jobs.json()
     self.assertEqual(group_spec['uuid'], jobs[0]['groups'][0])
     self.assertEqual(group_spec['uuid'], jobs[1]['groups'][0])
     util.wait_for_job(self.cook_url, job_a['uuid'], 'completed')
     util.wait_for_job(self.cook_url, job_b['uuid'], 'completed')
Exemplo n.º 8
0
 def test_max_runtime_exceeded(self):
     settings_timeout_interval_minutes = util.get_in(self.settings(), 'task-constraints', 'timeout-interval-minutes')
     # the value needs to be a little more than 2 times settings_timeout_interval_minutes to allow
     # at least two runs of the lingering task killer
     job_timeout_interval_seconds = (2 * settings_timeout_interval_minutes * 60) + 15
     job_uuid, resp = util.submit_job(self.cook_url, command='sleep %s' % job_timeout_interval_seconds, max_runtime=5000)
     self.assertEqual(201, resp.status_code)
     job = util.wait_for_job(self.cook_url, job_uuid, 'completed', job_timeout_interval_seconds * 1000)
     self.assertEqual(1, len(job['instances']))
     self.assertEqual('failed', job['instances'][0]['status'])
     self.assertEqual(2003, job['instances'][0]['reason_code'])
Exemplo n.º 9
0
    def test_expected_runtime_field(self):
        # Should support expected_runtime
        expected_runtime = 1
        job_uuid, resp = util.submit_job(self.cook_url, expected_runtime=expected_runtime)
        self.assertEqual(resp.status_code, 201)
        job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
        self.assertEqual('success', job['instances'][0]['status'])
        self.assertEqual(expected_runtime, job['expected_runtime'])

        # Should disallow expected_runtime > max_runtime
        expected_runtime = 2
        max_runtime = expected_runtime - 1
        job_uuid, resp = util.submit_job(self.cook_url, expected_runtime=expected_runtime, max_runtime=max_runtime)
        self.assertEqual(resp.status_code, 400)
Exemplo n.º 10
0
 def test_straggler_handling(self):
     straggler_handling = {
         'type': 'quantile-deviation',
         'parameters': {
             'quantile': 0.5,
             'multiplier': 2.0
         }
     }
     group_spec = self.minimal_group(straggler_handling=straggler_handling)
     job_fast = util.minimal_job(group=group_spec["uuid"])
     job_slow = util.minimal_job(group=group_spec["uuid"], command='sleep 120')
     data = {'jobs': [job_fast, job_slow], 'groups': [group_spec]}
     resp = util.session.post('%s/rawscheduler' % self.cook_url, json=data)
     self.assertEqual(resp.status_code, 201)
     util.wait_for_job(self.cook_url, job_fast['uuid'], 'completed')
     util.wait_for_job(self.cook_url, job_slow['uuid'], 'completed')
     jobs = util.session.get('%s/rawscheduler?job=%s&job=%s' %
                             (self.cook_url, job_fast['uuid'], job_slow['uuid']))
     self.assertEqual(200, jobs.status_code)
     jobs = jobs.json()
     self.logger.debug('Loaded jobs %s', jobs)
     self.assertEqual('success', jobs[0]['state'])
     self.assertEqual('failed', jobs[1]['state'])
     self.assertEqual(2004, jobs[1]['instances'][0]['reason_code'])
Exemplo n.º 11
0
    def test_constraints(self):
        state = util.get_mesos_state(self.mesos_url)
        hosts = [agent['hostname'] for agent in state['slaves']]

        bad_job_uuid, resp = util.submit_job(self.cook_url, constraints=[["HOSTNAME", 
                                                                          "EQUALS", 
                                                                          "lol won't get scheduled"]])
        self.assertEqual(resp.status_code, 201, resp.text)
        
        host_to_job_uuid = {}
        for hostname in hosts:
            constraints = [["HOSTNAME", "EQUALS", hostname]]
            job_uuid, resp = util.submit_job(self.cook_url, constraints=constraints)
            self.assertEqual(resp.status_code, 201, resp.text)
            host_to_job_uuid[hostname] = job_uuid

        for hostname, job_uuid in host_to_job_uuid.items():
            job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
            hostname_constrained = job['instances'][0]['hostname']
            self.assertEqual(hostname, hostname_constrained)
            self.assertEqual([["HOSTNAME", "EQUALS", hostname]], job['constraints'])
        # This job should have been scheduled since the job submitted after it has completed
        # however, its constraint means it won't get scheduled
        job = util.wait_for_job(self.cook_url, bad_job_uuid, 'waiting', max_delay=3000)
Exemplo n.º 12
0
    def test_application_field(self):
        # Should support application
        application = {'name': 'foo-app', 'version': '0.1.0'}
        job_uuid, resp = util.submit_job(self.cook_url, application=application)
        self.assertEqual(resp.status_code, 201)
        job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
        self.assertEqual('success', job['instances'][0]['status'])
        self.assertEqual(application, job['application'])

        # Should require application name
        _, resp = util.submit_job(self.cook_url, application={'version': '0.1.0'})
        self.assertEqual(resp.status_code, 400)

        # Should require application version
        _, resp = util.submit_job(self.cook_url, application={'name': 'foo-app'})
        self.assertEqual(resp.status_code, 400)
Exemplo n.º 13
0
 def test_job_delete_permission(self):
     user1, user2 = self.user_factory.new_users(2)
     with user1:
         job_uuid, resp = util.submit_job(self.cook_url, command='sleep 30')
     try:
         self.assertEqual(resp.status_code, 201, resp.text)
         with user2:
             util.kill_jobs(self.cook_url, [job_uuid],
                            expected_status_code=403)
         with user1:
             util.kill_jobs(self.cook_url, [job_uuid])
         job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
         self.assertEqual('failed', job['state'])
     finally:
         with user1:
             util.kill_jobs(self.cook_url, [job_uuid])
Exemplo n.º 14
0
 def test_job_delete_permission(self):
     user1, user2 = self.user_factory.new_users(2)
     with user1:
         job_uuid, resp = util.submit_job(self.cook_url, command='sleep 30')
     try:
         self.assertEqual(resp.status_code, 201, resp.text)
         with user2:
             resp = util.kill_jobs(self.cook_url, [job_uuid],
                                   expected_status_code=403)
             self.assertEqual(
                 f'You are not authorized to kill the following jobs: {job_uuid}',
                 resp.json()['error'])
         with user1:
             util.kill_jobs(self.cook_url, [job_uuid])
         job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
         self.assertEqual('failed', job['state'])
     finally:
         with user1:
             util.kill_jobs(self.cook_url, [job_uuid],
                            assert_response=False)
Exemplo n.º 15
0
 def test_group_delete_permission(self):
     user1, user2 = self.user_factory.new_users(2)
     with user1:
         group_spec = util.minimal_group()
         group_uuid = group_spec['uuid']
         job_uuid, resp = util.submit_job(self.cook_url,
                                          command='sleep 30',
                                          group=group_uuid)
     try:
         self.assertEqual(resp.status_code, 201, resp.text)
         with user2:
             util.kill_groups(self.cook_url, [group_uuid],
                              expected_status_code=403)
         with user1:
             util.kill_groups(self.cook_url, [group_uuid])
         job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
         self.assertEqual('failed', job['state'])
     finally:
         with user1:
             util.kill_jobs(self.cook_url, [job_uuid],
                            assert_response=False)
Exemplo n.º 16
0
    def test_get_job(self):
        # schedule a job
        job_spec = util.minimal_job()
        resp = util.session.post('%s/rawscheduler' % self.cook_url, json={'jobs': [job_spec]})
        self.assertEqual(201, resp.status_code)

        # query for the same job & ensure the response has what it's supposed to have
        job = util.wait_for_job(self.cook_url, job_spec['uuid'], 'completed')
        self.assertEquals(job_spec['mem'], job['mem'])
        self.assertEquals(job_spec['max_retries'], job['max_retries'])
        self.assertEquals(job_spec['name'], job['name'])
        self.assertEquals(job_spec['priority'], job['priority'])
        self.assertEquals(job_spec['uuid'], job['uuid'])
        self.assertEquals(job_spec['cpus'], job['cpus'])
        self.assertTrue('labels' in job)
        self.assertEquals(9223372036854775807, job['max_runtime'])
        # 9223372036854775807 is MAX_LONG(ish), the default value for max_runtime
        self.assertEquals('success', job['state'])
        self.assertTrue('env' in job)
        self.assertTrue('framework_id' in job)
        self.assertTrue('ports' in job)
        self.assertTrue('instances' in job)
        self.assertEquals('completed', job['status'])
        self.assertTrue(isinstance(job['submit_time'], int))
        self.assertTrue('uris' in job)
        self.assertTrue('retries_remaining' in job)
        instance = job['instances'][0]
        self.assertTrue(isinstance(instance['start_time'], int))
        self.assertTrue('executor_id' in instance)
        self.assertTrue('hostname' in instance)
        self.assertTrue('slave_id' in instance)
        self.assertTrue(isinstance(instance['preempted'], bool))
        self.assertTrue(isinstance(instance['end_time'], int))
        self.assertTrue(isinstance(instance['backfilled'], bool))
        self.assertTrue('ports' in instance)
        self.assertEquals('completed', job['status'])
        self.assertTrue('task_id' in instance)
Exemplo n.º 17
0
 def test_show_running_job(self):
     cp, uuids = cli.submit('sleep 60', self.cook_url)
     self.assertEqual(0, cp.returncode, cp.stderr)
     util.wait_for_job(self.cook_url, uuids[0], 'running')
     cp = cli.show(uuids, self.cook_url)
     self.assertEqual(0, cp.returncode, cp.stderr)
Exemplo n.º 18
0
 def test_failing_submit(self):
     job_uuid, resp = util.submit_job(self.cook_url, command='exit 1')
     self.assertEqual(201, resp.status_code)
     job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
     self.assertEqual(1, len(job['instances']))
     self.assertEqual('failed', job['instances'][0]['status'])
Exemplo n.º 19
0
 def test_list_by_state(self):
     name = str(uuid.uuid4())
     # waiting
     raw_job = {
         'command': 'ls',
         'name': name,
         'constraints': [['HOSTNAME', 'EQUALS', 'will not get scheduled']]
     }
     cp, uuids = cli.submit(stdin=cli.encode(json.dumps(raw_job)),
                            cook_url=self.cook_url,
                            submit_flags='--raw')
     user = util.get_user(self.cook_url, uuids[0])
     self.assertEqual(0, cp.returncode, cp.stderr)
     util.wait_for_job(self.cook_url, uuids[0], 'waiting')
     cp, jobs = self.list_jobs(name, user, 'waiting')
     self.assertEqual(0, cp.returncode, cp.stderr)
     self.assertEqual(1, len(jobs))
     self.assertEqual(uuids[0], jobs[0]['uuid'])
     waiting_uuid = uuids[0]
     # running
     cp, uuids = cli.submit('sleep 60',
                            self.cook_url,
                            submit_flags='--name %s' % name)
     self.assertEqual(0, cp.returncode, cp.stderr)
     util.wait_for_job(self.cook_url, uuids[0], 'running')
     cp, jobs = self.list_jobs(name, user, 'running')
     self.assertEqual(0, cp.returncode, cp.stderr)
     self.assertEqual(1, len(jobs))
     self.assertEqual(uuids[0], jobs[0]['uuid'])
     running_uuid = uuids[0]
     # completed
     cp, uuids = cli.submit('ls',
                            self.cook_url,
                            submit_flags='--name %s' % name)
     self.assertEqual(0, cp.returncode, cp.stderr)
     util.wait_for_job(self.cook_url, uuids[0], 'completed')
     cp, jobs = self.list_jobs(name, user, 'completed')
     self.assertEqual(0, cp.returncode, cp.stderr)
     self.assertEqual(1, len(jobs))
     self.assertEqual(uuids[0], jobs[0]['uuid'])
     # success
     cp, jobs = self.list_jobs(name, user, 'success')
     self.assertEqual(0, cp.returncode, cp.stderr)
     self.assertEqual(1, len(jobs))
     self.assertEqual(uuids[0], jobs[0]['uuid'])
     success_uuid = uuids[0]
     # failed
     cp, uuids = cli.submit('exit 1',
                            self.cook_url,
                            submit_flags='--name %s' % name)
     self.assertEqual(0, cp.returncode, cp.stderr)
     util.wait_for_job(self.cook_url, uuids[0], 'completed')
     cp, jobs = self.list_jobs(name, user, 'failed')
     self.assertEqual(0, cp.returncode, cp.stderr)
     self.assertEqual(1, len(jobs))
     self.assertEqual(uuids[0], jobs[0]['uuid'])
     failed_uuid = uuids[0]
     # all
     cp, jobs = self.list_jobs(name, user, 'all')
     uuids = [j['uuid'] for j in jobs]
     self.assertEqual(0, cp.returncode, cp.stderr)
     self.assertEqual(4, len(jobs))
     self.assertIn(waiting_uuid, uuids)
     self.assertIn(running_uuid, uuids)
     self.assertIn(success_uuid, uuids)
     self.assertIn(failed_uuid, uuids)
     # waiting+running
     cp, jobs = self.list_jobs(name, user, 'waiting', 'running')
     uuids = [j['uuid'] for j in jobs]
     self.assertEqual(0, cp.returncode, cp.stderr)
     self.assertEqual(2, len(jobs))
     self.assertIn(waiting_uuid, uuids)
     self.assertIn(running_uuid, uuids)
     # completed+waiting
     cp, jobs = self.list_jobs(name, user, 'completed', 'waiting')
     uuids = [j['uuid'] for j in jobs]
     self.assertEqual(0, cp.returncode, cp.stderr)
     self.assertEqual(3, len(jobs))
     self.assertIn(waiting_uuid, uuids)
     self.assertIn(success_uuid, uuids)
     self.assertIn(failed_uuid, uuids)
Exemplo n.º 20
0
 def test_basic_submit(self):
     job_uuid, resp = util.submit_job(self.cook_url)
     self.assertEqual(resp.status_code, 201)
     job = util.wait_for_job(self.cook_url, job_uuid, 'completed')
     self.assertEqual('success', job['instances'][0]['status'])
     self.assertEqual(False, job['disable_mea_culpa_retries'])