예제 #1
0
파일: cli.py 프로젝트: kathryn-zhou/Cook
def wait_for_output_file(cook_url, job_uuid, name):
    """Waits for a file with the given name for the given job to exist"""
    def query():
        cp, _ = ls(job_uuid, cook_url, parse_json=False)
        return json.loads(stdout(cp)) if cp.returncode == 0 else []

    def predicate(entries):
        logging.debug(f'Job {job_uuid} has entries {entries}')
        return ls_entry_by_name(entries, name)

    response = util.wait_until(query, predicate)
    return response
예제 #2
0
    def test_dynamic_clusters(self):
        """
        Test that dynamic cluster configuration functionality is working.
        """
        docker_image = util.docker_image()
        container = {'type': 'docker', 'docker': {'image': docker_image}}
        admin = self.user_factory.admin()
        # Force all clusters to have state = deleted via the API
        clusters = [
            cluster
            for cluster in util.compute_clusters(self.cook_url)['db-configs']
            if cluster["state"] == "running"
        ]
        with admin:
            self.logger.info(f'Clusters {clusters}')
            # First state = draining
            for cluster in clusters:
                cluster["state"] = "draining"
                cluster["state-locked?"] = True
                self.logger.info(f'Trying to update cluster {cluster}')
                data, resp = util.update_compute_cluster(
                    self.cook_url, cluster)
                self.assertEqual(201, resp.status_code, resp.content)
            # Then state = deleted
            for cluster in clusters:
                cluster["state"] = "deleted"
                util.wait_until(
                    lambda: util.update_compute_cluster(
                        self.cook_url, cluster),
                    lambda x: 201 == x[1].status_code, 300000, 5000)
            # Create at least one new cluster with a unique test name (using one of the existing cluster's IP and cert)
            test_cluster_name = f'test_cluster_{round(time.time() * 1000)}'
            test_cluster = {
                "name": test_cluster_name,
                "state": "running",
                "base-path": clusters[0]["base-path"],
                "ca-cert": clusters[0]["ca-cert"],
                "template": clusters[0]["template"]
            }
            data, resp = util.create_compute_cluster(self.cook_url,
                                                     test_cluster)
            self.assertEqual(201, resp.status_code, resp.content)
            # Test create cluster with duplicate name
            data, resp = util.create_compute_cluster(self.cook_url,
                                                     test_cluster)
            self.assertEqual(422, resp.status_code, resp.content)
            self.assertEqual(
                f'Compute cluster with name {test_cluster_name} already exists',
                data['error']['message'], resp.content)

        # Check that a job schedules successfully
        command = "true"
        job_uuid, resp = util.submit_job(self.cook_url,
                                         command=command,
                                         container=container)
        self.assertEqual(201, resp.status_code, resp.content)
        instance = util.wait_for_instance(self.cook_url, job_uuid)
        message = repr(instance)
        self.assertIsNotNone(instance['compute-cluster'], message)
        instance_compute_cluster_name = instance['compute-cluster']['name']
        self.assertEqual(test_cluster["name"], instance_compute_cluster_name,
                         instance['compute-cluster'])
        util.wait_for_instance(self.cook_url, job_uuid, status='success')
        running_clusters = [
            cluster
            for cluster in util.compute_clusters(self.cook_url)['db-configs']
            if cluster["state"] == "running"
        ]
        self.assertEqual(1, len(running_clusters), running_clusters)
        self.assertEqual(test_cluster["name"], running_clusters[0]["name"],
                         running_clusters)

        with admin:
            # Delete test cluster
            # First state = draining
            test_cluster["state"] = "draining"
            data, resp = util.update_compute_cluster(self.cook_url,
                                                     test_cluster)
            self.assertEqual(201, resp.status_code, resp.content)
            # Then state = deleted
            test_cluster["state"] = "deleted"
            util.wait_until(
                lambda: util.update_compute_cluster(self.cook_url, test_cluster
                                                    ),
                lambda x: 201 == x[1].status_code, 300000, 5000)
            # Hard-delete the original non-test clusters
            for cluster in clusters:
                self.logger.info(f'Trying to delete cluster {cluster}')
                resp = util.delete_compute_cluster(self.cook_url, cluster)
                self.assertEqual(204, resp.status_code, resp.content)
            # Force give up leadership
            resp = util.shutdown_leader(self.cook_url, "test_dynamic_clusters")
            self.assertEqual(b'Accepted', resp)

        # Old clusters should be re-created
        # wait for cook to come up
        util.wait_until(
            lambda: [
                cluster for cluster in util.compute_clusters(self.cook_url)[
                    'db-configs'] if cluster["state"] == "running"
            ], lambda x: len(x) == len(clusters), 420000, 5000)
        # Check that a job schedules successfully
        command = "true"
        job_uuid, resp = util.submit_job(self.cook_url,
                                         command=command,
                                         container=container)
        self.assertEqual(201, resp.status_code, resp.content)
        util.wait_for_instance(self.cook_url, job_uuid, status='success')

        with admin:
            # Hard-delete test cluster
            resp = util.delete_compute_cluster(self.cook_url, test_cluster)
            self.assertEqual(204, resp.status_code, resp.content)
예제 #3
0
    def test_checkpoint_locality(self):
        """
        Test that restored instances run in the same location as their checkpointed instances.
        """
        # Get the set of clusters that correspond to the pool under test and are running
        pool = util.default_submit_pool()
        clusters = util.compute_clusters(self.cook_url)
        running_clusters = [
            c for c in clusters['in-mem-configs']
            if pool in c['cluster-definition']['config']['synthetic-pods']
            ['pools'] and c['state'] == 'running'
        ]
        self.logger.info(
            f'Running clusters for pool {pool}: {running_clusters}')
        if len(running_clusters) == 0:
            self.skipTest(
                f'Requires at least 1 running compute cluster for pool {pool}')

        # Submit an initial canary job
        job_uuid, resp = util.submit_job(self.cook_url,
                                         pool=pool,
                                         command='true')
        self.assertEqual(201, resp.status_code, resp.content)
        util.wait_for_instance(self.cook_url,
                               job_uuid,
                               status='success',
                               indent=None)

        # Submit a long-running job with checkpointing
        checkpoint_job_uuid, resp = util.submit_job(
            self.cook_url,
            pool=pool,
            command=f'sleep {util.DEFAULT_TEST_TIMEOUT_SECS}',
            max_retries=5,
            checkpoint={'mode': 'auto'})
        self.assertEqual(201, resp.status_code, resp.content)

        try:
            # Wait for the job to be running
            checkpoint_instance = util.wait_for_instance(self.cook_url,
                                                         checkpoint_job_uuid,
                                                         status='running',
                                                         indent=None)
            checkpoint_instance_uuid = checkpoint_instance['task_id']
            checkpoint_location = next(
                c['location'] for c in running_clusters
                if c['name'] == checkpoint_instance['compute-cluster']['name'])

            admin = self.user_factory.admin()
            try:
                # Force all clusters in the instance's location to have state = draining
                with admin:
                    for cluster in running_clusters:
                        if cluster['location'] == checkpoint_location:
                            cluster_update = dict(cluster)
                            # Set state = draining
                            cluster_update['state'] = 'draining'
                            cluster_update['state-locked?'] = True
                            # The location, cluster-definition, and features fields cannot be sent in the update
                            cluster_update.pop('location', None)
                            cluster_update.pop('cluster-definition', None)
                            cluster_update.pop('features', None)
                            self.logger.info(
                                f'Trying to update cluster to draining: {cluster_update}'
                            )
                            util.wait_until(
                                lambda: util.update_compute_cluster(
                                    self.cook_url, cluster_update)[1],
                                lambda response: response.status_code == 201
                                and len(response.json()) > 0)
                        else:
                            self.logger.info(
                                f'Not updating cluster - not in location {checkpoint_location}: {cluster}'
                            )

                # Kill the running checkpoint job instance
                util.kill_instance(self.cook_url, checkpoint_instance_uuid)

                # Submit another canary job
                job_uuid, resp = util.submit_job(self.cook_url,
                                                 pool=pool,
                                                 command='true')
                self.assertEqual(201, resp.status_code, resp.content)

                cluster_locations = set(c['location']
                                        for c in running_clusters)
                if len(cluster_locations) > 1:
                    # The canary job should run in the non-draining location
                    self.logger.info(
                        f'There are > 1 cluster locations under test: {cluster_locations}'
                    )
                    util.wait_for_instance(self.cook_url,
                                           job_uuid,
                                           status='success',
                                           indent=None)
                else:
                    self.logger.info(
                        f'There is only 1 cluster location under test: {cluster_locations}'
                    )

                # The checkpoint job should be waiting
                util.wait_for_instance(self.cook_url,
                                       checkpoint_job_uuid,
                                       status='failed',
                                       indent=None)
                util.wait_for_job_in_statuses(self.cook_url,
                                              checkpoint_job_uuid, ['waiting'])
            finally:
                # Revert all clusters in the instance's location to state = running
                with admin:
                    for cluster in running_clusters:
                        if cluster['location'] == checkpoint_location:
                            cluster_update = dict(cluster)
                            # Set state = running
                            cluster_update['state'] = 'running'
                            cluster_update['state-locked?'] = False
                            # The location, cluster-definition, and features fields cannot be sent in the update
                            cluster_update.pop('location', None)
                            cluster_update.pop('cluster-definition', None)
                            cluster_update.pop('features', None)
                            self.logger.info(
                                f'Trying to update cluster to running: {cluster_update}'
                            )
                            util.wait_until(
                                lambda: util.update_compute_cluster(
                                    self.cook_url, cluster_update)[1],
                                lambda response: response.status_code == 201
                                and len(response.json()) > 0)
                        else:
                            self.logger.info(
                                f'Not updating cluster - not in location {checkpoint_location}: {cluster}'
                            )

                # Wait for the checkpoint job to be running again, in the same location as before
                checkpoint_instance = util.wait_for_instance(
                    self.cook_url,
                    checkpoint_job_uuid,
                    status='running',
                    indent=None)
                self.assertEqual(
                    checkpoint_location,
                    next(c['location'] for c in running_clusters if c['name']
                         == checkpoint_instance['compute-cluster']['name']))
        finally:
            # Kill the checkpoint job to not leave it running
            util.kill_jobs(self.cook_url, [checkpoint_job_uuid])
예제 #4
0
    def trigger_preemption(self, pool):
        """
        Triggers preemption on the provided pool (which can be None) by doing the following:

        1. Choose a user, X
        2. Lower X's cpu share to 0.1 and cpu quota to 1.0
        3. Submit a job, J1, from X with 1.0 cpu and priority 99 (fills the cpu quota)
        4. Wait for J1 to start running
        5. Submit a job, J2, from X with 0.1 cpu and priority 100
        6. Wait until J1 is preempted (to make room for J2)
        """
        admin = self.user_factory.admin()
        user = self.user_factory.new_user()
        all_job_uuids = []
        try:
            small_cpus = 0.1
            large_cpus = small_cpus * 10
            with admin:
                # Lower the user's cpu share and quota
                util.set_limit(self.cook_url,
                               'share',
                               user.name,
                               cpus=small_cpus,
                               pool=pool)
                util.set_limit(self.cook_url,
                               'quota',
                               user.name,
                               cpus=large_cpus,
                               pool=pool)

            with user:
                # Submit a large job that fills up the user's quota
                base_priority = 99
                command = 'sleep 600'
                uuid_large, _ = util.submit_job(self.cook_url,
                                                priority=base_priority,
                                                cpus=large_cpus,
                                                command=command,
                                                pool=pool)
                all_job_uuids.append(uuid_large)
                util.wait_for_running_instance(self.cook_url, uuid_large)

                # Submit a higher-priority job that should trigger preemption
                uuid_high_priority, _ = util.submit_job(
                    self.cook_url,
                    priority=base_priority + 1,
                    cpus=small_cpus,
                    command=command,
                    name='higher_priority_job',
                    pool=pool)
                all_job_uuids.append(uuid_high_priority)

                # Assert that the lower-priority job was preempted
                def low_priority_job():
                    job = util.load_job(self.cook_url, uuid_large)
                    one_hour_in_millis = 60 * 60 * 1000
                    start = util.current_milli_time() - one_hour_in_millis
                    end = util.current_milli_time()
                    running = util.jobs(self.cook_url,
                                        user=user.name,
                                        state='running',
                                        start=start,
                                        end=end).json()
                    waiting = util.jobs(self.cook_url,
                                        user=user.name,
                                        state='waiting',
                                        start=start,
                                        end=end).json()
                    self.logger.info(
                        f'Currently running jobs: {json.dumps(running, indent=2)}'
                    )
                    self.logger.info(
                        f'Currently waiting jobs: {json.dumps(waiting, indent=2)}'
                    )
                    return job

                def job_was_preempted(job):
                    for instance in job['instances']:
                        self.logger.debug(
                            f'Checking if instance was preempted: {instance}')
                        if instance.get(
                                'reason_string') == 'Preempted by rebalancer':
                            return True
                    self.logger.info(f'Job has not been preempted: {job}')
                    return False

                max_wait_ms = util.settings(
                    self.cook_url
                )['rebalancer']['interval-seconds'] * 1000 * 1.5
                self.logger.info(
                    f'Waiting up to {max_wait_ms} milliseconds for preemption to happen'
                )
                util.wait_until(low_priority_job,
                                job_was_preempted,
                                max_wait_ms=max_wait_ms,
                                wait_interval_ms=5000)
        finally:
            with admin:
                util.kill_jobs(self.cook_url,
                               all_job_uuids,
                               assert_response=False)
                util.reset_limit(self.cook_url,
                                 'share',
                                 user.name,
                                 reason=self.current_name(),
                                 pool=pool)
                util.reset_limit(self.cook_url,
                                 'quota',
                                 user.name,
                                 reason=self.current_name(),
                                 pool=pool)
예제 #5
0
    def test_rate_limit_launching_jobs(self):
        settings = util.settings(self.cook_url)
        if settings['rate-limit']['job-launch'] is None:
            pytest.skip(
                "Can't test job launch rate limit without launch rate limit set."
            )

        # Allow an environmental variable override.
        name = os.getenv('COOK_LAUNCH_RATE_LIMIT_NAME')
        if name is not None:
            user = self.user_factory.user_class(name)
        else:
            user = self.user_factory.new_user()

        if not settings['rate-limit']['job-launch']['enforce?']:
            pytest.skip("Enforcing must be on for test to run")
        bucket_size = settings['rate-limit']['job-launch']['bucket-size']
        token_rate = settings['rate-limit']['job-launch'][
            'tokens-replenished-per-minute']
        # In some environments, e.g., minimesos, we can only launch so many concurrent jobs.
        if token_rate < 5 or token_rate > 20:
            pytest.skip(
                "Job launch rate limit test is only validated to reliably work correctly with certain token rates."
            )
        if bucket_size < 10 or bucket_size > 20:
            pytest.skip(
                "Job launch rate limit test is only validated to reliably work correctly with certain token bucket sizes."
            )
        with user:
            job_uuids = []
            try:
                jobspec = {"command": "sleep 240", 'cpus': 0.03, 'mem': 32}

                self.logger.info(
                    f'Submitting initial batch of {bucket_size-1} jobs')
                initial_uuids, initial_response = util.submit_jobs(
                    self.cook_url, jobspec, bucket_size - 1)
                job_uuids.extend(initial_uuids)
                self.assertEqual(201,
                                 initial_response.status_code,
                                 msg=initial_response.content)

                def submit_jobs():
                    self.logger.info(
                        f'Submitting subsequent batch of {bucket_size-1} jobs')
                    subsequent_uuids, subsequent_response = util.submit_jobs(
                        self.cook_url, jobspec, bucket_size - 1)
                    job_uuids.extend(subsequent_uuids)
                    self.assertEqual(201,
                                     subsequent_response.status_code,
                                     msg=subsequent_response.content)

                def is_rate_limit_triggered(_):
                    jobs1 = util.query_jobs(self.cook_url,
                                            True,
                                            uuid=job_uuids).json()
                    waiting_jobs = [
                        j for j in jobs1 if j['status'] == 'waiting'
                    ]
                    running_jobs = [
                        j for j in jobs1 if j['status'] == 'running'
                    ]
                    self.logger.debug(
                        f'There are {len(waiting_jobs)} waiting jobs')
                    # We submitted just under two buckets. We should only see a bucket + some extra running. No more.
                    return len(running_jobs) >= bucket_size and len(
                        running_jobs) < (bucket_size + token_rate /
                                         2) and len(waiting_jobs) > 0

                util.wait_until(submit_jobs, is_rate_limit_triggered)
                jobs2 = util.query_jobs(self.cook_url, True,
                                        uuid=job_uuids).json()
                running_jobs = [j for j in jobs2 if j['status'] == 'running']
                self.assertEqual(len(running_jobs), bucket_size)
            finally:
                util.kill_jobs(self.cook_url, job_uuids)
예제 #6
0
    def test_preemption(self):
        admin = self.user_factory.admin()
        user = self.user_factory.new_user()
        all_job_uuids = []
        try:
            small_cpus = 0.1
            large_cpus = small_cpus * 10
            with admin:
                # Lower the user's cpu share and quota
                util.set_limit(self.cook_url,
                               'share',
                               user.name,
                               cpus=small_cpus)
                util.set_limit(self.cook_url,
                               'quota',
                               user.name,
                               cpus=large_cpus)

            with user:
                # Submit a large job that fills up the user's quota
                base_priority = 99
                command = 'sleep 600'
                uuid_large, _ = util.submit_job(self.cook_url,
                                                priority=base_priority,
                                                cpus=large_cpus,
                                                command=command)
                all_job_uuids.append(uuid_large)
                util.wait_for_running_instance(self.cook_url, uuid_large)

                # Submit a higher-priority job that should trigger preemption
                uuid_high_priority, _ = util.submit_job(
                    self.cook_url,
                    priority=base_priority + 1,
                    cpus=small_cpus,
                    command=command,
                    name='higher_priority_job')
                all_job_uuids.append(uuid_high_priority)

                # Assert that the lower-priority job was preempted
                def low_priority_job():
                    job = util.load_job(self.cook_url, uuid_large)
                    one_hour_in_millis = 60 * 60 * 1000
                    start = util.current_milli_time() - one_hour_in_millis
                    end = util.current_milli_time()
                    running = util.jobs(self.cook_url,
                                        user=user.name,
                                        state='running',
                                        start=start,
                                        end=end).json()
                    waiting = util.jobs(self.cook_url,
                                        user=user.name,
                                        state='waiting',
                                        start=start,
                                        end=end).json()
                    self.logger.info(
                        f'Currently running jobs: {json.dumps(running, indent=2)}'
                    )
                    self.logger.info(
                        f'Currently waiting jobs: {json.dumps(waiting, indent=2)}'
                    )
                    return job

                def job_was_preempted(job):
                    for instance in job['instances']:
                        self.logger.debug(
                            f'Checking if instance was preempted: {instance}')
                        if instance.get(
                                'reason_string') == 'Preempted by rebalancer':
                            return True
                    self.logger.info(f'Job has not been preempted: {job}')
                    return False

                max_wait_ms = util.settings(
                    self.cook_url
                )['rebalancer']['interval-seconds'] * 1000 * 1.5
                self.logger.info(
                    f'Waiting up to {max_wait_ms} milliseconds for preemption to happen'
                )
                util.wait_until(low_priority_job,
                                job_was_preempted,
                                max_wait_ms=max_wait_ms,
                                wait_interval_ms=5000)
        finally:
            with admin:
                util.kill_jobs(self.cook_url,
                               all_job_uuids,
                               assert_response=False)
                util.reset_limit(self.cook_url,
                                 'share',
                                 user.name,
                                 reason=self.current_name())
                util.reset_limit(self.cook_url,
                                 'quota',
                                 user.name,
                                 reason=self.current_name())