Пример #1
0
    def test_container_port_submit(self):
        """Test submitting a job with a port specification."""
        JOB_PORT = 30030
        progress_file_env = util.retrieve_progress_file_env(
            type(self).cook_url)
        hostname_progress_cmd = util.progress_line(
            type(self).cook_url,
            50,  # Don't really care, we just need a val
            '$(hostname -I)',
            write_to_file=True)

        container = DockerContainer(util.docker_image(),
                                    port_mapping=[
                                        DockerPortMapping(
                                            host_port=0,
                                            container_port=JOB_PORT,
                                            protocol='tcp')
                                    ])
        uuid = self.client.submit(
            command=
            f'{hostname_progress_cmd} && nc -l -p {JOB_PORT} $(hostname -I)',
            container=container,
            env={progress_file_env: 'progress.txt'},
            max_retries=5,
            pool=util.default_submit_pool())

        addr = None
        try:
            util.wait_for_instance_with_progress(
                type(self).cook_url, str(uuid), 50)
            job = self.client.query(uuid)
            addr = job.instances[0].progress_message

            self.assertIsNotNone(addr)

            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
                sock.connect((addr, JOB_PORT))
                message = b"hello world!"

                self.assertEqual(sock.send(message), len(message))
        except Exception as e:
            if addr is not None:
                raise Exception(f"Could not connect to {addr}: {e}") from e
            else:
                raise e
        finally:
            self.client.kill(uuid)
Пример #2
0
    def test_bulk_submit_explicit_none(self):
        jobspecs = [
            {
                'command': 'echo "Hello World!"',
                'mem': 256.0,
                'container': None
            }
        ]
        uuids = self.client.submit_all(jobspecs,
                                       pool=util.default_submit_pool())
        try:
            jobs = self.client.query_all(uuids)

            self.assertEqual(jobs[0].uuid, uuids[0])
            self.assertEqual(jobs[0].command, jobspecs[0]['command'])
        finally:
            self.client.kill_all(uuids)
Пример #3
0
    def test_instance_query(self):
        """Test that parsing an instance yielded from Cook works."""
        uuid = self.client.submit(command=f'sleep {util.DEFAULT_TEST_TIMEOUT_SECS}',
                                  cpus=0.5,
                                  mem=1.0,
                                  max_retries=5,
                                  pool=util.default_submit_pool())

        try:
            util.wait_for_instance(type(self).cook_url, uuid)

            job = self.client.query(uuid)

            self.assertNotEqual(job.instances, [])
            self.assertIsNotNone(job.instances[0])
        finally:
            self.client.kill(uuid)
Пример #4
0
 def test_kill(self):
     uuid = self.client.submit(command=f'sleep {util.DEFAULT_TEST_TIMEOUT_SECS}',
                               cpus=0.5,
                               mem=1.0,
                               max_retries=5,
                               pool=util.default_submit_pool())
     killed = False
     try:
         job = self.client.query(uuid)
         # Ensure the job is either waiting or running
         self.assertNotEqual(job.status, JobStatus.COMPLETED)
         self.client.kill(uuid)
         killed = True
         job = self.client.query(uuid)
         self.assertEqual(job.status, JobStatus.COMPLETED)
     finally:
         if not killed:
             self.client.kill(uuid)
Пример #5
0
    def test_bulk_ops(self):
        jobspecs = [{
            'command': 'ls'
        }, {
            'command': 'echo "Hello World!"',
            'mem': 256.0
        }]
        uuids = self.client.submit_all(jobspecs,
                                       pool=util.default_submit_pool())
        try:
            jobs = self.client.query_all(uuids)

            self.assertEqual(jobs[0].uuid, uuids[0])
            self.assertEqual(jobs[0].command, jobspecs[0]['command'])

            self.assertEqual(jobs[1].uuid, uuids[1])
            self.assertEqual(jobs[1].command, jobspecs[1]['command'])
            self.assertEqual(jobs[1].mem, jobspecs[1]['mem'])
        finally:
            self.client.kill_all(uuids)
Пример #6
0
def submit(command=None,
           cook_url=None,
           flags=None,
           submit_flags=None,
           stdin=None):
    """Submits one job via the CLI"""
    default_pool = util.default_submit_pool()
    if default_pool:
        message = f'Submitting explicitly to the {default_pool} pool (set as default)'
        if not submit_flags:
            submit_flags = f'--pool {default_pool}'
            logger.info(message)
        elif '--pool' not in submit_flags:
            submit_flags += f' --pool {default_pool}'
            logger.info(message)

    args = 'submit %s%s' % (submit_flags + ' ' if submit_flags else '',
                            command if command else '')
    cp = cli(args, cook_url, flags, stdin)
    uuids = [
        s for s in stdout(cp).split() if len(s) == 36 and util.is_valid_uuid(s)
    ]
    return cp, uuids
Пример #7
0
    def test_checkpoint_locality(self):
        """
        Test that restored instances run in the same location as their checkpointed instances.
        """
        # Get the set of clusters that correspond to the pool under test and are running
        pool = util.default_submit_pool()
        clusters = util.compute_clusters(self.cook_url)
        running_clusters = [
            c for c in clusters['in-mem-configs']
            if pool in c['cluster-definition']['config']['synthetic-pods']
            ['pools'] and c['state'] == 'running'
        ]
        self.logger.info(
            f'Running clusters for pool {pool}: {running_clusters}')
        if len(running_clusters) == 0:
            self.skipTest(
                f'Requires at least 1 running compute cluster for pool {pool}')

        # Submit an initial canary job
        job_uuid, resp = util.submit_job(self.cook_url,
                                         pool=pool,
                                         command='true')
        self.assertEqual(201, resp.status_code, resp.content)
        util.wait_for_instance(self.cook_url,
                               job_uuid,
                               status='success',
                               indent=None)

        # Submit a long-running job with checkpointing
        checkpoint_job_uuid, resp = util.submit_job(
            self.cook_url,
            pool=pool,
            command=f'sleep {util.DEFAULT_TEST_TIMEOUT_SECS}',
            max_retries=5,
            checkpoint={'mode': 'auto'})
        self.assertEqual(201, resp.status_code, resp.content)

        try:
            # Wait for the job to be running
            checkpoint_instance = util.wait_for_instance(self.cook_url,
                                                         checkpoint_job_uuid,
                                                         status='running',
                                                         indent=None)
            checkpoint_instance_uuid = checkpoint_instance['task_id']
            checkpoint_location = next(
                c['location'] for c in running_clusters
                if c['name'] == checkpoint_instance['compute-cluster']['name'])

            admin = self.user_factory.admin()
            try:
                # Force all clusters in the instance's location to have state = draining
                with admin:
                    for cluster in running_clusters:
                        if cluster['location'] == checkpoint_location:
                            cluster_update = dict(cluster)
                            # Set state = draining
                            cluster_update['state'] = 'draining'
                            cluster_update['state-locked?'] = True
                            # The location, cluster-definition, and features fields cannot be sent in the update
                            cluster_update.pop('location', None)
                            cluster_update.pop('cluster-definition', None)
                            cluster_update.pop('features', None)
                            self.logger.info(
                                f'Trying to update cluster to draining: {cluster_update}'
                            )
                            util.wait_until(
                                lambda: util.update_compute_cluster(
                                    self.cook_url, cluster_update)[1],
                                lambda response: response.status_code == 201
                                and len(response.json()) > 0)
                        else:
                            self.logger.info(
                                f'Not updating cluster - not in location {checkpoint_location}: {cluster}'
                            )

                # Kill the running checkpoint job instance
                util.kill_instance(self.cook_url, checkpoint_instance_uuid)

                # Submit another canary job
                job_uuid, resp = util.submit_job(self.cook_url,
                                                 pool=pool,
                                                 command='true')
                self.assertEqual(201, resp.status_code, resp.content)

                cluster_locations = set(c['location']
                                        for c in running_clusters)
                if len(cluster_locations) > 1:
                    # The canary job should run in the non-draining location
                    self.logger.info(
                        f'There are > 1 cluster locations under test: {cluster_locations}'
                    )
                    util.wait_for_instance(self.cook_url,
                                           job_uuid,
                                           status='success',
                                           indent=None)
                else:
                    self.logger.info(
                        f'There is only 1 cluster location under test: {cluster_locations}'
                    )

                # The checkpoint job should be waiting
                util.wait_for_instance(self.cook_url,
                                       checkpoint_job_uuid,
                                       status='failed',
                                       indent=None)
                util.wait_for_job_in_statuses(self.cook_url,
                                              checkpoint_job_uuid, ['waiting'])
            finally:
                # Revert all clusters in the instance's location to state = running
                with admin:
                    for cluster in running_clusters:
                        if cluster['location'] == checkpoint_location:
                            cluster_update = dict(cluster)
                            # Set state = running
                            cluster_update['state'] = 'running'
                            cluster_update['state-locked?'] = False
                            # The location, cluster-definition, and features fields cannot be sent in the update
                            cluster_update.pop('location', None)
                            cluster_update.pop('cluster-definition', None)
                            cluster_update.pop('features', None)
                            self.logger.info(
                                f'Trying to update cluster to running: {cluster_update}'
                            )
                            util.wait_until(
                                lambda: util.update_compute_cluster(
                                    self.cook_url, cluster_update)[1],
                                lambda response: response.status_code == 201
                                and len(response.json()) > 0)
                        else:
                            self.logger.info(
                                f'Not updating cluster - not in location {checkpoint_location}: {cluster}'
                            )

                # Wait for the checkpoint job to be running again, in the same location as before
                checkpoint_instance = util.wait_for_instance(
                    self.cook_url,
                    checkpoint_job_uuid,
                    status='running',
                    indent=None)
                self.assertEqual(
                    checkpoint_location,
                    next(c['location'] for c in running_clusters if c['name']
                         == checkpoint_instance['compute-cluster']['name']))
        finally:
            # Kill the checkpoint job to not leave it running
            util.kill_jobs(self.cook_url, [checkpoint_job_uuid])