def test_get_nonexistent_job(client: BatchClient): try: client.get_job(1, 666) except aiohttp.ClientResponseError as e: if e.status == 404: pass else: raise
def test_delete_batch(client: BatchClient): b = client.create_batch() j = b.create_job(DOCKER_ROOT_IMAGE, ['sleep', '30']) b = b.submit() b.delete() # verify doesn't exist try: client.get_job(*j.id) except aiohttp.ClientResponseError as e: if e.status == 404: pass else: raise
def test_get_job(client: BatchClient): b = client.create_batch() j = b.create_job(DOCKER_ROOT_IMAGE, ['true']) b = b.submit() j2 = client.get_job(*j.id) status2 = j2.status() assert (status2['batch_id'], status2['job_id']) == j.id, str( (status, b.debug_info()))
class Test(unittest.TestCase): def setUp(self): session = aiohttp.ClientSession( raise_for_status=True, timeout=aiohttp.ClientTimeout(total=60)) self.client = BatchClient(session, url=os.environ.get('BATCH_URL')) def tearDown(self): self.client.close() def test_job(self): builder = self.client.create_batch() j = builder.create_job('alpine', ['echo', 'test']) builder.submit() status = j.wait() self.assertTrue('attributes' not in status, (status, j.log())) self.assertEqual(status['state'], 'Success', (status, j.log())) self.assertEqual(status['exit_code']['main'], 0, (status, j.log())) self.assertEqual(j.log()['main'], 'test\n', status) j.pod_status() self.assertTrue(j.is_complete()) def test_attributes(self): a = { 'name': 'test_attributes', 'foo': 'bar' } builder = self.client.create_batch() j = builder.create_job('alpine', ['true'], attributes=a) builder.submit() status = j.status() assert(status['attributes'] == a) def test_unsubmitted_state(self): builder = self.client.create_batch() j = builder.create_job('alpine', ['echo', 'test']) with self.assertRaises(ValueError): j.batch_id with self.assertRaises(ValueError): j.id with self.assertRaises(ValueError): j.status() with self.assertRaises(ValueError): j.is_complete() with self.assertRaises(ValueError): j.log() with self.assertRaises(ValueError): j.pod_status() with self.assertRaises(ValueError): j.wait() builder.submit() with self.assertRaises(ValueError): builder.create_job('alpine', ['echo', 'test']) def test_list_batches(self): tag = secrets.token_urlsafe(64) b1 = self.client.create_batch(attributes={'tag': tag, 'name': 'b1'}) b1.create_job('alpine', ['sleep', '30']) b1 = b1.submit() b2 = self.client.create_batch(attributes={'tag': tag, 'name': 'b2'}) b2.create_job('alpine', ['echo', 'test']) b2 = b2.submit() def assert_batch_ids(expected, complete=None, success=None, attributes=None): batches = self.client.list_batches(complete=complete, success=success, attributes=attributes) # list_batches returns all batches for all prev run tests actual = set([b.id for b in batches]).intersection({b1.id, b2.id}) self.assertEqual(actual, expected) assert_batch_ids({b1.id, b2.id}, attributes={'tag': tag}) b2.wait() assert_batch_ids({b1.id}, complete=False, attributes={'tag': tag}) assert_batch_ids({b2.id}, complete=True, attributes={'tag': tag}) assert_batch_ids({b1.id}, success=False, attributes={'tag': tag}) assert_batch_ids({b2.id}, success=True, attributes={'tag': tag}) b1.cancel() b1.wait() assert_batch_ids({b1.id}, success=False, attributes={'tag': tag}) assert_batch_ids({b2.id}, success=True, attributes={'tag': tag}) assert_batch_ids(set(), complete=False, attributes={'tag': tag}) assert_batch_ids({b1.id, b2.id}, complete=True, attributes={'tag': tag}) assert_batch_ids({b2.id}, attributes={'tag': tag, 'name': 'b2'}) def test_limit_offset(self): b1 = self.client.create_batch() for i in range(3): b1.create_job('alpine', ['true']) b1 = b1.submit() s = b1.status(limit=2, offset=1) filtered_jobs = {j['job_id'] for j in s['jobs']} assert filtered_jobs == {2, 3}, s def test_fail(self): b = self.client.create_batch() j = b.create_job('alpine', ['false']) b.submit() status = j.wait() self.assertEqual(status['exit_code']['main'], 1) def test_deleted_job_log(self): b = self.client.create_batch() j = b.create_job('alpine', ['echo', 'test']) b = b.submit() j.wait() b.delete() try: j.log() except aiohttp.ClientResponseError as e: if e.status == 404: pass else: self.assertTrue(False, f"batch should have deleted log {e}") def test_delete_batch(self): b = self.client.create_batch() j = b.create_job('alpine', ['sleep', '30']) b = b.submit() b.delete() # verify doesn't exist try: self.client.get_job(*j.id) except aiohttp.ClientResponseError as e: if e.status == 404: pass else: raise def test_cancel_batch(self): b = self.client.create_batch() j = b.create_job('alpine', ['sleep', '30']) b = b.submit() status = j.status() assert status['state'] in ('Ready', 'Running'), status b.cancel() status = j.wait() assert status['state'] == 'Cancelled', status assert 'log' not in status, status # cancelled job has no log try: j.log() except aiohttp.ClientResponseError as e: if e.status == 404: pass else: raise def test_get_nonexistent_job(self): try: self.client.get_job(1, 666) except aiohttp.ClientResponseError as e: if e.status == 404: pass else: raise def test_get_job(self): b = self.client.create_batch() j = b.create_job('alpine', ['true']) b.submit() j2 = self.client.get_job(*j.id) status2 = j2.status() assert (status2['batch_id'], status2['job_id']) == j.id def test_batch(self): b = self.client.create_batch() j1 = b.create_job('alpine', ['false']) j2 = b.create_job('alpine', ['sleep', '1']) j3 = b.create_job('alpine', ['sleep', '30']) b = b.submit() j1.wait() j2.wait() b.cancel() bstatus = b.wait() assert len(bstatus['jobs']) == 3, bstatus state_count = collections.Counter([j['state'] for j in bstatus['jobs']]) n_cancelled = state_count['Cancelled'] n_complete = state_count['Error'] + state_count['Failed'] + state_count['Success'] assert n_cancelled <= 1, bstatus assert n_cancelled + n_complete == 3, bstatus n_failed = sum([j['exit_code']['main'] > 0 for j in bstatus['jobs'] if j['state'] in ('Failed', 'Error')]) assert n_failed == 1, bstatus def test_batch_status(self): b1 = self.client.create_batch() b1.create_job('alpine', ['true']) b1 = b1.submit() b1.wait() b1s = b1.status() assert b1s['complete'] and b1s['state'] == 'success', b1s b2 = self.client.create_batch() b2.create_job('alpine', ['false']) b2.create_job('alpine', ['true']) b2 = b2.submit() b2.wait() b2s = b2.status() assert b2s['complete'] and b2s['state'] == 'failure', b2s b3 = self.client.create_batch() b3.create_job('alpine', ['sleep', '30']) b3 = b3.submit() b3s = b3.status() assert not b3s['complete'] and b3s['state'] == 'running', b3s b4 = self.client.create_batch() b4.create_job('alpine', ['sleep', '30']) b4 = b4.submit() b4.cancel() b4.wait() b4s = b4.status() assert b4s['complete'] and b4s['state'] == 'cancelled', b4s def test_callback(self): app = Flask('test-client') d = {} @app.route('/test', methods=['POST']) def test(): d['status'] = request.get_json() return Response(status=200) server = ServerThread(app) try: server.start() b = self.client.create_batch() j = b.create_job( 'alpine', ['echo', 'test'], attributes={'foo': 'bar'}, callback=server.url_for('/test')) b = b.submit() j.wait() poll_until(lambda: 'status' in d) status = d['status'] self.assertEqual(status['state'], 'Success') self.assertEqual(status['attributes'], {'foo': 'bar'}) finally: server.shutdown() server.join() def test_log_after_failing_job(self): b = self.client.create_batch() j = b.create_job('alpine', ['/bin/sh', '-c', 'echo test; exit 127']) b.submit() status = j.wait() self.assertTrue('attributes' not in status) self.assertEqual(status['state'], 'Failed') self.assertEqual(status['exit_code']['main'], 127) self.assertEqual(j.log()['main'], 'test\n') self.assertTrue(j.is_complete()) def test_authorized_users_only(self): endpoints = [ (requests.get, '/api/v1alpha/batches/0/jobs/0'), (requests.get, '/api/v1alpha/batches/0/jobs/0/log'), (requests.get, '/api/v1alpha/batches/0/jobs/0/pod_status'), (requests.get, '/api/v1alpha/batches'), (requests.post, '/api/v1alpha/batches/create'), (requests.post, '/api/v1alpha/batches/0/jobs/create'), (requests.get, '/api/v1alpha/batches/0'), (requests.delete, '/api/v1alpha/batches/0'), (requests.patch, '/api/v1alpha/batches/0/close'), (requests.get, '/batches'), (requests.get, '/batches/0'), (requests.get, '/batches/0/jobs/0/log')] for f, url in endpoints: r = f(os.environ.get('BATCH_URL')+url) assert r.status_code == 401, r def test_bad_jwt_key(self): fname = pkg_resources.resource_filename( __name__, 'jwt-test-user.json') with open(fname) as f: userdata = json.loads(f.read()) token = hj.JWTClient(hj.JWTClient.generate_key()).encode(userdata) session = aiohttp.ClientSession( raise_for_status=True, timeout=aiohttp.ClientTimeout(total=60)) bc = BatchClient(session, url=os.environ.get('BATCH_URL'), token=token) try: b = bc.create_batch() j = b.create_job('alpine', ['false']) b.submit() assert False, j except aiohttp.ClientResponseError as e: if e.status == 401: pass else: assert False, e finally: bc.close() def test_ui_batches(self): with open(os.environ['HAIL_TOKEN_FILE']) as f: token = f.read() # just check successful response r = requests.get(f'{os.environ.get("BATCH_URL")}/batches', cookies={'user': token}) assert (r.status_code >= 200) and (r.status_code < 300) def test_ui_batch_and_job_log(self): b = self.client.create_batch() j = b.create_job('alpine', ['true']) b = b.submit() status = j.wait() with open(os.environ['HAIL_TOKEN_FILE']) as f: token = f.read() # just check successful response r = requests.get(f'{os.environ.get("BATCH_URL")}/batches/{b.id}', cookies={'user': token}) assert (r.status_code >= 200) and (r.status_code < 300) # just check successful response r = requests.get(f'{os.environ.get("BATCH_URL")}/batches/{j.batch_id}/jobs/{j.job_id}/log', cookies={'user': token}) assert (r.status_code >= 200) and (r.status_code < 300) r = requests.get(f'{os.environ.get("BATCH_URL")}/batches/{j.batch_id}/jobs/{j.job_id}/pod_status', cookies={'user': token}) assert (r.status_code >= 200) and (r.status_code < 300)
class BatchBackend(Backend): """ Backend that executes pipelines on a Kubernetes cluster using `batch`. Examples -------- >>> batch_backend = BatchBackend(tmp_dir='http://localhost:5000') >>> p = Pipeline(backend=batch_backend) Parameters ---------- url: :obj:`str` URL to batch server. """ def __init__(self, _service='batch'): self._batch_client = BatchClient(_service=_service) def close(self): self._batch_client.close() def _run(self, pipeline, dry_run, verbose, delete_scratch_on_exit): # pylint: disable-msg=R0915 start = time.time() bucket = self._batch_client.bucket subdir_name = 'pipeline-{}'.format(uuid.uuid4().hex[:12]) remote_tmpdir = f'gs://{bucket}/pipeline/{subdir_name}' local_tmpdir = f'/io/pipeline/{subdir_name}' default_image = 'ubuntu:latest' attributes = pipeline.attributes if pipeline.name is not None: attributes['name'] = pipeline.name batch = self._batch_client.create_batch(attributes=attributes) n_jobs_submitted = 0 used_remote_tmpdir = False task_to_job_mapping = {} jobs_to_command = {} commands = [] bash_flags = 'set -e' + ('x' if verbose else '') + '; ' activate_service_account = 'gcloud -q auth activate-service-account ' \ '--key-file=/gsa-key/privateKeyData' def copy_input(r): if isinstance(r, InputResourceFile): return [(r._input_path, r._get_path(local_tmpdir))] assert isinstance(r, TaskResourceFile) return [(r._get_path(remote_tmpdir), r._get_path(local_tmpdir))] def copy_internal_output(r): assert isinstance(r, TaskResourceFile) return [(r._get_path(local_tmpdir), r._get_path(remote_tmpdir))] def copy_external_output(r): if isinstance(r, InputResourceFile): return [(r._input_path, dest) for dest in r._output_paths] assert isinstance(r, TaskResourceFile) return [(r._get_path(local_tmpdir), dest) for dest in r._output_paths] write_external_inputs = [x for r in pipeline._input_resources for x in copy_external_output(r)] if write_external_inputs: def _cp(src, dst): return f'gsutil -m cp -R {src} {dst}' write_cmd = bash_flags + activate_service_account + ' && ' + \ ' && '.join([_cp(*files) for files in write_external_inputs]) if dry_run: commands.append(write_cmd) else: j = batch.create_job(image='google/cloud-sdk:237.0.0-alpine', command=['/bin/bash', '-c', write_cmd], attributes={'name': 'write_external_inputs'}) jobs_to_command[j] = write_cmd n_jobs_submitted += 1 for task in pipeline._tasks: inputs = [x for r in task._inputs for x in copy_input(r)] outputs = [x for r in task._internal_outputs for x in copy_internal_output(r)] if outputs: used_remote_tmpdir = True outputs += [x for r in task._external_outputs for x in copy_external_output(r)] resource_defs = [r._declare(directory=local_tmpdir) for r in task._mentioned] if task._image is None: if verbose: print(f"Using image '{default_image}' since no image was specified.") make_local_tmpdir = f'mkdir -p {local_tmpdir}/{task._uid}/; ' defs = '; '.join(resource_defs) + '; ' if resource_defs else '' task_command = [cmd.strip() for cmd in task._command] cmd = bash_flags + make_local_tmpdir + defs + " && ".join(task_command) if dry_run: commands.append(cmd) continue parents = [task_to_job_mapping[t] for t in task._dependencies] attributes = {'task_uid': task._uid} if task.name: attributes['name'] = task.name attributes.update(task.attributes) resources = {'requests': {}} if task._cpu: resources['requests']['cpu'] = task._cpu if task._memory: resources['requests']['memory'] = task._memory j = batch.create_job(image=task._image if task._image else default_image, command=['/bin/bash', '-c', cmd], parents=parents, attributes=attributes, resources=resources, input_files=inputs if len(inputs) > 0 else None, output_files=outputs if len(outputs) > 0 else None, pvc_size=task._storage) n_jobs_submitted += 1 task_to_job_mapping[task] = j jobs_to_command[j] = cmd if dry_run: print("\n\n".join(commands)) return if delete_scratch_on_exit and used_remote_tmpdir: parents = list(jobs_to_command.keys()) rm_cmd = f'gsutil -m rm -r {remote_tmpdir}' cmd = bash_flags + f'{activate_service_account} && {rm_cmd}' j = batch.create_job( image='google/cloud-sdk:237.0.0-alpine', command=['/bin/bash', '-c', cmd], parents=parents, attributes={'name': 'remove_tmpdir'}, always_run=True) jobs_to_command[j] = cmd n_jobs_submitted += 1 print(f'Built DAG with {n_jobs_submitted} jobs in {round(time.time() - start, 3)} seconds:') start = time.time() batch = batch.submit() print(f'Submitted batch {batch.id} with {n_jobs_submitted} jobs in {round(time.time() - start, 3)} seconds:') jobs_to_command = {j.id: cmd for j, cmd in jobs_to_command.items()} if verbose: print(f'Submitted batch {batch.id} with {n_jobs_submitted} jobs in {round(time.time() - start, 3)} seconds:') for jid, cmd in jobs_to_command.items(): print(f'{jid}: {cmd}') status = batch.wait() if status['state'] == 'success': print('Pipeline completed successfully!') return failed_jobs = [((j['batch_id'], j['job_id']), j['exit_code']) for j in status['jobs'] if 'exit_code' in j and any([ec != 0 for _, ec in j['exit_code'].items()])] fail_msg = '' for jid, ec in failed_jobs: ec = Job.exit_code(ec) job = self._batch_client.get_job(*jid) log = job.log() name = job.status()['attributes'].get('name', None) fail_msg += ( f"Job {jid} failed with exit code {ec}:\n" f" Task name:\t{name}\n" f" Command:\t{jobs_to_command[jid]}\n" f" Log:\t{log}\n") raise PipelineException(fail_msg)
class Test(unittest.TestCase): def setUp(self): self.client = BatchClient('test') def tearDown(self): self.client.close() def test_job(self): builder = self.client.create_batch() j = builder.create_job('ubuntu:18.04', ['echo', 'test']) b = builder.submit() status = j.wait() self.assertTrue('attributes' not in status, (status, j.log())) self.assertEqual(status['state'], 'Success', (status, j.log())) self.assertEqual(status['exit_code'], 0, status) self.assertEqual(j._get_exit_code(status, 'main'), 0, (status, j.log())) self.assertEqual(j.log()['main'], 'test\n', status) def test_exit_code_duration(self): builder = self.client.create_batch() j = builder.create_job('ubuntu:18.04', ['bash', '-c', 'exit 7']) b = builder.submit() status = j.wait() self.assertEqual(status['exit_code'], 7, status) assert isinstance(status['duration'], int) self.assertEqual(j._get_exit_code(status, 'main'), 7, status) def test_msec_mcpu(self): builder = self.client.create_batch() resources = {'cpu': '100m', 'memory': '375M'} # two jobs so the batch msec_mcpu computation is non-trivial builder.create_job('ubuntu:18.04', ['echo', 'foo'], resources=resources) builder.create_job('ubuntu:18.04', ['echo', 'bar'], resources=resources) b = builder.submit() batch = b.wait() assert batch['state'] == 'success', batch batch_msec_mcpu2 = 0 for job in b.jobs(): # I'm dying job = self.client.get_job(job['batch_id'], job['job_id']) job = job.status() # runs at 100mcpu job_msec_mcpu2 = 100 * max( job['status']['end_time'] - job['status']['start_time'], 0) # greater than in case there are multiple attempts assert job['msec_mcpu'] >= job_msec_mcpu2, batch batch_msec_mcpu2 += job_msec_mcpu2 assert batch['msec_mcpu'] == batch_msec_mcpu2, batch def test_attributes(self): a = {'name': 'test_attributes', 'foo': 'bar'} builder = self.client.create_batch() j = builder.create_job('ubuntu:18.04', ['true'], attributes=a) builder.submit() assert (j.attributes() == a) def test_garbage_image(self): builder = self.client.create_batch() j = builder.create_job('dsafaaadsf', ['echo', 'test']) builder.submit() status = j.wait() assert j._get_exit_codes(status) == {'main': None}, status assert j._get_error(status, 'main') is not None assert status['state'] == 'Error', status def test_bad_command(self): builder = self.client.create_batch() j = builder.create_job('ubuntu:18.04', ['sleep 5']) builder.submit() status = j.wait() assert j._get_exit_codes(status) == {'main': None}, status assert j._get_error(status, 'main') is not None assert status['state'] == 'Error', status def test_invalid_resource_requests(self): builder = self.client.create_batch() resources = {'cpu': '1', 'memory': '250Gi'} builder.create_job('ubuntu:18.04', ['true'], resources=resources) with self.assertRaisesRegex(aiohttp.client.ClientResponseError, 'resource requests.*unsatisfiable'): builder.submit() builder = self.client.create_batch() resources = {'cpu': '0', 'memory': '1Gi'} builder.create_job('ubuntu:18.04', ['true'], resources=resources) with self.assertRaisesRegex(aiohttp.client.ClientResponseError, 'bad resource request.*cpu cannot be 0'): builder.submit() def test_out_of_memory(self): builder = self.client.create_batch() resources = {'cpu': '0.1', 'memory': '10M'} j = builder.create_job('python:3.6-slim-stretch', ['python', '-c', 'x = "a" * 400 * 1000**2'], resources=resources) builder.submit() status = j.wait() assert j._get_out_of_memory(status, 'main') def test_unsubmitted_state(self): builder = self.client.create_batch() j = builder.create_job('ubuntu:18.04', ['echo', 'test']) with self.assertRaises(ValueError): j.batch_id with self.assertRaises(ValueError): j.id with self.assertRaises(ValueError): j.status() with self.assertRaises(ValueError): j.is_complete() with self.assertRaises(ValueError): j.log() with self.assertRaises(ValueError): j.wait() builder.submit() with self.assertRaises(ValueError): builder.create_job('ubuntu:18.04', ['echo', 'test']) def test_list_batches(self): tag = secrets.token_urlsafe(64) b1 = self.client.create_batch(attributes={'tag': tag, 'name': 'b1'}) b1.create_job('ubuntu:18.04', ['sleep', '3600']) b1 = b1.submit() b2 = self.client.create_batch(attributes={'tag': tag, 'name': 'b2'}) b2.create_job('ubuntu:18.04', ['echo', 'test']) b2 = b2.submit() def assert_batch_ids(expected, q=None): batches = self.client.list_batches(q) # list_batches returns all batches for all prev run tests actual = set([b.id for b in batches]).intersection({b1.id, b2.id}) self.assertEqual(actual, expected) assert_batch_ids({b1.id, b2.id}) assert_batch_ids({b1.id, b2.id}, f'tag={tag}') b2.wait() assert_batch_ids({b1.id}, f'!complete tag={tag}') assert_batch_ids({b2.id}, f'complete tag={tag}') assert_batch_ids({b1.id}, f'!success tag={tag}') assert_batch_ids({b2.id}, f'success tag={tag}') b1.cancel() b1.wait() assert_batch_ids({b1.id}, f'!success tag={tag}') assert_batch_ids({b2.id}, f'success tag={tag}') assert_batch_ids(set(), f'!complete tag={tag}') assert_batch_ids({b1.id, b2.id}, f'complete tag={tag}') assert_batch_ids({b2.id}, f'tag={tag} name=b2') def test_include_jobs(self): b1 = self.client.create_batch() for i in range(2): b1.create_job('ubuntu:18.04', ['true']) b1 = b1.submit() s = b1.status() assert 'jobs' not in s def test_fail(self): b = self.client.create_batch() j = b.create_job('ubuntu:18.04', ['false']) b.submit() status = j.wait() self.assertEqual(j._get_exit_code(status, 'main'), 1) def test_running_job_log_and_status(self): b = self.client.create_batch() j = b.create_job('ubuntu:18.04', ['sleep', '300']) b = b.submit() while True: if j.status()['state'] == 'Running' or j.is_complete(): break j.log() # FIXME after batch1 goes away, check running status b.cancel() def test_deleted_job_log(self): b = self.client.create_batch() j = b.create_job('ubuntu:18.04', ['echo', 'test']) b = b.submit() j.wait() b.delete() try: j.log() except aiohttp.ClientResponseError as e: if e.status == 404: pass else: self.assertTrue(False, f"batch should have deleted log {e}") def test_delete_batch(self): b = self.client.create_batch() j = b.create_job('ubuntu:18.04', ['sleep', '30']) b = b.submit() b.delete() # verify doesn't exist try: self.client.get_job(*j.id) except aiohttp.ClientResponseError as e: if e.status == 404: pass else: raise def test_cancel_batch(self): b = self.client.create_batch() j = b.create_job('ubuntu:18.04', ['sleep', '30']) b = b.submit() status = j.status() assert status['state'] in ('Ready', 'Running'), status b.cancel() status = j.wait() assert status['state'] == 'Cancelled', status assert 'log' not in status, status # cancelled job has no log try: j.log() except aiohttp.ClientResponseError as e: if e.status == 404: pass else: raise def test_get_nonexistent_job(self): try: self.client.get_job(1, 666) except aiohttp.ClientResponseError as e: if e.status == 404: pass else: raise def test_get_job(self): b = self.client.create_batch() j = b.create_job('ubuntu:18.04', ['true']) b.submit() j2 = self.client.get_job(*j.id) status2 = j2.status() assert (status2['batch_id'], status2['job_id']) == j.id def test_batch(self): b = self.client.create_batch() j1 = b.create_job('ubuntu:18.04', ['false']) j2 = b.create_job('ubuntu:18.04', ['sleep', '1']) j3 = b.create_job('ubuntu:18.04', ['sleep', '30']) b = b.submit() j1.wait() j2.wait() b.cancel() b.wait() bstatus = legacy_batch_status(b) assert len(bstatus['jobs']) == 3, bstatus state_count = collections.Counter( [j['state'] for j in bstatus['jobs']]) n_cancelled = state_count['Cancelled'] n_complete = state_count['Error'] + state_count[ 'Failed'] + state_count['Success'] assert n_cancelled <= 1, bstatus assert n_cancelled + n_complete == 3, bstatus n_failed = sum([ j['exit_code'] > 0 for j in bstatus['jobs'] if j['state'] in ('Failed', 'Error') ]) assert n_failed == 1, bstatus def test_batch_status(self): b1 = self.client.create_batch() b1.create_job('ubuntu:18.04', ['true']) b1 = b1.submit() b1.wait() b1s = b1.status() assert b1s['complete'] and b1s['state'] == 'success', b1s b2 = self.client.create_batch() b2.create_job('ubuntu:18.04', ['false']) b2.create_job('ubuntu:18.04', ['true']) b2 = b2.submit() b2.wait() b2s = b2.status() assert b2s['complete'] and b2s['state'] == 'failure', b2s b3 = self.client.create_batch() b3.create_job('ubuntu:18.04', ['sleep', '30']) b3 = b3.submit() b3s = b3.status() assert not b3s['complete'] and b3s['state'] == 'running', b3s b3.cancel() b4 = self.client.create_batch() b4.create_job('ubuntu:18.04', ['sleep', '30']) b4 = b4.submit() b4.cancel() b4.wait() b4s = b4.status() assert b4s['complete'] and b4s['state'] == 'cancelled', b4s def test_log_after_failing_job(self): b = self.client.create_batch() j = b.create_job('ubuntu:18.04', ['/bin/sh', '-c', 'echo test; exit 127']) b.submit() status = j.wait() self.assertTrue('attributes' not in status) self.assertEqual(status['state'], 'Failed') self.assertEqual(j._get_exit_code(status, 'main'), 127) self.assertEqual(j.log()['main'], 'test\n') self.assertTrue(j.is_complete()) def test_authorized_users_only(self): endpoints = [ (requests.get, '/api/v1alpha/batches/0/jobs/0', 401), (requests.get, '/api/v1alpha/batches/0/jobs/0/log', 401), (requests.get, '/api/v1alpha/batches', 401), (requests.post, '/api/v1alpha/batches/create', 401), (requests.post, '/api/v1alpha/batches/0/jobs/create', 401), (requests.get, '/api/v1alpha/batches/0', 401), (requests.delete, '/api/v1alpha/batches/0', 401), (requests.patch, '/api/v1alpha/batches/0/close', 401), # redirect to auth/login (requests.get, '/batches', 302), (requests.get, '/batches/0', 302), (requests.post, '/batches/0/cancel', 401), (requests.get, '/batches/0/jobs/0', 302) ] for f, url, expected in endpoints: full_url = deploy_config.url('batch', url) r = f(full_url, allow_redirects=False) assert r.status_code == expected, (full_url, r, expected) def test_bad_token(self): token = base64.urlsafe_b64encode( secrets.token_bytes(32)).decode('ascii') bc = BatchClient('test', _token=token) try: b = bc.create_batch() j = b.create_job('ubuntu:18.04', ['false']) b.submit() assert False, j except aiohttp.ClientResponseError as e: assert e.status == 401, e finally: bc.close() def test_gcr_image(self): builder = self.client.create_batch() j = builder.create_job(os.environ['HAIL_BASE_IMAGE'], ['echo', 'test']) b = builder.submit() status = j.wait() self.assertEqual(status['state'], 'Success', (status, j.log())) def test_service_account(self): b = self.client.create_batch() j = b.create_job( os.environ['CI_UTILS_IMAGE'], ['/bin/sh', '-c', 'kubectl get pods -l app=batch-driver'], service_account={ 'namespace': os.environ['HAIL_BATCH_PODS_NAMESPACE'], 'name': 'ci-agent' }) b.submit() status = j.wait() assert j._get_exit_code(status, 'main') == 0, status def test_port(self): builder = self.client.create_batch() j = builder.create_job('ubuntu:18.04', [ 'bash', '-c', ''' echo $HAIL_BATCH_WORKER_PORT echo $HAIL_BATCH_WORKER_IP ''' ], port=5000) b = builder.submit() batch = b.wait() print(j.log()) assert batch['state'] == 'success', batch def test_client_max_size(self): builder = self.client.create_batch() for i in range(4): builder.create_job('ubuntu:18.04', ['echo', 'a' * (900 * 1024)]) builder.submit() def test_restartable_insert(self): i = 0 def every_third_time(): nonlocal i i += 1 if i % 3 == 0: return True return False with FailureInjectingClientSession(every_third_time) as session: client = BatchClient('test', session=session) builder = client.create_batch() for _ in range(9): builder.create_job('ubuntu:18.04', ['echo', 'a']) b = builder.submit(max_bunch_size=1) b = self.client.get_batch( b.id ) # get a batch untainted by the FailureInjectingClientSession batch = b.wait() assert batch['state'] == 'success', batch assert len(list(b.jobs())) == 9 def test_create_idempotence(self): builder = self.client.create_batch() builder.create_job('ubuntu:18.04', ['/bin/true']) batch_token = secrets.token_urlsafe(32) b = builder._create(batch_token=batch_token) b2 = builder._create(batch_token=batch_token) assert b.id == b2.id def test_batch_create_validation(self): bad_configs = [ # unexpected field fleep { 'billing_project': 'foo', 'n_jobs': 5, 'token': 'baz', 'fleep': 'quam' }, # billing project None/missing { 'billing_project': None, 'n_jobs': 5, 'token': 'baz' }, { 'n_jobs': 5, 'token': 'baz' }, # n_jobs None/missing { 'billing_project': 'foo', 'n_jobs': None, 'token': 'baz' }, { 'billing_project': 'foo', 'token': 'baz' }, # n_jobs wrong type { 'billing_project': 'foo', 'n_jobs': '5', 'token': 'baz' }, # token None/missing { 'billing_project': 'foo', 'n_jobs': 5, 'token': None }, { 'billing_project': 'foo', 'n_jobs': 5 }, # attribute key/value None { 'attributes': { 'k': None }, 'billing_project': 'foo', 'n_jobs': 5, 'token': 'baz' }, ] url = deploy_config.url('batch', '/api/v1alpha/batches/create') headers = service_auth_headers(deploy_config, 'batch') for config in bad_configs: r = requests.post(url, json=config, allow_redirects=True, headers=headers) assert r.status_code == 400, (config, r)
class Test(unittest.TestCase): def setUp(self): self.client = BatchClient() def tearDown(self): self.client.close() def test_job(self): builder = self.client.create_batch() j = builder.create_job('alpine', ['echo', 'test']) builder.submit() status = j.wait() self.assertTrue('attributes' not in status, (status, j.log())) self.assertEqual(status['state'], 'Success', (status, j.log())) self.assertEqual(status['exit_code']['main'], 0, (status, j.log())) self.assertEqual(j.log()['main'], 'test\n', status) j.pod_status() self.assertTrue(j.is_complete()) def test_attributes(self): a = {'name': 'test_attributes', 'foo': 'bar'} builder = self.client.create_batch() j = builder.create_job('alpine', ['true'], attributes=a) builder.submit() status = j.status() assert (status['attributes'] == a) def test_unsubmitted_state(self): builder = self.client.create_batch() j = builder.create_job('alpine', ['echo', 'test']) with self.assertRaises(ValueError): j.batch_id with self.assertRaises(ValueError): j.id with self.assertRaises(ValueError): j.status() with self.assertRaises(ValueError): j.is_complete() with self.assertRaises(ValueError): j.log() with self.assertRaises(ValueError): j.pod_status() with self.assertRaises(ValueError): j.wait() builder.submit() with self.assertRaises(ValueError): builder.create_job('alpine', ['echo', 'test']) def test_list_batches(self): tag = secrets.token_urlsafe(64) b1 = self.client.create_batch(attributes={'tag': tag, 'name': 'b1'}) b1.create_job('alpine', ['sleep', '3600']) b1 = b1.submit() b2 = self.client.create_batch(attributes={'tag': tag, 'name': 'b2'}) b2.create_job('alpine', ['echo', 'test']) b2 = b2.submit() def assert_batch_ids(expected, complete=None, success=None, attributes=None): batches = self.client.list_batches(complete=complete, success=success, attributes=attributes) # list_batches returns all batches for all prev run tests actual = set([b.id for b in batches]).intersection({b1.id, b2.id}) self.assertEqual(actual, expected) assert_batch_ids({b1.id, b2.id}, attributes={'tag': tag}) b2.wait() assert_batch_ids({b1.id}, complete=False, attributes={'tag': tag}) assert_batch_ids({b2.id}, complete=True, attributes={'tag': tag}) assert_batch_ids({b1.id}, success=False, attributes={'tag': tag}) assert_batch_ids({b2.id}, success=True, attributes={'tag': tag}) b1.cancel() b1.wait() assert_batch_ids({b1.id}, success=False, attributes={'tag': tag}) assert_batch_ids({b2.id}, success=True, attributes={'tag': tag}) assert_batch_ids(set(), complete=False, attributes={'tag': tag}) assert_batch_ids({b1.id, b2.id}, complete=True, attributes={'tag': tag}) assert_batch_ids({b2.id}, attributes={'tag': tag, 'name': 'b2'}) def test_limit_offset(self): b1 = self.client.create_batch() for i in range(3): b1.create_job('alpine', ['true']) b1 = b1.submit() s = b1.status(limit=2, offset=1) filtered_jobs = {j['job_id'] for j in s['jobs']} assert filtered_jobs == {2, 3}, s def test_fail(self): b = self.client.create_batch() j = b.create_job('alpine', ['false']) b.submit() status = j.wait() self.assertEqual(status['exit_code']['main'], 1) def test_deleted_job_log(self): b = self.client.create_batch() j = b.create_job('alpine', ['echo', 'test']) b = b.submit() j.wait() b.delete() try: j.log() except aiohttp.ClientResponseError as e: if e.status == 404: pass else: self.assertTrue(False, f"batch should have deleted log {e}") def test_delete_batch(self): b = self.client.create_batch() j = b.create_job('alpine', ['sleep', '30']) b = b.submit() b.delete() # verify doesn't exist try: self.client.get_job(*j.id) except aiohttp.ClientResponseError as e: if e.status == 404: pass else: raise def test_cancel_batch(self): b = self.client.create_batch() j = b.create_job('alpine', ['sleep', '30']) b = b.submit() status = j.status() assert status['state'] in ('Ready', 'Running'), status b.cancel() status = j.wait() assert status['state'] == 'Cancelled', status assert 'log' not in status, status # cancelled job has no log try: j.log() except aiohttp.ClientResponseError as e: if e.status == 404: pass else: raise def test_get_nonexistent_job(self): try: self.client.get_job(1, 666) except aiohttp.ClientResponseError as e: if e.status == 404: pass else: raise def test_get_job(self): b = self.client.create_batch() j = b.create_job('alpine', ['true']) b.submit() j2 = self.client.get_job(*j.id) status2 = j2.status() assert (status2['batch_id'], status2['job_id']) == j.id def test_batch(self): b = self.client.create_batch() j1 = b.create_job('alpine', ['false']) j2 = b.create_job('alpine', ['sleep', '1']) j3 = b.create_job('alpine', ['sleep', '30']) b = b.submit() j1.wait() j2.wait() b.cancel() bstatus = b.wait() assert len(bstatus['jobs']) == 3, bstatus state_count = collections.Counter( [j['state'] for j in bstatus['jobs']]) n_cancelled = state_count['Cancelled'] n_complete = state_count['Error'] + state_count[ 'Failed'] + state_count['Success'] assert n_cancelled <= 1, bstatus assert n_cancelled + n_complete == 3, bstatus n_failed = sum([ j['exit_code']['main'] > 0 for j in bstatus['jobs'] if j['state'] in ('Failed', 'Error') ]) assert n_failed == 1, bstatus def test_batch_status(self): b1 = self.client.create_batch() b1.create_job('alpine', ['true']) b1 = b1.submit() b1.wait() b1s = b1.status() assert b1s['complete'] and b1s['state'] == 'success', b1s b2 = self.client.create_batch() b2.create_job('alpine', ['false']) b2.create_job('alpine', ['true']) b2 = b2.submit() b2.wait() b2s = b2.status() assert b2s['complete'] and b2s['state'] == 'failure', b2s b3 = self.client.create_batch() b3.create_job('alpine', ['sleep', '30']) b3 = b3.submit() b3s = b3.status() assert not b3s['complete'] and b3s['state'] == 'running', b3s b3.cancel() b4 = self.client.create_batch() b4.create_job('alpine', ['sleep', '30']) b4 = b4.submit() b4.cancel() b4.wait() b4s = b4.status() assert b4s['complete'] and b4s['state'] == 'cancelled', b4s def test_log_after_failing_job(self): b = self.client.create_batch() j = b.create_job('alpine', ['/bin/sh', '-c', 'echo test; exit 127']) b.submit() status = j.wait() self.assertTrue('attributes' not in status) self.assertEqual(status['state'], 'Failed') self.assertEqual(status['exit_code']['main'], 127) self.assertEqual(j.log()['main'], 'test\n') self.assertTrue(j.is_complete()) def test_authorized_users_only(self): deploy_config = get_deploy_config() endpoints = [ (requests.get, '/api/v1alpha/batches/0/jobs/0', 401), (requests.get, '/api/v1alpha/batches/0/jobs/0/log', 401), (requests.get, '/api/v1alpha/batches/0/jobs/0/pod_status', 401), (requests.get, '/api/v1alpha/batches', 401), (requests.post, '/api/v1alpha/batches/create', 401), (requests.post, '/api/v1alpha/batches/0/jobs/create', 401), (requests.get, '/api/v1alpha/batches/0', 401), (requests.delete, '/api/v1alpha/batches/0', 401), (requests.patch, '/api/v1alpha/batches/0/close', 401), # redirect to auth/login (requests.get, '/batches', 302), (requests.get, '/batches/0', 302), (requests.get, '/batches/0/jobs/0/log', 302) ] for f, url, expected in endpoints: r = f(deploy_config.url('batch', url)) assert r.status_code == 401, r def test_bad_token(self): token = base64.urlsafe_b64encode( secrets.token_bytes(32)).decode('ascii') bc = BatchClient(_token=token) try: b = bc.create_batch() j = b.create_job('alpine', ['false']) b.submit() assert False, j except aiohttp.ClientResponseError as e: assert e.status == 401, e finally: bc.close()
class Test(unittest.TestCase): def setUp(self): self.client = BatchClient() def tearDown(self): self.client.close() def test_job(self): builder = self.client.create_batch() j = builder.create_job('ubuntu:18.04', ['echo', 'test']) b = builder.submit() status = j.wait() self.assertTrue('attributes' not in status, (status, j.log())) self.assertEqual(status['state'], 'Success', (status, j.log())) self.assertEqual(j._get_exit_code(status, 'main'), 0, (status, j.log())) self.assertEqual(j.log()['main'], 'test\n', status) self.assertTrue(j.is_complete()) def test_attributes(self): a = {'name': 'test_attributes', 'foo': 'bar'} builder = self.client.create_batch() j = builder.create_job('ubuntu:18.04', ['true'], attributes=a) builder.submit() status = j.status() assert (status['attributes'] == a) def test_garbage_image(self): builder = self.client.create_batch() j = builder.create_job('dsafaaadsf', ['echo', 'test']) builder.submit() status = j.wait() assert j._get_exit_codes(status) == {'main': None}, status assert j._get_error(status, 'main') is not None assert status['state'] == 'Error', status def test_bad_command(self): builder = self.client.create_batch() j = builder.create_job('ubuntu:18.04', ['sleep 5']) builder.submit() status = j.wait() assert j._get_exit_codes(status) == {'main': None}, status assert j._get_error(status, 'main') is not None assert status['state'] == 'Error', status def test_invalid_resource_requests(self): builder = self.client.create_batch() resources = {'cpu': '1', 'memory': '28Gi'} builder.create_job('ubuntu:18.04', ['true'], resources=resources) with self.assertRaisesRegex(aiohttp.client.ClientResponseError, 'resource requests.*unsatisfiable'): builder.submit() builder = self.client.create_batch() resources = {'cpu': '0', 'memory': '1Gi'} builder.create_job('ubuntu:18.04', ['true'], resources=resources) with self.assertRaisesRegex(aiohttp.client.ClientResponseError, 'bad resource request.*cpu cannot be 0'): builder.submit() def test_out_of_memory(self): builder = self.client.create_batch() resources = {'cpu': '0.1', 'memory': '10M'} j = builder.create_job('python:3.6-slim-stretch', ['python', '-c', 'x = "a" * 400 * 1000**2'], resources=resources) builder.submit() status = j.wait() assert j._get_out_of_memory(status, 'main') def test_unsubmitted_state(self): builder = self.client.create_batch() j = builder.create_job('ubuntu:18.04', ['echo', 'test']) with self.assertRaises(ValueError): j.batch_id with self.assertRaises(ValueError): j.id with self.assertRaises(ValueError): j.status() with self.assertRaises(ValueError): j.is_complete() with self.assertRaises(ValueError): j.log() with self.assertRaises(ValueError): j.wait() builder.submit() with self.assertRaises(ValueError): builder.create_job('ubuntu:18.04', ['echo', 'test']) def test_list_batches(self): tag = secrets.token_urlsafe(64) b1 = self.client.create_batch(attributes={'tag': tag, 'name': 'b1'}) b1.create_job('ubuntu:18.04', ['sleep', '3600']) b1 = b1.submit() b2 = self.client.create_batch(attributes={'tag': tag, 'name': 'b2'}) b2.create_job('ubuntu:18.04', ['echo', 'test']) b2 = b2.submit() def assert_batch_ids(expected, complete=None, success=None, attributes=None): batches = self.client.list_batches(complete=complete, success=success, attributes=attributes) # list_batches returns all batches for all prev run tests actual = set([b.id for b in batches]).intersection({b1.id, b2.id}) self.assertEqual(actual, expected) assert_batch_ids({b1.id, b2.id}) assert_batch_ids({b1.id, b2.id}, attributes={'tag': tag}) b2.wait() assert_batch_ids({b1.id}, complete=False, attributes={'tag': tag}) assert_batch_ids({b2.id}, complete=True, attributes={'tag': tag}) assert_batch_ids({b1.id}, success=False, attributes={'tag': tag}) assert_batch_ids({b2.id}, success=True, attributes={'tag': tag}) b1.cancel() b1.wait() assert_batch_ids({b1.id}, success=False, attributes={'tag': tag}) assert_batch_ids({b2.id}, success=True, attributes={'tag': tag}) assert_batch_ids(set(), complete=False, attributes={'tag': tag}) assert_batch_ids({b1.id, b2.id}, complete=True, attributes={'tag': tag}) assert_batch_ids({b2.id}, attributes={'tag': tag, 'name': 'b2'}) def test_include_jobs(self): b1 = self.client.create_batch() for i in range(2): b1.create_job('ubuntu:18.04', ['true']) b1 = b1.submit() s = b1.status(include_jobs=False) assert 'jobs' not in s def test_fail(self): b = self.client.create_batch() j = b.create_job('ubuntu:18.04', ['false']) b.submit() status = j.wait() self.assertEqual(j._get_exit_code(status, 'main'), 1) def test_running_job_log_and_status(self): b = self.client.create_batch() j = b.create_job('ubuntu:18.04', ['sleep', '300']) b = b.submit() while True: if j.status()['state'] == 'Running' or j.is_complete(): break j.log() # FIXME after batch1 goes away, check running status b.cancel() def test_deleted_job_log(self): b = self.client.create_batch() j = b.create_job('ubuntu:18.04', ['echo', 'test']) b = b.submit() j.wait() b.delete() try: j.log() except aiohttp.ClientResponseError as e: if e.status == 404: pass else: self.assertTrue(False, f"batch should have deleted log {e}") def test_delete_batch(self): b = self.client.create_batch() j = b.create_job('ubuntu:18.04', ['sleep', '30']) b = b.submit() b.delete() # verify doesn't exist try: self.client.get_job(*j.id) except aiohttp.ClientResponseError as e: if e.status == 404: pass else: raise def test_cancel_batch(self): b = self.client.create_batch() j = b.create_job('ubuntu:18.04', ['sleep', '30']) b = b.submit() status = j.status() assert status['state'] in ('Ready', 'Running'), status b.cancel() status = j.wait() assert status['state'] == 'Cancelled', status assert 'log' not in status, status # cancelled job has no log try: j.log() except aiohttp.ClientResponseError as e: if e.status == 404: pass else: raise def test_get_nonexistent_job(self): try: self.client.get_job(1, 666) except aiohttp.ClientResponseError as e: if e.status == 404: pass else: raise def test_get_job(self): b = self.client.create_batch() j = b.create_job('ubuntu:18.04', ['true']) b.submit() j2 = self.client.get_job(*j.id) status2 = j2.status() assert (status2['batch_id'], status2['job_id']) == j.id def test_batch(self): b = self.client.create_batch() j1 = b.create_job('ubuntu:18.04', ['false']) j2 = b.create_job('ubuntu:18.04', ['sleep', '1']) j3 = b.create_job('ubuntu:18.04', ['sleep', '30']) b = b.submit() j1.wait() j2.wait() b.cancel() bstatus = b.wait() assert len(bstatus['jobs']) == 3, bstatus state_count = collections.Counter( [j['state'] for j in bstatus['jobs']]) n_cancelled = state_count['Cancelled'] n_complete = state_count['Error'] + state_count[ 'Failed'] + state_count['Success'] assert n_cancelled <= 1, bstatus assert n_cancelled + n_complete == 3, bstatus n_failed = sum([ Job._get_exit_code(j, 'main') > 0 for j in bstatus['jobs'] if j['state'] in ('Failed', 'Error') ]) assert n_failed == 1, bstatus def test_batch_status(self): b1 = self.client.create_batch() b1.create_job('ubuntu:18.04', ['true']) b1 = b1.submit() b1.wait() b1s = b1.status() assert b1s['complete'] and b1s['state'] == 'success', b1s b2 = self.client.create_batch() b2.create_job('ubuntu:18.04', ['false']) b2.create_job('ubuntu:18.04', ['true']) b2 = b2.submit() b2.wait() b2s = b2.status() assert b2s['complete'] and b2s['state'] == 'failure', b2s b3 = self.client.create_batch() b3.create_job('ubuntu:18.04', ['sleep', '30']) b3 = b3.submit() b3s = b3.status() assert not b3s['complete'] and b3s['state'] == 'running', b3s b3.cancel() b4 = self.client.create_batch() b4.create_job('ubuntu:18.04', ['sleep', '30']) b4 = b4.submit() b4.cancel() b4.wait() b4s = b4.status() assert b4s['complete'] and b4s['state'] == 'cancelled', b4s def test_log_after_failing_job(self): b = self.client.create_batch() j = b.create_job('ubuntu:18.04', ['/bin/sh', '-c', 'echo test; exit 127']) b.submit() status = j.wait() self.assertTrue('attributes' not in status) self.assertEqual(status['state'], 'Failed') self.assertEqual(j._get_exit_code(status, 'main'), 127) self.assertEqual(j.log()['main'], 'test\n') self.assertTrue(j.is_complete()) def test_authorized_users_only(self): endpoints = [ (requests.get, '/api/v1alpha/batches/0/jobs/0', 401), (requests.get, '/api/v1alpha/batches/0/jobs/0/log', 401), (requests.get, '/api/v1alpha/batches', 401), (requests.post, '/api/v1alpha/batches/create', 401), (requests.post, '/api/v1alpha/batches/0/jobs/create', 401), (requests.get, '/api/v1alpha/batches/0', 401), (requests.delete, '/api/v1alpha/batches/0', 401), (requests.patch, '/api/v1alpha/batches/0/close', 401), # redirect to auth/login (requests.get, '/batches', 302), (requests.get, '/batches/0', 302), (requests.post, '/batches/0/cancel', 401), (requests.get, '/batches/0/jobs/0', 302) ] for f, url, expected in endpoints: full_url = deploy_config.url('batch2', url) r = f(full_url, allow_redirects=False) assert r.status_code == expected, (full_url, r, expected) def test_bad_token(self): token = base64.urlsafe_b64encode( secrets.token_bytes(32)).decode('ascii') bc = BatchClient(_token=token) try: b = bc.create_batch() j = b.create_job('ubuntu:18.04', ['false']) b.submit() assert False, j except aiohttp.ClientResponseError as e: assert e.status == 401, e finally: bc.close() def test_gcr_image(self): builder = self.client.create_batch() j = builder.create_job(os.environ['HAIL_BASE_IMAGE'], ['echo', 'test']) b = builder.submit() status = j.wait() self.assertEqual(status['state'], 'Success', (status, j.log())) def test_service_account(self): b = self.client.create_batch() j = b.create_job( os.environ['CI_UTILS_IMAGE'], ['/bin/sh', '-c', 'kubectl get pods -l app=batch2-driver'], service_account={ 'namespace': os.environ['HAIL_BATCH_PODS_NAMESPACE'], 'name': 'ci-agent' }) b.submit() status = j.wait() assert j._get_exit_code(status, 'main') == 0, status