Exemplo n.º 1
0
def main(args):
    if not args:
        parser().print_help()
        sys.exit(0)
    jmp = {
        'billing': billing,
        'list': list_batches,
        'delete': delete,
        'get': get,
        'cancel': cancel,
        'log': log,
        'job': job,
        'wait': wait
    }

    args, pass_through_args = parser().parse_known_args(args=args)

    # hailctl batch doesn't create batches
    client = BatchClient(None)

    try:
        if args.module == 'billing':
            from .billing import cli  # pylint: disable=import-outside-toplevel
            cli.main(args, pass_through_args, client)
            return

        jmp[args.module].main(args, pass_through_args, client)
    finally:
        client.close()
Exemplo n.º 2
0
Arquivo: cli.py Projeto: mpilo24a/hail
def main(args):
    if not args:
        parser().print_help()
        sys.exit(0)
    jmp = {
        'list': list_batches,
        'delete': delete,
        'get': get,
        'cancel': cancel,
        'log': log,
        'pod_status': pod_status,
        'wait': wait
    }

    args, pass_through_args = parser().parse_known_args(args=args)

    session = aiohttp.ClientSession(
        raise_for_status=True,
        timeout=aiohttp.ClientTimeout(total=60))
    client = BatchClient(session, url=args.master_url)

    try:
        jmp[args.module].main(args, pass_through_args, client)
    finally:
        client.close()
Exemplo n.º 3
0
def client():
    session = aiohttp.ClientSession(
        raise_for_status=True,
        timeout=aiohttp.ClientTimeout(total=60))
    client = BatchClient(session, url=os.environ.get('BATCH_URL'))
    yield client
    client.close()
Exemplo n.º 4
0
 def test_bad_token(self):
     token = base64.urlsafe_b64encode(secrets.token_bytes(32)).decode('ascii')
     bc = BatchClient(_token=token, _service='batch2')
     try:
         b = bc.create_batch()
         j = b.create_job('ubuntu:18.04', ['false'])
         b.submit()
         assert False, j
     except aiohttp.ClientResponseError as e:
         assert e.status == 401, e
     finally:
         bc.close()
Exemplo n.º 5
0
 def test_bad_jwt_key(self):
     fname = pkg_resources.resource_filename(__name__, 'jwt-test-user.json')
     with open(fname) as f:
         userdata = json.loads(f.read())
     token = hj.JWTClient(hj.JWTClient.generate_key()).encode(userdata)
     session = aiohttp.ClientSession(
         raise_for_status=True, timeout=aiohttp.ClientTimeout(total=60))
     bc = BatchClient(session, url=os.environ.get('BATCH_URL'), token=token)
     try:
         b = bc.create_batch()
         j = b.create_job('alpine', ['false'])
         b.submit()
         assert False, j
     except aiohttp.ClientResponseError as e:
         if e.status == 401:
             pass
         else:
             assert False, e
     finally:
         bc.close()
Exemplo n.º 6
0
def main(args):
    if not args:
        parser().print_help()
        sys.exit(0)
    jmp = {
        'list': list_batches,
        'delete': delete,
        'get': get,
        'cancel': cancel,
        'log': log,
        'pod_status': pod_status,
        'wait': wait
    }

    args, pass_through_args = parser().parse_known_args(args=args)

    client = BatchClient()

    try:
        jmp[args.module].main(args, pass_through_args, client)
    finally:
        client.close()
Exemplo n.º 7
0
def main(args):
    if not args:
        parser().print_help()
        sys.exit(0)
    jmp = {
        'list': list_batches,
        'delete': delete,
        'get': get,
        'cancel': cancel,
        'log': log,
        'job': job,
        'wait': wait
    }

    args, pass_through_args = parser().parse_known_args(args=args)

    # hailctl batch doesn't create batches
    client = BatchClient(None)

    try:
        jmp[args.module].main(args, pass_through_args, client)
    finally:
        client.close()
Exemplo n.º 8
0
class Test(unittest.TestCase):
    def setUp(self):
        self.client = BatchClient('test')

    def tearDown(self):
        self.client.close()

    def test_job(self):
        builder = self.client.create_batch()
        j = builder.create_job('ubuntu:18.04', ['echo', 'test'])
        b = builder.submit()
        status = j.wait()
        self.assertTrue('attributes' not in status, (status, j.log()))
        self.assertEqual(status['state'], 'Success', (status, j.log()))
        self.assertEqual(status['exit_code'], 0, status)
        self.assertEqual(j._get_exit_code(status, 'main'), 0,
                         (status, j.log()))

        self.assertEqual(j.log()['main'], 'test\n', status)

    def test_exit_code_duration(self):
        builder = self.client.create_batch()
        j = builder.create_job('ubuntu:18.04', ['bash', '-c', 'exit 7'])
        b = builder.submit()
        status = j.wait()
        self.assertEqual(status['exit_code'], 7, status)
        assert isinstance(status['duration'], int)
        self.assertEqual(j._get_exit_code(status, 'main'), 7, status)

    def test_msec_mcpu(self):
        builder = self.client.create_batch()
        resources = {'cpu': '100m', 'memory': '375M'}
        # two jobs so the batch msec_mcpu computation is non-trivial
        builder.create_job('ubuntu:18.04', ['echo', 'foo'],
                           resources=resources)
        builder.create_job('ubuntu:18.04', ['echo', 'bar'],
                           resources=resources)
        b = builder.submit()

        batch = b.wait()
        assert batch['state'] == 'success', batch

        batch_msec_mcpu2 = 0
        for job in b.jobs():
            # I'm dying
            job = self.client.get_job(job['batch_id'], job['job_id'])
            job = job.status()

            # runs at 100mcpu
            job_msec_mcpu2 = 100 * max(
                job['status']['end_time'] - job['status']['start_time'], 0)
            # greater than in case there are multiple attempts
            assert job['msec_mcpu'] >= job_msec_mcpu2, batch

            batch_msec_mcpu2 += job_msec_mcpu2

        assert batch['msec_mcpu'] == batch_msec_mcpu2, batch

    def test_attributes(self):
        a = {'name': 'test_attributes', 'foo': 'bar'}
        builder = self.client.create_batch()
        j = builder.create_job('ubuntu:18.04', ['true'], attributes=a)
        builder.submit()
        assert (j.attributes() == a)

    def test_garbage_image(self):
        builder = self.client.create_batch()
        j = builder.create_job('dsafaaadsf', ['echo', 'test'])
        builder.submit()
        status = j.wait()
        assert j._get_exit_codes(status) == {'main': None}, status
        assert j._get_error(status, 'main') is not None
        assert status['state'] == 'Error', status

    def test_bad_command(self):
        builder = self.client.create_batch()
        j = builder.create_job('ubuntu:18.04', ['sleep 5'])
        builder.submit()
        status = j.wait()
        assert j._get_exit_codes(status) == {'main': None}, status
        assert j._get_error(status, 'main') is not None
        assert status['state'] == 'Error', status

    def test_invalid_resource_requests(self):
        builder = self.client.create_batch()
        resources = {'cpu': '1', 'memory': '250Gi'}
        builder.create_job('ubuntu:18.04', ['true'], resources=resources)
        with self.assertRaisesRegex(aiohttp.client.ClientResponseError,
                                    'resource requests.*unsatisfiable'):
            builder.submit()

        builder = self.client.create_batch()
        resources = {'cpu': '0', 'memory': '1Gi'}
        builder.create_job('ubuntu:18.04', ['true'], resources=resources)
        with self.assertRaisesRegex(aiohttp.client.ClientResponseError,
                                    'bad resource request.*cpu cannot be 0'):
            builder.submit()

    def test_out_of_memory(self):
        builder = self.client.create_batch()
        resources = {'cpu': '0.1', 'memory': '10M'}
        j = builder.create_job('python:3.6-slim-stretch',
                               ['python', '-c', 'x = "a" * 400 * 1000**2'],
                               resources=resources)
        builder.submit()
        status = j.wait()
        assert j._get_out_of_memory(status, 'main')

    def test_unsubmitted_state(self):
        builder = self.client.create_batch()
        j = builder.create_job('ubuntu:18.04', ['echo', 'test'])

        with self.assertRaises(ValueError):
            j.batch_id
        with self.assertRaises(ValueError):
            j.id
        with self.assertRaises(ValueError):
            j.status()
        with self.assertRaises(ValueError):
            j.is_complete()
        with self.assertRaises(ValueError):
            j.log()
        with self.assertRaises(ValueError):
            j.wait()

        builder.submit()
        with self.assertRaises(ValueError):
            builder.create_job('ubuntu:18.04', ['echo', 'test'])

    def test_list_batches(self):
        tag = secrets.token_urlsafe(64)
        b1 = self.client.create_batch(attributes={'tag': tag, 'name': 'b1'})
        b1.create_job('ubuntu:18.04', ['sleep', '3600'])
        b1 = b1.submit()

        b2 = self.client.create_batch(attributes={'tag': tag, 'name': 'b2'})
        b2.create_job('ubuntu:18.04', ['echo', 'test'])
        b2 = b2.submit()

        def assert_batch_ids(expected, q=None):
            batches = self.client.list_batches(q)
            # list_batches returns all batches for all prev run tests
            actual = set([b.id for b in batches]).intersection({b1.id, b2.id})
            self.assertEqual(actual, expected)

        assert_batch_ids({b1.id, b2.id})

        assert_batch_ids({b1.id, b2.id}, f'tag={tag}')

        b2.wait()

        assert_batch_ids({b1.id}, f'!complete tag={tag}')
        assert_batch_ids({b2.id}, f'complete tag={tag}')

        assert_batch_ids({b1.id}, f'!success tag={tag}')
        assert_batch_ids({b2.id}, f'success tag={tag}')

        b1.cancel()
        b1.wait()

        assert_batch_ids({b1.id}, f'!success tag={tag}')
        assert_batch_ids({b2.id}, f'success tag={tag}')

        assert_batch_ids(set(), f'!complete tag={tag}')
        assert_batch_ids({b1.id, b2.id}, f'complete tag={tag}')

        assert_batch_ids({b2.id}, f'tag={tag} name=b2')

    def test_include_jobs(self):
        b1 = self.client.create_batch()
        for i in range(2):
            b1.create_job('ubuntu:18.04', ['true'])
        b1 = b1.submit()
        s = b1.status()
        assert 'jobs' not in s

    def test_fail(self):
        b = self.client.create_batch()
        j = b.create_job('ubuntu:18.04', ['false'])
        b.submit()
        status = j.wait()
        self.assertEqual(j._get_exit_code(status, 'main'), 1)

    def test_running_job_log_and_status(self):
        b = self.client.create_batch()
        j = b.create_job('ubuntu:18.04', ['sleep', '300'])
        b = b.submit()

        while True:
            if j.status()['state'] == 'Running' or j.is_complete():
                break

        j.log()
        # FIXME after batch1 goes away, check running status
        b.cancel()

    def test_deleted_job_log(self):
        b = self.client.create_batch()
        j = b.create_job('ubuntu:18.04', ['echo', 'test'])
        b = b.submit()
        j.wait()
        b.delete()

        try:
            j.log()
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                self.assertTrue(False, f"batch should have deleted log {e}")

    def test_delete_batch(self):
        b = self.client.create_batch()
        j = b.create_job('ubuntu:18.04', ['sleep', '30'])
        b = b.submit()
        b.delete()

        # verify doesn't exist
        try:
            self.client.get_job(*j.id)
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                raise

    def test_cancel_batch(self):
        b = self.client.create_batch()
        j = b.create_job('ubuntu:18.04', ['sleep', '30'])
        b = b.submit()

        status = j.status()
        assert status['state'] in ('Ready', 'Running'), status

        b.cancel()

        status = j.wait()
        assert status['state'] == 'Cancelled', status
        assert 'log' not in status, status

        # cancelled job has no log
        try:
            j.log()
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                raise

    def test_get_nonexistent_job(self):
        try:
            self.client.get_job(1, 666)
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                raise

    def test_get_job(self):
        b = self.client.create_batch()
        j = b.create_job('ubuntu:18.04', ['true'])
        b.submit()

        j2 = self.client.get_job(*j.id)
        status2 = j2.status()
        assert (status2['batch_id'], status2['job_id']) == j.id

    def test_batch(self):
        b = self.client.create_batch()
        j1 = b.create_job('ubuntu:18.04', ['false'])
        j2 = b.create_job('ubuntu:18.04', ['sleep', '1'])
        j3 = b.create_job('ubuntu:18.04', ['sleep', '30'])
        b = b.submit()

        j1.wait()
        j2.wait()
        b.cancel()
        b.wait()
        bstatus = legacy_batch_status(b)

        assert len(bstatus['jobs']) == 3, bstatus
        state_count = collections.Counter(
            [j['state'] for j in bstatus['jobs']])
        n_cancelled = state_count['Cancelled']
        n_complete = state_count['Error'] + state_count[
            'Failed'] + state_count['Success']
        assert n_cancelled <= 1, bstatus
        assert n_cancelled + n_complete == 3, bstatus

        n_failed = sum([
            j['exit_code'] > 0 for j in bstatus['jobs']
            if j['state'] in ('Failed', 'Error')
        ])
        assert n_failed == 1, bstatus

    def test_batch_status(self):
        b1 = self.client.create_batch()
        b1.create_job('ubuntu:18.04', ['true'])
        b1 = b1.submit()
        b1.wait()
        b1s = b1.status()
        assert b1s['complete'] and b1s['state'] == 'success', b1s

        b2 = self.client.create_batch()
        b2.create_job('ubuntu:18.04', ['false'])
        b2.create_job('ubuntu:18.04', ['true'])
        b2 = b2.submit()
        b2.wait()
        b2s = b2.status()
        assert b2s['complete'] and b2s['state'] == 'failure', b2s

        b3 = self.client.create_batch()
        b3.create_job('ubuntu:18.04', ['sleep', '30'])
        b3 = b3.submit()
        b3s = b3.status()
        assert not b3s['complete'] and b3s['state'] == 'running', b3s
        b3.cancel()

        b4 = self.client.create_batch()
        b4.create_job('ubuntu:18.04', ['sleep', '30'])
        b4 = b4.submit()
        b4.cancel()
        b4.wait()
        b4s = b4.status()
        assert b4s['complete'] and b4s['state'] == 'cancelled', b4s

    def test_log_after_failing_job(self):
        b = self.client.create_batch()
        j = b.create_job('ubuntu:18.04',
                         ['/bin/sh', '-c', 'echo test; exit 127'])
        b.submit()
        status = j.wait()
        self.assertTrue('attributes' not in status)
        self.assertEqual(status['state'], 'Failed')
        self.assertEqual(j._get_exit_code(status, 'main'), 127)

        self.assertEqual(j.log()['main'], 'test\n')

        self.assertTrue(j.is_complete())

    def test_authorized_users_only(self):
        endpoints = [
            (requests.get, '/api/v1alpha/batches/0/jobs/0', 401),
            (requests.get, '/api/v1alpha/batches/0/jobs/0/log', 401),
            (requests.get, '/api/v1alpha/batches', 401),
            (requests.post, '/api/v1alpha/batches/create', 401),
            (requests.post, '/api/v1alpha/batches/0/jobs/create', 401),
            (requests.get, '/api/v1alpha/batches/0', 401),
            (requests.delete, '/api/v1alpha/batches/0', 401),
            (requests.patch, '/api/v1alpha/batches/0/close', 401),
            # redirect to auth/login
            (requests.get, '/batches', 302),
            (requests.get, '/batches/0', 302),
            (requests.post, '/batches/0/cancel', 401),
            (requests.get, '/batches/0/jobs/0', 302)
        ]
        for f, url, expected in endpoints:
            full_url = deploy_config.url('batch', url)
            r = f(full_url, allow_redirects=False)
            assert r.status_code == expected, (full_url, r, expected)

    def test_bad_token(self):
        token = base64.urlsafe_b64encode(
            secrets.token_bytes(32)).decode('ascii')
        bc = BatchClient('test', _token=token)
        try:
            b = bc.create_batch()
            j = b.create_job('ubuntu:18.04', ['false'])
            b.submit()
            assert False, j
        except aiohttp.ClientResponseError as e:
            assert e.status == 401, e
        finally:
            bc.close()

    def test_gcr_image(self):
        builder = self.client.create_batch()
        j = builder.create_job(os.environ['HAIL_BASE_IMAGE'], ['echo', 'test'])
        b = builder.submit()
        status = j.wait()

        self.assertEqual(status['state'], 'Success', (status, j.log()))

    def test_service_account(self):
        b = self.client.create_batch()
        j = b.create_job(
            os.environ['CI_UTILS_IMAGE'],
            ['/bin/sh', '-c', 'kubectl get pods -l app=batch-driver'],
            service_account={
                'namespace': os.environ['HAIL_BATCH_PODS_NAMESPACE'],
                'name': 'ci-agent'
            })
        b.submit()
        status = j.wait()
        assert j._get_exit_code(status, 'main') == 0, status

    def test_port(self):
        builder = self.client.create_batch()
        j = builder.create_job('ubuntu:18.04', [
            'bash', '-c', '''
echo $HAIL_BATCH_WORKER_PORT
echo $HAIL_BATCH_WORKER_IP
'''
        ],
                               port=5000)
        b = builder.submit()
        batch = b.wait()
        print(j.log())
        assert batch['state'] == 'success', batch

    def test_client_max_size(self):
        builder = self.client.create_batch()
        for i in range(4):
            builder.create_job('ubuntu:18.04', ['echo', 'a' * (900 * 1024)])
        builder.submit()

    def test_restartable_insert(self):
        i = 0

        def every_third_time():
            nonlocal i
            i += 1
            if i % 3 == 0:
                return True
            return False

        with FailureInjectingClientSession(every_third_time) as session:
            client = BatchClient('test', session=session)
            builder = client.create_batch()

            for _ in range(9):
                builder.create_job('ubuntu:18.04', ['echo', 'a'])

            b = builder.submit(max_bunch_size=1)
            b = self.client.get_batch(
                b.id
            )  # get a batch untainted by the FailureInjectingClientSession
            batch = b.wait()
            assert batch['state'] == 'success', batch
            assert len(list(b.jobs())) == 9

    def test_create_idempotence(self):
        builder = self.client.create_batch()
        builder.create_job('ubuntu:18.04', ['/bin/true'])
        batch_token = secrets.token_urlsafe(32)
        b = builder._create(batch_token=batch_token)
        b2 = builder._create(batch_token=batch_token)
        assert b.id == b2.id

    def test_batch_create_validation(self):
        bad_configs = [
            # unexpected field fleep
            {
                'billing_project': 'foo',
                'n_jobs': 5,
                'token': 'baz',
                'fleep': 'quam'
            },
            # billing project None/missing
            {
                'billing_project': None,
                'n_jobs': 5,
                'token': 'baz'
            },
            {
                'n_jobs': 5,
                'token': 'baz'
            },
            # n_jobs None/missing
            {
                'billing_project': 'foo',
                'n_jobs': None,
                'token': 'baz'
            },
            {
                'billing_project': 'foo',
                'token': 'baz'
            },
            # n_jobs wrong type
            {
                'billing_project': 'foo',
                'n_jobs': '5',
                'token': 'baz'
            },
            # token None/missing
            {
                'billing_project': 'foo',
                'n_jobs': 5,
                'token': None
            },
            {
                'billing_project': 'foo',
                'n_jobs': 5
            },
            # attribute key/value None
            {
                'attributes': {
                    'k': None
                },
                'billing_project': 'foo',
                'n_jobs': 5,
                'token': 'baz'
            },
        ]
        url = deploy_config.url('batch', '/api/v1alpha/batches/create')
        headers = service_auth_headers(deploy_config, 'batch')
        for config in bad_configs:
            r = requests.post(url,
                              json=config,
                              allow_redirects=True,
                              headers=headers)
            assert r.status_code == 400, (config, r)
Exemplo n.º 9
0
class Test(unittest.TestCase):
    def setUp(self):
        self.client = BatchClient()

    def tearDown(self):
        self.client.close()

    def test_job(self):
        builder = self.client.create_batch()
        j = builder.create_job('alpine', ['echo', 'test'])
        builder.submit()
        status = j.wait()
        self.assertTrue('attributes' not in status, (status, j.log()))
        self.assertEqual(status['state'], 'Success', (status, j.log()))
        self.assertEqual(status['exit_code']['main'], 0, (status, j.log()))

        self.assertEqual(j.log()['main'], 'test\n', status)
        j.pod_status()

        self.assertTrue(j.is_complete())

    def test_attributes(self):
        a = {'name': 'test_attributes', 'foo': 'bar'}
        builder = self.client.create_batch()
        j = builder.create_job('alpine', ['true'], attributes=a)
        builder.submit()
        status = j.status()
        assert (status['attributes'] == a)

    def test_unsubmitted_state(self):
        builder = self.client.create_batch()
        j = builder.create_job('alpine', ['echo', 'test'])

        with self.assertRaises(ValueError):
            j.batch_id
        with self.assertRaises(ValueError):
            j.id
        with self.assertRaises(ValueError):
            j.status()
        with self.assertRaises(ValueError):
            j.is_complete()
        with self.assertRaises(ValueError):
            j.log()
        with self.assertRaises(ValueError):
            j.pod_status()
        with self.assertRaises(ValueError):
            j.wait()

        builder.submit()
        with self.assertRaises(ValueError):
            builder.create_job('alpine', ['echo', 'test'])

    def test_list_batches(self):
        tag = secrets.token_urlsafe(64)
        b1 = self.client.create_batch(attributes={'tag': tag, 'name': 'b1'})
        b1.create_job('alpine', ['sleep', '3600'])
        b1 = b1.submit()

        b2 = self.client.create_batch(attributes={'tag': tag, 'name': 'b2'})
        b2.create_job('alpine', ['echo', 'test'])
        b2 = b2.submit()

        def assert_batch_ids(expected,
                             complete=None,
                             success=None,
                             attributes=None):
            batches = self.client.list_batches(complete=complete,
                                               success=success,
                                               attributes=attributes)
            # list_batches returns all batches for all prev run tests
            actual = set([b.id for b in batches]).intersection({b1.id, b2.id})
            self.assertEqual(actual, expected)

        assert_batch_ids({b1.id, b2.id}, attributes={'tag': tag})

        b2.wait()

        assert_batch_ids({b1.id}, complete=False, attributes={'tag': tag})
        assert_batch_ids({b2.id}, complete=True, attributes={'tag': tag})

        assert_batch_ids({b1.id}, success=False, attributes={'tag': tag})
        assert_batch_ids({b2.id}, success=True, attributes={'tag': tag})

        b1.cancel()
        b1.wait()

        assert_batch_ids({b1.id}, success=False, attributes={'tag': tag})
        assert_batch_ids({b2.id}, success=True, attributes={'tag': tag})

        assert_batch_ids(set(), complete=False, attributes={'tag': tag})
        assert_batch_ids({b1.id, b2.id},
                         complete=True,
                         attributes={'tag': tag})

        assert_batch_ids({b2.id}, attributes={'tag': tag, 'name': 'b2'})

    def test_limit_offset(self):
        b1 = self.client.create_batch()
        for i in range(3):
            b1.create_job('alpine', ['true'])
        b1 = b1.submit()
        s = b1.status(limit=2, offset=1)
        filtered_jobs = {j['job_id'] for j in s['jobs']}
        assert filtered_jobs == {2, 3}, s

    def test_fail(self):
        b = self.client.create_batch()
        j = b.create_job('alpine', ['false'])
        b.submit()
        status = j.wait()
        self.assertEqual(status['exit_code']['main'], 1)

    def test_deleted_job_log(self):
        b = self.client.create_batch()
        j = b.create_job('alpine', ['echo', 'test'])
        b = b.submit()
        j.wait()
        b.delete()

        try:
            j.log()
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                self.assertTrue(False, f"batch should have deleted log {e}")

    def test_delete_batch(self):
        b = self.client.create_batch()
        j = b.create_job('alpine', ['sleep', '30'])
        b = b.submit()
        b.delete()

        # verify doesn't exist
        try:
            self.client.get_job(*j.id)
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                raise

    def test_cancel_batch(self):
        b = self.client.create_batch()
        j = b.create_job('alpine', ['sleep', '30'])
        b = b.submit()

        status = j.status()
        assert status['state'] in ('Ready', 'Running'), status

        b.cancel()

        status = j.wait()
        assert status['state'] == 'Cancelled', status
        assert 'log' not in status, status

        # cancelled job has no log
        try:
            j.log()
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                raise

    def test_get_nonexistent_job(self):
        try:
            self.client.get_job(1, 666)
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                raise

    def test_get_job(self):
        b = self.client.create_batch()
        j = b.create_job('alpine', ['true'])
        b.submit()

        j2 = self.client.get_job(*j.id)
        status2 = j2.status()
        assert (status2['batch_id'], status2['job_id']) == j.id

    def test_batch(self):
        b = self.client.create_batch()
        j1 = b.create_job('alpine', ['false'])
        j2 = b.create_job('alpine', ['sleep', '1'])
        j3 = b.create_job('alpine', ['sleep', '30'])
        b = b.submit()

        j1.wait()
        j2.wait()
        b.cancel()
        bstatus = b.wait()

        assert len(bstatus['jobs']) == 3, bstatus
        state_count = collections.Counter(
            [j['state'] for j in bstatus['jobs']])
        n_cancelled = state_count['Cancelled']
        n_complete = state_count['Error'] + state_count[
            'Failed'] + state_count['Success']
        assert n_cancelled <= 1, bstatus
        assert n_cancelled + n_complete == 3, bstatus

        n_failed = sum([
            j['exit_code']['main'] > 0 for j in bstatus['jobs']
            if j['state'] in ('Failed', 'Error')
        ])
        assert n_failed == 1, bstatus

    def test_batch_status(self):
        b1 = self.client.create_batch()
        b1.create_job('alpine', ['true'])
        b1 = b1.submit()
        b1.wait()
        b1s = b1.status()
        assert b1s['complete'] and b1s['state'] == 'success', b1s

        b2 = self.client.create_batch()
        b2.create_job('alpine', ['false'])
        b2.create_job('alpine', ['true'])
        b2 = b2.submit()
        b2.wait()
        b2s = b2.status()
        assert b2s['complete'] and b2s['state'] == 'failure', b2s

        b3 = self.client.create_batch()
        b3.create_job('alpine', ['sleep', '30'])
        b3 = b3.submit()
        b3s = b3.status()
        assert not b3s['complete'] and b3s['state'] == 'running', b3s
        b3.cancel()

        b4 = self.client.create_batch()
        b4.create_job('alpine', ['sleep', '30'])
        b4 = b4.submit()
        b4.cancel()
        b4.wait()
        b4s = b4.status()
        assert b4s['complete'] and b4s['state'] == 'cancelled', b4s

    def test_log_after_failing_job(self):
        b = self.client.create_batch()
        j = b.create_job('alpine', ['/bin/sh', '-c', 'echo test; exit 127'])
        b.submit()
        status = j.wait()
        self.assertTrue('attributes' not in status)
        self.assertEqual(status['state'], 'Failed')
        self.assertEqual(status['exit_code']['main'], 127)

        self.assertEqual(j.log()['main'], 'test\n')

        self.assertTrue(j.is_complete())

    def test_authorized_users_only(self):
        deploy_config = get_deploy_config()
        endpoints = [
            (requests.get, '/api/v1alpha/batches/0/jobs/0', 401),
            (requests.get, '/api/v1alpha/batches/0/jobs/0/log', 401),
            (requests.get, '/api/v1alpha/batches/0/jobs/0/pod_status', 401),
            (requests.get, '/api/v1alpha/batches', 401),
            (requests.post, '/api/v1alpha/batches/create', 401),
            (requests.post, '/api/v1alpha/batches/0/jobs/create', 401),
            (requests.get, '/api/v1alpha/batches/0', 401),
            (requests.delete, '/api/v1alpha/batches/0', 401),
            (requests.patch, '/api/v1alpha/batches/0/close', 401),
            # redirect to auth/login
            (requests.get, '/batches', 302),
            (requests.get, '/batches/0', 302),
            (requests.get, '/batches/0/jobs/0/log', 302)
        ]
        for f, url, expected in endpoints:
            r = f(deploy_config.url('batch', url))
            assert r.status_code == 401, r

    def test_bad_token(self):
        token = base64.urlsafe_b64encode(
            secrets.token_bytes(32)).decode('ascii')
        bc = BatchClient(_token=token)
        try:
            b = bc.create_batch()
            j = b.create_job('alpine', ['false'])
            b.submit()
            assert False, j
        except aiohttp.ClientResponseError as e:
            assert e.status == 401, e
        finally:
            bc.close()
Exemplo n.º 10
0
def client():
    client = BatchClient('test')
    yield client
    client.close()
Exemplo n.º 11
0
class BatchBackend(Backend):
    """
    Backend that executes pipelines on a Kubernetes cluster using `batch`.

    Examples
    --------

    >>> batch_backend = BatchBackend(tmp_dir='http://localhost:5000')
    >>> p = Pipeline(backend=batch_backend)

    Parameters
    ----------
    url: :obj:`str`
        URL to batch server.
    """
    def __init__(self, billing_project):
        self._batch_client = BatchClient(billing_project)

    def close(self):
        self._batch_client.close()

    def _run(
            self,
            pipeline,
            dry_run,
            verbose,
            delete_scratch_on_exit,
            wait=True,
            open=False,
            batch_submit_args=None):  # pylint: disable-msg=too-many-statements
        build_dag_start = time.time()

        bucket = self._batch_client.bucket
        subdir_name = 'pipeline-{}'.format(uuid.uuid4().hex[:12])

        remote_tmpdir = f'gs://{bucket}/pipeline/{subdir_name}'
        local_tmpdir = f'/io/pipeline/{subdir_name}'

        default_image = 'ubuntu:latest'

        attributes = pipeline.attributes
        if pipeline.name is not None:
            attributes['name'] = pipeline.name

        batch = self._batch_client.create_batch(attributes=attributes)

        n_jobs_submitted = 0
        used_remote_tmpdir = False

        task_to_job_mapping = {}
        jobs_to_command = {}
        commands = []

        bash_flags = 'set -e' + ('x' if verbose else '') + '; '

        activate_service_account = 'gcloud -q auth activate-service-account ' \
                                   '--key-file=/gsa-key/key.json'

        def copy_input(r):
            if isinstance(r, InputResourceFile):
                return [(r._input_path, r._get_path(local_tmpdir))]
            assert isinstance(r, TaskResourceFile)
            return [(r._get_path(remote_tmpdir), r._get_path(local_tmpdir))]

        def copy_internal_output(r):
            assert isinstance(r, TaskResourceFile)
            return [(r._get_path(local_tmpdir), r._get_path(remote_tmpdir))]

        def copy_external_output(r):
            if isinstance(r, InputResourceFile):
                return [(r._input_path, dest) for dest in r._output_paths]
            assert isinstance(r, TaskResourceFile)
            return [(r._get_path(local_tmpdir), dest)
                    for dest in r._output_paths]

        write_external_inputs = [
            x for r in pipeline._input_resources
            for x in copy_external_output(r)
        ]
        if write_external_inputs:

            def _cp(src, dst):
                return f'gsutil -m cp -R {src} {dst}'

            write_cmd = bash_flags + activate_service_account + ' && ' + \
                ' && '.join([_cp(*files) for files in write_external_inputs])

            if dry_run:
                commands.append(write_cmd)
            else:
                j = batch.create_job(
                    image='google/cloud-sdk:237.0.0-alpine',
                    command=['/bin/bash', '-c', write_cmd],
                    attributes={'name': 'write_external_inputs'})
                jobs_to_command[j] = write_cmd
                n_jobs_submitted += 1

        for task in pipeline._tasks:
            inputs = [x for r in task._inputs for x in copy_input(r)]

            outputs = [
                x for r in task._internal_outputs
                for x in copy_internal_output(r)
            ]
            if outputs:
                used_remote_tmpdir = True
            outputs += [
                x for r in task._external_outputs
                for x in copy_external_output(r)
            ]

            resource_defs = [
                r._declare(directory=local_tmpdir) for r in task._mentioned
            ]

            if task._image is None:
                if verbose:
                    print(
                        f"Using image '{default_image}' since no image was specified."
                    )

            make_local_tmpdir = f'mkdir -p {local_tmpdir}/{task._uid}/; '
            defs = '; '.join(resource_defs) + '; ' if resource_defs else ''
            task_command = [cmd.strip() for cmd in task._command]

            cmd = bash_flags + make_local_tmpdir + defs + " && ".join(
                task_command)
            if dry_run:
                commands.append(cmd)
                continue

            parents = [task_to_job_mapping[t] for t in task._dependencies]

            attributes = task.attributes
            if task.name:
                attributes['name'] = task.name

            resources = {}
            if task._cpu:
                resources['cpu'] = task._cpu
            if task._memory:
                resources['memory'] = task._memory

            j = batch.create_job(
                image=task._image if task._image else default_image,
                command=['/bin/bash', '-c', cmd],
                parents=parents,
                attributes=attributes,
                resources=resources,
                input_files=inputs if len(inputs) > 0 else None,
                output_files=outputs if len(outputs) > 0 else None,
                pvc_size=task._storage,
                always_run=task._always_run)
            n_jobs_submitted += 1

            task_to_job_mapping[task] = j
            jobs_to_command[j] = cmd

        if dry_run:
            print("\n\n".join(commands))
            return None

        if delete_scratch_on_exit and used_remote_tmpdir:
            parents = list(jobs_to_command.keys())
            rm_cmd = f'gsutil -m rm -r {remote_tmpdir}'
            cmd = bash_flags + f'{activate_service_account} && {rm_cmd}'
            j = batch.create_job(image='google/cloud-sdk:237.0.0-alpine',
                                 command=['/bin/bash', '-c', cmd],
                                 parents=parents,
                                 attributes={'name': 'remove_tmpdir'},
                                 always_run=True)
            jobs_to_command[j] = cmd
            n_jobs_submitted += 1

        if verbose:
            print(
                f'Built DAG with {n_jobs_submitted} jobs in {round(time.time() - build_dag_start, 3)} seconds.'
            )

        submit_batch_start = time.time()
        batch = batch.submit(**(batch_submit_args or {}))

        jobs_to_command = {j.id: cmd for j, cmd in jobs_to_command.items()}

        if verbose:
            print(
                f'Submitted batch {batch.id} with {n_jobs_submitted} jobs in {round(time.time() - submit_batch_start, 3)} seconds:'
            )
            for jid, cmd in jobs_to_command.items():
                print(f'{jid}: {cmd}')

            print('')

        deploy_config = get_deploy_config()
        url = deploy_config.url('batch', f'/batches/{batch.id}')
        print(f'Submitted batch {batch.id}, see {url}')

        if open:
            webbrowser.open(url)
        if wait:
            print(f'Waiting for batch {batch.id}...')
            status = batch.wait()
            print(f'Batch {batch.id} complete: {status["state"]}')
        return batch
Exemplo n.º 12
0
class ServiceBackend(Backend):
    """Backend that executes batches on Hail's Batch Service on Google Cloud.

    Examples
    --------

    >>> service_backend = ServiceBackend('test')
    >>> b = Batch(backend=service_backend)
    >>> b.run() # doctest: +SKIP
    >>> service_backend.close()

    If the Hail configuration parameter batch/billing_project was previously set
    with ``hailctl config set``, then one may elide the billing_project
    parameter.

    >>> service_backend = ServiceBackend()
    >>> b = Batch(backend=service_backend)
    >>> b.run() # doctest: +SKIP
    >>> service_backend.close()

    Parameters
    ----------
    billing_project: :obj:`str`
        Name of billing project to use.
    """
    def __init__(self, billing_project=None):
        if billing_project is None:
            billing_project = get_user_config().get('batch',
                                                    'billing_project',
                                                    fallback=None)
        if billing_project is None:
            raise ValueError(
                f'the billing_project parameter of ServiceBackend must be set '
                f'or run `hailctl config set batch/billing_project '
                f'YOUR_BILLING_PROJECT`')
        self._batch_client = BatchClient(billing_project)

    def close(self):
        """
        Close the connection with the Batch Service.

        Notes
        -----
        This method should be called after executing your batches at the
        end of your script.
        """
        self._batch_client.close()

    def _run(self,
             batch,
             dry_run,
             verbose,
             delete_scratch_on_exit,
             wait=True,
             open=False,
             disable_progress_bar=False
             ):  # pylint: disable-msg=too-many-statements
        """
        Execute a batch.

        Warning
        -------
        This method should not be called directly. Instead, use :meth:`.Batch.run`
        and pass :class:`.ServiceBackend` specific arguments as key-word arguments.

        Parameters
        ----------
        batch: :class:`.Batch`
            Batch to execute.
        dry_run: :obj:`bool`
            If `True`, don't execute code.
        verbose: :obj:`bool`
            If `True`, print debugging output.
        delete_scratch_on_exit: :obj:`bool`
            If `True`, delete temporary directories with intermediate files.
        wait: :obj:`bool`, optional
            If `True`, wait for the batch to finish executing before returning.
        open: :obj:`bool`, optional
            If `True`, open the UI page for the batch.
        disable_progress_bar: :obj:`bool`, optional
            If `True`, disable the progress bar.
        """
        build_dag_start = time.time()

        bucket = self._batch_client.bucket
        subdir_name = 'batch-{}'.format(uuid.uuid4().hex[:12])

        remote_tmpdir = f'gs://{bucket}/batch/{subdir_name}'
        local_tmpdir = f'/io/batch/{subdir_name}'

        default_image = 'ubuntu:latest'

        attributes = copy.deepcopy(batch.attributes)
        if batch.name is not None:
            attributes['name'] = batch.name

        bc_batch = self._batch_client.create_batch(attributes=attributes)

        n_jobs_submitted = 0
        used_remote_tmpdir = False

        job_to_client_job_mapping = {}
        jobs_to_command = {}
        commands = []

        bash_flags = 'set -e' + ('x' if verbose else '') + '; '

        activate_service_account = 'gcloud -q auth activate-service-account ' \
                                   '--key-file=/gsa-key/key.json'

        def copy_input(r):
            if isinstance(r, InputResourceFile):
                return [(r._input_path, r._get_path(local_tmpdir))]
            assert isinstance(r, JobResourceFile)
            return [(r._get_path(remote_tmpdir), r._get_path(local_tmpdir))]

        def copy_internal_output(r):
            assert isinstance(r, JobResourceFile)
            return [(r._get_path(local_tmpdir), r._get_path(remote_tmpdir))]

        def copy_external_output(r):
            if isinstance(r, InputResourceFile):
                return [(r._input_path, dest) for dest in r._output_paths]
            assert isinstance(r, JobResourceFile)
            return [(r._get_path(local_tmpdir), dest)
                    for dest in r._output_paths]

        write_external_inputs = [
            x for r in batch._input_resources for x in copy_external_output(r)
        ]
        if write_external_inputs:

            def _cp(src, dst):
                return f'gsutil -m cp -R {src} {dst}'

            write_cmd = bash_flags + activate_service_account + ' && ' + \
                ' && '.join([_cp(*files) for files in write_external_inputs])

            if dry_run:
                commands.append(write_cmd)
            else:
                j = bc_batch.create_job(
                    image='google/cloud-sdk:237.0.0-alpine',
                    command=['/bin/bash', '-c', write_cmd],
                    attributes={'name': 'write_external_inputs'})
                jobs_to_command[j] = write_cmd
                n_jobs_submitted += 1

        for job in batch._jobs:
            inputs = [x for r in job._inputs for x in copy_input(r)]

            outputs = [
                x for r in job._internal_outputs
                for x in copy_internal_output(r)
            ]
            if outputs:
                used_remote_tmpdir = True
            outputs += [
                x for r in job._external_outputs
                for x in copy_external_output(r)
            ]

            env_vars = {
                r._uid: r._get_path(local_tmpdir)
                for r in job._mentioned
            }

            if job._image is None:
                if verbose:
                    print(
                        f"Using image '{default_image}' since no image was specified."
                    )

            make_local_tmpdir = f'mkdir -p {local_tmpdir}/{job._uid}/; '
            job_command = [cmd.strip() for cmd in job._command]

            cmd = bash_flags + make_local_tmpdir + " && ".join(job_command)

            if dry_run:
                commands.append(cmd)
                continue

            parents = [job_to_client_job_mapping[j] for j in job._dependencies]

            attributes = copy.deepcopy(job.attributes)
            if job.name:
                attributes['name'] = job.name

            resources = {}
            if job._cpu:
                resources['cpu'] = job._cpu
            if job._memory:
                resources['memory'] = job._memory

            j = bc_batch.create_job(
                image=job._image if job._image else default_image,
                command=['/bin/bash', '-c', cmd],
                parents=parents,
                attributes=attributes,
                resources=resources,
                input_files=inputs if len(inputs) > 0 else None,
                output_files=outputs if len(outputs) > 0 else None,
                pvc_size=job._storage,
                always_run=job._always_run,
                timeout=job._timeout,
                env=env_vars)

            n_jobs_submitted += 1

            job_to_client_job_mapping[job] = j
            jobs_to_command[j] = cmd

        if dry_run:
            print("\n\n".join(commands))
            return None

        if delete_scratch_on_exit and used_remote_tmpdir:
            parents = list(jobs_to_command.keys())
            rm_cmd = f'gsutil -m rm -r {remote_tmpdir}'
            cmd = bash_flags + f'{activate_service_account} && {rm_cmd}'
            j = bc_batch.create_job(image='google/cloud-sdk:237.0.0-alpine',
                                    command=['/bin/bash', '-c', cmd],
                                    parents=parents,
                                    attributes={'name': 'remove_tmpdir'},
                                    always_run=True)
            jobs_to_command[j] = cmd
            n_jobs_submitted += 1

        if verbose:
            print(
                f'Built DAG with {n_jobs_submitted} jobs in {round(time.time() - build_dag_start, 3)} seconds.'
            )

        submit_batch_start = time.time()
        bc_batch = bc_batch.submit(disable_progress_bar=disable_progress_bar)

        jobs_to_command = {j.id: cmd for j, cmd in jobs_to_command.items()}

        if verbose:
            print(
                f'Submitted batch {bc_batch.id} with {n_jobs_submitted} jobs in {round(time.time() - submit_batch_start, 3)} seconds:'
            )
            for jid, cmd in jobs_to_command.items():
                print(f'{jid}: {cmd}')

            print('')

        deploy_config = get_deploy_config()
        url = deploy_config.url('batch', f'/batches/{bc_batch.id}')
        print(f'Submitted batch {bc_batch.id}, see {url}')

        if open:
            webbrowser.open(url)
        if wait:
            print(f'Waiting for batch {bc_batch.id}...')
            status = bc_batch.wait()
            print(f'batch {bc_batch.id} complete: {status["state"]}')
        return bc_batch
Exemplo n.º 13
0
def client():
    client = BatchClient(_service='batch2')
    yield client
    client.close()
Exemplo n.º 14
0
class ServiceBackend(Backend[bc.Batch]):
    """Backend that executes batches on Hail's Batch Service on Google Cloud.

    Examples
    --------

    >>> service_backend = ServiceBackend(billing_project='my-billing-account', remote_tmpdir='gs://my-bucket/temporary-files/') # doctest: +SKIP
    >>> b = Batch(backend=service_backend) # doctest: +SKIP
    >>> b.run() # doctest: +SKIP
    >>> service_backend.close() # doctest: +SKIP

    If the Hail configuration parameters batch/billing_project and
    batch/remote_tmpdir were previously set with ``hailctl config set``, then
    one may elide the `billing_project` and `remote_tmpdir` parameters.

    >>> service_backend = ServiceBackend()
    >>> b = Batch(backend=service_backend)
    >>> b.run() # doctest: +SKIP
    >>> service_backend.close()


    Parameters
    ----------
    billing_project:
        Name of billing project to use.
    bucket:
        Name of bucket to use. Should not include the ``gs://`` prefix. Cannot be used with
        `remote_tmpdir`. Temporary data will be stored in the "/batch" folder of this
        bucket. This argument is deprecated. Use `remote_tmpdir` instead.
    remote_tmpdir:
        Temporary data will be stored in this cloud storage folder. Cannot be used with deprecated
        argument `bucket`. Paths should start with one of gs://, hail-az://, or s3://.
    google_project:
        If specified, the project to use when authenticating with Google
        Storage. Google Storage is used to transfer serialized values between
        this computer and the cloud machines that execute Python jobs.
    token:
        The authorization token to pass to the batch client.
        Should only be set for user delegation purposes.
    """
    def __init__(self,
                 *args,
                 billing_project: Optional[str] = None,
                 bucket: Optional[str] = None,
                 remote_tmpdir: Optional[str] = None,
                 google_project: Optional[str] = None,
                 token: Optional[str] = None):
        if len(args) > 2:
            raise TypeError(
                f'ServiceBackend() takes 2 positional arguments but {len(args)} were given'
            )
        if len(args) >= 1:
            if billing_project is not None:
                raise TypeError(
                    'ServiceBackend() got multiple values for argument \'billing_project\''
                )
            warnings.warn(
                'Use of deprecated positional argument \'billing_project\' in ServiceBackend(). Specify \'billing_project\' as a keyword argument instead.'
            )
            billing_project = args[0]
        if len(args) >= 2:
            if bucket is not None:
                raise TypeError(
                    'ServiceBackend() got multiple values for argument \'bucket\''
                )
            warnings.warn(
                'Use of deprecated positional argument \'bucket\' in ServiceBackend(). Specify \'bucket\' as a keyword argument instead.'
            )
            bucket = args[1]

        if billing_project is None:
            billing_project = get_user_config().get('batch',
                                                    'billing_project',
                                                    fallback=None)
        if billing_project is None:
            raise ValueError(
                'the billing_project parameter of ServiceBackend must be set '
                'or run `hailctl config set batch/billing_project '
                'MY_BILLING_PROJECT`')
        self._batch_client = BatchClient(billing_project, _token=token)

        user_config = get_user_config()

        if bucket is not None:
            warnings.warn(
                'Use of deprecated argument \'bucket\' in ServiceBackend(). Specify \'remote_tmpdir\' as a keyword argument instead.'
            )

        if remote_tmpdir is not None and bucket is not None:
            raise ValueError(
                'Cannot specify both \'remote_tmpdir\' and \'bucket\' in ServiceBackend(). Specify \'remote_tmpdir\' as a keyword argument instead.'
            )

        if bucket is None and remote_tmpdir is None:
            remote_tmpdir = user_config.get('batch',
                                            'remote_tmpdir',
                                            fallback=None)

        if remote_tmpdir is None:
            if bucket is None:
                bucket = user_config.get('batch', 'bucket', fallback=None)
                warnings.warn(
                    'Using deprecated configuration setting \'batch/bucket\'. Run `hailctl config set batch/remote_tmpdir` '
                    'to set the default for \'remote_tmpdir\' instead.')
            if bucket is None:
                raise ValueError(
                    'The \'remote_tmpdir\' parameter of ServiceBackend must be set. '
                    'Run `hailctl config set batch/remote_tmpdir REMOTE_TMPDIR`'
                )
            if 'gs://' in bucket:
                raise ValueError(
                    'The bucket parameter to ServiceBackend() should be a bucket name, not a path. '
                    'Use the remote_tmpdir parameter to specify a path.')
            remote_tmpdir = f'gs://{bucket}/batch'
        else:
            schemes = {'gs', 'hail-az'}
            found_scheme = any(
                remote_tmpdir.startswith(f'{scheme}://') for scheme in schemes)
            if not found_scheme:
                raise ValueError(
                    f'remote_tmpdir must be a storage uri path like gs://bucket/folder. Possible schemes include {schemes}'
                )
        if remote_tmpdir[-1] != '/':
            remote_tmpdir += '/'
        self.remote_tmpdir = remote_tmpdir

        gcs_kwargs = {'project': google_project}
        self.__fs: AsyncFS = RouterAsyncFS(default_scheme='file',
                                           gcs_kwargs=gcs_kwargs)

    @property
    def _fs(self):
        return self.__fs

    def _close(self):
        if hasattr(self, '_batch_client'):
            self._batch_client.close()
        async_to_blocking(self._fs.close())

    def _run(
        self,
        batch: 'batch.Batch',
        dry_run: bool,
        verbose: bool,
        delete_scratch_on_exit: bool,
        wait: bool = True,
        open: bool = False,
        disable_progress_bar: bool = False,
        callback: Optional[str] = None,
        token: Optional[str] = None,
        **backend_kwargs
    ) -> bc.Batch:  # pylint: disable-msg=too-many-statements
        """Execute a batch.

        Warning
        -------
        This method should not be called directly. Instead, use :meth:`.batch.Batch.run`
        and pass :class:`.ServiceBackend` specific arguments as key-word arguments.

        Parameters
        ----------
        batch:
            Batch to execute.
        dry_run:
            If `True`, don't execute code.
        verbose:
            If `True`, print debugging output.
        delete_scratch_on_exit:
            If `True`, delete temporary directories with intermediate files.
        wait:
            If `True`, wait for the batch to finish executing before returning.
        open:
            If `True`, open the UI page for the batch.
        disable_progress_bar:
            If `True`, disable the progress bar.
        callback:
            If not `None`, a URL that will receive at most one POST request
            after the entire batch completes.
        token:
            If not `None`, a string used for idempotency of batch submission.
        """
        return async_to_blocking(
            self._async_run(batch, dry_run, verbose, delete_scratch_on_exit,
                            wait, open, disable_progress_bar, callback, token,
                            **backend_kwargs))

    async def _async_run(
            self,
            batch: 'batch.Batch',
            dry_run: bool,
            verbose: bool,
            delete_scratch_on_exit: bool,
            wait: bool = True,
            open: bool = False,
            disable_progress_bar: bool = False,
            callback: Optional[str] = None,
            token: Optional[str] = None,
            **backend_kwargs):  # pylint: disable-msg=too-many-statements
        if backend_kwargs:
            raise ValueError(
                f'ServiceBackend does not support any of these keywords: {backend_kwargs}'
            )

        build_dag_start = time.time()

        uid = uuid.uuid4().hex[:6]
        batch_remote_tmpdir = f'{self.remote_tmpdir}{uid}'
        local_tmpdir = f'/io/batch/{uid}'

        default_image = 'ubuntu:20.04'

        attributes = copy.deepcopy(batch.attributes)
        if batch.name is not None:
            attributes['name'] = batch.name

        bc_batch = self._batch_client.create_batch(
            attributes=attributes,
            callback=callback,
            token=token,
            cancel_after_n_failures=batch._cancel_after_n_failures)

        n_jobs_submitted = 0
        used_remote_tmpdir = False

        job_to_client_job_mapping: Dict[_job.Job, bc.Job] = {}
        jobs_to_command = {}
        commands = []

        bash_flags = 'set -e' + ('x' if verbose else '')

        def copy_input(r):
            if isinstance(r, resource.InputResourceFile):
                return [(r._input_path, r._get_path(local_tmpdir))]
            assert isinstance(
                r, (resource.JobResourceFile, resource.PythonResult))
            return [(r._get_path(batch_remote_tmpdir),
                     r._get_path(local_tmpdir))]

        def copy_internal_output(r):
            assert isinstance(
                r, (resource.JobResourceFile, resource.PythonResult))
            return [(r._get_path(local_tmpdir),
                     r._get_path(batch_remote_tmpdir))]

        def copy_external_output(r):
            if isinstance(r, resource.InputResourceFile):
                return [(r._input_path, dest) for dest in r._output_paths]
            assert isinstance(
                r, (resource.JobResourceFile, resource.PythonResult))
            return [(r._get_path(local_tmpdir), dest)
                    for dest in r._output_paths]

        def symlink_input_resource_group(r):
            symlinks = []
            if isinstance(r, resource.ResourceGroup) and r._source is None:
                for name, irf in r._resources.items():
                    src = irf._get_path(local_tmpdir)
                    dest = f'{r._get_path(local_tmpdir)}.{name}'
                    symlinks.append(f'ln -sf {shq(src)} {shq(dest)}')
            return symlinks

        write_external_inputs = [
            x for r in batch._input_resources for x in copy_external_output(r)
        ]
        if write_external_inputs:
            transfers_bytes = orjson.dumps([{
                "from": src,
                "to": dest
            } for src, dest in write_external_inputs])
            transfers = transfers_bytes.decode('utf-8')
            write_cmd = [
                'python3', '-m', 'hailtop.aiotools.copy', 'null', transfers
            ]
            if dry_run:
                commands.append(' '.join(shq(x) for x in write_cmd))
            else:
                j = bc_batch.create_job(
                    image=HAIL_GENETICS_HAIL_IMAGE,
                    command=write_cmd,
                    attributes={'name': 'write_external_inputs'})
                jobs_to_command[j] = ' '.join(shq(x) for x in write_cmd)
                n_jobs_submitted += 1

        pyjobs = [j for j in batch._jobs if isinstance(j, _job.PythonJob)]
        for job in pyjobs:
            if job._image is None:
                version = sys.version_info
                if version.major != 3 or version.minor not in (6, 7, 8):
                    raise BatchException(
                        f"You must specify 'image' for Python jobs if you are using a Python version other than 3.6, 3.7, or 3.8 (you are using {version})"
                    )
                job._image = f'hailgenetics/python-dill:{version.major}.{version.minor}-slim'

        with tqdm(total=len(batch._jobs),
                  desc='upload code',
                  disable=disable_progress_bar) as pbar:

            async def compile_job(job):
                used_remote_tmpdir = await job._compile(local_tmpdir,
                                                        batch_remote_tmpdir,
                                                        dry_run=dry_run)
                pbar.update(1)
                return used_remote_tmpdir

            used_remote_tmpdir_results = await bounded_gather(
                *[functools.partial(compile_job, j) for j in batch._jobs],
                parallelism=150)
            used_remote_tmpdir |= any(used_remote_tmpdir_results)

        for job in tqdm(batch._jobs,
                        desc='create job objects',
                        disable=disable_progress_bar):
            inputs = [x for r in job._inputs for x in copy_input(r)]

            outputs = [
                x for r in job._internal_outputs
                for x in copy_internal_output(r)
            ]
            if outputs:
                used_remote_tmpdir = True
            outputs += [
                x for r in job._external_outputs
                for x in copy_external_output(r)
            ]

            symlinks = [
                x for r in job._mentioned
                for x in symlink_input_resource_group(r)
            ]

            if job._image is None:
                if verbose:
                    print(
                        f"Using image '{default_image}' since no image was specified."
                    )

            make_local_tmpdir = f'mkdir -p {local_tmpdir}/{job._dirname}'

            job_command = [cmd.strip() for cmd in job._wrapper_code]
            prepared_job_command = (f'{{\n{x}\n}}' for x in job_command)
            cmd = f'''
{bash_flags}
{make_local_tmpdir}
{"; ".join(symlinks)}
{" && ".join(prepared_job_command)}
'''

            user_code = '\n\n'.join(job._user_code) if job._user_code else None

            if dry_run:
                formatted_command = f'''
================================================================================
# Job {job._job_id} {f": {job.name}" if job.name else ''}

--------------------------------------------------------------------------------
## USER CODE
--------------------------------------------------------------------------------
{user_code}

--------------------------------------------------------------------------------
## COMMAND
--------------------------------------------------------------------------------
{cmd}
================================================================================
'''
                commands.append(formatted_command)
                continue

            parents = [job_to_client_job_mapping[j] for j in job._dependencies]

            attributes = copy.deepcopy(
                job.attributes) if job.attributes else {}
            if job.name:
                attributes['name'] = job.name

            resources: Dict[str, Any] = {}
            if job._cpu:
                resources['cpu'] = job._cpu
            if job._memory:
                resources['memory'] = job._memory
            if job._storage:
                resources['storage'] = job._storage
            if job._machine_type:
                resources['machine_type'] = job._machine_type
            if job._preemptible is not None:
                resources['preemptible'] = job._preemptible

            image = job._image if job._image else default_image
            image_ref = parse_docker_image_reference(image)
            if image_ref.hosted_in('dockerhub') and image_ref.name(
            ) not in HAIL_GENETICS_IMAGES:
                warnings.warn(f'Using an image {image} from Docker Hub. '
                              f'Jobs may fail due to Docker Hub rate limits.')

            env = {**job._env, 'BATCH_TMPDIR': local_tmpdir}

            j = bc_batch.create_job(
                image=image,
                command=[
                    job._shell if job._shell else DEFAULT_SHELL, '-c', cmd
                ],
                parents=parents,
                attributes=attributes,
                resources=resources,
                input_files=inputs if len(inputs) > 0 else None,
                output_files=outputs if len(outputs) > 0 else None,
                always_run=job._always_run,
                timeout=job._timeout,
                cloudfuse=job._cloudfuse if len(job._cloudfuse) > 0 else None,
                env=env,
                requester_pays_project=batch.requester_pays_project,
                mount_tokens=True,
                user_code=user_code)

            n_jobs_submitted += 1

            job_to_client_job_mapping[job] = j
            jobs_to_command[j] = cmd

        if dry_run:
            print("\n\n".join(commands))
            return None

        if delete_scratch_on_exit and used_remote_tmpdir:
            parents = list(jobs_to_command.keys())
            j = bc_batch.create_job(image=HAIL_GENETICS_HAIL_IMAGE,
                                    command=[
                                        'python3', '-m',
                                        'hailtop.aiotools.delete',
                                        batch_remote_tmpdir
                                    ],
                                    parents=parents,
                                    attributes={'name': 'remove_tmpdir'},
                                    always_run=True)
            jobs_to_command[j] = cmd
            n_jobs_submitted += 1

        if verbose:
            print(
                f'Built DAG with {n_jobs_submitted} jobs in {round(time.time() - build_dag_start, 3)} seconds.'
            )

        submit_batch_start = time.time()
        batch_handle = bc_batch.submit(
            disable_progress_bar=disable_progress_bar)

        jobs_to_command = {j.id: cmd for j, cmd in jobs_to_command.items()}

        if verbose:
            print(
                f'Submitted batch {batch_handle.id} with {n_jobs_submitted} jobs in {round(time.time() - submit_batch_start, 3)} seconds:'
            )
            for jid, cmd in jobs_to_command.items():
                print(f'{jid}: {cmd}')
            print('')

        deploy_config = get_deploy_config()
        url = deploy_config.url('batch', f'/batches/{batch_handle.id}')
        print(f'Submitted batch {batch_handle.id}, see {url}')

        if open:
            webbrowser.open(url)
        if wait:
            print(f'Waiting for batch {batch_handle.id}...')
            status = batch_handle.wait()
            print(f'batch {batch_handle.id} complete: {status["state"]}')
        return batch_handle
Exemplo n.º 15
0
class Test(unittest.TestCase):
    def setUp(self):
        self.client = BatchClient()

    def tearDown(self):
        self.client.close()

    def test_job(self):
        builder = self.client.create_batch()
        j = builder.create_job('ubuntu:18.04', ['echo', 'test'])
        b = builder.submit()
        status = j.wait()
        self.assertTrue('attributes' not in status, (status, j.log()))
        self.assertEqual(status['state'], 'Success', (status, j.log()))
        self.assertEqual(j._get_exit_code(status, 'main'), 0,
                         (status, j.log()))

        self.assertEqual(j.log()['main'], 'test\n', status)

        self.assertTrue(j.is_complete())

    def test_attributes(self):
        a = {'name': 'test_attributes', 'foo': 'bar'}
        builder = self.client.create_batch()
        j = builder.create_job('ubuntu:18.04', ['true'], attributes=a)
        builder.submit()
        status = j.status()
        assert (status['attributes'] == a)

    def test_garbage_image(self):
        builder = self.client.create_batch()
        j = builder.create_job('dsafaaadsf', ['echo', 'test'])
        builder.submit()
        status = j.wait()
        assert j._get_exit_codes(status) == {'main': None}, status
        assert j._get_error(status, 'main') is not None
        assert status['state'] == 'Error', status

    def test_bad_command(self):
        builder = self.client.create_batch()
        j = builder.create_job('ubuntu:18.04', ['sleep 5'])
        builder.submit()
        status = j.wait()
        assert j._get_exit_codes(status) == {'main': None}, status
        assert j._get_error(status, 'main') is not None
        assert status['state'] == 'Error', status

    def test_invalid_resource_requests(self):
        builder = self.client.create_batch()
        resources = {'cpu': '1', 'memory': '28Gi'}
        builder.create_job('ubuntu:18.04', ['true'], resources=resources)
        with self.assertRaisesRegex(aiohttp.client.ClientResponseError,
                                    'resource requests.*unsatisfiable'):
            builder.submit()

        builder = self.client.create_batch()
        resources = {'cpu': '0', 'memory': '1Gi'}
        builder.create_job('ubuntu:18.04', ['true'], resources=resources)
        with self.assertRaisesRegex(aiohttp.client.ClientResponseError,
                                    'bad resource request.*cpu cannot be 0'):
            builder.submit()

    def test_out_of_memory(self):
        builder = self.client.create_batch()
        resources = {'cpu': '0.1', 'memory': '10M'}
        j = builder.create_job('python:3.6-slim-stretch',
                               ['python', '-c', 'x = "a" * 400 * 1000**2'],
                               resources=resources)
        builder.submit()
        status = j.wait()
        assert j._get_out_of_memory(status, 'main')

    def test_unsubmitted_state(self):
        builder = self.client.create_batch()
        j = builder.create_job('ubuntu:18.04', ['echo', 'test'])

        with self.assertRaises(ValueError):
            j.batch_id
        with self.assertRaises(ValueError):
            j.id
        with self.assertRaises(ValueError):
            j.status()
        with self.assertRaises(ValueError):
            j.is_complete()
        with self.assertRaises(ValueError):
            j.log()
        with self.assertRaises(ValueError):
            j.wait()

        builder.submit()
        with self.assertRaises(ValueError):
            builder.create_job('ubuntu:18.04', ['echo', 'test'])

    def test_list_batches(self):
        tag = secrets.token_urlsafe(64)
        b1 = self.client.create_batch(attributes={'tag': tag, 'name': 'b1'})
        b1.create_job('ubuntu:18.04', ['sleep', '3600'])
        b1 = b1.submit()

        b2 = self.client.create_batch(attributes={'tag': tag, 'name': 'b2'})
        b2.create_job('ubuntu:18.04', ['echo', 'test'])
        b2 = b2.submit()

        def assert_batch_ids(expected,
                             complete=None,
                             success=None,
                             attributes=None):
            batches = self.client.list_batches(complete=complete,
                                               success=success,
                                               attributes=attributes)
            # list_batches returns all batches for all prev run tests
            actual = set([b.id for b in batches]).intersection({b1.id, b2.id})
            self.assertEqual(actual, expected)

        assert_batch_ids({b1.id, b2.id})

        assert_batch_ids({b1.id, b2.id}, attributes={'tag': tag})

        b2.wait()

        assert_batch_ids({b1.id}, complete=False, attributes={'tag': tag})
        assert_batch_ids({b2.id}, complete=True, attributes={'tag': tag})

        assert_batch_ids({b1.id}, success=False, attributes={'tag': tag})
        assert_batch_ids({b2.id}, success=True, attributes={'tag': tag})

        b1.cancel()
        b1.wait()

        assert_batch_ids({b1.id}, success=False, attributes={'tag': tag})
        assert_batch_ids({b2.id}, success=True, attributes={'tag': tag})

        assert_batch_ids(set(), complete=False, attributes={'tag': tag})
        assert_batch_ids({b1.id, b2.id},
                         complete=True,
                         attributes={'tag': tag})

        assert_batch_ids({b2.id}, attributes={'tag': tag, 'name': 'b2'})

    def test_include_jobs(self):
        b1 = self.client.create_batch()
        for i in range(2):
            b1.create_job('ubuntu:18.04', ['true'])
        b1 = b1.submit()
        s = b1.status(include_jobs=False)
        assert 'jobs' not in s

    def test_fail(self):
        b = self.client.create_batch()
        j = b.create_job('ubuntu:18.04', ['false'])
        b.submit()
        status = j.wait()
        self.assertEqual(j._get_exit_code(status, 'main'), 1)

    def test_running_job_log_and_status(self):
        b = self.client.create_batch()
        j = b.create_job('ubuntu:18.04', ['sleep', '300'])
        b = b.submit()

        while True:
            if j.status()['state'] == 'Running' or j.is_complete():
                break

        j.log()
        # FIXME after batch1 goes away, check running status
        b.cancel()

    def test_deleted_job_log(self):
        b = self.client.create_batch()
        j = b.create_job('ubuntu:18.04', ['echo', 'test'])
        b = b.submit()
        j.wait()
        b.delete()

        try:
            j.log()
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                self.assertTrue(False, f"batch should have deleted log {e}")

    def test_delete_batch(self):
        b = self.client.create_batch()
        j = b.create_job('ubuntu:18.04', ['sleep', '30'])
        b = b.submit()
        b.delete()

        # verify doesn't exist
        try:
            self.client.get_job(*j.id)
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                raise

    def test_cancel_batch(self):
        b = self.client.create_batch()
        j = b.create_job('ubuntu:18.04', ['sleep', '30'])
        b = b.submit()

        status = j.status()
        assert status['state'] in ('Ready', 'Running'), status

        b.cancel()

        status = j.wait()
        assert status['state'] == 'Cancelled', status
        assert 'log' not in status, status

        # cancelled job has no log
        try:
            j.log()
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                raise

    def test_get_nonexistent_job(self):
        try:
            self.client.get_job(1, 666)
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                raise

    def test_get_job(self):
        b = self.client.create_batch()
        j = b.create_job('ubuntu:18.04', ['true'])
        b.submit()

        j2 = self.client.get_job(*j.id)
        status2 = j2.status()
        assert (status2['batch_id'], status2['job_id']) == j.id

    def test_batch(self):
        b = self.client.create_batch()
        j1 = b.create_job('ubuntu:18.04', ['false'])
        j2 = b.create_job('ubuntu:18.04', ['sleep', '1'])
        j3 = b.create_job('ubuntu:18.04', ['sleep', '30'])
        b = b.submit()

        j1.wait()
        j2.wait()
        b.cancel()
        bstatus = b.wait()

        assert len(bstatus['jobs']) == 3, bstatus
        state_count = collections.Counter(
            [j['state'] for j in bstatus['jobs']])
        n_cancelled = state_count['Cancelled']
        n_complete = state_count['Error'] + state_count[
            'Failed'] + state_count['Success']
        assert n_cancelled <= 1, bstatus
        assert n_cancelled + n_complete == 3, bstatus

        n_failed = sum([
            Job._get_exit_code(j, 'main') > 0 for j in bstatus['jobs']
            if j['state'] in ('Failed', 'Error')
        ])
        assert n_failed == 1, bstatus

    def test_batch_status(self):
        b1 = self.client.create_batch()
        b1.create_job('ubuntu:18.04', ['true'])
        b1 = b1.submit()
        b1.wait()
        b1s = b1.status()
        assert b1s['complete'] and b1s['state'] == 'success', b1s

        b2 = self.client.create_batch()
        b2.create_job('ubuntu:18.04', ['false'])
        b2.create_job('ubuntu:18.04', ['true'])
        b2 = b2.submit()
        b2.wait()
        b2s = b2.status()
        assert b2s['complete'] and b2s['state'] == 'failure', b2s

        b3 = self.client.create_batch()
        b3.create_job('ubuntu:18.04', ['sleep', '30'])
        b3 = b3.submit()
        b3s = b3.status()
        assert not b3s['complete'] and b3s['state'] == 'running', b3s
        b3.cancel()

        b4 = self.client.create_batch()
        b4.create_job('ubuntu:18.04', ['sleep', '30'])
        b4 = b4.submit()
        b4.cancel()
        b4.wait()
        b4s = b4.status()
        assert b4s['complete'] and b4s['state'] == 'cancelled', b4s

    def test_log_after_failing_job(self):
        b = self.client.create_batch()
        j = b.create_job('ubuntu:18.04',
                         ['/bin/sh', '-c', 'echo test; exit 127'])
        b.submit()
        status = j.wait()
        self.assertTrue('attributes' not in status)
        self.assertEqual(status['state'], 'Failed')
        self.assertEqual(j._get_exit_code(status, 'main'), 127)

        self.assertEqual(j.log()['main'], 'test\n')

        self.assertTrue(j.is_complete())

    def test_authorized_users_only(self):
        endpoints = [
            (requests.get, '/api/v1alpha/batches/0/jobs/0', 401),
            (requests.get, '/api/v1alpha/batches/0/jobs/0/log', 401),
            (requests.get, '/api/v1alpha/batches', 401),
            (requests.post, '/api/v1alpha/batches/create', 401),
            (requests.post, '/api/v1alpha/batches/0/jobs/create', 401),
            (requests.get, '/api/v1alpha/batches/0', 401),
            (requests.delete, '/api/v1alpha/batches/0', 401),
            (requests.patch, '/api/v1alpha/batches/0/close', 401),
            # redirect to auth/login
            (requests.get, '/batches', 302),
            (requests.get, '/batches/0', 302),
            (requests.post, '/batches/0/cancel', 401),
            (requests.get, '/batches/0/jobs/0', 302)
        ]
        for f, url, expected in endpoints:
            full_url = deploy_config.url('batch2', url)
            r = f(full_url, allow_redirects=False)
            assert r.status_code == expected, (full_url, r, expected)

    def test_bad_token(self):
        token = base64.urlsafe_b64encode(
            secrets.token_bytes(32)).decode('ascii')
        bc = BatchClient(_token=token)
        try:
            b = bc.create_batch()
            j = b.create_job('ubuntu:18.04', ['false'])
            b.submit()
            assert False, j
        except aiohttp.ClientResponseError as e:
            assert e.status == 401, e
        finally:
            bc.close()

    def test_gcr_image(self):
        builder = self.client.create_batch()
        j = builder.create_job(os.environ['HAIL_BASE_IMAGE'], ['echo', 'test'])
        b = builder.submit()
        status = j.wait()

        self.assertEqual(status['state'], 'Success', (status, j.log()))

    def test_service_account(self):
        b = self.client.create_batch()
        j = b.create_job(
            os.environ['CI_UTILS_IMAGE'],
            ['/bin/sh', '-c', 'kubectl get pods -l app=batch2-driver'],
            service_account={
                'namespace': os.environ['HAIL_BATCH_PODS_NAMESPACE'],
                'name': 'ci-agent'
            })
        b.submit()
        status = j.wait()
        assert j._get_exit_code(status, 'main') == 0, status
Exemplo n.º 16
0
class BatchBackend(Backend):
    """
    Backend that executes pipelines on a Kubernetes cluster using `batch`.

    Examples
    --------

    >>> batch_backend = BatchBackend(tmp_dir='http://localhost:5000')
    >>> p = Pipeline(backend=batch_backend)

    Parameters
    ----------
    url: :obj:`str`
        URL to batch server.
    """

    def __init__(self, _service='batch'):
        self._batch_client = BatchClient(_service=_service)

    def close(self):
        self._batch_client.close()

    def _run(self, pipeline, dry_run, verbose, delete_scratch_on_exit):  # pylint: disable-msg=R0915
        start = time.time()

        bucket = self._batch_client.bucket
        subdir_name = 'pipeline-{}'.format(uuid.uuid4().hex[:12])

        remote_tmpdir = f'gs://{bucket}/pipeline/{subdir_name}'
        local_tmpdir = f'/io/pipeline/{subdir_name}'

        default_image = 'ubuntu:latest'

        attributes = pipeline.attributes
        if pipeline.name is not None:
            attributes['name'] = pipeline.name

        batch = self._batch_client.create_batch(attributes=attributes)

        n_jobs_submitted = 0
        used_remote_tmpdir = False

        task_to_job_mapping = {}
        jobs_to_command = {}
        commands = []

        bash_flags = 'set -e' + ('x' if verbose else '') + '; '

        activate_service_account = 'gcloud -q auth activate-service-account ' \
                                   '--key-file=/gsa-key/privateKeyData'

        def copy_input(r):
            if isinstance(r, InputResourceFile):
                return [(r._input_path, r._get_path(local_tmpdir))]
            assert isinstance(r, TaskResourceFile)
            return [(r._get_path(remote_tmpdir), r._get_path(local_tmpdir))]

        def copy_internal_output(r):
            assert isinstance(r, TaskResourceFile)
            return [(r._get_path(local_tmpdir), r._get_path(remote_tmpdir))]

        def copy_external_output(r):
            if isinstance(r, InputResourceFile):
                return [(r._input_path, dest) for dest in r._output_paths]
            assert isinstance(r, TaskResourceFile)
            return [(r._get_path(local_tmpdir), dest) for dest in r._output_paths]

        write_external_inputs = [x for r in pipeline._input_resources for x in copy_external_output(r)]
        if write_external_inputs:
            def _cp(src, dst):
                return f'gsutil -m cp -R {src} {dst}'

            write_cmd = bash_flags + activate_service_account + ' && ' + \
                ' && '.join([_cp(*files) for files in write_external_inputs])

            if dry_run:
                commands.append(write_cmd)
            else:
                j = batch.create_job(image='google/cloud-sdk:237.0.0-alpine',
                                     command=['/bin/bash', '-c', write_cmd],
                                     attributes={'name': 'write_external_inputs'})
                jobs_to_command[j] = write_cmd
                n_jobs_submitted += 1

        for task in pipeline._tasks:
            inputs = [x for r in task._inputs for x in copy_input(r)]

            outputs = [x for r in task._internal_outputs for x in copy_internal_output(r)]
            if outputs:
                used_remote_tmpdir = True
            outputs += [x for r in task._external_outputs for x in copy_external_output(r)]

            resource_defs = [r._declare(directory=local_tmpdir) for r in task._mentioned]

            if task._image is None:
                if verbose:
                    print(f"Using image '{default_image}' since no image was specified.")

            make_local_tmpdir = f'mkdir -p {local_tmpdir}/{task._uid}/; '
            defs = '; '.join(resource_defs) + '; ' if resource_defs else ''
            task_command = [cmd.strip() for cmd in task._command]

            cmd = bash_flags + make_local_tmpdir + defs + " && ".join(task_command)
            if dry_run:
                commands.append(cmd)
                continue

            parents = [task_to_job_mapping[t] for t in task._dependencies]

            attributes = {'task_uid': task._uid}
            if task.name:
                attributes['name'] = task.name
            attributes.update(task.attributes)

            resources = {'requests': {}}
            if task._cpu:
                resources['requests']['cpu'] = task._cpu
            if task._memory:
                resources['requests']['memory'] = task._memory

            j = batch.create_job(image=task._image if task._image else default_image,
                                 command=['/bin/bash', '-c', cmd],
                                 parents=parents,
                                 attributes=attributes,
                                 resources=resources,
                                 input_files=inputs if len(inputs) > 0 else None,
                                 output_files=outputs if len(outputs) > 0 else None,
                                 pvc_size=task._storage)
            n_jobs_submitted += 1

            task_to_job_mapping[task] = j
            jobs_to_command[j] = cmd

        if dry_run:
            print("\n\n".join(commands))
            return

        if delete_scratch_on_exit and used_remote_tmpdir:
            parents = list(jobs_to_command.keys())
            rm_cmd = f'gsutil -m rm -r {remote_tmpdir}'
            cmd = bash_flags + f'{activate_service_account} && {rm_cmd}'
            j = batch.create_job(
                image='google/cloud-sdk:237.0.0-alpine',
                command=['/bin/bash', '-c', cmd],
                parents=parents,
                attributes={'name': 'remove_tmpdir'},
                always_run=True)
            jobs_to_command[j] = cmd
            n_jobs_submitted += 1

        print(f'Built DAG with {n_jobs_submitted} jobs in {round(time.time() - start, 3)} seconds:')
        start = time.time()
        batch = batch.submit()
        print(f'Submitted batch {batch.id} with {n_jobs_submitted} jobs in {round(time.time() - start, 3)} seconds:')

        jobs_to_command = {j.id: cmd for j, cmd in jobs_to_command.items()}

        if verbose:
            print(f'Submitted batch {batch.id} with {n_jobs_submitted} jobs in {round(time.time() - start, 3)} seconds:')
            for jid, cmd in jobs_to_command.items():
                print(f'{jid}: {cmd}')

        status = batch.wait()

        if status['state'] == 'success':
            print('Pipeline completed successfully!')
            return

        failed_jobs = [((j['batch_id'], j['job_id']), j['exit_code']) for j in status['jobs'] if 'exit_code' in j and any([ec != 0 for _, ec in j['exit_code'].items()])]

        fail_msg = ''
        for jid, ec in failed_jobs:
            ec = Job.exit_code(ec)
            job = self._batch_client.get_job(*jid)
            log = job.log()
            name = job.status()['attributes'].get('name', None)
            fail_msg += (
                f"Job {jid} failed with exit code {ec}:\n"
                f"  Task name:\t{name}\n"
                f"  Command:\t{jobs_to_command[jid]}\n"
                f"  Log:\t{log}\n")

        raise PipelineException(fail_msg)
Exemplo n.º 17
0
class Test(unittest.TestCase):
    def setUp(self):
        session = aiohttp.ClientSession(
            raise_for_status=True,
            timeout=aiohttp.ClientTimeout(total=60))
        self.client = BatchClient(session, url=os.environ.get('BATCH_URL'))

    def tearDown(self):
        self.client.close()

    def test_job(self):
        builder = self.client.create_batch()
        j = builder.create_job('alpine', ['echo', 'test'])
        builder.submit()
        status = j.wait()
        self.assertTrue('attributes' not in status, (status, j.log()))
        self.assertEqual(status['state'], 'Success', (status, j.log()))
        self.assertEqual(status['exit_code']['main'], 0, (status, j.log()))

        self.assertEqual(j.log()['main'], 'test\n', status)
        j.pod_status()

        self.assertTrue(j.is_complete())

    def test_attributes(self):
        a = {
            'name': 'test_attributes',
            'foo': 'bar'
        }
        builder = self.client.create_batch()
        j = builder.create_job('alpine', ['true'], attributes=a)
        builder.submit()
        status = j.status()
        assert(status['attributes'] == a)

    def test_unsubmitted_state(self):
        builder = self.client.create_batch()
        j = builder.create_job('alpine', ['echo', 'test'])

        with self.assertRaises(ValueError):
            j.batch_id
        with self.assertRaises(ValueError):
            j.id
        with self.assertRaises(ValueError):
            j.status()
        with self.assertRaises(ValueError):
            j.is_complete()
        with self.assertRaises(ValueError):
            j.log()
        with self.assertRaises(ValueError):
            j.pod_status()
        with self.assertRaises(ValueError):
            j.wait()

        builder.submit()
        with self.assertRaises(ValueError):
            builder.create_job('alpine', ['echo', 'test'])

    def test_list_batches(self):
        tag = secrets.token_urlsafe(64)
        b1 = self.client.create_batch(attributes={'tag': tag, 'name': 'b1'})
        b1.create_job('alpine', ['sleep', '30'])
        b1 = b1.submit()

        b2 = self.client.create_batch(attributes={'tag': tag, 'name': 'b2'})
        b2.create_job('alpine', ['echo', 'test'])
        b2 = b2.submit()

        def assert_batch_ids(expected, complete=None, success=None, attributes=None):
            batches = self.client.list_batches(complete=complete, success=success, attributes=attributes)
            # list_batches returns all batches for all prev run tests
            actual = set([b.id for b in batches]).intersection({b1.id, b2.id})
            self.assertEqual(actual, expected)

        assert_batch_ids({b1.id, b2.id}, attributes={'tag': tag})

        b2.wait()

        assert_batch_ids({b1.id}, complete=False, attributes={'tag': tag})
        assert_batch_ids({b2.id}, complete=True, attributes={'tag': tag})

        assert_batch_ids({b1.id}, success=False, attributes={'tag': tag})
        assert_batch_ids({b2.id}, success=True, attributes={'tag': tag})

        b1.cancel()
        b1.wait()

        assert_batch_ids({b1.id}, success=False, attributes={'tag': tag})
        assert_batch_ids({b2.id}, success=True, attributes={'tag': tag})

        assert_batch_ids(set(), complete=False, attributes={'tag': tag})
        assert_batch_ids({b1.id, b2.id}, complete=True, attributes={'tag': tag})

        assert_batch_ids({b2.id}, attributes={'tag': tag, 'name': 'b2'})

    def test_limit_offset(self):
        b1 = self.client.create_batch()
        for i in range(3):
            b1.create_job('alpine', ['true'])
        b1 = b1.submit()
        s = b1.status(limit=2, offset=1)
        filtered_jobs = {j['job_id'] for j in s['jobs']}
        assert filtered_jobs == {2, 3}, s

    def test_fail(self):
        b = self.client.create_batch()
        j = b.create_job('alpine', ['false'])
        b.submit()
        status = j.wait()
        self.assertEqual(status['exit_code']['main'], 1)

    def test_deleted_job_log(self):
        b = self.client.create_batch()
        j = b.create_job('alpine', ['echo', 'test'])
        b = b.submit()
        j.wait()
        b.delete()

        try:
            j.log()
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                self.assertTrue(False, f"batch should have deleted log {e}")

    def test_delete_batch(self):
        b = self.client.create_batch()
        j = b.create_job('alpine', ['sleep', '30'])
        b = b.submit()
        b.delete()

        # verify doesn't exist
        try:
            self.client.get_job(*j.id)
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                raise

    def test_cancel_batch(self):
        b = self.client.create_batch()
        j = b.create_job('alpine', ['sleep', '30'])
        b = b.submit()

        status = j.status()
        assert status['state'] in ('Ready', 'Running'), status

        b.cancel()

        status = j.wait()
        assert status['state'] == 'Cancelled', status
        assert 'log' not in status, status

        # cancelled job has no log
        try:
            j.log()
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                raise

    def test_get_nonexistent_job(self):
        try:
            self.client.get_job(1, 666)
        except aiohttp.ClientResponseError as e:
            if e.status == 404:
                pass
            else:
                raise

    def test_get_job(self):
        b = self.client.create_batch()
        j = b.create_job('alpine', ['true'])
        b.submit()

        j2 = self.client.get_job(*j.id)
        status2 = j2.status()
        assert (status2['batch_id'], status2['job_id']) == j.id

    def test_batch(self):
        b = self.client.create_batch()
        j1 = b.create_job('alpine', ['false'])
        j2 = b.create_job('alpine', ['sleep', '1'])
        j3 = b.create_job('alpine', ['sleep', '30'])
        b = b.submit()

        j1.wait()
        j2.wait()
        b.cancel()
        bstatus = b.wait()

        assert len(bstatus['jobs']) == 3, bstatus
        state_count = collections.Counter([j['state'] for j in bstatus['jobs']])
        n_cancelled = state_count['Cancelled']
        n_complete = state_count['Error'] + state_count['Failed'] + state_count['Success']
        assert n_cancelled <= 1, bstatus
        assert n_cancelled + n_complete == 3, bstatus

        n_failed = sum([j['exit_code']['main'] > 0 for j in bstatus['jobs'] if j['state'] in ('Failed', 'Error')])
        assert n_failed == 1, bstatus

    def test_batch_status(self):
        b1 = self.client.create_batch()
        b1.create_job('alpine', ['true'])
        b1 = b1.submit()
        b1.wait()
        b1s = b1.status()
        assert b1s['complete'] and b1s['state'] == 'success', b1s

        b2 = self.client.create_batch()
        b2.create_job('alpine', ['false'])
        b2.create_job('alpine', ['true'])
        b2 = b2.submit()
        b2.wait()
        b2s = b2.status()
        assert b2s['complete'] and b2s['state'] == 'failure', b2s

        b3 = self.client.create_batch()
        b3.create_job('alpine', ['sleep', '30'])
        b3 = b3.submit()
        b3s = b3.status()
        assert not b3s['complete'] and b3s['state'] == 'running', b3s

        b4 = self.client.create_batch()
        b4.create_job('alpine', ['sleep', '30'])
        b4 = b4.submit()
        b4.cancel()
        b4.wait()
        b4s = b4.status()
        assert b4s['complete'] and b4s['state'] == 'cancelled', b4s

    def test_callback(self):
        app = Flask('test-client')

        d = {}

        @app.route('/test', methods=['POST'])
        def test():
            d['status'] = request.get_json()
            return Response(status=200)

        server = ServerThread(app)
        try:
            server.start()
            b = self.client.create_batch()
            j = b.create_job(
                'alpine',
                ['echo', 'test'],
                attributes={'foo': 'bar'},
                callback=server.url_for('/test'))
            b = b.submit()
            j.wait()

            poll_until(lambda: 'status' in d)
            status = d['status']
            self.assertEqual(status['state'], 'Success')
            self.assertEqual(status['attributes'], {'foo': 'bar'})
        finally:
            server.shutdown()
            server.join()

    def test_log_after_failing_job(self):
        b = self.client.create_batch()
        j = b.create_job('alpine', ['/bin/sh', '-c', 'echo test; exit 127'])
        b.submit()
        status = j.wait()
        self.assertTrue('attributes' not in status)
        self.assertEqual(status['state'], 'Failed')
        self.assertEqual(status['exit_code']['main'], 127)

        self.assertEqual(j.log()['main'], 'test\n')

        self.assertTrue(j.is_complete())

    def test_authorized_users_only(self):
        endpoints = [
            (requests.get, '/api/v1alpha/batches/0/jobs/0'),
            (requests.get, '/api/v1alpha/batches/0/jobs/0/log'),
            (requests.get, '/api/v1alpha/batches/0/jobs/0/pod_status'),
            (requests.get, '/api/v1alpha/batches'),
            (requests.post, '/api/v1alpha/batches/create'),
            (requests.post, '/api/v1alpha/batches/0/jobs/create'),
            (requests.get, '/api/v1alpha/batches/0'),
            (requests.delete, '/api/v1alpha/batches/0'),
            (requests.patch, '/api/v1alpha/batches/0/close'),
            (requests.get, '/batches'),
            (requests.get, '/batches/0'),
            (requests.get, '/batches/0/jobs/0/log')]
        for f, url in endpoints:
            r = f(os.environ.get('BATCH_URL')+url)
            assert r.status_code == 401, r

    def test_bad_jwt_key(self):
        fname = pkg_resources.resource_filename(
            __name__,
            'jwt-test-user.json')
        with open(fname) as f:
            userdata = json.loads(f.read())
        token = hj.JWTClient(hj.JWTClient.generate_key()).encode(userdata)
        session = aiohttp.ClientSession(
            raise_for_status=True,
            timeout=aiohttp.ClientTimeout(total=60))
        bc = BatchClient(session, url=os.environ.get('BATCH_URL'), token=token)
        try:
            b = bc.create_batch()
            j = b.create_job('alpine', ['false'])
            b.submit()
            assert False, j
        except aiohttp.ClientResponseError as e:
            if e.status == 401:
                pass
            else:
                assert False, e
        finally:
            bc.close()

    def test_ui_batches(self):
        with open(os.environ['HAIL_TOKEN_FILE']) as f:
            token = f.read()
        # just check successful response
        r = requests.get(f'{os.environ.get("BATCH_URL")}/batches',
                         cookies={'user': token})
        assert (r.status_code >= 200) and (r.status_code < 300)

    def test_ui_batch_and_job_log(self):
        b = self.client.create_batch()
        j = b.create_job('alpine', ['true'])
        b = b.submit()
        status = j.wait()

        with open(os.environ['HAIL_TOKEN_FILE']) as f:
            token = f.read()

        # just check successful response
        r = requests.get(f'{os.environ.get("BATCH_URL")}/batches/{b.id}',
                         cookies={'user': token})
        assert (r.status_code >= 200) and (r.status_code < 300)

        # just check successful response
        r = requests.get(f'{os.environ.get("BATCH_URL")}/batches/{j.batch_id}/jobs/{j.job_id}/log',
                         cookies={'user': token})
        assert (r.status_code >= 200) and (r.status_code < 300)

        r = requests.get(f'{os.environ.get("BATCH_URL")}/batches/{j.batch_id}/jobs/{j.job_id}/pod_status',
                         cookies={'user': token})
        assert (r.status_code >= 200) and (r.status_code < 300)
Exemplo n.º 18
0
class ServiceBackend(Backend):
    """Backend that executes batches on Hail's Batch Service on Google Cloud.

    Examples
    --------

    >>> service_backend = ServiceBackend('my-billing-account', 'my-bucket') # doctest: +SKIP
    >>> b = Batch(backend=service_backend) # doctest: +SKIP
    >>> b.run() # doctest: +SKIP
    >>> service_backend.close() # doctest: +SKIP

    If the Hail configuration parameters batch/billing_project and
    batch/bucket were previously set with ``hailctl config set``, then
    one may elide the `billing_project` and `bucket` parameters.

    >>> service_backend = ServiceBackend()
    >>> b = Batch(backend=service_backend)
    >>> b.run() # doctest: +SKIP
    >>> service_backend.close()

    Parameters
    ----------
    billing_project:
        Name of billing project to use.
    bucket:
        Name of bucket to use.  Should not include the ``gs://``
        prefix.
    """

    def __init__(self,
                 billing_project: str = None,
                 bucket: str = None):
        if billing_project is None:
            billing_project = get_user_config().get('batch', 'billing_project', fallback=None)
        if billing_project is None:
            raise ValueError(
                'the billing_project parameter of ServiceBackend must be set '
                'or run `hailctl config set batch/billing_project '
                'MY_BILLING_PROJECT`')
        self._batch_client = BatchClient(billing_project)

        if bucket is None:
            bucket = get_user_config().get('batch', 'bucket', fallback=None)
        if bucket is None:
            raise ValueError(
                'the bucket parameter of ServiceBackend must be set '
                'or run `hailctl config set batch/bucket '
                'MY_BUCKET`')
        self._bucket_name = bucket

    def close(self):
        """
        Close the connection with the Batch Service.

        Notes
        -----
        This method should be called after executing your batches at the
        end of your script.
        """
        self._batch_client.close()

    def _run(self,
             batch: 'batch.Batch',
             dry_run: bool,
             verbose: bool,
             delete_scratch_on_exit: bool,
             wait: bool = True,
             open: bool = False,
             disable_progress_bar: bool = False,
             callback: Optional[str] = None,
             token: Optional[str] = None,
             **backend_kwargs):  # pylint: disable-msg=too-many-statements
        """Execute a batch.

        Warning
        -------
        This method should not be called directly. Instead, use :meth:`.batch.Batch.run`
        and pass :class:`.ServiceBackend` specific arguments as key-word arguments.

        Parameters
        ----------
        batch:
            Batch to execute.
        dry_run:
            If `True`, don't execute code.
        verbose:
            If `True`, print debugging output.
        delete_scratch_on_exit:
            If `True`, delete temporary directories with intermediate files.
        wait:
            If `True`, wait for the batch to finish executing before returning.
        open:
            If `True`, open the UI page for the batch.
        disable_progress_bar:
            If `True`, disable the progress bar.
        callback:
            If not `None`, a URL that will receive at most one POST request
            after the entire batch completes.
        token:
            If not `None`, a string used for idempotency of batch submission.
        """

        if backend_kwargs:
            raise ValueError(f'ServiceBackend does not support any of these keywords: {backend_kwargs}')

        build_dag_start = time.time()

        uid = uuid.uuid4().hex[:6]
        remote_tmpdir = f'gs://{self._bucket_name}/batch/{uid}'
        local_tmpdir = f'/io/batch/{uid}'

        default_image = 'ubuntu:18.04'

        attributes = copy.deepcopy(batch.attributes)
        if batch.name is not None:
            attributes['name'] = batch.name

        bc_batch = self._batch_client.create_batch(attributes=attributes, callback=callback,
                                                   token=token, cancel_after_n_failures=batch._cancel_after_n_failures)

        n_jobs_submitted = 0
        used_remote_tmpdir = False

        job_to_client_job_mapping: Dict[_job.Job, bc.Job] = {}
        jobs_to_command = {}
        commands = []

        bash_flags = 'set -e' + ('x' if verbose else '')

        activate_service_account = 'gcloud -q auth activate-service-account ' \
                                   '--key-file=/gsa-key/key.json'

        def copy_input(r):
            if isinstance(r, resource.InputResourceFile):
                return [(r._input_path, r._get_path(local_tmpdir))]
            assert isinstance(r, (resource.JobResourceFile, resource.PythonResult))
            return [(r._get_path(remote_tmpdir), r._get_path(local_tmpdir))]

        def copy_internal_output(r):
            assert isinstance(r, (resource.JobResourceFile, resource.PythonResult))
            return [(r._get_path(local_tmpdir), r._get_path(remote_tmpdir))]

        def copy_external_output(r):
            if isinstance(r, resource.InputResourceFile):
                return [(r._input_path, dest) for dest in r._output_paths]
            assert isinstance(r, (resource.JobResourceFile, resource.PythonResult))
            return [(r._get_path(local_tmpdir), dest) for dest in r._output_paths]

        def symlink_input_resource_group(r):
            symlinks = []
            if isinstance(r, resource.ResourceGroup) and r._source is None:
                for name, irf in r._resources.items():
                    src = irf._get_path(local_tmpdir)
                    dest = f'{r._get_path(local_tmpdir)}.{name}'
                    symlinks.append(f'ln -sf {shq(src)} {shq(dest)}')
            return symlinks

        write_external_inputs = [x for r in batch._input_resources for x in copy_external_output(r)]
        if write_external_inputs:
            def _cp(src, dst):
                return f'gsutil -m cp -R {shq(src)} {shq(dst)}'

            write_cmd = f'''
{bash_flags}
{activate_service_account}
{' && '.join([_cp(*files) for files in write_external_inputs])}
'''

            if dry_run:
                commands.append(write_cmd)
            else:
                j = bc_batch.create_job(image='gcr.io/google.com/cloudsdktool/cloud-sdk:310.0.0-alpine',
                                        command=['/bin/bash', '-c', write_cmd],
                                        attributes={'name': 'write_external_inputs'})
                jobs_to_command[j] = write_cmd
                n_jobs_submitted += 1

        for job in batch._jobs:
            if isinstance(job, _job.PythonJob):
                if job._image is None:
                    version = sys.version_info
                    if version.major != 3 or version.minor not in (6, 7, 8):
                        raise BatchException(
                            f"You must specify 'image' for Python jobs if you are using a Python version other than 3.6, 3.7, or 3.8 (you are using {version})")
                    job._image = f'hailgenetics/python-dill:{version.major}.{version.minor}-slim'
                job._compile(local_tmpdir, remote_tmpdir)

            inputs = [x for r in job._inputs for x in copy_input(r)]

            outputs = [x for r in job._internal_outputs for x in copy_internal_output(r)]
            if outputs:
                used_remote_tmpdir = True
            outputs += [x for r in job._external_outputs for x in copy_external_output(r)]

            symlinks = [x for r in job._mentioned for x in symlink_input_resource_group(r)]

            env_vars = {
                **job._env,
                **{r._uid: r._get_path(local_tmpdir) for r in job._mentioned}}

            if job._image is None:
                if verbose:
                    print(f"Using image '{default_image}' since no image was specified.")

            make_local_tmpdir = f'mkdir -p {local_tmpdir}/{job._job_id}'

            job_command = [cmd.strip() for cmd in job._command]

            prepared_job_command = (f'{{\n{x}\n}}' for x in job_command)
            cmd = f'''
{bash_flags}
{make_local_tmpdir}
{"; ".join(symlinks)}
{" && ".join(prepared_job_command)}
'''

            if dry_run:
                commands.append(cmd)
                continue

            parents = [job_to_client_job_mapping[j] for j in job._dependencies]

            attributes = copy.deepcopy(job.attributes) if job.attributes else dict()
            if job.name:
                attributes['name'] = job.name

            resources: Dict[str, Any] = {}
            if job._cpu:
                resources['cpu'] = job._cpu
            if job._memory:
                resources['memory'] = job._memory
            if job._storage:
                resources['storage'] = job._storage
            if job._machine_type:
                resources['machine_type'] = job._machine_type
            if job._preemptible is not None:
                resources['preemptible'] = job._preemptible

            image = job._image if job._image else default_image
            image_ref = parse_docker_image_reference(image)
            if not is_google_registry_domain(image_ref.domain) and image_ref.name() not in HAIL_GENETICS_IMAGES:
                warnings.warn(f'Using an image {image} not in GCR. '
                              f'Jobs may fail due to Docker Hub rate limits.')

            j = bc_batch.create_job(image=image,
                                    command=[job._shell if job._shell else self._DEFAULT_SHELL, '-c', cmd],
                                    parents=parents,
                                    attributes=attributes,
                                    resources=resources,
                                    input_files=inputs if len(inputs) > 0 else None,
                                    output_files=outputs if len(outputs) > 0 else None,
                                    always_run=job._always_run,
                                    timeout=job._timeout,
                                    gcsfuse=job._gcsfuse if len(job._gcsfuse) > 0 else None,
                                    env=env_vars,
                                    requester_pays_project=batch.requester_pays_project,
                                    mount_tokens=True)

            n_jobs_submitted += 1

            job_to_client_job_mapping[job] = j
            jobs_to_command[j] = cmd

        if dry_run:
            print("\n\n".join(commands))
            return None

        if delete_scratch_on_exit and used_remote_tmpdir:
            parents = list(jobs_to_command.keys())
            rm_cmd = f'gsutil -m rm -r {remote_tmpdir}'
            cmd = f'''
{bash_flags}
{activate_service_account}
{rm_cmd}
'''
            j = bc_batch.create_job(
                image='gcr.io/google.com/cloudsdktool/cloud-sdk:310.0.0-alpine',
                command=['/bin/bash', '-c', cmd],
                parents=parents,
                attributes={'name': 'remove_tmpdir'},
                always_run=True)
            jobs_to_command[j] = cmd
            n_jobs_submitted += 1

        if verbose:
            print(f'Built DAG with {n_jobs_submitted} jobs in {round(time.time() - build_dag_start, 3)} seconds.')

        submit_batch_start = time.time()
        bc_batch = bc_batch.submit(disable_progress_bar=disable_progress_bar)

        jobs_to_command = {j.id: cmd for j, cmd in jobs_to_command.items()}

        if verbose:
            print(f'Submitted batch {bc_batch.id} with {n_jobs_submitted} jobs in {round(time.time() - submit_batch_start, 3)} seconds:')
            for jid, cmd in jobs_to_command.items():
                print(f'{jid}: {cmd}')

            print('')

        deploy_config = get_deploy_config()
        url = deploy_config.url('batch', f'/batches/{bc_batch.id}')
        print(f'Submitted batch {bc_batch.id}, see {url}')

        if open:
            webbrowser.open(url)
        if wait:
            print(f'Waiting for batch {bc_batch.id}...')
            status = bc_batch.wait()
            print(f'batch {bc_batch.id} complete: {status["state"]}')
        return bc_batch