def __init__(self, billing_project: str = None, bucket: str = None, *, deploy_config=None, skip_logging_configuration: bool = False): if billing_project is None: billing_project = get_user_config().get('batch', 'billing_project', fallback=None) if billing_project is None: billing_project = os.environ.get('HAIL_BILLING_PROJECT') if billing_project is None: raise ValueError( "No billing project. Call 'init_service' with the billing " "project, set the HAIL_BILLING_PROJECT environment variable, " "or run 'hailctl config set batch/billing_project " "MY_BILLING_PROJECT'") self._billing_project = billing_project if bucket is None: bucket = get_user_config().get('batch', 'bucket', fallback=None) if bucket is None: bucket = os.environ.get('HAIL_BUCKET') if bucket is None: raise ValueError( 'the bucket parameter of ServiceBackend must be set ' 'or run `hailctl config set batch/bucket ' 'MY_BUCKET`') self._bucket = bucket self._fs = None self._logger = PythonOnlyLogger(skip_logging_configuration) self.socket = ServiceSocket(deploy_config=deploy_config)
def test_input_dependency_directory(client): remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir') batch = client.create_batch() head = batch.create_job( DOCKER_ROOT_IMAGE, command=[ '/bin/sh', '-c', 'mkdir -p /io/test/; echo head1 > /io/test/data1 ; echo head2 > /io/test/data2' ], output_files=[('/io/test', f'{remote_tmpdir}test')], ) tail = batch.create_job( DOCKER_ROOT_IMAGE, command=['/bin/sh', '-c', 'cat /io/test/data1; cat /io/test/data2'], input_files=[(f'{remote_tmpdir}test', '/io/test')], parents=[head], ) batch = batch.submit() tail.wait() head_status = head.status() assert head._get_exit_code(head_status, 'main') == 0, str( (head_status, batch.debug_info())) tail_log = tail.log() assert tail_log['main'] == 'head1\nhead2\n', str( (tail_log, batch.debug_info()))
def main(args): if not args: parser().print_help() sys.exit(0) args = parser().parse_args(args=args) config_file = get_user_config_path() if args.module == 'config-location': print(config_file) sys.exit(0) config = get_user_config() path = args.parameter.split('/') if len(path) == 1: section = 'global' key = path[0] elif len(path) == 2: section = path[0] key = path[1] else: print(f''' Paramters must contain at most one slash separating the configuration section from the configuration parameter, for example: "batch/billing_project". Parameters may also have no slashes, indicating the parameter is a global parameter, for example: "email". A parameter with more than one slash is invalid, for example: "batch/billing/project". '''.lstrip('\n'), file=sys.stderr) sys.exit(1) if args.module == 'set': validation_func, msg = validations.get(tuple(path), (lambda x: True, '')) if not validation_func(args.value): print( f"Error: bad value {args.value!r} for parameter {args.parameter!r} {msg}", file=sys.stderr) sys.exit(1) if section not in config: config[section] = dict() config[section][key] = args.value with open(config_file, 'w') as f: config.write(f) sys.exit(0) if args.module == 'unset': if section in config and key in config[section]: del config[section][key] with open(config_file, 'w') as f: config.write(f) sys.exit(0) if args.module == 'get': if section in config and key in config[section]: print(config[section][key]) sys.exit(0) print(f'bad module name: {args.module}') sys.exit(1)
def test_can_use_google_credentials(client): token = os.environ["HAIL_TOKEN"] attempt_token = secrets.token_urlsafe(5) bucket_name = get_user_config().get('batch', 'bucket') builder = client.create_batch() script = f'''import hail as hl location = "gs://{ bucket_name }/{ token }/{ attempt_token }/test_can_use_hailctl_auth.t" hl.utils.range_table(10).write(location) hl.read_table(location).show() ''' j = builder.create_job(os.environ['HAIL_HAIL_BASE_IMAGE'], [ '/bin/bash', '-c', f'python3 -c >out 2>err \'{script}\'; cat out err' ]) builder.submit() status = j.wait() assert status['state'] == 'Success', f'{j.log(), status}' expected_log = '''+-------+ | idx | +-------+ | int32 | +-------+ | 0 | | 1 | | 2 | | 3 | | 4 | | 5 | | 6 | | 7 | | 8 | | 9 | +-------+ ''' log = j.log() assert expected_log in log['main'], f'{j.log(), status}'
def setUp(self): self.backend = ServiceBackend() self.bucket_name = get_user_config().get('batch', 'bucket') self.gcs_input_dir = f'gs://{self.bucket_name}/batch-tests/resources' token = uuid.uuid4() self.gcs_output_path = f'/batch-tests/{token}' self.gcs_output_dir = f'gs://{self.bucket_name}{self.gcs_output_path}' in_cluster_key_file = '/test-gsa-key/key.json' if os.path.exists(in_cluster_key_file): credentials = google.oauth2.service_account.Credentials.from_service_account_file( in_cluster_key_file) else: credentials = None gcs_client = google.cloud.storage.Client(project='hail-vdc', credentials=credentials) bucket = gcs_client.bucket(self.bucket_name) if not bucket.blob('batch-tests/resources/hello.txt').exists(): bucket.blob('batch-tests/resources/hello.txt').upload_from_string( 'hello world') if not bucket.blob('batch-tests/resources/hello spaces.txt').exists(): bucket.blob('batch-tests/resources/hello spaces.txt' ).upload_from_string('hello') if not bucket.blob( 'batch-tests/resources/hello (foo) spaces.txt').exists(): bucket.blob('batch-tests/resources/hello (foo) spaces.txt' ).upload_from_string('hello')
def setUp(self): bucket_name = get_user_config().get('batch', 'bucket') token = uuid.uuid4() self.test_path = f'gs://{bucket_name}/memory-tests/{token}' self.fs = GCS(concurrent.futures.ThreadPoolExecutor(), project=os.environ['PROJECT']) self.client = BlockingMemoryClient(fs=self.fs) self.temp_files = set()
def setUp(self): bucket_name = get_user_config().get('batch', 'bucket') token = uuid.uuid4() self.test_path = f'gs://{bucket_name}/memory-tests/{token}' self.fs = GoogleStorageAsyncFS(project=PROJECT) self.client = BlockingMemoryClient(fs=self.fs) self.temp_files = set()
def setUp(self): self.backend = ServiceBackend() remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir') if not remote_tmpdir.endswith('/'): remote_tmpdir += '/' self.remote_tmpdir = remote_tmpdir if remote_tmpdir.startswith('gs://'): self.bucket = re.fullmatch( 'gs://(?P<bucket_name>[^/]+).*', remote_tmpdir).groupdict()['bucket_name'] else: assert remote_tmpdir.startswith('hail-az://') storage_account, container_name = re.fullmatch( 'hail-az://(?P<storage_account>[^/]+)/(?P<container_name>[^/]+).*', remote_tmpdir).groups() self.bucket = f'{storage_account}/{container_name}' self.cloud_input_dir = f'{self.remote_tmpdir}batch-tests/resources' token = uuid.uuid4() self.cloud_output_path = f'/batch-tests/{token}' self.cloud_output_dir = f'{self.remote_tmpdir}{self.cloud_output_path}' in_cluster_key_file = '/test-gsa-key/key.json' if not os.path.exists(in_cluster_key_file): in_cluster_key_file = None router_fs = RouterAsyncFS( 'gs', gcs_kwargs={ 'project': 'hail-vdc', 'credentials_file': in_cluster_key_file }, azure_kwargs={'credential_file': in_cluster_key_file}) def sync_exists(url): return async_to_blocking(router_fs.exists(url)) def sync_write(url, data): return async_to_blocking(router_fs.write(url, data)) if not sync_exists( f'{self.remote_tmpdir}batch-tests/resources/hello.txt'): sync_write(f'{self.remote_tmpdir}batch-tests/resources/hello.txt', b'hello world') if not sync_exists( f'{self.remote_tmpdir}batch-tests/resources/hello spaces.txt'): sync_write( f'{self.remote_tmpdir}batch-tests/resources/hello spaces.txt', b'hello') if not sync_exists( f'{self.remote_tmpdir}batch-tests/resources/hello (foo) spaces.txt' ): sync_write( f'{self.remote_tmpdir}batch-tests/resources/hello (foo) spaces.txt', b'hello')
def setUp(self): remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir') token = uuid.uuid4() self.test_path = f'{remote_tmpdir}memory-tests/{token}' self.fs = RouterAsyncFS( 'gs', filesystems=[aiogoogle.GoogleStorageAsyncFS(project=PROJECT)]) self.client = BlockingMemoryClient(fs=self.fs) self.temp_files = set()
def __init__(self, billing_project: str = None, bucket: str = None): if billing_project is None: billing_project = get_user_config().get('batch', 'billing_project', fallback=None) if billing_project is None: raise ValueError( 'the billing_project parameter of ServiceBackend must be set ' 'or run `hailctl config set batch/billing_project ' 'MY_BILLING_PROJECT`') self._batch_client = BatchClient(billing_project) if bucket is None: bucket = get_user_config().get('batch', 'bucket', fallback=None) if bucket is None: raise ValueError( 'the bucket parameter of ServiceBackend must be set ' 'or run `hailctl config set batch/bucket ' 'MY_BUCKET`') self._bucket_name = bucket
def test_cant_submit_to_default_with_other_ns_creds(client: BatchClient): remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir') script = f'''import hailtop.batch as hb backend = hb.ServiceBackend("test", remote_tmpdir="{remote_tmpdir}") b = hb.Batch(backend=backend) j = b.new_bash_job() j.command("echo hi") b.run() backend.close() ''' builder = client.create_batch() j = builder.create_job( os.environ['HAIL_HAIL_BASE_IMAGE'], [ '/bin/bash', '-c', f''' hailctl config set domain {DOMAIN} rm /deploy-config/deploy-config.json python3 -c \'{script}\'''', ], mount_tokens=True, ) b = builder.submit() status = j.wait() if NAMESPACE == 'default': assert status['state'] == 'Success', str((status, b.debug_info())) else: assert status['state'] == 'Failed', str((status, b.debug_info())) assert "Please log in" in j.log()['main'], (str(j.log()['main']), status) builder = client.create_batch() j = builder.create_job( os.environ['HAIL_HAIL_BASE_IMAGE'], [ '/bin/bash', '-c', f''' jq '.default_namespace = "default"' /deploy-config/deploy-config.json > tmp.json mv tmp.json /deploy-config/deploy-config.json python3 -c \'{script}\'''', ], mount_tokens=True, ) b = builder.submit() status = j.wait() if NAMESPACE == 'default': assert status['state'] == 'Success', str((status, b.debug_info())) else: assert status['state'] == 'Failed', str((status, b.debug_info())) job_log = j.log() assert "Please log in" in job_log['main'], str( (job_log, b.debug_info()))
def __init__(self, billing_project=None): if billing_project is None: billing_project = get_user_config().get('batch', 'billing_project', fallback=None) if billing_project is None: raise ValueError( f'the billing_project parameter of ServiceBackend must be set ' f'or run `hailctl config set batch/billing_project ' f'YOUR_BILLING_PROJECT`') self._batch_client = BatchClient(billing_project)
def __init__(self, billing_project: str = None, bucket: str = None, *, deploy_config=None, skip_logging_configuration=None, disable_progress_bar: bool = True): del skip_logging_configuration if billing_project is None: billing_project = get_user_config().get('batch', 'billing_project', fallback=None) if billing_project is None: billing_project = os.environ.get('HAIL_BILLING_PROJECT') if billing_project is None: raise ValueError( "No billing project. Call 'init_service' with the billing " "project, set the HAIL_BILLING_PROJECT environment variable, " "or run 'hailctl config set batch/billing_project " "MY_BILLING_PROJECT'" ) if bucket is None: bucket = get_user_config().get('batch', 'bucket', fallback=None) if bucket is None: bucket = os.environ.get('HAIL_BUCKET') if bucket is None: raise ValueError( 'the bucket parameter of ServiceBackend must be set ' 'or run `hailctl config set batch/bucket ' 'MY_BUCKET`' ) self.billing_project = billing_project self.bucket = bucket self._fs = GoogleCloudStorageFS() deploy_config = deploy_config or get_deploy_config() self.bc = hb.BatchClient(self.billing_project) self.async_bc = self.bc._async_client self.disable_progress_bar = disable_progress_bar self.batch_attributes: Dict[str, str] = dict()
def test_input_dependency_directory(client): bucket_name = get_user_config().get('batch', 'bucket') batch = client.create_batch() head = batch.create_job('ubuntu:18.04', command=['/bin/sh', '-c', 'mkdir -p /io/test/; echo head1 > /io/test/data1 ; echo head2 > /io/test/data2'], output_files=[('/io/test/', f'gs://{bucket_name}')]) tail = batch.create_job('ubuntu:18.04', command=['/bin/sh', '-c', 'cat /io/test/data1; cat /io/test/data2'], input_files=[(f'gs://{bucket_name}/test', '/io/')], parents=[head]) batch.submit() tail.wait() assert head._get_exit_code(head.status(), 'main') == 0, head._status assert tail.log()['main'] == 'head1\nhead2\n', tail.status()
def check_for_running_batches(): yield billing_project = get_user_config().get('batch', 'billing_project', fallback=None) with BatchClient(billing_project=billing_project) as bc: for id in submitted_batch_ids: b = bc.get_batch(id) delay = 0.1 while True: if b.status()['state'] != 'running': break print(f'batch {b.id} is still running') delay = sync_sleep_and_backoff(delay)
def test_submit_batch_in_job(client: BatchClient): builder = client.create_batch() bucket_name = get_user_config().get('batch', 'bucket') script = f'''import hailtop.batch as hb backend = hb.ServiceBackend("test", "{bucket_name}") b = hb.Batch(backend=backend) j = b.new_bash_job() j.command("echo hi") b.run() backend.close() ''' j = builder.create_job( os.environ['HAIL_HAIL_BASE_IMAGE'], ['/bin/bash', '-c', f'''python3 -c \'{script}\''''], mount_tokens=True, ) b = builder.submit() status = j.wait() assert status['state'] == 'Success', str((status, b.debug_info()))
def __init__(self, billing_project: str = None, deploy_config=None, skip_logging_configuration=False): if billing_project is None: billing_project = get_user_config().get('batch', 'billing_project', fallback=None) if billing_project is None: billing_project = os.environ.get('HAIL_BILLING_PROJECT') if billing_project is None: raise ValueError( "No billing project. Call 'init_service' with the billing " "project, set the HAIL_BILLING_PROJECT environment variable, " "or run 'hailctl config set batch/billing_project " "MY_BILLING_PROJECT'") self._billing_project = billing_project if not deploy_config: deploy_config = get_deploy_config() self.url = deploy_config.base_url('query') self.headers = service_auth_headers(deploy_config, 'query') self._fs = None self._logger = PythonOnlyLogger(skip_logging_configuration)
async def create(*, billing_project: Optional[str] = None, batch_client: Optional[aiohb.BatchClient] = None, skip_logging_configuration: Optional[bool] = None, disable_progress_bar: bool = True, remote_tmpdir: Optional[str] = None, flags: Optional[Dict[str, str]] = None): del skip_logging_configuration if billing_project is None: billing_project = get_user_config().get('batch', 'billing_project', fallback=None) if billing_project is None: raise ValueError( "No billing project. Call 'init_service' with the billing " "project or run 'hailctl config set batch/billing_project " "MY_BILLING_PROJECT'") async_fs = RouterAsyncFS('file') sync_fs = RouterFS(async_fs) if batch_client is None: batch_client = await aiohb.BatchClient.create(billing_project) bc = hb.BatchClient.from_async(batch_client) batch_attributes: Dict[str, str] = dict() user_local_reference_cache_dir = Path(get_user_local_cache_dir(), 'references', version()) os.makedirs(user_local_reference_cache_dir, exist_ok=True) remote_tmpdir = get_remote_tmpdir('ServiceBackend', remote_tmpdir=remote_tmpdir) return ServiceBackend( billing_project=billing_project, sync_fs=sync_fs, async_fs=async_fs, bc=bc, disable_progress_bar=disable_progress_bar, batch_attributes=batch_attributes, user_local_reference_cache_dir=user_local_reference_cache_dir, remote_tmpdir=remote_tmpdir, flags=flags or {}, )
def test_input_dependency_wildcard(client): bucket_name = get_user_config().get('batch', 'bucket') batch = client.create_batch() head = batch.create_job( DOCKER_ROOT_IMAGE, command=['/bin/sh', '-c', 'echo head1 > /io/data1 ; echo head2 > /io/data2'], output_files=[('/io/data1', f'gs://{bucket_name}/data1'), ('/io/data2', f'gs://{bucket_name}/data2')], ) tail = batch.create_job( DOCKER_ROOT_IMAGE, command=['/bin/sh', '-c', 'cat /io/data1 ; cat /io/data2'], input_files=[(f'gs://{bucket_name}/data1', '/io/data1'), (f'gs://{bucket_name}/data2', '/io/data2')], parents=[head], ) batch = batch.submit() tail.wait() head_status = head.status() assert head._get_exit_code(head_status, 'input') != 0, str((head_status, batch.debug_info())) tail_log = tail.log() assert tail_log['main'] == 'head1\nhead2\n', str((tail_log, batch.debug_info()))
def test_input_dependency(client): bucket_name = get_user_config().get('batch', 'bucket') batch = client.create_batch() head = batch.create_job( DOCKER_ROOT_IMAGE, command=[ '/bin/sh', '-c', 'echo head1 > /io/data1; echo head2 > /io/data2' ], output_files=[('/io/data1', f'gs://{bucket_name}/data1'), ('/io/data2', f'gs://{bucket_name}/data2')]) tail = batch.create_job( DOCKER_ROOT_IMAGE, command=['/bin/sh', '-c', 'cat /io/data1; cat /io/data2'], input_files=[(f'gs://{bucket_name}/data1', '/io/data1'), (f'gs://{bucket_name}/data2', '/io/data2')], parents=[head]) batch.submit() tail.wait() assert head._get_exit_code(head.status(), 'main') == 0, str(head._status) assert tail.log()['main'] == 'head1\nhead2\n', str(tail.log(), tail.status())
def __init__(self, *args, billing_project: Optional[str] = None, bucket: Optional[str] = None, remote_tmpdir: Optional[str] = None, google_project: Optional[str] = None, token: str = None): if len(args) > 2: raise TypeError( f'ServiceBackend() takes 2 positional arguments but {len(args)} were given' ) if len(args) >= 1: if billing_project is not None: raise TypeError( 'ServiceBackend() got multiple values for argument \'billing_project\'' ) warnings.warn( 'Use of deprecated positional argument \'billing_project\' in ServiceBackend(). Specify \'billing_project\' as a keyword argument instead.' ) billing_project = args[0] if len(args) >= 2: if bucket is not None: raise TypeError( 'ServiceBackend() got multiple values for argument \'bucket\'' ) warnings.warn( 'Use of deprecated positional argument \'bucket\' in ServiceBackend(). Specify \'bucket\' as a keyword argument instead.' ) bucket = args[1] if remote_tmpdir is not None and bucket is not None: raise ValueError( 'Cannot specify both remote_tmpdir and bucket in ServiceBackend()' ) if billing_project is None: billing_project = get_user_config().get('batch', 'billing_project', fallback=None) if billing_project is None: raise ValueError( 'the billing_project parameter of ServiceBackend must be set ' 'or run `hailctl config set batch/billing_project ' 'MY_BILLING_PROJECT`') self._batch_client = BatchClient(billing_project, _token=token) self.__fs: AsyncFS = RouterAsyncFS('file', [ LocalAsyncFS(ThreadPoolExecutor()), GoogleStorageAsyncFS(project=google_project) ]) if remote_tmpdir is None: if bucket is None: bucket = get_user_config().get('batch', 'bucket', fallback=None) if bucket is None: raise ValueError( 'either the bucket or remote_tmpdir parameter of ServiceBackend ' 'must be set or run `hailctl config set batch/bucket MY_BUCKET`' ) if 'gs://' in bucket: raise ValueError( 'The bucket parameter to ServiceBackend() should be a bucket name, not a path. ' 'Use the remote_tmpdir parameter to specify a path.') remote_tmpdir = f'gs://{bucket}/batch' else: if not remote_tmpdir.startswith('gs://'): raise ValueError( 'remote_tmpdir must be a google storage path like gs://bucket/folder' ) if remote_tmpdir[-1] != '/': remote_tmpdir += '/' self.remote_tmpdir = remote_tmpdir
def __init__(self, *args, billing_project: Optional[str] = None, bucket: Optional[str] = None, remote_tmpdir: Optional[str] = None, google_project: Optional[str] = None, token: Optional[str] = None): if len(args) > 2: raise TypeError( f'ServiceBackend() takes 2 positional arguments but {len(args)} were given' ) if len(args) >= 1: if billing_project is not None: raise TypeError( 'ServiceBackend() got multiple values for argument \'billing_project\'' ) warnings.warn( 'Use of deprecated positional argument \'billing_project\' in ServiceBackend(). Specify \'billing_project\' as a keyword argument instead.' ) billing_project = args[0] if len(args) >= 2: if bucket is not None: raise TypeError( 'ServiceBackend() got multiple values for argument \'bucket\'' ) warnings.warn( 'Use of deprecated positional argument \'bucket\' in ServiceBackend(). Specify \'bucket\' as a keyword argument instead.' ) bucket = args[1] if billing_project is None: billing_project = get_user_config().get('batch', 'billing_project', fallback=None) if billing_project is None: raise ValueError( 'the billing_project parameter of ServiceBackend must be set ' 'or run `hailctl config set batch/billing_project ' 'MY_BILLING_PROJECT`') self._batch_client = BatchClient(billing_project, _token=token) user_config = get_user_config() if bucket is not None: warnings.warn( 'Use of deprecated argument \'bucket\' in ServiceBackend(). Specify \'remote_tmpdir\' as a keyword argument instead.' ) if remote_tmpdir is not None and bucket is not None: raise ValueError( 'Cannot specify both \'remote_tmpdir\' and \'bucket\' in ServiceBackend(). Specify \'remote_tmpdir\' as a keyword argument instead.' ) if bucket is None and remote_tmpdir is None: remote_tmpdir = user_config.get('batch', 'remote_tmpdir', fallback=None) if remote_tmpdir is None: if bucket is None: bucket = user_config.get('batch', 'bucket', fallback=None) warnings.warn( 'Using deprecated configuration setting \'batch/bucket\'. Run `hailctl config set batch/remote_tmpdir` ' 'to set the default for \'remote_tmpdir\' instead.') if bucket is None: raise ValueError( 'The \'remote_tmpdir\' parameter of ServiceBackend must be set. ' 'Run `hailctl config set batch/remote_tmpdir REMOTE_TMPDIR`' ) if 'gs://' in bucket: raise ValueError( 'The bucket parameter to ServiceBackend() should be a bucket name, not a path. ' 'Use the remote_tmpdir parameter to specify a path.') remote_tmpdir = f'gs://{bucket}/batch' else: schemes = {'gs', 'hail-az'} found_scheme = any( remote_tmpdir.startswith(f'{scheme}://') for scheme in schemes) if not found_scheme: raise ValueError( f'remote_tmpdir must be a storage uri path like gs://bucket/folder. Possible schemes include {schemes}' ) if remote_tmpdir[-1] != '/': remote_tmpdir += '/' self.remote_tmpdir = remote_tmpdir gcs_kwargs = {'project': google_project} self.__fs: AsyncFS = RouterAsyncFS(default_scheme='file', gcs_kwargs=gcs_kwargs)