Пример #1
0
async def filesystem(request):
    token = secret_alnum_string()

    with ThreadPoolExecutor() as thread_pool:
        if request.param.startswith('router/'):
            fs = RouterAsyncFS(
                'file', [LocalAsyncFS(thread_pool),
                         GoogleStorageAsyncFS()])
        elif request.param == 'file':
            fs = LocalAsyncFS(thread_pool)
        else:
            fs = GoogleStorageAsyncFS()
        async with fs:
            if request.param.endswith('file'):
                base = f'/tmp/{token}/'
            else:
                assert request.param.endswith('gs')
                bucket = os.environ['HAIL_TEST_BUCKET']
                base = f'gs://{bucket}/tmp/{token}/'

            await fs.mkdir(base)
            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, base)
                await fs.rmtree(sema, base)
            assert not await fs.isdir(base)
Пример #2
0
async def router_filesystem(request):
    token = secrets.token_hex(16)

    with ThreadPoolExecutor() as thread_pool:
        async with RouterAsyncFS(
                'file', [LocalAsyncFS(thread_pool), GoogleStorageAsyncFS()]) as fs:
            file_base = f'/tmp/{token}/'
            await fs.mkdir(file_base)

            bucket = os.environ['HAIL_TEST_BUCKET']
            gs_base = f'gs://{bucket}/tmp/{token}/'

            bases = {
                'file': file_base,
                'gs': gs_base
            }

            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, bases)
                await bounded_gather2(sema,
                                      fs.rmtree(sema, file_base),
                                      fs.rmtree(sema, gs_base))

            assert not await fs.isdir(file_base)
            assert not await fs.isdir(gs_base)
Пример #3
0
async def gs_filesystem(request):
    token = secret_alnum_string()

    with ThreadPoolExecutor() as thread_pool:
        if request.param.startswith('router/'):
            fs = RouterAsyncFS('file',
                               filesystems=[
                                   LocalAsyncFS(thread_pool),
                                   GoogleStorageAsyncFS()
                               ])
        else:
            assert request.param.endswith('gs')
            fs = GoogleStorageAsyncFS()
        async with fs:
            test_storage_uri = os.environ['HAIL_TEST_STORAGE_URI']
            protocol = 'gs://'
            assert test_storage_uri[:len(protocol)] == protocol
            base = f'{test_storage_uri}/tmp/{token}/'

            await fs.mkdir(base)
            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, base)
                await fs.rmtree(sema, base)
            assert not await fs.isdir(base)
Пример #4
0
async def copy_test_specs():
    test_specs = []

    with ThreadPoolExecutor() as thread_pool:
        async with RouterAsyncFS(
                'file',
                filesystems=[LocalAsyncFS(thread_pool)]) as fs:
            for config in copy_test_configurations():
                token = secrets.token_hex(16)

                base = f'/tmp/{token}/'
                src_base = f'{base}src/'
                dest_base = f'{base}dest/'

                await fs.mkdir(base)
                await fs.mkdir(src_base)
                await fs.mkdir(dest_base)
                # make sure dest_base exists
                async with await fs.create(f'{dest_base}keep'):
                    pass

                sema = asyncio.Semaphore(50)
                async with sema:
                    result = await run_test_spec(sema, fs, config, src_base, dest_base)
                    config['result'] = result

                    test_specs.append(config)

                    await fs.rmtree(sema, base)
                    assert not await fs.isdir(base)

    return test_specs
Пример #5
0
 def _fs(self) -> AsyncFS:
     if self._DEPRECATED_project is not None:
         if self._DEPRECATED_fs is None:
             self._DEPRECATED_fs = RouterAsyncFS('file', [
                 LocalAsyncFS(ThreadPoolExecutor()),
                 GoogleStorageAsyncFS(project=self._DEPRECATED_project)
             ])
         return self._DEPRECATED_fs
     return self._backend._fs
Пример #6
0
async def filesystem(
        request) -> AsyncIterator[Tuple[asyncio.Semaphore, AsyncFS, str]]:
    token = secret_alnum_string()

    with ThreadPoolExecutor() as thread_pool:
        fs: AsyncFS
        if request.param.startswith('router/'):
            fs = RouterAsyncFS('file',
                               filesystems=[
                                   LocalAsyncFS(thread_pool),
                                   GoogleStorageAsyncFS(),
                                   S3AsyncFS(thread_pool),
                                   AzureAsyncFS()
                               ])
        elif request.param == 'file':
            fs = LocalAsyncFS(thread_pool)
        elif request.param.endswith('gs'):
            fs = GoogleStorageAsyncFS()
        elif request.param.endswith('s3'):
            fs = S3AsyncFS(thread_pool)
        else:
            assert request.param.endswith('hail-az')
            fs = AzureAsyncFS()
        async with fs:
            if request.param.endswith('file'):
                base = f'/tmp/{token}/'
            elif request.param.endswith('gs'):
                bucket = os.environ['HAIL_TEST_GCS_BUCKET']
                base = f'gs://{bucket}/tmp/{token}/'
            elif request.param.endswith('s3'):
                bucket = os.environ['HAIL_TEST_S3_BUCKET']
                base = f's3://{bucket}/tmp/{token}/'
            else:
                assert request.param.endswith('hail-az')
                account = os.environ['HAIL_TEST_AZURE_ACCOUNT']
                container = os.environ['HAIL_TEST_AZURE_CONTAINER']
                base = f'hail-az://{account}/{container}/tmp/{token}/'

            await fs.mkdir(base)
            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, base)
                await fs.rmtree(sema, base)
            assert not await fs.isdir(base)
Пример #7
0
async def local_filesystem(request):
    token = secret_alnum_string()

    with ThreadPoolExecutor() as thread_pool:
        async with LocalAsyncFS(thread_pool) as fs:
            base = f'/tmp/{token}/'
            await fs.mkdir(base)
            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, base)
                await fs.rmtree(sema, base)
            assert not await fs.isdir(base)
Пример #8
0
async def router_filesystem(
    request
) -> AsyncIterator[Tuple[asyncio.Semaphore, AsyncFS, Dict[str, str]]]:
    token = secrets.token_hex(16)

    with ThreadPoolExecutor() as thread_pool:
        async with RouterAsyncFS('file',
                                 filesystems=[
                                     LocalAsyncFS(thread_pool),
                                     GoogleStorageAsyncFS(),
                                     S3AsyncFS(thread_pool),
                                     AzureAsyncFS()
                                 ]) as fs:
            file_base = f'/tmp/{token}/'
            await fs.mkdir(file_base)

            gs_bucket = os.environ['HAIL_TEST_GCS_BUCKET']
            gs_base = f'gs://{gs_bucket}/tmp/{token}/'

            s3_bucket = os.environ['HAIL_TEST_S3_BUCKET']
            s3_base = f's3://{s3_bucket}/tmp/{token}/'

            azure_account = os.environ['HAIL_TEST_AZURE_ACCOUNT']
            azure_container = os.environ['HAIL_TEST_AZURE_CONTAINER']
            azure_base = f'hail-az://{azure_account}/{azure_container}/tmp/{token}/'

            bases = {
                'file': file_base,
                'gs': gs_base,
                's3': s3_base,
                'hail-az': azure_base
            }

            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, bases)
                await bounded_gather2(
                    sema, functools.partial(fs.rmtree, sema, file_base),
                    functools.partial(fs.rmtree, sema, gs_base),
                    functools.partial(fs.rmtree, sema, s3_base),
                    functools.partial(fs.rmtree, sema, azure_base))

            assert not await fs.isdir(file_base)
            assert not await fs.isdir(gs_base)
            assert not await fs.isdir(s3_base)
            assert not await fs.isdir(azure_base)
Пример #9
0
    def __init__(self,
                 tmp_dir: str = '/tmp/',
                 gsa_key_file: Optional[str] = None,
                 extra_docker_run_flags: Optional[str] = None):
        self._tmp_dir = tmp_dir.rstrip('/')

        flags = ''

        if extra_docker_run_flags is not None:
            flags += extra_docker_run_flags
        elif os.environ.get('HAIL_BATCH_EXTRA_DOCKER_RUN_FLAGS') is not None:
            flags += os.environ['HAIL_BATCH_EXTRA_DOCKER_RUN_FLAGS']

        if gsa_key_file is None:
            gsa_key_file = os.environ.get('HAIL_BATCH_GSA_KEY_FILE')
        if gsa_key_file is not None:
            flags += f' -v {gsa_key_file}:/gsa-key/key.json'

        self._extra_docker_run_flags = flags
        self.__fs: AsyncFS = LocalAsyncFS(ThreadPoolExecutor())
Пример #10
0
    def __init__(self,
                 *args,
                 billing_project: Optional[str] = None,
                 bucket: Optional[str] = None,
                 remote_tmpdir: Optional[str] = None,
                 google_project: Optional[str] = None,
                 token: str = None):
        if len(args) > 2:
            raise TypeError(
                f'ServiceBackend() takes 2 positional arguments but {len(args)} were given'
            )
        if len(args) >= 1:
            if billing_project is not None:
                raise TypeError(
                    'ServiceBackend() got multiple values for argument \'billing_project\''
                )
            warnings.warn(
                'Use of deprecated positional argument \'billing_project\' in ServiceBackend(). Specify \'billing_project\' as a keyword argument instead.'
            )
            billing_project = args[0]
        if len(args) >= 2:
            if bucket is not None:
                raise TypeError(
                    'ServiceBackend() got multiple values for argument \'bucket\''
                )
            warnings.warn(
                'Use of deprecated positional argument \'bucket\' in ServiceBackend(). Specify \'bucket\' as a keyword argument instead.'
            )
            bucket = args[1]

        if remote_tmpdir is not None and bucket is not None:
            raise ValueError(
                'Cannot specify both remote_tmpdir and bucket in ServiceBackend()'
            )

        if billing_project is None:
            billing_project = get_user_config().get('batch',
                                                    'billing_project',
                                                    fallback=None)
        if billing_project is None:
            raise ValueError(
                'the billing_project parameter of ServiceBackend must be set '
                'or run `hailctl config set batch/billing_project '
                'MY_BILLING_PROJECT`')
        self._batch_client = BatchClient(billing_project, _token=token)
        self.__fs: AsyncFS = RouterAsyncFS('file', [
            LocalAsyncFS(ThreadPoolExecutor()),
            GoogleStorageAsyncFS(project=google_project)
        ])
        if remote_tmpdir is None:
            if bucket is None:
                bucket = get_user_config().get('batch',
                                               'bucket',
                                               fallback=None)
            if bucket is None:
                raise ValueError(
                    'either the bucket or remote_tmpdir parameter of ServiceBackend '
                    'must be set or run `hailctl config set batch/bucket MY_BUCKET`'
                )
            if 'gs://' in bucket:
                raise ValueError(
                    'The bucket parameter to ServiceBackend() should be a bucket name, not a path. '
                    'Use the remote_tmpdir parameter to specify a path.')
            remote_tmpdir = f'gs://{bucket}/batch'
        else:
            if not remote_tmpdir.startswith('gs://'):
                raise ValueError(
                    'remote_tmpdir must be a google storage path like gs://bucket/folder'
                )
        if remote_tmpdir[-1] != '/':
            remote_tmpdir += '/'
        self.remote_tmpdir = remote_tmpdir
Пример #11
0
async def parallel_file_exists_async(fpaths: List[str],
                                     parallelism: int = 750
                                     ) -> Dict[str, bool]:
    """
    Check whether a large number of files exist.

    Created for use with hail Batch jobs.
    Normal `file_exists` function is very slow when checking a large number of files.

    :param fpaths: List of file paths to check. Files can be in local or Google cloud storage.
    :param parallelism: Integer that sets parallelism of file existence checking task. Default is 750.
    :return: Dictionary of file paths (str) and whether the file exists (boolean).
    """
    async def async_file_exists(fs: AsyncFS, fpath: str) -> bool:
        """
        Determine file existence.

        :param fs: AsyncFS object.
        :param fpath: Path to file to check.
        :return: Whether file exists.
        """
        fext = os.path.splitext(fpath)[1]
        if fext in [".ht", ".mt"]:
            fpath += "/_SUCCESS"
        try:
            await fs.statfile(fpath)
        except FileNotFoundError:
            return False
        else:
            return True

    with tqdm(total=len(fpaths),
              desc="check files for existence",
              disable=False) as pbar:
        with ThreadPoolExecutor() as thread_pool:
            async with RouterAsyncFS("file",
                                     filesystems=[
                                         LocalAsyncFS(thread_pool),
                                         GoogleStorageAsyncFS()
                                     ]) as fs:

                def check_existence_and_update_pbar_thunk(
                        fpath: str) -> Callable:
                    """
                    Create function to check if file exists and update progress bar in stdout.

                    Function delays coroutine creation to avoid creating too many live coroutines.

                    :param fpath: Path to file to check.
                    :return: Function that checks for file existence and updates progress bar.
                    """
                    async def unapplied_function():
                        x = await async_file_exists(fs, fpath)
                        pbar.update(1)
                        return x

                    return unapplied_function

                file_existence_checks = [
                    check_existence_and_update_pbar_thunk(fpath)
                    for fpath in fpaths
                ]
                file_existence = await bounded_gather(*file_existence_checks,
                                                      parallelism=parallelism)
    return dict(zip(fpaths, file_existence))