示例#1
0
class Tests(unittest.TestCase):
    def setUp(self):
        remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir')
        token = uuid.uuid4()
        self.test_path = f'{remote_tmpdir}memory-tests/{token}'
        self.fs = RouterAsyncFS(
            'gs',
            filesystems=[aiogoogle.GoogleStorageAsyncFS(project=PROJECT)])
        self.client = BlockingMemoryClient(fs=self.fs)
        self.temp_files = set()

    def tearDown(self):
        async_to_blocking(self.fs.rmtree(None, self.test_path))
        self.client.close()

    async def add_temp_file_from_string(self, name: str, str_value: bytes):
        handle = f'{self.test_path}/{name}'

        async with await self.fs.create(handle) as f:
            await f.write(str_value)

        return handle

    def test_non_existent(self):
        for _ in range(3):
            self.assertIsNone(
                self.client._get_file_if_exists(
                    f'{self.test_path}/nonexistent'))

    def test_small_write_around(self):
        async def read(url):
            async with await self.fs.open(url) as f:
                return await f.read()

        cases = [('empty_file', b''), ('null', b'\0'),
                 ('small', b'hello world')]
        for file, data in cases:
            handle = async_to_blocking(
                self.add_temp_file_from_string(file, data))
            expected = async_to_blocking(read(handle))
            self.assertEqual(expected, data)
            i = 0
            cached = self.client._get_file_if_exists(handle)
            while cached is None and i < 10:
                cached = self.client._get_file_if_exists(handle)
                i += 1
            self.assertEqual(cached, expected)

    def test_small_write_through(self):
        cases = [('empty_file2', b''), ('null2', b'\0'),
                 ('small2', b'hello world')]
        for file, data in cases:
            filename = f'{self.test_path}/{file}'
            self.client.write_file(filename, data)
            cached = self.client._get_file_if_exists(filename)
            self.assertEqual(cached, data)
示例#2
0
class BatchPoolExecutor:
    """An executor which executes Python functions in the cloud.

    :class:`.concurrent.futures.ProcessPoolExecutor` and
    :class:`.concurrent.futures.ThreadPoolExecutor` enable the use of all the
    computer cores available on a single computer. :class:`.BatchPoolExecutor`
    enables the use of an effectively arbitrary number of cloud computer cores.

    Functions provided to :meth:`.submit` are serialized using `dill
    <https://dill.readthedocs.io/en/latest/dill.html>`__, sent to a Python
    docker container in the cloud, deserialized, and executed. The results are
    serialized and returned to the machine from which :meth:`.submit` was
    called. The Python version in the docker container will share a major and
    minor verison with the local process. The `image` parameter overrides this
    behavior.

    When used as a context manager (the ``with`` syntax), the executor will wait
    for all jobs to finish before finishing the ``with`` statement. This
    behavior can be controlled by the `wait_on_exit` parameter.

    This class creates a folder ``batch-pool-executor`` at the root of the
    bucket specified by the `backend`. This folder can be safely deleted after
    all jobs have completed.

    Examples
    --------

    Add ``3`` to ``6`` on a machine in the cloud and send the result back to
    this machine:

    >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
    ...     future_nine = bpe.submit(lambda: 3 + 6)
    >>> future_nine.result()  # doctest: +SKIP
    9

    :meth:`.map` facilitates the common case of executing a function on many
    values in parallel:

    >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
    ...     list(bpe.map(lambda x: x * 3, range(4)))
    [0, 3, 6, 9]

    Parameters
    ----------
    name:
        A name for the executor. Executors produce many batches and each batch
        will include this name as a prefix.
    backend:
        Backend used to execute the jobs. Must be a :class:`.ServiceBackend`.
    image:
        The name of a Docker image used for each submitted job. The image must
        include Python 3.7 or later and must have the ``dill`` Python package
        installed. If you intend to use ``numpy``, ensure that OpenBLAS is also
        installed. If unspecified, an image with a matching Python verison and
        ``numpy``, ``scipy``, and ``sklearn`` installed is used.
    cpus_per_job:
        The number of CPU cores to allocate to each job. The default value is
        ``1``. The parameter is passed unaltered to :meth:`.Job.cpu`. This
        parameter's value is used to set several environment variables
        instructing BLAS and LAPACK to limit core use.
    wait_on_exit:
        If ``True`` or unspecified, wait for all jobs to complete when exiting a
        context. If ``False``, do not wait. This option has no effect if this
        executor is not used with the ``with`` syntax.
    cleanup_bucket:
        If ``True`` or unspecified, delete all temporary files in the cloud
        storage bucket when this executor fully shuts down. If Python crashes
        before the executor is shutdown, the files will not be deleted.
    project:
        If specified, the project to use when authenticating with Google
        Storage. Google Storage is used to transfer serialized values between
        this computer and the cloud machines that execute jobs.
    """
    def __init__(self,
                 *,
                 name: Optional[str] = None,
                 backend: Optional[ServiceBackend] = None,
                 image: Optional[str] = None,
                 cpus_per_job: Optional[Union[int, str]] = None,
                 wait_on_exit: bool = True,
                 cleanup_bucket: bool = True,
                 project: Optional[str] = None):
        self.name = name or "BatchPoolExecutor-" + secret_alnum_string(4)
        self.backend = backend or ServiceBackend()
        if not isinstance(self.backend, ServiceBackend):
            raise ValueError(
                f'BatchPoolExecutor is not compatible with {type(backend)}')
        self.batches: List[Batch] = []
        self.directory = self.backend.remote_tmpdir + f'batch-pool-executor/{self.name}/'
        self.inputs = self.directory + 'inputs/'
        self.outputs = self.directory + 'outputs/'
        self.fs = RouterAsyncFS('file', gcs_kwargs={'project': project})
        self.futures: List[BatchPoolFuture] = []
        self.finished_future_count = 0
        self._shutdown = False
        version = sys.version_info
        if image is None:
            if version.major != 3 or version.minor not in (6, 7, 8):
                raise ValueError(
                    f'You must specify an image if you are using a Python version other than 3.6, 3.7, or 3.8 (you are using {version})'
                )
            self.image = f'hailgenetics/python-dill:{version.major}.{version.minor}-slim'
        else:
            self.image = image
        self.cpus_per_job = cpus_per_job
        self.cleanup_bucket = cleanup_bucket
        self.wait_on_exit = wait_on_exit

    def __enter__(self):
        return self

    def map(self,
            fn: Callable,
            *iterables: Iterable[Any],
            timeout: Optional[Union[int, float]] = None,
            chunksize: int = 1):
        """Call `fn` on cloud machines with arguments from `iterables`.

        This function returns a generator which will produce each result in the
        same order as the `iterables`, only blocking if the result is not yet
        ready. You can convert the generator to a list with :class:`.list`.

        Examples
        --------

        Do nothing, but on the cloud:

        >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
        ...     list(bpe.map(lambda x: x, range(4)))
        [0, 1, 2, 3]

        Call a function with two parameters, on the cloud:

        >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
        ...     list(bpe.map(lambda x, y: x + y,
        ...                  ["white", "cat", "best"],
        ...                  ["house", "dog", "friend"]))
        ["whitehouse", "catdog", "bestfriend"]

        Generate products of random matrices, on the cloud:

        >>> def random_product(seed):
        ...     np.random.seed(seed)
        ...     w = np.random.rand(1, 100)
        ...     u = np.random.rand(100, 1)
        ...     return float(w @ u)
        >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
        ...     list(bpe.map(random_product, range(4)))
        [24.440006386777277, 23.325755364428026, 23.920184804993806, 25.47912882125101]

        Parameters
        ----------
        fn:
            The function to execute.
        iterables:
            The `iterables` are zipped together and each tuple is used as
            arguments to `fn`. See the second example for more detail. It is not
            possible to pass keyword arguments. Each element of `iterables` must
            have the same length.
        timeout:
            This is roughly a timeout on how long we wait on each function
            call. Specifically, each call to the returned generator's
            :class:`.BatchPoolFuture`
            :meth:`.iterator.__next__` invokes :meth:`.BatchPoolFuture.result` with this
            `timeout`.
        chunksize:
            The number of tasks to schedule in the same docker container. Docker
            containers take about 5 seconds to start. Ideally, each task should
            take an order of magnitude more time than start-up time. You can
            make the chunksize larger to reduce parallelism but increase the
            amount of meaningful work done per-container.
        """

        agen = async_to_blocking(
            self.async_map(fn, iterables, timeout=timeout,
                           chunksize=chunksize))

        def generator_from_async_generator(aiter):
            try:
                while True:
                    yield async_to_blocking(aiter.__anext__())
            except StopAsyncIteration:
                return

        return generator_from_async_generator(agen.__aiter__())

    async def async_map(self,
                        fn: Callable,
                        iterables: Iterable[Iterable[Any]],
                        timeout: Optional[Union[int, float]] = None,
                        chunksize: int = 1):
        """Aysncio compatible version of :meth:`.map`."""
        if not iterables:
            return iter([])

        if chunksize > 1:
            list_per_argument = [list(x) for x in iterables]
            n = len(list_per_argument[0])
            assert all(n == len(x) for x in list_per_argument)
            n_chunks = (n + chunksize - 1) // chunksize
            iterables_chunks = [
                list(partition(n_chunks, x)) for x in list_per_argument
            ]
            iterables_chunks = [
                chunk for chunk in iterables_chunks if len(chunk) > 0
            ]
            fn = chunk(fn)
            iterables = iterables_chunks

        submit_tasks = [
            asyncio.ensure_future(self.async_submit(fn, *arguments))
            for arguments in zip(*iterables)
        ]
        try:
            bp_futures = [await t for t in submit_tasks]
        except:
            for t in submit_tasks:
                if t.done() and not t.exception():
                    await t.result().async_cancel()
                elif not t.done():
                    t.cancel()
            raise

        async def async_result_or_cancel_all(future):
            try:
                return await future.async_result(timeout=timeout)
            except:
                await asyncio.gather(
                    *[bp_fut.async_cancel() for bp_fut in bp_futures],
                    return_exceptions=True)
                raise

        if chunksize > 1:
            return (val for future in bp_futures
                    for val in await async_result_or_cancel_all(future))

        return (await async_result_or_cancel_all(future)
                for future in bp_futures)

    def submit(self, fn: Callable, *args: Any,
               **kwargs: Any) -> 'BatchPoolFuture':
        """Call `fn` on a cloud machine with all remaining arguments and keyword arguments.

        The function, any objects it references, the arguments, and the keyword
        arguments will be serialized to the cloud machine. Python modules are
        not serialized, so you must ensure any needed Python modules and
        packages already present in the underlying Docker image. For more
        details see the `default_image` argument to :class:`.BatchPoolExecutor`

        This function does not return the function's output, it returns a
        :class:`.BatchPoolFuture` whose :meth:`.BatchPoolFuture.result` method
        can be used to access the value.

        Examples
        --------

        Do nothing, but on the cloud:

        >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
        ...     future = bpe.submit(lambda x: x, 4)
        ...     future.result()
        4

        Call a function with two arguments and one keyword argument, on the
        cloud:

        >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
        ...     future = bpe.submit(lambda x, y, z: x + y + z,
        ...                         "poly", "ethyl", z="ene")
        ...     future.result()
        "polyethylene"

        Generate a product of two random matrices, on the cloud:

        >>> def random_product(seed):
        ...     np.random.seed(seed)
        ...     w = np.random.rand(1, 100)
        ...     u = np.random.rand(100, 1)
        ...     return float(w @ u)
        >>> with BatchPoolExecutor() as bpe:  # doctest: +SKIP
        ...     future = bpe.submit(random_product, 1)
        ...     future.result()
        [23.325755364428026]

        Parameters
        ----------
        fn:
            The function to execute.
        args:
            Arguments for the funciton.
        kwargs:
            Keyword arguments for the function.
        """
        return async_to_blocking(self.async_submit(fn, *args, **kwargs))

    async def async_submit(self, unapplied: Callable, *args: Any,
                           **kwargs: Any) -> 'BatchPoolFuture':
        """Aysncio compatible version of :meth:`BatchPoolExecutor.submit`."""
        if self._shutdown:
            raise RuntimeError('BatchPoolExecutor has already been shutdown.')

        try:
            name = unapplied.__name__
        except AttributeError:
            name = '<anonymous>'
        name = f'{name}-{secret_alnum_string(4)}'
        batch = Batch(name=self.name + '-' + name,
                      backend=self.backend,
                      default_image=self.image)
        self.batches.append(batch)
        j = batch.new_job(name)

        pipe = BytesIO()
        dill.dump(functools.partial(unapplied, *args, **kwargs),
                  pipe,
                  recurse=True)
        pipe.seek(0)
        pickledfun_remote = self.inputs + f'{name}/pickledfun'
        await self.fs.write(pickledfun_remote, pipe.getvalue())
        pickledfun_local = batch.read_input(pickledfun_remote)

        thread_limit = "1"
        if self.cpus_per_job:
            j.cpu(self.cpus_per_job)
            thread_limit = str(
                int(max(1.0, cpu_spec_to_float(self.cpus_per_job))))
        j.env("OMP_NUM_THREADS", thread_limit)
        j.env("OPENBLAS_NUM_THREADS", thread_limit)
        j.env("MKL_NUM_THREADS", thread_limit)
        j.env("VECLIB_MAXIMUM_THREADS", thread_limit)
        j.env("NUMEXPR_NUM_THREADS", thread_limit)

        j.command('set -ex')
        j.command(f'''python3 -c "
import base64
import dill
import traceback
with open(\\"{j.ofile}\\", \\"wb\\") as out:
    try:
        with open(\\"{pickledfun_local}\\", \\"rb\\") as f:
            dill.dump((dill.load(f)(), None), out, recurse=True)
    except Exception as e:
        print(\\"BatchPoolExecutor encountered an exception:\\")
        traceback.print_exc()
        dill.dump((e, traceback.format_exception(type(e), e, e.__traceback__)), out, recurse=True)
"''')
        output_gcs = self.outputs + f'{name}/output'
        batch.write_output(j.ofile, output_gcs)
        backend_batch = batch.run(wait=False,
                                  disable_progress_bar=True)._async_batch
        try:
            return BatchPoolFuture(
                self, backend_batch,
                low_level_batch_client.Job.submitted_job(backend_batch, 1),
                output_gcs)
        except:
            await backend_batch.cancel()
            raise

    def __exit__(self, exc_type: Optional[Type[BaseException]],
                 exc_value: Optional[BaseException],
                 traceback: Optional[TracebackType]):
        self.shutdown(wait=self.wait_on_exit)

    def _add_future(self, f):
        self.futures.append(f)

    def _finish_future(self):
        self.finished_future_count += 1
        if self._shutdown and self.finished_future_count == len(self.futures):
            self._cleanup()

    def shutdown(self, wait: bool = True):
        """Allow temporary resources to be cleaned up.

        Until shutdown is called, some temporary cloud storage files will
        persist. After shutdown has been called *and* all outstanding jobs have
        completed, these files will be deleted.

        Parameters
        ----------
        wait:
            If true, wait for all jobs to complete before returning from this
            method.
        """
        if wait:

            async def ignore_exceptions(f):
                try:
                    await f.async_result()
                except Exception:
                    pass

            async_to_blocking(
                asyncio.gather(*[ignore_exceptions(f) for f in self.futures]))
        if self.finished_future_count == len(self.futures):
            self._cleanup()
        self._shutdown = True

    def _cleanup(self):
        if self.cleanup_bucket:
            async_to_blocking(self.fs.rmtree(None, self.directory))
        async_to_blocking(self.fs.close())
        self.backend.close()