class Tests(unittest.TestCase): def setUp(self): remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir') token = uuid.uuid4() self.test_path = f'{remote_tmpdir}memory-tests/{token}' self.fs = RouterAsyncFS( 'gs', filesystems=[aiogoogle.GoogleStorageAsyncFS(project=PROJECT)]) self.client = BlockingMemoryClient(fs=self.fs) self.temp_files = set() def tearDown(self): async_to_blocking(self.fs.rmtree(None, self.test_path)) self.client.close() async def add_temp_file_from_string(self, name: str, str_value: bytes): handle = f'{self.test_path}/{name}' async with await self.fs.create(handle) as f: await f.write(str_value) return handle def test_non_existent(self): for _ in range(3): self.assertIsNone( self.client._get_file_if_exists( f'{self.test_path}/nonexistent')) def test_small_write_around(self): async def read(url): async with await self.fs.open(url) as f: return await f.read() cases = [('empty_file', b''), ('null', b'\0'), ('small', b'hello world')] for file, data in cases: handle = async_to_blocking( self.add_temp_file_from_string(file, data)) expected = async_to_blocking(read(handle)) self.assertEqual(expected, data) i = 0 cached = self.client._get_file_if_exists(handle) while cached is None and i < 10: cached = self.client._get_file_if_exists(handle) i += 1 self.assertEqual(cached, expected) def test_small_write_through(self): cases = [('empty_file2', b''), ('null2', b'\0'), ('small2', b'hello world')] for file, data in cases: filename = f'{self.test_path}/{file}' self.client.write_file(filename, data) cached = self.client._get_file_if_exists(filename) self.assertEqual(cached, data)
class BatchPoolExecutor: """An executor which executes Python functions in the cloud. :class:`.concurrent.futures.ProcessPoolExecutor` and :class:`.concurrent.futures.ThreadPoolExecutor` enable the use of all the computer cores available on a single computer. :class:`.BatchPoolExecutor` enables the use of an effectively arbitrary number of cloud computer cores. Functions provided to :meth:`.submit` are serialized using `dill <https://dill.readthedocs.io/en/latest/dill.html>`__, sent to a Python docker container in the cloud, deserialized, and executed. The results are serialized and returned to the machine from which :meth:`.submit` was called. The Python version in the docker container will share a major and minor verison with the local process. The `image` parameter overrides this behavior. When used as a context manager (the ``with`` syntax), the executor will wait for all jobs to finish before finishing the ``with`` statement. This behavior can be controlled by the `wait_on_exit` parameter. This class creates a folder ``batch-pool-executor`` at the root of the bucket specified by the `backend`. This folder can be safely deleted after all jobs have completed. Examples -------- Add ``3`` to ``6`` on a machine in the cloud and send the result back to this machine: >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... future_nine = bpe.submit(lambda: 3 + 6) >>> future_nine.result() # doctest: +SKIP 9 :meth:`.map` facilitates the common case of executing a function on many values in parallel: >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... list(bpe.map(lambda x: x * 3, range(4))) [0, 3, 6, 9] Parameters ---------- name: A name for the executor. Executors produce many batches and each batch will include this name as a prefix. backend: Backend used to execute the jobs. Must be a :class:`.ServiceBackend`. image: The name of a Docker image used for each submitted job. The image must include Python 3.7 or later and must have the ``dill`` Python package installed. If you intend to use ``numpy``, ensure that OpenBLAS is also installed. If unspecified, an image with a matching Python verison and ``numpy``, ``scipy``, and ``sklearn`` installed is used. cpus_per_job: The number of CPU cores to allocate to each job. The default value is ``1``. The parameter is passed unaltered to :meth:`.Job.cpu`. This parameter's value is used to set several environment variables instructing BLAS and LAPACK to limit core use. wait_on_exit: If ``True`` or unspecified, wait for all jobs to complete when exiting a context. If ``False``, do not wait. This option has no effect if this executor is not used with the ``with`` syntax. cleanup_bucket: If ``True`` or unspecified, delete all temporary files in the cloud storage bucket when this executor fully shuts down. If Python crashes before the executor is shutdown, the files will not be deleted. project: If specified, the project to use when authenticating with Google Storage. Google Storage is used to transfer serialized values between this computer and the cloud machines that execute jobs. """ def __init__(self, *, name: Optional[str] = None, backend: Optional[ServiceBackend] = None, image: Optional[str] = None, cpus_per_job: Optional[Union[int, str]] = None, wait_on_exit: bool = True, cleanup_bucket: bool = True, project: Optional[str] = None): self.name = name or "BatchPoolExecutor-" + secret_alnum_string(4) self.backend = backend or ServiceBackend() if not isinstance(self.backend, ServiceBackend): raise ValueError( f'BatchPoolExecutor is not compatible with {type(backend)}') self.batches: List[Batch] = [] self.directory = self.backend.remote_tmpdir + f'batch-pool-executor/{self.name}/' self.inputs = self.directory + 'inputs/' self.outputs = self.directory + 'outputs/' self.fs = RouterAsyncFS('file', gcs_kwargs={'project': project}) self.futures: List[BatchPoolFuture] = [] self.finished_future_count = 0 self._shutdown = False version = sys.version_info if image is None: if version.major != 3 or version.minor not in (6, 7, 8): raise ValueError( f'You must specify an image if you are using a Python version other than 3.6, 3.7, or 3.8 (you are using {version})' ) self.image = f'hailgenetics/python-dill:{version.major}.{version.minor}-slim' else: self.image = image self.cpus_per_job = cpus_per_job self.cleanup_bucket = cleanup_bucket self.wait_on_exit = wait_on_exit def __enter__(self): return self def map(self, fn: Callable, *iterables: Iterable[Any], timeout: Optional[Union[int, float]] = None, chunksize: int = 1): """Call `fn` on cloud machines with arguments from `iterables`. This function returns a generator which will produce each result in the same order as the `iterables`, only blocking if the result is not yet ready. You can convert the generator to a list with :class:`.list`. Examples -------- Do nothing, but on the cloud: >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... list(bpe.map(lambda x: x, range(4))) [0, 1, 2, 3] Call a function with two parameters, on the cloud: >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... list(bpe.map(lambda x, y: x + y, ... ["white", "cat", "best"], ... ["house", "dog", "friend"])) ["whitehouse", "catdog", "bestfriend"] Generate products of random matrices, on the cloud: >>> def random_product(seed): ... np.random.seed(seed) ... w = np.random.rand(1, 100) ... u = np.random.rand(100, 1) ... return float(w @ u) >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... list(bpe.map(random_product, range(4))) [24.440006386777277, 23.325755364428026, 23.920184804993806, 25.47912882125101] Parameters ---------- fn: The function to execute. iterables: The `iterables` are zipped together and each tuple is used as arguments to `fn`. See the second example for more detail. It is not possible to pass keyword arguments. Each element of `iterables` must have the same length. timeout: This is roughly a timeout on how long we wait on each function call. Specifically, each call to the returned generator's :class:`.BatchPoolFuture` :meth:`.iterator.__next__` invokes :meth:`.BatchPoolFuture.result` with this `timeout`. chunksize: The number of tasks to schedule in the same docker container. Docker containers take about 5 seconds to start. Ideally, each task should take an order of magnitude more time than start-up time. You can make the chunksize larger to reduce parallelism but increase the amount of meaningful work done per-container. """ agen = async_to_blocking( self.async_map(fn, iterables, timeout=timeout, chunksize=chunksize)) def generator_from_async_generator(aiter): try: while True: yield async_to_blocking(aiter.__anext__()) except StopAsyncIteration: return return generator_from_async_generator(agen.__aiter__()) async def async_map(self, fn: Callable, iterables: Iterable[Iterable[Any]], timeout: Optional[Union[int, float]] = None, chunksize: int = 1): """Aysncio compatible version of :meth:`.map`.""" if not iterables: return iter([]) if chunksize > 1: list_per_argument = [list(x) for x in iterables] n = len(list_per_argument[0]) assert all(n == len(x) for x in list_per_argument) n_chunks = (n + chunksize - 1) // chunksize iterables_chunks = [ list(partition(n_chunks, x)) for x in list_per_argument ] iterables_chunks = [ chunk for chunk in iterables_chunks if len(chunk) > 0 ] fn = chunk(fn) iterables = iterables_chunks submit_tasks = [ asyncio.ensure_future(self.async_submit(fn, *arguments)) for arguments in zip(*iterables) ] try: bp_futures = [await t for t in submit_tasks] except: for t in submit_tasks: if t.done() and not t.exception(): await t.result().async_cancel() elif not t.done(): t.cancel() raise async def async_result_or_cancel_all(future): try: return await future.async_result(timeout=timeout) except: await asyncio.gather( *[bp_fut.async_cancel() for bp_fut in bp_futures], return_exceptions=True) raise if chunksize > 1: return (val for future in bp_futures for val in await async_result_or_cancel_all(future)) return (await async_result_or_cancel_all(future) for future in bp_futures) def submit(self, fn: Callable, *args: Any, **kwargs: Any) -> 'BatchPoolFuture': """Call `fn` on a cloud machine with all remaining arguments and keyword arguments. The function, any objects it references, the arguments, and the keyword arguments will be serialized to the cloud machine. Python modules are not serialized, so you must ensure any needed Python modules and packages already present in the underlying Docker image. For more details see the `default_image` argument to :class:`.BatchPoolExecutor` This function does not return the function's output, it returns a :class:`.BatchPoolFuture` whose :meth:`.BatchPoolFuture.result` method can be used to access the value. Examples -------- Do nothing, but on the cloud: >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... future = bpe.submit(lambda x: x, 4) ... future.result() 4 Call a function with two arguments and one keyword argument, on the cloud: >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... future = bpe.submit(lambda x, y, z: x + y + z, ... "poly", "ethyl", z="ene") ... future.result() "polyethylene" Generate a product of two random matrices, on the cloud: >>> def random_product(seed): ... np.random.seed(seed) ... w = np.random.rand(1, 100) ... u = np.random.rand(100, 1) ... return float(w @ u) >>> with BatchPoolExecutor() as bpe: # doctest: +SKIP ... future = bpe.submit(random_product, 1) ... future.result() [23.325755364428026] Parameters ---------- fn: The function to execute. args: Arguments for the funciton. kwargs: Keyword arguments for the function. """ return async_to_blocking(self.async_submit(fn, *args, **kwargs)) async def async_submit(self, unapplied: Callable, *args: Any, **kwargs: Any) -> 'BatchPoolFuture': """Aysncio compatible version of :meth:`BatchPoolExecutor.submit`.""" if self._shutdown: raise RuntimeError('BatchPoolExecutor has already been shutdown.') try: name = unapplied.__name__ except AttributeError: name = '<anonymous>' name = f'{name}-{secret_alnum_string(4)}' batch = Batch(name=self.name + '-' + name, backend=self.backend, default_image=self.image) self.batches.append(batch) j = batch.new_job(name) pipe = BytesIO() dill.dump(functools.partial(unapplied, *args, **kwargs), pipe, recurse=True) pipe.seek(0) pickledfun_remote = self.inputs + f'{name}/pickledfun' await self.fs.write(pickledfun_remote, pipe.getvalue()) pickledfun_local = batch.read_input(pickledfun_remote) thread_limit = "1" if self.cpus_per_job: j.cpu(self.cpus_per_job) thread_limit = str( int(max(1.0, cpu_spec_to_float(self.cpus_per_job)))) j.env("OMP_NUM_THREADS", thread_limit) j.env("OPENBLAS_NUM_THREADS", thread_limit) j.env("MKL_NUM_THREADS", thread_limit) j.env("VECLIB_MAXIMUM_THREADS", thread_limit) j.env("NUMEXPR_NUM_THREADS", thread_limit) j.command('set -ex') j.command(f'''python3 -c " import base64 import dill import traceback with open(\\"{j.ofile}\\", \\"wb\\") as out: try: with open(\\"{pickledfun_local}\\", \\"rb\\") as f: dill.dump((dill.load(f)(), None), out, recurse=True) except Exception as e: print(\\"BatchPoolExecutor encountered an exception:\\") traceback.print_exc() dill.dump((e, traceback.format_exception(type(e), e, e.__traceback__)), out, recurse=True) "''') output_gcs = self.outputs + f'{name}/output' batch.write_output(j.ofile, output_gcs) backend_batch = batch.run(wait=False, disable_progress_bar=True)._async_batch try: return BatchPoolFuture( self, backend_batch, low_level_batch_client.Job.submitted_job(backend_batch, 1), output_gcs) except: await backend_batch.cancel() raise def __exit__(self, exc_type: Optional[Type[BaseException]], exc_value: Optional[BaseException], traceback: Optional[TracebackType]): self.shutdown(wait=self.wait_on_exit) def _add_future(self, f): self.futures.append(f) def _finish_future(self): self.finished_future_count += 1 if self._shutdown and self.finished_future_count == len(self.futures): self._cleanup() def shutdown(self, wait: bool = True): """Allow temporary resources to be cleaned up. Until shutdown is called, some temporary cloud storage files will persist. After shutdown has been called *and* all outstanding jobs have completed, these files will be deleted. Parameters ---------- wait: If true, wait for all jobs to complete before returning from this method. """ if wait: async def ignore_exceptions(f): try: await f.async_result() except Exception: pass async_to_blocking( asyncio.gather(*[ignore_exceptions(f) for f in self.futures])) if self.finished_future_count == len(self.futures): self._cleanup() self._shutdown = True def _cleanup(self): if self.cleanup_bucket: async_to_blocking(self.fs.rmtree(None, self.directory)) async_to_blocking(self.fs.close()) self.backend.close()