def test_map_reduce_url(self): print('Testing map_reduce() over URLs...') ex = FunctionExecutor(config=CONFIG) ex.map_reduce(TestMethods.my_map_function_url, TEST_FILES_URLS, TestMethods.my_reduce_function) result = ex.get_result() self.assertEqual(result, self.__class__.cos_result_to_compare)
def pywren_inside_pywren_map_function(x): def _func(x): return x ex = FunctionExecutor() ex.map(_func, range(x)) return ex.get_result()
def pywren_return_futures_map_function3(x): def _func(x): return x + 1 ex = FunctionExecutor() fut1 = ex.map(_func, range(x)) fut2 = ex.map(_func, range(x)) return fut1 + fut2
def test_map_reduce(self): print('Testing map_reduce()...') iterdata = [[1, 1], [2, 2], [3, 3], [4, 4]] ex = FunctionExecutor(config=CONFIG) ex.map_reduce(TestMethods.simple_map_function, iterdata, TestMethods.simple_reduce_function) result = ex.get_result() self.assertEqual(result, 20)
def test_map_reduce_obj_bucket(self): print('Testing map_reduce() over a bucket...') sb = STORAGE_CONFIG['backend'] data_prefix = sb + '://' + STORAGE_CONFIG['bucket'] + '/' + PREFIX + '/' ex = FunctionExecutor(config=CONFIG) ex.map_reduce(TestMethods.my_map_function_obj, data_prefix, TestMethods.my_reduce_function) result = ex.get_result() self.assertEqual(result, self.__class__.cos_result_to_compare)
def test_storage_handler(self): print('Testing "storage" function arg...') iterdata = [[key, STORAGE_CONFIG['bucket']] for key in TestUtils.list_test_keys()] ex = FunctionExecutor(config=CONFIG) ex.map_reduce(TestMethods.my_map_function_storage, iterdata, TestMethods.my_reduce_function) result = ex.get_result() self.assertEqual(result, self.__class__.cos_result_to_compare)
def test_map_reduce_obj_key(self): print('Testing map_reduce() over object keys...') sb = STORAGE_CONFIG['backend'] bucket_name = STORAGE_CONFIG['bucket'] iterdata = [ sb + '://' + bucket_name + '/' + key for key in TestUtils.list_test_keys() ] ex = FunctionExecutor(config=CONFIG) ex.map_reduce(TestMethods.my_map_function_obj, iterdata, TestMethods.my_reduce_function) result = ex.get_result() self.assertEqual(result, self.__class__.cos_result_to_compare)
def __init__(self, processes=None, initializer=None, initargs=(), maxtasksperchild=None, context=None): self._ctx = context or get_context() #self._setup_queues() self._taskqueue = queue.Queue() self._cache = {} self._state = RUN self._maxtasksperchild = maxtasksperchild self._initializer = initializer self._initargs = initargs if processes is not None and processes < 1: raise ValueError("Number of processes must be at least 1") if processes is not None: if self._initargs: self._executor = FunctionExecutor(workers=processes, **self._initargs) else: self._executor = FunctionExecutor(workers=processes) self._processes = processes else: if self._initargs: self._executor = FunctionExecutor(**self._initargs) else: self._executor = FunctionExecutor() self._processes = self._executor.invoker.workers if initializer is not None and not callable(initializer): raise TypeError('initializer must be a callable') self._pool = []
def test_cloudobject(self): print('Testing cloudobjects...') sb = STORAGE_CONFIG['backend'] data_prefix = sb + '://' + STORAGE_CONFIG['bucket'] + '/' + PREFIX + '/' with FunctionExecutor(config=CONFIG) as ex: ex.map_reduce(TestMethods.my_cloudobject_put, data_prefix, TestMethods.my_cloudobject_get) result = ex.get_result() self.assertEqual(result, self.__class__.cos_result_to_compare)
class Popen(object): method = 'cloud' def __init__(self, process_obj): util._flush_std_streams() self.returncode = None self._executor = FunctionExecutor() self._launch(process_obj) def duplicate_for_child(self, fd): return fd def poll(self, flag=ALWAYS): if self.returncode is None: self._executor.wait([self.sentinel], return_when=flag) if self.sentinel.ready or self.sentinel.done: self.returncode = 0 if self.sentinel.error: self.returncode = 1 return self.returncode def wait(self, timeout=None): if self.returncode is None: wait = self._executor.wait if not wait([self.sentinel], timeout=timeout): return None # This shouldn't block if wait() returned successfully. return self.poll(ALWAYS if timeout == 0.0 else ALL_COMPLETED) return self.returncode def terminate(self): if self.returncode is None: try: self.sentinel.cancel() except NotImplementedError: pass def _launch(self, process_obj): fn_args = [*process_obj._args, *process_obj._kwargs] self.sentinel = self._executor.call_async(process_obj._target, fn_args)
def test_chunks_bucket(self): print('Testing chunks on a bucket...') sb = STORAGE_CONFIG['backend'] data_prefix = sb + '://' + STORAGE_CONFIG['bucket'] + '/' + PREFIX + '/' ex = FunctionExecutor(config=CONFIG) futures = ex.map_reduce(TestMethods.my_map_function_obj, data_prefix, TestMethods.my_reduce_function, chunk_size=1 * 1024**2) result = ex.get_result(futures) self.assertEqual(result, self.__class__.cos_result_to_compare) self.assertEqual(len(futures), 8) ex = FunctionExecutor(config=CONFIG) futures = ex.map_reduce(TestMethods.my_map_function_obj, data_prefix, TestMethods.my_reduce_function, chunk_n=2) result = ex.get_result(futures) self.assertEqual(result, self.__class__.cos_result_to_compare) self.assertEqual(len(futures), 11)
def test_multiple_executions(self): print('Testing multiple executions...') ex = FunctionExecutor(config=CONFIG) iterdata = [[1, 1], [2, 2]] ex.map(TestMethods.simple_map_function, iterdata) iterdata = [[3, 3], [4, 4]] ex.map(TestMethods.simple_map_function, iterdata) result = ex.get_result() self.assertEqual(result, [2, 4, 6, 8]) iterdata = [[1, 1], [2, 2]] ex.map(TestMethods.simple_map_function, iterdata) result = ex.get_result() self.assertEqual(result, [2, 4]) iterdata = [[1, 1], [2, 2]] futures1 = ex.map(TestMethods.simple_map_function, iterdata) result1 = ex.get_result(fs=futures1) iterdata = [[3, 3], [4, 4]] futures2 = ex.map(TestMethods.simple_map_function, iterdata) result2 = ex.get_result(fs=futures2) self.assertEqual(result1, [2, 4]) self.assertEqual(result2, [6, 8])
class Pool(object): ''' Class which supports an async version of applying functions to arguments. ''' _wrap_exception = True def Process(self, *args, **kwds): return self._ctx.Process(*args, **kwds) def __init__(self, processes=None, initializer=None, initargs=(), maxtasksperchild=None, context=None): self._ctx = context or get_context() #self._setup_queues() self._taskqueue = queue.Queue() self._cache = {} self._state = RUN self._maxtasksperchild = maxtasksperchild self._initializer = initializer self._initargs = initargs if processes is not None and processes < 1: raise ValueError("Number of processes must be at least 1") if processes is not None: if self._initargs: self._executor = FunctionExecutor(workers=processes, **self._initargs) else: self._executor = FunctionExecutor(workers=processes) self._processes = processes else: if self._initargs: self._executor = FunctionExecutor(**self._initargs) else: self._executor = FunctionExecutor() self._processes = self._executor.invoker.workers if initializer is not None and not callable(initializer): raise TypeError('initializer must be a callable') self._pool = [] #self._repopulate_pool() # self._worker_handler = threading.Thread( # target=Pool._handle_workers, # args=(self, ) # ) # self._worker_handler.daemon = True # self._worker_handler._state = RUN # self._worker_handler.start() # # # self._task_handler = threading.Thread( # target=Pool._handle_tasks, # args=(self._taskqueue, self._quick_put, self._outqueue, # self._pool, self._cache) # ) # self._task_handler.daemon = True # self._task_handler._state = RUN # self._task_handler.start() # # self._result_handler = threading.Thread( # target=Pool._handle_results, # args=(self._outqueue, self._quick_get, self._cache) # ) # self._result_handler.daemon = True # self._result_handler._state = RUN # self._result_handler.start() # # self._terminate = util.Finalize( # self, self._terminate_pool, # args=(self._taskqueue, self._inqueue, self._outqueue, self._pool, # self._worker_handler, self._task_handler, # self._result_handler, self._cache), # exitpriority=15 # ) def _join_exited_workers(self): """Cleanup after any worker processes which have exited due to reaching their specified lifetime. Returns True if any workers were cleaned up. """ cleaned = False for i in reversed(range(len(self._pool))): worker = self._pool[i] if worker.exitcode is not None: # worker exited util.debug('cleaning up worker %d' % i) worker.join() cleaned = True del self._pool[i] return cleaned def _repopulate_pool(self): """Bring the number of pool processes up to the specified number, for use after reaping workers which have exited. """ for i in range(self._processes - len(self._pool)): w = self.Process(target=worker, args=(self._inqueue, self._outqueue, self._initializer, self._initargs, self._maxtasksperchild, self._wrap_exception) ) self._pool.append(w) w.name = w.name.replace('Process', 'PoolWorker') w.daemon = True w.start() util.debug('added worker') def _maintain_pool(self): """Clean up any exited workers and start replacements for them. """ if self._join_exited_workers(): self._repopulate_pool() def _setup_queues(self): self._inqueue = self._ctx.SimpleQueue() self._outqueue = self._ctx.SimpleQueue() self._quick_put = self._inqueue._writer.send self._quick_get = self._outqueue._reader.recv def apply(self, func, args=(), kwds={}): ''' Equivalent of `func(*args, **kwds)`. ''' assert self._state == RUN return self.apply_async(func, args, kwds).get() def map(self, func, iterable, chunksize=None): ''' Apply `func` to each element in `iterable`, collecting the results in a list that is returned. ''' return self._map_async(func, iterable, mapstar, chunksize).get() def starmap(self, func, iterable, chunksize=None): ''' Like `map()` method but the elements of the `iterable` are expected to be iterables as well and will be unpacked as arguments. Hence `func` and (a, b) becomes func(a, b). ''' return self._map_async(func, iterable, starmapstar, chunksize).get() def starmap_async(self, func, iterable, chunksize=None, callback=None, error_callback=None): ''' Asynchronous version of `starmap()` method. ''' return self._map_async(func, iterable, starmapstar, chunksize, callback, error_callback) def _guarded_task_generation(self, result_job, func, iterable): '''Provides a generator of tasks for imap and imap_unordered with appropriate handling for iterables which throw exceptions during iteration.''' try: i = -1 for i, x in enumerate(iterable): yield (result_job, i, func, (x,), {}) except Exception as e: yield (result_job, i+1, _helper_reraises_exception, (e,), {}) def imap(self, func, iterable, chunksize=1): ''' Equivalent of `map()` -- can be MUCH slower than `Pool.map()`. ''' if self._state != RUN: raise ValueError("Pool not running") if chunksize == 1: result = IMapIterator(self._cache) self._taskqueue.put( ( self._guarded_task_generation(result._job, func, iterable), result._set_length )) return result else: assert chunksize > 1 task_batches = Pool._get_tasks(func, iterable, chunksize) result = IMapIterator(self._cache) self._taskqueue.put( ( self._guarded_task_generation(result._job, mapstar, task_batches), result._set_length )) return (item for chunk in result for item in chunk) def imap_unordered(self, func, iterable, chunksize=1): ''' Like `imap()` method but ordering of results is arbitrary. ''' if self._state != RUN: raise ValueError("Pool not running") if chunksize == 1: result = IMapUnorderedIterator(self._cache) self._taskqueue.put( ( self._guarded_task_generation(result._job, func, iterable), result._set_length )) return result else: assert chunksize > 1 task_batches = Pool._get_tasks(func, iterable, chunksize) result = IMapUnorderedIterator(self._cache) self._taskqueue.put( ( self._guarded_task_generation(result._job, mapstar, task_batches), result._set_length )) return (item for chunk in result for item in chunk) def apply_async(self, func, args=(), kwds={}, callback=None, error_callback=None): ''' Asynchronous version of `apply()` method. ''' if self._state != RUN: raise ValueError("Pool not running") futures = self._executor.call_async(func, [*args, *kwds]) result = ApplyResult(self._executor, [futures], callback, error_callback) return result def map_async(self, func, iterable, chunksize=None, callback=None, error_callback=None): ''' Asynchronous version of `map()` method. ''' return self._map_async(func, iterable, mapstar, chunksize, callback, error_callback) def _map_async(self, func, iterable, mapper, chunksize=None, callback=None, error_callback=None): ''' Helper function to implement map, starmap and their async counterparts. ''' if self._state != RUN: raise ValueError("Pool not running") if not hasattr(iterable, '__len__'): iterable = list(iterable) futures = self._executor.map(func, iterable) result = MapResult(self._executor, futures, callback, error_callback) return result @staticmethod def _handle_workers(pool): thread = threading.current_thread() # Keep maintaining workers until the cache gets drained, unless the pool # is terminated. while thread._state == RUN or (pool._cache and thread._state != TERMINATE): pool._maintain_pool() time.sleep(0.1) # send sentinel to stop workers pool._taskqueue.put(None) util.debug('worker handler exiting') @staticmethod def _handle_tasks(taskqueue, put, outqueue, pool, cache): thread = threading.current_thread() for taskseq, set_length in iter(taskqueue.get, None): task = None try: # iterating taskseq cannot fail for task in taskseq: if thread._state: util.debug('task handler found thread._state != RUN') break try: put(task) except Exception as e: job, idx = task[:2] try: cache[job]._set(idx, (False, e)) except KeyError: pass else: if set_length: util.debug('doing set_length()') idx = task[1] if task else -1 set_length(idx + 1) continue break finally: task = taskseq = job = None else: util.debug('task handler got sentinel') try: # tell result handler to finish when cache is empty util.debug('task handler sending sentinel to result handler') outqueue.put(None) # tell workers there is no more work util.debug('task handler sending sentinel to workers') for p in pool: put(None) except OSError: util.debug('task handler got OSError when sending sentinels') util.debug('task handler exiting') @staticmethod def _handle_results(outqueue, get, cache): thread = threading.current_thread() while 1: try: task = get() except (OSError, EOFError): util.debug('result handler got EOFError/OSError -- exiting') return if thread._state: assert thread._state == TERMINATE util.debug('result handler found thread._state=TERMINATE') break if task is None: util.debug('result handler got sentinel') break job, i, obj = task try: cache[job]._set(i, obj) except KeyError: pass task = job = obj = None while cache and thread._state != TERMINATE: try: task = get() except (OSError, EOFError): util.debug('result handler got EOFError/OSError -- exiting') return if task is None: util.debug('result handler ignoring extra sentinel') continue job, i, obj = task try: cache[job]._set(i, obj) except KeyError: pass task = job = obj = None if hasattr(outqueue, '_reader'): util.debug('ensuring that outqueue is not full') # If we don't make room available in outqueue then # attempts to add the sentinel (None) to outqueue may # block. There is guaranteed to be no more than 2 sentinels. try: for i in range(10): if not outqueue._reader.poll(): break get() except (OSError, EOFError): pass util.debug('result handler exiting: len(cache)=%s, thread._state=%s', len(cache), thread._state) @staticmethod def _get_tasks(func, it, size): it = iter(it) while 1: x = tuple(itertools.islice(it, size)) if not x: return yield (func, x) def __reduce__(self): raise NotImplementedError( 'pool objects cannot be passed between processes or pickled' ) def close(self): util.debug('closing pool') if self._state == RUN: self._state = CLOSE #self._worker_handler._state = CLOSE def terminate(self): util.debug('terminating pool') self._state = TERMINATE #self._worker_handler._state = TERMINATE #self._terminate() self._executor.clean() def join(self): util.debug('joining pool') assert self._state in (CLOSE, TERMINATE) #self._worker_handler.join() #self._task_handler.join() #self._result_handler.join() #for p in self._pool: # p.join() @staticmethod def _help_stuff_finish(inqueue, task_handler, size): # task_handler may be blocked trying to put items on inqueue util.debug('removing tasks from inqueue until task handler finished') while task_handler.is_alive() and inqueue._reader.poll(): inqueue._reader.recv() time.sleep(0) @classmethod def _terminate_pool(cls, taskqueue, inqueue, outqueue, pool, worker_handler, task_handler, result_handler, cache): # this is guaranteed to only be called once util.debug('finalizing pool') worker_handler._state = TERMINATE task_handler._state = TERMINATE util.debug('helping task handler/workers to finish') cls._help_stuff_finish(inqueue, task_handler, len(pool)) assert result_handler.is_alive() or len(cache) == 0 result_handler._state = TERMINATE outqueue.put(None) # sentinel # We must wait for the worker handler to exit before terminating # workers because we don't want workers to be restarted behind our back. util.debug('joining worker handler') if threading.current_thread() is not worker_handler: worker_handler.join() # Terminate workers which haven't already finished. if pool and hasattr(pool[0], 'terminate'): util.debug('terminating workers') for p in pool: if p.exitcode is None: p.terminate() util.debug('joining task handler') if threading.current_thread() is not task_handler: task_handler.join() util.debug('joining result handler') if threading.current_thread() is not result_handler: result_handler.join() if pool and hasattr(pool[0], 'terminate'): util.debug('joining pool workers') for p in pool: if p.is_alive(): # worker has not yet exited util.debug('cleaning up worker %d' % p.pid) p.join() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.terminate()
""" Simple Cloudbutton example using the map method of the raw API. In this example the map() method will launch one map function for each entry in 'iterdata'. Finally it will print the results for each invocation with pw.get_result() """ from cloudbutton.engine.executor import FunctionExecutor def my_map_function(id, x): print("I'm activation number {}".format(id)) return x + 7 if __name__ == "__main__": iterdata = [1, 2, 3, 4] cb_exc = FunctionExecutor() cb_exc.map(my_map_function, iterdata) print(cb_exc.get_result()) cb_exc.clean()
def my_map_function(x): return x + 7 def my_reduce_function(results): total = 0 for map_result in results: total = total + map_result return total if __name__ == "__main__": """ By default the reducer will be launched within a Cloud Function when the local PyWren have all the results from the mappers. """ cb_exc = FunctionExecutor() cb_exc.map_reduce(my_map_function, iterdata, my_reduce_function) print(cb_exc.get_result()) """ Set 'reducer_wait_local=True' to wait for the results locally. """ cb_exc = FunctionExecutor() cb_exc.map_reduce(my_map_function, iterdata, my_reduce_function, reducer_wait_local=True) print(cb_exc.get_result())
def pywren_return_futures_map_function1(x): def _func(x): return x + 1 ex = FunctionExecutor() return ex.map(_func, range(x))
""" Simple Cloudbutton example using the map_reduce method of the raw API. """ from cloudbutton.engine.executor import FunctionExecutor def my_function(x): return x + 7 if __name__ == '__main__': cb_exc = FunctionExecutor() cb_exc.call_async(my_function, 3) print(cb_exc.get_result())
def pywren_return_futures_map_function2(x): def _func(x): return x + 1 ex = FunctionExecutor() return ex.call_async(_func, x + 5)
def test_call_async(self): print('Testing call_async()...') ex = FunctionExecutor(config=CONFIG) ex.call_async(TestMethods.hello_world, "") result = ex.get_result() self.assertEqual(result, "Hello World!") ex = FunctionExecutor(config=CONFIG) ex.call_async(TestMethods.simple_map_function, [4, 6]) result = ex.get_result() self.assertEqual(result, 10) ex = FunctionExecutor(config=CONFIG) ex.call_async(TestMethods.simple_map_function, {'x': 2, 'y': 8}) result = ex.get_result() self.assertEqual(result, 10)
def test_internal_executions(self): print('Testing internal executions...') ex = FunctionExecutor(config=CONFIG) ex.map(TestMethods.pywren_inside_pywren_map_function, range(1, 11)) result = ex.get_result() self.assertEqual(result, [list(range(i)) for i in range(1, 11)]) ex = FunctionExecutor(config=CONFIG) ex.call_async(TestMethods.pywren_return_futures_map_function1, 3) ex.get_result() ex = FunctionExecutor(config=CONFIG) ex.call_async(TestMethods.pywren_return_futures_map_function2, 3) ex.get_result() ex = FunctionExecutor(config=CONFIG) ex.call_async(TestMethods.pywren_return_futures_map_function3, 3) ex.wait() ex.get_result()
def __init__(self, process_obj): util._flush_std_streams() self.returncode = None self._executor = FunctionExecutor() self._launch(process_obj)
def test_map(self): print('Testing map()...') iterdata = [[1, 1], [2, 2], [3, 3], [4, 4]] ex = FunctionExecutor(config=CONFIG) ex.map(TestMethods.simple_map_function, iterdata) result = ex.get_result() self.assertEqual(result, [2, 4, 6, 8]) ex = FunctionExecutor(config=CONFIG, workers=1) ex.map(TestMethods.simple_map_function, iterdata) result = ex.get_result() self.assertEqual(result, [2, 4, 6, 8]) ex = FunctionExecutor(config=CONFIG) set_iterdata = set(range(2)) ex.map(TestMethods.hello_world, set_iterdata) result = ex.get_result() self.assertEqual(result, ['Hello World!'] * 2) ex = FunctionExecutor(config=CONFIG) generator_iterdata = range(2) ex.map(TestMethods.hello_world, generator_iterdata) result = ex.get_result() self.assertEqual(result, ['Hello World!'] * 2) ex = FunctionExecutor(config=CONFIG) listDicts_iterdata = [{'x': 2, 'y': 8}, {'x': 2, 'y': 8}] ex.map(TestMethods.simple_map_function, listDicts_iterdata) result = ex.get_result() self.assertEqual(result, [10, 10])