def benchmark(workers, memory, loopcount, matn): iterable = [(loopcount, matn) for i in range(workers)] fexec = FunctionExecutor(runtime_memory=memory) start_time = time.time() worker_futures = fexec.map(compute_flops, iterable) results = fexec.get_result() end_time = time.time() worker_stats = [f.stats for f in worker_futures] total_time = end_time - start_time print("Total time:", round(total_time, 3)) est_flops = workers * 2 * loopcount * matn**3 print('Estimated GFLOPS:', round(est_flops / 1e9 / total_time, 4)) res = { 'start_time': start_time, 'total_time': total_time, 'est_flops': est_flops, 'worker_stats': worker_stats, 'results': results } return res
def __init__(self, processes=None, initializer=None, initargs=None, maxtasksperchild=None, context=None): if initargs is None: initargs = () self._taskqueue = queue.Queue() self._cache = {} self._state = RUN self._maxtasksperchild = maxtasksperchild self._initializer = initializer self._initargs = initargs self._remote_logger = None if processes is not None and processes < 1: raise ValueError("Number of processes must be at least 1") lithops_conf = mp_config.get_parameter(mp_config.LITHOPS_CONFIG) if processes is not None: self._processes = processes self._executor = FunctionExecutor(workers=processes, **lithops_conf) else: self._executor = FunctionExecutor(**lithops_conf) self._processes = self._executor.invoker.workers if initializer is not None and not callable(initializer): raise TypeError('initializer must be a callable')
def __init__(self, group=None, target=None, name=None, args=None, kwargs=None, *, daemon=None): assert group is None, 'process grouping is not implemented' count = next(_process_counter) if args is None: args = () if kwargs is None: kwargs = {} self._config = {} self._identity = count self._parent_pid = os.getpid() self._target = target self._args = tuple(args) self._kwargs = dict(kwargs) self._name = name or (type(self).__name__ + '-' + str(self._identity)) if daemon is not None: self.daemon = daemon lithops_config = mp_config.get_parameter(mp_config.LITHOPS_CONFIG) self._executor = FunctionExecutor(**lithops_config) self._forked = False self._sentinel = object() self._remote_logger = None self._redis = util.get_redis_client()
def read(backend, bucket_name, number, keylist_raw, read_times): blocksize = 1024 * 1024 def read_object(key_name, storage): m = hashlib.md5() bytes_read = 0 print(key_name) start_time = time.time() for unused in range(read_times): fileobj = storage.get_object(bucket_name, key_name, stream=True) try: buf = fileobj.read(blocksize) while len(buf) > 0: bytes_read += len(buf) #if bytes_read % (blocksize *10) == 0: # mb_rate = bytes_read/(time.time()-t1)/1e6 # print('POS:'+str(bytes_read)+' MB Rate: '+ str(mb_rate)) m.update(buf) buf = fileobj.read(blocksize) except Exception as e: print(e) pass end_time = time.time() mb_rate = bytes_read / (end_time - start_time) / 1e6 print('MB Rate: ' + str(mb_rate)) return { 'start_time': start_time, 'end_time': end_time, 'mb_rate': mb_rate, 'bytes_read': bytes_read } if number == 0: keynames = keylist_raw else: keynames = [keylist_raw[i % len(keylist_raw)] for i in range(number)] fexec = FunctionExecutor(backend=backend, runtime_memory=1024) start_time = time.time() worker_futures = fexec.map(read_object, keynames) results = fexec.get_result() end_time = time.time() total_time = end_time - start_time worker_stats = [f.stats for f in worker_futures] res = { 'start_time': start_time, 'total_time': total_time, 'worker_stats': worker_stats, 'results': results } return res
def __init__(self, processes=None, initializer=None, initargs=(), maxtasksperchild=None, context=None): self._ctx = context or get_context() # self._setup_queues() self._taskqueue = queue.Queue() self._cache = {} self._state = RUN self._maxtasksperchild = maxtasksperchild self._initializer = initializer self._initargs = initargs if processes is not None and processes < 1: raise ValueError("Number of processes must be at least 1") if processes is not None: if self._initargs: self._executor = FunctionExecutor(workers=processes, **self._initargs) else: self._executor = FunctionExecutor(workers=processes) self._processes = processes else: if self._initargs: self._executor = FunctionExecutor(**self._initargs) else: self._executor = FunctionExecutor() self._processes = self._executor.invoker.workers if initializer is not None and not callable(initializer): raise TypeError('initializer must be a callable') self._pool = []
def get_conn(self, lithops_executor_config): """ Initializes Lithops executor. """ lithops_executor_config['log_level'] = 'DEBUG' lithops_executor_config['config'] = self.lithops_config return FunctionExecutor(**lithops_executor_config)
def validate_command(prefix, image): storage_client = Storage() with FunctionExecutor(runtime=image) as fexec: bucket = fexec.config['lithops']['storage_bucket'] key_list = storage_client.list_keys(bucket, prefix + '/') validate_records_futures = fexec.map(validate_records, key_list, extra_args=[bucket, prefix], include_modules=['util']) results = fexec.get_result(fs=validate_records_futures) for index, r in enumerate(results): if not r['success']: print(f'Failed to validate partition: {key_list[index]}') print(r['stderr']) return validate_summaries_futures = fexec.map(validate_summaries, [prefix + summary_postfix], extra_args=[bucket], include_modules=['util']) results = fexec.get_result(fs=validate_summaries_futures) if results[0] == '': print('Success!') else: print(results)
def write(backend, bucket_name, mb_per_file, number, key_prefix): def write_object(key_name, storage): bytes_n = mb_per_file * 1024**2 d = RandomDataGenerator(bytes_n) print(key_name) start_time = time.time() storage.put_object(bucket_name, key_name, d) end_time = time.time() mb_rate = bytes_n / (end_time - start_time) / 1e6 print('MB Rate: ' + str(mb_rate)) return { 'start_time': start_time, 'end_time': end_time, 'mb_rate': mb_rate } # create list of random keys keynames = [ key_prefix + str(uuid.uuid4().hex.upper()) for unused in range(number) ] fexec = FunctionExecutor(backend=backend, runtime_memory=1024) start_time = time.time() worker_futures = fexec.map(write_object, keynames) results = fexec.get_result() end_time = time.time() worker_stats = [f.stats for f in worker_futures] total_time = end_time - start_time res = { 'start_time': start_time, 'total_time': total_time, 'worker_stats': worker_stats, 'bucket_name': bucket_name, 'keynames': keynames, 'results': results } return res
class Popen(object): method = 'cloud' def __init__(self, process_obj): util._flush_std_streams() self.returncode = None self._executor = FunctionExecutor() self._launch(process_obj) def duplicate_for_child(self, fd): return fd def poll(self, flag=ALWAYS): if self.returncode is None: self._executor.wait([self.sentinel], return_when=flag) if self.sentinel.ready or self.sentinel.done: self.returncode = 0 if self.sentinel.error: self.returncode = 1 return self.returncode def wait(self, timeout=None): if self.returncode is None: wait = self._executor.wait if not wait([self.sentinel], timeout=timeout): return None # This shouldn't block if wait() returned successfully. return self.poll(ALWAYS if timeout == 0.0 else ALL_COMPLETED) return self.returncode def terminate(self): if self.returncode is None: try: self.sentinel.cancel() except NotImplementedError: pass def _launch(self, process_obj): fn_args = [*process_obj._args, *process_obj._kwargs] self.sentinel = self._executor.call_async(process_obj._target, fn_args)
class FuturesList(list): def _create_executor(self): if not self.executor: from lithops import FunctionExecutor self.executor = FunctionExecutor(config=self.config) def _extend_futures(self, fs): for fut in self: fut._produce_output = False if not hasattr(self, 'alt_list'): self.alt_list = [] self.alt_list.extend(self) self.alt_list.extend(fs) self.clear() self.extend(fs) def map(self, map_function, sync=False, **kwargs): self._create_executor() if sync: self.executor.wait(self) fs = self.executor.map(map_function, self, **kwargs) self._extend_futures(fs) return self def map_reduce(self, map_function, reduce_function, sync=False, **kwargs): self._create_executor() if sync: self.executor.wait(self) fs = self.executor.map_reduce(map_function, self, reduce_function, **kwargs) self._extend_futures(fs) return self def wait(self, **kwargs): self._create_executor() fs_tt = self.alt_list if hasattr(self, 'alt_list') else self return self.executor.wait(fs_tt, **kwargs) def get_result(self, **kwargs): self._create_executor() fs_tt = self.alt_list if hasattr(self, 'alt_list') else self return self.executor.get_result(fs_tt, **kwargs) def __reduce__(self): self.executor = None return super().__reduce__()
def generate_command(number, prefix, partitions, image): bucket = None with FunctionExecutor(runtime=image) as fexec: bucket = fexec.config['lithops']['storage_bucket'] futures = fexec.map(generate_records, range(partitions), extra_args=[number, prefix], include_modules=['util']) results = fexec.get_result(fs=futures) # print(results) partition_size = record_size * number # Check if all files have been uploaded storage_client = Storage() partition_list = storage_client.list_objects(bucket, prefix + '/') assert len( partition_list ) == partitions, f'partition_list: {len(partition_list)}; partitions: {partitions}' for info in partition_list: assert info[ 'Size'] == partition_size, f'partition size: {partition_size} \ninfo: {info}' print('Done!')
def _create_executor(self): if not self.executor: from lithops import FunctionExecutor self.executor = FunctionExecutor(config=self.config)
class CloudProcess: def __init__(self, group=None, target=None, name=None, args=None, kwargs=None, *, daemon=None): assert group is None, 'process grouping is not implemented' count = next(_process_counter) if args is None: args = () if kwargs is None: kwargs = {} self._config = {} self._identity = count self._parent_pid = os.getpid() self._target = target self._args = tuple(args) self._kwargs = dict(kwargs) self._name = name or (type(self).__name__ + '-' + str(self._identity)) if daemon is not None: self.daemon = daemon lithops_config = mp_config.get_parameter(mp_config.LITHOPS_CONFIG) self._executor = FunctionExecutor(**lithops_config) self._forked = False self._sentinel = object() self._remote_logger = None self._redis = util.get_redis_client() def run(self): """ Method to be run in sub-process; can be overridden in sub-class """ if self._target: self._target(*self._args, **self._kwargs) def start(self): """ Start child process """ assert not self._forked, 'cannot start a process twice' assert self._parent_pid == os.getpid( ), 'can only start a process object created by current process' cloud_worker = CloudWorker(self._target) if mp_config.get_parameter(mp_config.STREAM_STDOUT): stream = self._executor.executor_id logger.debug( 'Log streaming enabled, stream name: {}'.format(stream)) self._remote_logger = util.RemoteLoggingFeed(stream) self._remote_logger.start() cloud_worker.log_stream = stream extra_env = mp_config.get_parameter(mp_config.ENV_VARS) self._executor.call_async(cloud_worker, { 'args': self._args, 'kwargs': self._kwargs }, extra_env=extra_env) del self._target, self._args, self._kwargs self._forked = True def terminate(self): """ Terminate process; sends SIGTERM signal or uses TerminateProcess() """ raise NotImplementedError() def join(self, timeout=None): """ Wait until child process terminates """ assert self._parent_pid == os.getpid(), 'can only join a child process' assert self._forked, 'can only join a started process' try: self._executor.wait() exception = None except Exception as e: exception = e finally: if self._remote_logger: self._remote_logger.stop() if exception: raise exception def is_alive(self): """ Return whether process is alive """ raise NotImplementedError() @property def name(self): return self._name @name.setter def name(self, name): assert isinstance(name, str), 'name must be a string' self._name = name @property def daemon(self): """ Return whether process is a daemon """ return self._config.get('daemon', False) @daemon.setter def daemon(self, daemonic): """ Set whether process is a daemon """ assert not self._forked, 'process has already started' self._config['daemon'] = daemonic @property def authkey(self): return self._config['authkey'] @authkey.setter def authkey(self, authkey): """ Set authorization key of process """ self._config['authkey'] = authkey @property def exitcode(self): """ Return exit code of process or `None` if it has yet to stop """ raise NotImplementedError() @property def ident(self): """ Return identifier (PID) of process or `None` if it has yet to start """ raise NotImplementedError() pid = ident @property def sentinel(self): """ Return a file descriptor (Unix) or handle (Windows) suitable for waiting for process termination. """ try: return self._sentinel except AttributeError: raise ValueError("process not started")
BUCKET_NAME = 'lithops-sample-data' # change-me def my_function(obj_id, storage): print(obj_id) data = storage.get_cloudobject(obj_id) return data.decode() if __name__ == '__main__': obj_key = 'cloudobject1.txt' storage = Storage() obj_id = storage.put_cloudobject('Hello World!', BUCKET_NAME, obj_key) print(obj_id) fexec = FunctionExecutor() fexec.call_async(my_function, obj_id) print(fexec.get_result()) obj_key = 'cloudobject2.txt' storage = fexec.storage obj_id = storage.put_cloudobject('Hello World!', BUCKET_NAME, obj_key) print(obj_id) fexec.call_async(my_function, obj_id) print(fexec.get_result())
def delete_temp_data(bucket_name, keynames): fexec = FunctionExecutor(runtime_memory=1024) print('Deleting temp files...') fexec.storage.delete_objects(bucket_name, keynames) print('Done!')
class Pool(object): """ Class which supports an async version of applying functions to arguments. """ _wrap_exception = True def Process(self, *args, **kwds): return self._ctx.Process(*args, **kwds) def __init__(self, processes=None, initializer=None, initargs=(), maxtasksperchild=None, context=None): self._ctx = context or get_context() # self._setup_queues() self._taskqueue = queue.Queue() self._cache = {} self._state = RUN self._maxtasksperchild = maxtasksperchild self._initializer = initializer self._initargs = initargs if processes is not None and processes < 1: raise ValueError("Number of processes must be at least 1") if processes is not None: if self._initargs: self._executor = FunctionExecutor(workers=processes, **self._initargs) else: self._executor = FunctionExecutor(workers=processes) self._processes = processes else: if self._initargs: self._executor = FunctionExecutor(**self._initargs) else: self._executor = FunctionExecutor() self._processes = self._executor.invoker.workers if initializer is not None and not callable(initializer): raise TypeError('initializer must be a callable') self._pool = [] def _join_exited_workers(self): """ Cleanup after any worker processes which have exited due to reaching their specified lifetime. Returns True if any workers were cleaned up. """ cleaned = False for i in reversed(range(len(self._pool))): worker = self._pool[i] if worker.exitcode is not None: # worker exited util.debug('cleaning up worker %d' % i) worker.join() cleaned = True del self._pool[i] return cleaned def _repopulate_pool(self): """ Bring the number of pool processes up to the specified number, for use after reaping workers which have exited. """ for i in range(self._processes - len(self._pool)): w = self.Process(target=worker, args=(self._inqueue, self._outqueue, self._initializer, self._initargs, self._maxtasksperchild, self._wrap_exception) ) self._pool.append(w) w.name = w.name.replace('Process', 'PoolWorker') w.daemon = True w.start() util.debug('added worker') def _maintain_pool(self): """ Clean up any exited workers and start replacements for them. """ if self._join_exited_workers(): self._repopulate_pool() def _setup_queues(self): self._inqueue = self._ctx.SimpleQueue() self._outqueue = self._ctx.SimpleQueue() self._quick_put = self._inqueue._writer.send self._quick_get = self._outqueue._reader.recv def apply(self, func, args=(), kwds={}): """ Equivalent of `func(*args, **kwds)`. """ assert self._state == RUN if kwds and not args: args = {} return self.apply_async(func, args, kwds).get() def map(self, func, iterable, chunksize=None): """ Apply `func` to each element in `iterable`, collecting the results in a list that is returned. """ return self._map_async(func, iterable, mapstar, chunksize).get() def starmap(self, func, iterable, chunksize=None): """ Like `map()` method but the elements of the `iterable` are expected to be iterables as well and will be unpacked as arguments. Hence `func` and (a, b) becomes func(a, b). """ return self._map_async(func, iterable, starmapstar, chunksize).get() def starmap_async(self, func, iterable, chunksize=None, callback=None, error_callback=None): """ Asynchronous version of `starmap()` method. """ return self._map_async(func, iterable, starmapstar, chunksize, callback, error_callback) def _guarded_task_generation(self, result_job, func, iterable): """ Provides a generator of tasks for imap and imap_unordered with appropriate handling for iterables which throw exceptions during iteration. """ try: i = -1 for i, x in enumerate(iterable): yield (result_job, i, func, (x,), {}) except Exception as e: yield (result_job, i + 1, _helper_reraises_exception, (e,), {}) def imap(self, func, iterable, chunksize=1): """ Equivalent of `map()` -- can be MUCH slower than `Pool.map()`. """ if self._state != RUN: raise ValueError("Pool not running") if chunksize == 1: result = IMapIterator(self._cache) self._taskqueue.put( ( self._guarded_task_generation(result._job, func, iterable), result._set_length )) return result else: assert chunksize > 1 task_batches = Pool._get_tasks(func, iterable, chunksize) result = IMapIterator(self._cache) self._taskqueue.put( ( self._guarded_task_generation(result._job, mapstar, task_batches), result._set_length )) return (item for chunk in result for item in chunk) def imap_unordered(self, func, iterable, chunksize=1): """ Like `imap()` method but ordering of results is arbitrary. """ if self._state != RUN: raise ValueError("Pool not running") if chunksize == 1: result = IMapUnorderedIterator(self._cache) self._taskqueue.put( ( self._guarded_task_generation(result._job, func, iterable), result._set_length )) return result else: assert chunksize > 1 task_batches = Pool._get_tasks(func, iterable, chunksize) result = IMapUnorderedIterator(self._cache) self._taskqueue.put( ( self._guarded_task_generation(result._job, mapstar, task_batches), result._set_length )) return (item for chunk in result for item in chunk) def apply_async(self, func, args=(), kwds={}, callback=None, error_callback=None): """ Asynchronous version of `apply()` method. """ if self._state != RUN: raise ValueError("Pool not running") futures = self._executor.call_async(func, data=args) result = ApplyResult(self._executor, [futures], callback, error_callback) return result def map_async(self, func, iterable, chunksize=None, callback=None, error_callback=None): """ Asynchronous version of `map()` method. """ return self._map_async(func, iterable, mapstar, chunksize, callback, error_callback) def _map_async(self, func, iterable, mapper, chunksize=None, callback=None, error_callback=None): """ Helper function to implement map, starmap and their async counterparts. """ if self._state != RUN: raise ValueError("Pool not running") if not hasattr(iterable, '__len__'): iterable = list(iterable) futures = self._executor.map(func, iterable) result = MapResult(self._executor, futures, callback, error_callback) return result @staticmethod def _handle_workers(pool): thread = threading.current_thread() # Keep maintaining workers until the cache gets drained, unless the pool # is terminated. while thread._state == RUN or (pool._cache and thread._state != TERMINATE): pool._maintain_pool() time.sleep(0.1) # send sentinel to stop workers pool._taskqueue.put(None) util.debug('worker handler exiting') @staticmethod def _handle_tasks(taskqueue, put, outqueue, pool, cache): thread = threading.current_thread() for taskseq, set_length in iter(taskqueue.get, None): task = None try: # iterating taskseq cannot fail for task in taskseq: if thread._state: util.debug('task handler found thread._state != RUN') break try: put(task) except Exception as e: job, idx = task[:2] try: cache[job]._set(idx, (False, e)) except KeyError: pass else: if set_length: util.debug('doing set_length()') idx = task[1] if task else -1 set_length(idx + 1) continue break finally: task = taskseq = job = None else: util.debug('task handler got sentinel') try: # tell result handler to finish when cache is empty util.debug('task handler sending sentinel to result handler') outqueue.put(None) # tell workers there is no more work util.debug('task handler sending sentinel to workers') for p in pool: put(None) except OSError: util.debug('task handler got OSError when sending sentinels') util.debug('task handler exiting') @staticmethod def _handle_results(outqueue, get, cache): thread = threading.current_thread() while 1: try: task = get() except (OSError, EOFError): util.debug('result handler got EOFError/OSError -- exiting') return if thread._state: assert thread._state == TERMINATE util.debug('result handler found thread._state=TERMINATE') break if task is None: util.debug('result handler got sentinel') break job, i, obj = task try: cache[job]._set(i, obj) except KeyError: pass task = job = obj = None while cache and thread._state != TERMINATE: try: task = get() except (OSError, EOFError): util.debug('result handler got EOFError/OSError -- exiting') return if task is None: util.debug('result handler ignoring extra sentinel') continue job, i, obj = task try: cache[job]._set(i, obj) except KeyError: pass task = job = obj = None if hasattr(outqueue, '_reader'): util.debug('ensuring that outqueue is not full') # If we don't make room available in outqueue then # attempts to add the sentinel (None) to outqueue may # block. There is guaranteed to be no more than 2 sentinels. try: for i in range(10): if not outqueue._reader.poll(): break get() except (OSError, EOFError): pass util.debug('result handler exiting: len(cache)=%s, thread._state=%s', len(cache), thread._state) @staticmethod def _get_tasks(func, it, size): it = iter(it) while 1: x = tuple(itertools.islice(it, size)) if not x: return yield (func, x) def __reduce__(self): raise NotImplementedError( 'pool objects cannot be passed between processes or pickled' ) def close(self): util.debug('closing pool') if self._state == RUN: self._state = CLOSE # self._worker_handler._state = CLOSE def terminate(self): util.debug('terminating pool') self._state = TERMINATE # self._worker_handler._state = TERMINATE # self._terminate() self._executor.clean() def join(self): util.debug('joining pool') assert self._state in (CLOSE, TERMINATE) # self._worker_handler.join() # self._task_handler.join() # self._result_handler.join() # for p in self._pool: # p.join() @staticmethod def _help_stuff_finish(inqueue, task_handler, size): # task_handler may be blocked trying to put items on inqueue util.debug('removing tasks from inqueue until task handler finished') while task_handler.is_alive() and inqueue._reader.poll(): inqueue._reader.recv() time.sleep(0) @classmethod def _terminate_pool(cls, taskqueue, inqueue, outqueue, pool, worker_handler, task_handler, result_handler, cache): # this is guaranteed to only be called once util.debug('finalizing pool') worker_handler._state = TERMINATE task_handler._state = TERMINATE util.debug('helping task handler/workers to finish') cls._help_stuff_finish(inqueue, task_handler, len(pool)) assert result_handler.is_alive() or len(cache) == 0 result_handler._state = TERMINATE outqueue.put(None) # sentinel # We must wait for the worker handler to exit before terminating # workers because we don't want workers to be restarted behind our back. util.debug('joining worker handler') if threading.current_thread() is not worker_handler: worker_handler.join() # Terminate workers which haven't already finished. if pool and hasattr(pool[0], 'terminate'): util.debug('terminating workers') for p in pool: if p.exitcode is None: p.terminate() util.debug('joining task handler') if threading.current_thread() is not task_handler: task_handler.join() util.debug('joining result handler') if threading.current_thread() is not result_handler: result_handler.join() if pool and hasattr(pool[0], 'terminate'): util.debug('joining pool workers') for p in pool: if p.is_alive(): # worker has not yet exited util.debug('cleaning up worker %d' % p.pid) p.join() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.terminate()
def sort_command(input_prefix, output_prefix, max_parallelism, image): storage_client = Storage() bucket = None input_info_lis = None with FunctionExecutor(runtime=image, workers=max_parallelism) as fexec: bucket = fexec.config['lithops']['storage_bucket'] input_info_list = storage_client.list_objects(bucket, input_prefix + '/') input_size = sum(info['Size'] for info in input_info_list) (num_shuffles, last_values_per_category) = make_plan(input_size) current_values_per_category = 1 current_prefix = input_prefix current_keys_list = [{ 'keys_list': [key_name], 'prefix': input_prefix + '-intermediate0', 'category_stack': [] } for key_name in storage_client.list_keys(bucket, input_prefix + '/')] for current_shuffle in range(num_shuffles): # Change values per category of last shuffle if current_shuffle == num_shuffles - 1: current_values_per_category = last_values_per_category radix_sort_futures = fexec.map(radix_sort_by_byte, current_keys_list, extra_args={ 'values_per_category': current_values_per_category }, include_modules=['util']) radix_sort_results = fexec.get_result(fs=radix_sort_futures) categories_keys_lists = {} for res in radix_sort_results: intermediate_keys_list = res['keys_list'] input_category_stack = res['category_stack'] for key_name in intermediate_keys_list: category_id = int(key_name.rsplit(sep='/', maxsplit=3)[-3]) new_category_stack = input_category_stack + [category_id] new_category_stack_str = '/'.join( [str(x) for x in new_category_stack]) if new_category_stack_str in categories_keys_lists: categories_keys_lists[new_category_stack_str].append( key_name) else: categories_keys_lists[new_category_stack_str] = [ key_name ] # Partition category lists # Attach prefix metadata so that sorter knows what to name files each_category_size = input_size / ( (256 / current_values_per_category) * (current_shuffle + 1)) num_partitions_per_category = math.ceil(each_category_size / buffer_size_to_categorize) current_keys_list = [] for category_stack_str, cat_keys_list in categories_keys_lists.items( ): for sub_list in np.array_split(cat_keys_list, num_partitions_per_category): partition_entry = { 'keys_list': sub_list, 'prefix': f'{input_prefix}-intermediate{str(current_shuffle + 1)}', 'category_stack': [int(x) for x in category_stack_str.split('/')] } current_keys_list.append(partition_entry) consider_last_byte_sorted = False if last_values_per_category == 1: consider_last_byte_sorted = True for entry in current_keys_list: entry['prefix'] = output_prefix sorted_keys_list = sorted(current_keys_list, key=lambda x: x['category_stack']) sort_category_futures = fexec.map(sort_category, sorted_keys_list, extra_args={ 'consider_last_byte_sorted': consider_last_byte_sorted }, include_modules=['util']) results = fexec.get_result(fs=sort_category_futures) # print(results) # Check if size of output matches size of input output_info_list = storage_client.list_objects(bucket, output_prefix) output_size = sum(info['Size'] for info in output_info_list) assert input_size == output_size, f'input size: {input_size}, output_size: {output_size}' print('Done!')
def __init__(self, process_obj): util._flush_std_streams() self.returncode = None self._executor = FunctionExecutor() self._launch(process_obj)
class CloudProcess: def __init__(self, group=None, target=None, name=None, args=None, kwargs=None, *, daemon=None): assert group is None, 'process grouping is not implemented' count = next(_process_counter) if args is None: args = () if kwargs is None: kwargs = {} self._config = {} self._identity = count self._parent_pid = os.getpid() self._target = target self._args = tuple(args) self._kwargs = dict(kwargs) self._name = name or (type(self).__name__ + '-' + str(self._identity)) if daemon is not None: self.daemon = daemon lithops_config = mp_config.get_parameter(mp_config.LITHOPS_CONFIG) self._executor = FunctionExecutor(**lithops_config) self._forked = False self._sentinel = object() self._logger_thread = None self._redis = util.get_redis_client() def _logger_monitor(self, stream): logger.debug('Starting logger monitor thread') redis_pubsub = self._redis.pubsub() redis_pubsub.subscribe(stream) while True: msg = redis_pubsub.get_message(ignore_subscribe_messages=True, timeout=10) if msg is None: continue sys.stdout.write(msg['data'].decode('utf-8')) def run(self): """ Method to be run in sub-process; can be overridden in sub-class """ if self._target: self._target(*self._args, **self._kwargs) def start(self): """ Start child process """ assert not self._forked, 'cannot start a process twice' assert self._parent_pid == os.getpid( ), 'can only start a process object created by current process' # sig = inspect.signature(self._target) # pos_args = [param.name for _, param in sig.parameters.items() if param.default is inspect.Parameter.empty] # fmt_args = dict(zip(pos_args, self._args)) # fmt_args.update(self._kwargs) cloud_worker = CloudWorker(self._target) extra_env = {} if mp_config.get_parameter(mp_config.STREAM_STDOUT): stream = self._executor.executor_id logger.debug( 'Log streaming enabled, stream name: {}'.format(stream)) cloud_worker.log_stream = stream self._logger_thread = threading.Thread(target=self._logger_monitor, args=(stream, )) self._logger_thread.daemon = True self._logger_thread.start() self._executor.call_async(cloud_worker, { 'args': self._args, 'kwargs': self._kwargs }, extra_env=extra_env) del self._target, self._args, self._kwargs self._forked = True def terminate(self): """ Terminate process; sends SIGTERM signal or uses TerminateProcess() """ raise NotImplementedError() def join(self, timeout=None): """ Wait until child process terminates """ assert self._parent_pid == os.getpid(), 'can only join a child process' assert self._forked, 'can only join a started process' self._executor.wait() def is_alive(self): """ Return whether process is alive """ raise NotImplementedError() @property def name(self): return self._name @name.setter def name(self, name): assert isinstance(name, str), 'name must be a string' self._name = name @property def daemon(self): """ Return whether process is a daemon """ return self._config.get('daemon', False) @daemon.setter def daemon(self, daemonic): """ Set whether process is a daemon """ assert not self._forked, 'process has already started' self._config['daemon'] = daemonic @property def authkey(self): return self._config['authkey'] @authkey.setter def authkey(self, authkey): """ Set authorization key of process """ self._config['authkey'] = authkey @property def exitcode(self): """ Return exit code of process or `None` if it has yet to stop """ raise NotImplementedError() @property def ident(self): """ Return identifier (PID) of process or `None` if it has yet to start """ raise NotImplementedError() pid = ident @property def sentinel(self): """ Return a file descriptor (Unix) or handle (Windows) suitable for waiting for process termination. """ try: return self._sentinel except AttributeError: raise ValueError("process not started")
class Pool(object): """ Class which supports an async version of applying functions to arguments. """ _wrap_exception = True Process = CloudProcess def __init__(self, processes=None, initializer=None, initargs=None, maxtasksperchild=None, context=None): if initargs is None: initargs = () self._taskqueue = queue.Queue() self._cache = {} self._state = RUN self._maxtasksperchild = maxtasksperchild self._initializer = initializer self._initargs = initargs self._remote_logger = None if processes is not None and processes < 1: raise ValueError("Number of processes must be at least 1") lithops_conf = mp_config.get_parameter(mp_config.LITHOPS_CONFIG) if processes is not None: self._processes = processes self._executor = FunctionExecutor(workers=processes, **lithops_conf) else: self._executor = FunctionExecutor(**lithops_conf) self._processes = self._executor.invoker.workers if initializer is not None and not callable(initializer): raise TypeError('initializer must be a callable') def apply(self, func, args=(), kwds={}): """ Equivalent of `func(*args, **kwds)`. """ assert self._state == RUN if kwds and not args: args = {} return self.apply_async(func, args, kwds).get() def map(self, func, iterable, chunksize=None): """ Apply `func` to each element in `iterable`, collecting the results in a list that is returned. """ return self._map_async(func, iterable, chunksize).get() def starmap(self, func, iterable, chunksize=None): """ Like `map()` method but the elements of the `iterable` are expected to be iterables as well and will be unpacked as arguments. Hence `func` and (a, b) becomes func(a, b). """ return self._map_async(func, iterable, chunksize=chunksize).get() def starmap_async(self, func, iterable, chunksize=None, callback=None, error_callback=None): """ Asynchronous version of `starmap()` method. """ return self._map_async(func, iterable, chunksize=chunksize, callback=callback, error_callback=error_callback) def imap(self, func, iterable, chunksize=1): """ Equivalent of `map()` -- can be MUCH slower than `Pool.map()`. """ res = self.map(func, iterable, chunksize=chunksize) return IMapIterator(res) def imap_unordered(self, func, iterable, chunksize=1): """ Like `imap()` method but ordering of results is arbitrary. """ res = self.map(func, iterable, chunksize=chunksize) return IMapIterator(res) def apply_async(self, func, args=(), kwds={}, callback=None, error_callback=None): """ Asynchronous version of `apply()` method. """ if self._state != RUN: raise ValueError("Pool not running") cloud_worker = CloudWorker(func=func, initializer=self._initializer, initargs=self._initargs) if mp_config.get_parameter(mp_config.STREAM_STDOUT): stream = self._executor.executor_id util.debug('Log streaming enabled, stream name: {}'.format(stream)) self._remote_logger = util.RemoteLoggingFeed(stream) self._remote_logger.start() cloud_worker.log_stream = stream extra_env = mp_config.get_parameter(mp_config.ENV_VARS) futures = self._executor.call_async(cloud_worker, data={ 'args': args, 'kwargs': kwds }, extra_env=extra_env) result = ApplyResult(self._executor, [futures], callback, error_callback) return result def map_async(self, func, iterable, chunksize=None, callback=None, error_callback=None): """ Asynchronous version of `map()` method. """ return self._map_async(func, iterable, chunksize, callback, error_callback) def _map_async(self, func, iterable, chunksize=None, callback=None, error_callback=None): """ Helper function to implement map, starmap and their async counterparts. """ if self._state != RUN: raise ValueError("Pool not running") if not hasattr(iterable, '__len__'): iterable = list(iterable) cloud_worker = CloudWorker(func=func, initializer=self._initializer, initargs=self._initargs) if isinstance(iterable[0], dict): fmt_args = [{'args': (), 'kwargs': kwargs} for kwargs in iterable] elif isinstance(iterable[0], tuple) or isinstance(iterable[0], list): fmt_args = [{'args': args, 'kwargs': {}} for args in iterable] else: fmt_args = [{'args': (args, ), 'kwargs': {}} for args in iterable] if mp_config.get_parameter(mp_config.STREAM_STDOUT): stream = self._executor.executor_id util.debug('Log streaming enabled, stream name: {}'.format(stream)) self._remote_logger = util.RemoteLoggingFeed(stream) self._remote_logger.start() cloud_worker.log_stream = stream extra_env = mp_config.get_parameter(mp_config.ENV_VARS) futures = self._executor.map(cloud_worker, fmt_args, extra_env=extra_env) result = MapResult(self._executor, futures, callback, error_callback) return result def __reduce__(self): raise NotImplementedError( 'pool objects cannot be passed between processes or pickled') def close(self): util.debug('closing pool') if self._state == RUN: self._state = CLOSE def terminate(self): util.debug('terminating pool') self._state = TERMINATE if self._remote_logger: self._remote_logger.stop() self._remote_logger = None def join(self): util.debug('joining pool') assert self._state in (CLOSE, TERMINATE) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.terminate()
class CloudProcess: def __init__(self, group=None, target=None, name=None, args=None, kwargs=None, *, daemon=None): assert group is None, 'process grouping is not implemented' if args is None: args = () if kwargs is None: kwargs = {} self._config = {} self._parent_pid = os.getpid() self._target = target self._args = tuple(args) self._kwargs = dict(kwargs) self._name = name or (type(self).__name__ + '-' + str(next(_process_counter))) self._pid = None if daemon is not None: self.daemon = daemon lithops_config = mp_config.get_parameter(mp_config.LITHOPS_CONFIG) self._executor = FunctionExecutor(**lithops_config) self._future = None self._sentinel = object() self._remote_logger = None self._redis = util.get_redis_client() def run(self): """ Method to be run in sub-process; can be overridden in sub-class """ if self._target: self._target(*self._args, **self._kwargs) def start(self): """ Start child process """ assert not self._pid, 'cannot start a process twice' assert self._parent_pid == os.getpid( ), 'can only start a process object created by current process' self._remote_logger, stream = util.setup_log_streaming(self._executor) extra_env = mp_config.get_parameter(mp_config.ENV_VARS) process_name = '-'.join([ 'CloudProcess', str(next(_process_counter)), self._target.__name__ ]) self._future = self._executor.call_async(cloud_process_wrapper, { 'func': self._target, 'data': { 'args': self._args, 'kwargs': self._kwargs }, 'initializer': None, 'initargs': None, 'name': process_name, 'log_stream': stream, 'unpack_args': True }, extra_env=extra_env) self._pid = '/'.join([ self._future.executor_id, self._future.job_id, self._future.call_id ]) del self._target, self._args, self._kwargs def terminate(self): """ Terminate process; sends SIGTERM signal or uses TerminateProcess() """ raise NotImplementedError() def join(self, timeout=None): """ Wait until child process terminates """ assert self._parent_pid == os.getpid(), 'can only join a child process' assert self._pid, 'can only join a started process' exception = None try: self._executor.wait(fs=[self._future]) except Exception as e: exception = e finally: if self._remote_logger: self._remote_logger.stop() util.export_execution_details([self._future], self._executor) if exception: raise exception def is_alive(self): """ Return whether process is alive """ raise NotImplementedError() @property def name(self): return self._name @name.setter def name(self, name): assert isinstance(name, str), 'name must be a string' self._name = name @property def daemon(self): """ Return whether process is a daemon """ return self._config.get('daemon', False) @daemon.setter def daemon(self, daemonic): """ Set whether process is a daemon """ assert not self._pid, 'process has already started' self._config['daemon'] = daemonic @property def authkey(self): return self._config['authkey'] @authkey.setter def authkey(self, authkey): """ Set authorization key of process """ self._config['authkey'] = authkey @property def exitcode(self): """ Return exit code of process or `None` if it has yet to stop """ raise NotImplementedError() @property def ident(self): """ Return identifier (PID) of process or `None` if it has yet to start """ return self._pid pid = ident @property def sentinel(self): """ Return a file descriptor (Unix) or handle (Windows) suitable for waiting for process termination. """ try: return self._sentinel except AttributeError: raise ValueError("process not started")