def map_reduce(self, map_function, map_iterdata, reduce_function, chunksize=None, worker_processes=None, extra_args=None, extra_env=None, map_runtime_memory=None, obj_chunk_size=None, obj_chunk_number=None, reduce_runtime_memory=None, chunk_size=None, chunk_n=None, timeout=None, invoke_pool_threads=None, reducer_one_per_object=False, reducer_wait_local=False, include_modules=[], exclude_modules=[]): """ Map the map_function over the data and apply the reduce_function across all futures. This method is executed all within CF. :param map_function: the function to map over the data :param map_iterdata: An iterable of input data :param chunksize: Split map_iteradata in chunks of this size. Lithops spawns 1 worker per resulting chunk. Default 1 :param worker_processes: Number of concurrent/parallel processes in each worker Default 1 :param reduce_function: the function to reduce over the futures :param extra_env: Additional environment variables for action environment. Default None. :param extra_args: Additional arguments to pass to function activation. Default None. :param map_runtime_memory: Memory to use to run the map function. Default None (loaded from config). :param reduce_runtime_memory: Memory to use to run the reduce function. Default None (loaded from config). :param obj_chunk_size: the size of the data chunks to split each object. 'None' for processing the whole file in one function activation. :param obj_chunk_number: Number of chunks to split each object. 'None' for processing the whole file in one function activation. :param remote_invocation: Enable or disable remote_invocation mechanism. Default 'False' :param timeout: Time that the functions have to complete their execution before raising a timeout. :param reducer_one_per_object: Set one reducer per object after running the partitioner :param reducer_wait_local: Wait for results locally :param invoke_pool_threads: Number of threads to use to invoke. :param include_modules: Explicitly pickle these dependencies. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: A list with size `len(map_iterdata)` of futures. """ self.last_call = 'map_reduce' map_job_id = self._create_job_id('M') runtime_meta = self.invoker.select_runtime(map_job_id, map_runtime_memory) map_job = create_map_job(self.config, self.internal_storage, self.executor_id, map_job_id, map_function=map_function, iterdata=map_iterdata, chunksize=chunksize, worker_processes=worker_processes, runtime_meta=runtime_meta, runtime_memory=map_runtime_memory, extra_args=extra_args, extra_env=extra_env, chunk_size=chunk_size, chunk_n=chunk_n, obj_chunk_size=obj_chunk_size, obj_chunk_number=obj_chunk_number, include_modules=include_modules, exclude_modules=exclude_modules, execution_timeout=timeout, invoke_pool_threads=invoke_pool_threads) map_futures = self.invoker.run_job(map_job) self.futures.extend(map_futures) if reducer_wait_local: wait(fs=map_futures, internal_storage=self.internal_storage, job_monitor=self.job_monitor) reduce_job_id = map_job_id.replace('M', 'R') runtime_meta = self.invoker.select_runtime(reduce_job_id, reduce_runtime_memory) reduce_job = create_reduce_job( self.config, self.internal_storage, self.executor_id, reduce_job_id, reduce_function, map_job, map_futures, runtime_meta=runtime_meta, runtime_memory=reduce_runtime_memory, reducer_one_per_object=reducer_one_per_object, extra_env=extra_env, include_modules=include_modules, exclude_modules=exclude_modules) reduce_futures = self.invoker.run_job(reduce_job) self.futures.extend(reduce_futures) for f in map_futures: f._produce_output = False return map_futures + reduce_futures
def map_reduce(self, map_function: Callable, map_iterdata: List[Union[List[Any], Tuple[Any, ...], Dict[str, Any]]], reduce_function: Callable, chunksize: Optional[int] = None, extra_args: Optional[Union[List[Any], Tuple[Any, ...], Dict[str, Any]]] = None, extra_env: Optional[Dict[str, str]] = None, map_runtime_memory: Optional[int] = None, reduce_runtime_memory: Optional[int] = None, obj_chunk_size: Optional[int] = None, obj_chunk_number: Optional[int] = None, timeout: Optional[int] = None, reducer_one_per_object: Optional[bool] = False, spawn_reducer: Optional[int] = 20, include_modules: Optional[List[str]] = [], exclude_modules: Optional[List[str]] = []) -> FuturesList: """ Map the map_function over the data and apply the reduce_function across all futures. :param map_function: The function to map over the data :param map_iterdata: An iterable of input data :param reduce_function: The function to reduce over the futures :param chunksize: Split map_iteradata in chunks of this size. Lithops spawns 1 worker per resulting chunk. Default 1 :param extra_args: Additional arguments to pass to function activation. Default None :param extra_env: Additional environment variables for action environment. Default None :param map_runtime_memory: Memory to use to run the map function. Default None (loaded from config) :param reduce_runtime_memory: Memory to use to run the reduce function. Default None (loaded from config) :param obj_chunk_size: the size of the data chunks to split each object. 'None' for processing the whole file in one function activation :param obj_chunk_number: Number of chunks to split each object. 'None' for processing the whole file in one function activation :param timeout: Time that the functions have to complete their execution before raising a timeout :param reducer_one_per_object: Set one reducer per object after running the partitioner :param spawn_reducer: Percentage of done map functions before spawning the reduce function :param include_modules: Explicitly pickle these dependencies. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: A list with size `len(map_iterdata)` of futures. """ self.last_call = 'map_reduce' map_job_id = self._create_job_id('M') runtime_meta = self.invoker.select_runtime(map_job_id, map_runtime_memory) map_job = create_map_job(config=self.config, internal_storage=self.internal_storage, executor_id=self.executor_id, job_id=map_job_id, map_function=map_function, iterdata=map_iterdata, chunksize=chunksize, runtime_meta=runtime_meta, runtime_memory=map_runtime_memory, extra_args=extra_args, extra_env=extra_env, obj_chunk_size=obj_chunk_size, obj_chunk_number=obj_chunk_number, include_modules=include_modules, exclude_modules=exclude_modules, execution_timeout=timeout) map_futures = self.invoker.run_job(map_job) self.futures.extend(map_futures) if isinstance(map_iterdata, FuturesList): for fut in map_iterdata: fut._produce_output = False if spawn_reducer != ALWAYS: self.wait(map_futures, return_when=spawn_reducer) logger.debug( f'ExecutorID {self.executor_id} | JobID {map_job_id} - ' f'{spawn_reducer}% of map activations done. Spawning reduce stage' ) reduce_job_id = map_job_id.replace('M', 'R') runtime_meta = self.invoker.select_runtime(reduce_job_id, reduce_runtime_memory) reduce_job = create_reduce_job( config=self.config, internal_storage=self.internal_storage, executor_id=self.executor_id, reduce_job_id=reduce_job_id, reduce_function=reduce_function, map_job=map_job, map_futures=map_futures, runtime_meta=runtime_meta, runtime_memory=reduce_runtime_memory, reducer_one_per_object=reducer_one_per_object, extra_env=extra_env, include_modules=include_modules, exclude_modules=exclude_modules) reduce_futures = self.invoker.run_job(reduce_job) self.futures.extend(reduce_futures) for f in map_futures: f._produce_output = False return create_futures_list(map_futures + reduce_futures, self)