def map_reduce(self, map_function, map_iterdata, reduce_function, extra_env=None, map_runtime_memory=None, reduce_runtime_memory=None, extra_meta=None, chunk_size=None, remote_invocation=False, remote_invocation_groups=None, timeout=EXECUTION_TIMEOUT, reducer_one_per_object=False, reducer_wait_local=False, invoke_pool_threads=500, overwrite_invoke_args=None, exclude_modules=None): """ Map the map_function over the data and apply the reduce_function across all futures. This method is executed all within CF. :param map_function: the function to map over the data :param map_iterdata: the function to reduce over the futures :param reduce_function: the function to reduce over the futures :param extra_env: Additional environment variables for action environment. Default None. :param extra_meta: Additional metadata to pass to action. Default None. :param chunk_size: the size of the data chunks. 'None' for processing the whole file in one map :param remote_invocation: Enable or disable remote_invocayion mechanism. Default 'False' :param timeout: Time that the functions have to complete their execution before raising a timeout. :param data_type: the type of the data. Now allowed: None (files with newline) and csv. :param reducer_one_per_object: Set one reducer per object after running the partitioner :param reducer_wait_local: Wait for results locally :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: A list with size `len(map_iterdata)` of futures for each job """ if self._state == ExecutorState.finished: raise Exception('You cannot run map_reduce() in the current state.' ' Create a new FunctionExecutor() instance.') job_id = str(len(self.jobs)).zfill(3) job, parts_per_object = create_map_job(self.config, self.internal_storage, self.executor_id, job_id, map_function=map_function, iterdata=map_iterdata, extra_env=extra_env, extra_meta=extra_meta, obj_chunk_size=chunk_size, runtime_memory=map_runtime_memory, remote_invocation=remote_invocation, remote_invocation_groups=remote_invocation_groups, invoke_pool_threads=invoke_pool_threads, exclude_modules=exclude_modules, is_cf_cluster=self.is_cf_cluster, overwrite_invoke_args=overwrite_invoke_args, execution_timeout=timeout) map_futures = self.invoker.run(job) self.jobs[job['job_id']] = {'futures': map_futures, 'total': job['total_calls'], 'state': JobState.running} self._state = ExecutorState.running if reducer_wait_local: self.monitor(futures=map_futures) job = create_reduce_job(self.config, self.internal_storage, self.executor_id, job_id, reduce_function, reduce_runtime_memory, map_futures, parts_per_object, reducer_one_per_object, extra_env, extra_meta) reduce_futures = self.invoker.run(job) self.jobs[job['job_id']] = {'futures': reduce_futures, 'total': job['total_calls'], 'state': JobState.running} for f in map_futures: f.produce_output = False return map_futures + reduce_futures
def map_reduce(self, map_function, map_iterdata, reduce_function, extra_params=None, extra_env=None, map_runtime_memory=None, reduce_runtime_memory=None, chunk_size=None, chunk_n=None, remote_invocation=False, remote_invocation_groups=None, timeout=EXECUTION_TIMEOUT, reducer_one_per_object=False, reducer_wait_local=False, invoke_pool_threads=450, include_modules=[], exclude_modules=[]): """ Map the map_function over the data and apply the reduce_function across all futures. This method is executed all within CF. :param map_function: the function to map over the data :param map_iterdata: the function to reduce over the futures :param reduce_function: the function to reduce over the futures :param extra_env: Additional environment variables for action environment. Default None. :param extra_params: Additional parameters to pass to function activation. Default None. :param map_runtime_memory: Memory to use to run the map function. Default None (loaded from config). :param reduce_runtime_memory: Memory to use to run the reduce function. Default None (loaded from config). :param chunk_size: the size of the data chunks to split each object. 'None' for processing the whole file in one function activation. :param chunk_n: Number of chunks to split each object. 'None' for processing the whole file in one function activation. :param remote_invocation: Enable or disable remote_invocation mechanism. Default 'False' :param timeout: Time that the functions have to complete their execution before raising a timeout. :param reducer_one_per_object: Set one reducer per object after running the partitioner :param reducer_wait_local: Wait for results locally :param invoke_pool_threads: Number of threads to use to invoke. :param include_modules: Explicitly pickle these dependencies. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: A list with size `len(map_iterdata)` of futures. """ if self._state == FunctionExecutor.State.Finished: raise Exception('You cannot run map_reduce() in the current state.' ' Create a new FunctionExecutor() instance.') total_current_jobs = len(self.jobs) job_id = str(total_current_jobs).zfill(3) map_job_id = 'M{}'.format(job_id) runtime_meta = self.invoker.select_runtime(map_job_id, map_runtime_memory) map_job = create_map_job( self.config, self.internal_storage, self.executor_id, map_job_id, map_function=map_function, iterdata=map_iterdata, runtime_meta=runtime_meta, runtime_memory=map_runtime_memory, extra_params=extra_params, extra_env=extra_env, obj_chunk_size=chunk_size, obj_chunk_number=chunk_n, remote_invocation=remote_invocation, remote_invocation_groups=remote_invocation_groups, invoke_pool_threads=invoke_pool_threads, include_modules=include_modules, exclude_modules=exclude_modules, is_remote_cluster=self.is_remote_cluster, execution_timeout=timeout) map_futures = self.invoker.run(map_job) self.jobs[map_job_id] = { 'futures': map_futures, 'state': JobState.Running } self._state = FunctionExecutor.State.Running if reducer_wait_local: self.wait(fs=map_futures) reduce_job_id = 'R{}'.format(job_id) runtime_meta = self.invoker.select_runtime(reduce_job_id, reduce_runtime_memory) reduce_job = create_reduce_job( self.config, self.internal_storage, self.executor_id, reduce_job_id, reduce_function, map_job, map_futures, runtime_meta=runtime_meta, reducer_one_per_object=reducer_one_per_object, runtime_memory=reduce_runtime_memory, extra_env=extra_env, include_modules=include_modules, exclude_modules=exclude_modules) reduce_futures = self.invoker.run(reduce_job) self.jobs[reduce_job_id] = { 'futures': reduce_futures, 'state': JobState.Running } for f in map_futures: f.produce_output = False return map_futures + reduce_futures
def map_reduce(self, map_function, map_iterdata, reduce_function, extra_params=None, extra_env=None, map_runtime_memory=None, reduce_runtime_memory=None, chunk_size=None, chunk_n=None, timeout=EXECUTION_TIMEOUT, invoke_pool_threads=500, reducer_one_per_object=False, reducer_wait_local=False, include_modules=[], exclude_modules=[]): """ Map the map_function over the data and apply the reduce_function across all futures. This method is executed all within CF. :param map_function: the function to map over the data :param map_iterdata: the function to reduce over the futures :param reduce_function: the function to reduce over the futures :param extra_env: Additional environment variables for action environment. Default None. :param extra_params: Additional parameters to pass to function activation. Default None. :param map_runtime_memory: Memory to use to run the map function. Default None (loaded from config). :param reduce_runtime_memory: Memory to use to run the reduce function. Default None (loaded from config). :param chunk_size: the size of the data chunks to split each object. 'None' for processing the whole file in one function activation. :param chunk_n: Number of chunks to split each object. 'None' for processing the whole file in one function activation. :param remote_invocation: Enable or disable remote_invocation mechanism. Default 'False' :param timeout: Time that the functions have to complete their execution before raising a timeout. :param reducer_one_per_object: Set one reducer per object after running the partitioner :param reducer_wait_local: Wait for results locally :param invoke_pool_threads: Number of threads to use to invoke. :param include_modules: Explicitly pickle these dependencies. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: A list with size `len(map_iterdata)` of futures. """ map_job_id = self._create_job_id('M') runtime_meta = self.invoker.select_runtime(map_job_id, map_runtime_memory) map_job = create_map_job(self.config, self.internal_storage, self.executor_id, map_job_id, map_function=map_function, iterdata=map_iterdata, runtime_meta=runtime_meta, runtime_memory=map_runtime_memory, extra_params=extra_params, extra_env=extra_env, obj_chunk_size=chunk_size, obj_chunk_number=chunk_n, invoke_pool_threads=invoke_pool_threads, include_modules=include_modules, exclude_modules=exclude_modules, execution_timeout=timeout) map_futures = self.invoker.run(map_job) self.futures.extend(map_futures) if reducer_wait_local: self.wait(fs=map_futures) reduce_job_id = map_job_id.replace('M', 'R') runtime_meta = self.invoker.select_runtime(reduce_job_id, reduce_runtime_memory) reduce_job = create_reduce_job( self.config, self.internal_storage, self.executor_id, reduce_job_id, reduce_function, map_job, map_futures, runtime_meta=runtime_meta, reducer_one_per_object=reducer_one_per_object, runtime_memory=reduce_runtime_memory, extra_env=extra_env, include_modules=include_modules, exclude_modules=exclude_modules) reduce_futures = self.invoker.run(reduce_job) self.futures.extend(reduce_futures) for f in map_futures: f.produce_output = False self._state = FunctionExecutor.State.Running return map_futures + reduce_futures