Пример #1
0
    def map_reduce(self, map_function, map_iterdata, reduce_function, extra_env=None,
                   map_runtime_memory=None, reduce_runtime_memory=None,
                   extra_meta=None, chunk_size=None, remote_invocation=False,
                   remote_invocation_groups=None, timeout=EXECUTION_TIMEOUT,
                   reducer_one_per_object=False, reducer_wait_local=False,
                   invoke_pool_threads=500, overwrite_invoke_args=None,
                   exclude_modules=None):
        """
        Map the map_function over the data and apply the reduce_function across all futures.
        This method is executed all within CF.
        :param map_function: the function to map over the data
        :param map_iterdata:  the function to reduce over the futures
        :param reduce_function:  the function to reduce over the futures
        :param extra_env: Additional environment variables for action environment. Default None.
        :param extra_meta: Additional metadata to pass to action. Default None.
        :param chunk_size: the size of the data chunks. 'None' for processing the whole file in one map
        :param remote_invocation: Enable or disable remote_invocayion mechanism. Default 'False'
        :param timeout: Time that the functions have to complete their execution before raising a timeout.
        :param data_type: the type of the data. Now allowed: None (files with newline) and csv.
        :param reducer_one_per_object: Set one reducer per object after running the partitioner
        :param reducer_wait_local: Wait for results locally
        :param invoke_pool_threads: Number of threads to use to invoke.
        :param data_all_as_one: upload the data as a single object. Default True
        :param overwrite_invoke_args: Overwrite other args. Mainly used for testing.
        :param exclude_modules: Explicitly keep these modules from pickled dependencies.
        :return: A list with size `len(map_iterdata)` of futures for each job
        """

        if self._state == ExecutorState.finished:
            raise Exception('You cannot run map_reduce() in the current state.'
                            ' Create a new FunctionExecutor() instance.')

        job_id = str(len(self.jobs)).zfill(3)
        job, parts_per_object = create_map_job(self.config, self.internal_storage,
                                               self.executor_id, job_id,
                                               map_function=map_function, iterdata=map_iterdata,
                                               extra_env=extra_env, extra_meta=extra_meta,
                                               obj_chunk_size=chunk_size, runtime_memory=map_runtime_memory,
                                               remote_invocation=remote_invocation,
                                               remote_invocation_groups=remote_invocation_groups,
                                               invoke_pool_threads=invoke_pool_threads,
                                               exclude_modules=exclude_modules,
                                               is_cf_cluster=self.is_cf_cluster,
                                               overwrite_invoke_args=overwrite_invoke_args,
                                               execution_timeout=timeout)
        map_futures = self.invoker.run(job)
        self.jobs[job['job_id']] = {'futures': map_futures, 'total': job['total_calls'], 'state': JobState.running}
        self._state = ExecutorState.running

        if reducer_wait_local:
            self.monitor(futures=map_futures)

        job = create_reduce_job(self.config, self.internal_storage, self.executor_id,
                                job_id, reduce_function, reduce_runtime_memory,
                                map_futures, parts_per_object, reducer_one_per_object,
                                extra_env, extra_meta)
        reduce_futures = self.invoker.run(job)
        self.jobs[job['job_id']] = {'futures': reduce_futures, 'total': job['total_calls'], 'state': JobState.running}

        for f in map_futures:
            f.produce_output = False

        return map_futures + reduce_futures
Пример #2
0
    def map_reduce(self,
                   map_function,
                   map_iterdata,
                   reduce_function,
                   extra_params=None,
                   extra_env=None,
                   map_runtime_memory=None,
                   reduce_runtime_memory=None,
                   chunk_size=None,
                   chunk_n=None,
                   remote_invocation=False,
                   remote_invocation_groups=None,
                   timeout=EXECUTION_TIMEOUT,
                   reducer_one_per_object=False,
                   reducer_wait_local=False,
                   invoke_pool_threads=450,
                   include_modules=[],
                   exclude_modules=[]):
        """
        Map the map_function over the data and apply the reduce_function across all futures.
        This method is executed all within CF.

        :param map_function: the function to map over the data
        :param map_iterdata:  the function to reduce over the futures
        :param reduce_function:  the function to reduce over the futures
        :param extra_env: Additional environment variables for action environment. Default None.
        :param extra_params: Additional parameters to pass to function activation. Default None.
        :param map_runtime_memory: Memory to use to run the map function. Default None (loaded from config).
        :param reduce_runtime_memory: Memory to use to run the reduce function. Default None (loaded from config).
        :param chunk_size: the size of the data chunks to split each object. 'None' for processing
                           the whole file in one function activation.
        :param chunk_n: Number of chunks to split each object. 'None' for processing the whole
                        file in one function activation.
        :param remote_invocation: Enable or disable remote_invocation mechanism. Default 'False'
        :param timeout: Time that the functions have to complete their execution before raising a timeout.
        :param reducer_one_per_object: Set one reducer per object after running the partitioner
        :param reducer_wait_local: Wait for results locally
        :param invoke_pool_threads: Number of threads to use to invoke.
        :param include_modules: Explicitly pickle these dependencies.
        :param exclude_modules: Explicitly keep these modules from pickled dependencies.

        :return: A list with size `len(map_iterdata)` of futures.
        """
        if self._state == FunctionExecutor.State.Finished:
            raise Exception('You cannot run map_reduce() in the current state.'
                            ' Create a new FunctionExecutor() instance.')

        total_current_jobs = len(self.jobs)
        job_id = str(total_current_jobs).zfill(3)
        map_job_id = 'M{}'.format(job_id)

        runtime_meta = self.invoker.select_runtime(map_job_id,
                                                   map_runtime_memory)

        map_job = create_map_job(
            self.config,
            self.internal_storage,
            self.executor_id,
            map_job_id,
            map_function=map_function,
            iterdata=map_iterdata,
            runtime_meta=runtime_meta,
            runtime_memory=map_runtime_memory,
            extra_params=extra_params,
            extra_env=extra_env,
            obj_chunk_size=chunk_size,
            obj_chunk_number=chunk_n,
            remote_invocation=remote_invocation,
            remote_invocation_groups=remote_invocation_groups,
            invoke_pool_threads=invoke_pool_threads,
            include_modules=include_modules,
            exclude_modules=exclude_modules,
            is_remote_cluster=self.is_remote_cluster,
            execution_timeout=timeout)

        map_futures = self.invoker.run(map_job)
        self.jobs[map_job_id] = {
            'futures': map_futures,
            'state': JobState.Running
        }
        self._state = FunctionExecutor.State.Running

        if reducer_wait_local:
            self.wait(fs=map_futures)

        reduce_job_id = 'R{}'.format(job_id)

        runtime_meta = self.invoker.select_runtime(reduce_job_id,
                                                   reduce_runtime_memory)

        reduce_job = create_reduce_job(
            self.config,
            self.internal_storage,
            self.executor_id,
            reduce_job_id,
            reduce_function,
            map_job,
            map_futures,
            runtime_meta=runtime_meta,
            reducer_one_per_object=reducer_one_per_object,
            runtime_memory=reduce_runtime_memory,
            extra_env=extra_env,
            include_modules=include_modules,
            exclude_modules=exclude_modules)

        reduce_futures = self.invoker.run(reduce_job)
        self.jobs[reduce_job_id] = {
            'futures': reduce_futures,
            'state': JobState.Running
        }

        for f in map_futures:
            f.produce_output = False

        return map_futures + reduce_futures
Пример #3
0
    def map_reduce(self,
                   map_function,
                   map_iterdata,
                   reduce_function,
                   extra_params=None,
                   extra_env=None,
                   map_runtime_memory=None,
                   reduce_runtime_memory=None,
                   chunk_size=None,
                   chunk_n=None,
                   timeout=EXECUTION_TIMEOUT,
                   invoke_pool_threads=500,
                   reducer_one_per_object=False,
                   reducer_wait_local=False,
                   include_modules=[],
                   exclude_modules=[]):
        """
        Map the map_function over the data and apply the reduce_function across all futures.
        This method is executed all within CF.

        :param map_function: the function to map over the data
        :param map_iterdata:  the function to reduce over the futures
        :param reduce_function:  the function to reduce over the futures
        :param extra_env: Additional environment variables for action environment. Default None.
        :param extra_params: Additional parameters to pass to function activation. Default None.
        :param map_runtime_memory: Memory to use to run the map function. Default None (loaded from config).
        :param reduce_runtime_memory: Memory to use to run the reduce function. Default None (loaded from config).
        :param chunk_size: the size of the data chunks to split each object. 'None' for processing
                           the whole file in one function activation.
        :param chunk_n: Number of chunks to split each object. 'None' for processing the whole
                        file in one function activation.
        :param remote_invocation: Enable or disable remote_invocation mechanism. Default 'False'
        :param timeout: Time that the functions have to complete their execution before raising a timeout.
        :param reducer_one_per_object: Set one reducer per object after running the partitioner
        :param reducer_wait_local: Wait for results locally
        :param invoke_pool_threads: Number of threads to use to invoke.
        :param include_modules: Explicitly pickle these dependencies.
        :param exclude_modules: Explicitly keep these modules from pickled dependencies.

        :return: A list with size `len(map_iterdata)` of futures.
        """
        map_job_id = self._create_job_id('M')

        runtime_meta = self.invoker.select_runtime(map_job_id,
                                                   map_runtime_memory)

        map_job = create_map_job(self.config,
                                 self.internal_storage,
                                 self.executor_id,
                                 map_job_id,
                                 map_function=map_function,
                                 iterdata=map_iterdata,
                                 runtime_meta=runtime_meta,
                                 runtime_memory=map_runtime_memory,
                                 extra_params=extra_params,
                                 extra_env=extra_env,
                                 obj_chunk_size=chunk_size,
                                 obj_chunk_number=chunk_n,
                                 invoke_pool_threads=invoke_pool_threads,
                                 include_modules=include_modules,
                                 exclude_modules=exclude_modules,
                                 execution_timeout=timeout)

        map_futures = self.invoker.run(map_job)
        self.futures.extend(map_futures)

        if reducer_wait_local:
            self.wait(fs=map_futures)

        reduce_job_id = map_job_id.replace('M', 'R')

        runtime_meta = self.invoker.select_runtime(reduce_job_id,
                                                   reduce_runtime_memory)

        reduce_job = create_reduce_job(
            self.config,
            self.internal_storage,
            self.executor_id,
            reduce_job_id,
            reduce_function,
            map_job,
            map_futures,
            runtime_meta=runtime_meta,
            reducer_one_per_object=reducer_one_per_object,
            runtime_memory=reduce_runtime_memory,
            extra_env=extra_env,
            include_modules=include_modules,
            exclude_modules=exclude_modules)

        reduce_futures = self.invoker.run(reduce_job)

        self.futures.extend(reduce_futures)

        for f in map_futures:
            f.produce_output = False

        self._state = FunctionExecutor.State.Running

        return map_futures + reduce_futures