Пример #1
0
    def _push(self, data):
        """Override Consecution's push such that we can push in parallel"""
        if self._logging == "output":
            self._write_log(data)

        executor_kwargs = self.context.get("executor_kwargs", None) or {}
        with self.executor_class(**executor_kwargs) as executor:
            futures = []

            do_split = self.context.get("split", False)
            info(
                "%s: split=%s, %d downstream nodes" %
                (self.__class__.__name__, do_split, len(
                    self._downstream_nodes)),
                label="push",
            )

            if do_split:
                # Split the data among the downstream nodes
                splits = divide_data(data, len(self._downstream_nodes))
                for i, split in enumerate(splits):
                    node = self._downstream_nodes[i]
                    futures.append(executor.submit(node._process, split))
            else:
                # Pass complete data to each downstream node
                for downstream in self._downstream_nodes:
                    futures.append(executor.submit(downstream._process, data))

            # Wait for results
            for future in self.__class__.as_completed_func(futures):
                future.result()
Пример #2
0
    def consume(self,
                data=None,
                cleanup=None,
                split_count=None,
                synchronous=False,
                timeout=None,
                **node_contexts):
        """Setup node contexts and consume data with the pipeline

        Parameters
        ----------
        data : iterable, optional
            Iterable of data to consume
        cleanup : dict, optional
            A mapping of arg names to clean up functions to be run after
            data processing is complete.
        split_count : int, optional
            How many slices to split the data into for parallel processing. Default
            is to inspect the celery app and set split_count = worker count.
        synchronous : bool, optional
            If False, return AsyncResults. If True, wait for tasks to complete and
            return their results, if any.
        timeout : int or float, optional
            If waiting for results, pass this as timeout to AsyncResult.get().
        **node_contexts
            Keyword arguments that are node_name->param_dict

        """
        if not split_count:
            dbg("determining split count from app celery worker count")
            app_stats = self.consume_task.app.control.inspect().stats()
            split_count = len(app_stats.keys())

        split_count = split_count_helper(data, split_count)
        if data is None:
            splits = [None for s in range(split_count)]
        else:
            splits = divide_data(data, split_count)

        dbg("%s: data len: %s, splits: %d" %
            (self.__class__.__name__, size(data, "n/a"), split_count))

        async_results = []
        for split in splits:
            async_results.append(
                self.consume_task.delay(self.pipeline,
                                        split,
                                        cleanup=cleanup,
                                        **node_contexts))

        if synchronous:
            results = []
            for async_result in async_results:
                try:
                    results.append(async_result.get(timeout=timeout))
                finally:
                    async_result.forget()
            return results

        return async_results
Пример #3
0
    def consume(self,
                data=None,
                cleanup=None,
                split_count=None,
                synchronous=False,
                timeout=None,
                **node_contexts):
        """Setup node contexts and consume data with the pipeline

        Parameters
        ----------
        data : iterable, optional
            Iterable of data to consume
        cleanup : dict, optional
            A mapping of arg names to clean up functions to be run after
            data processing is complete.
        split_count : int, optional
            How many slices to split the data into for parallel processing. Default
            is to use executor._max_workers.
        synchronous : bool, optional
            If False, return Futures. If True, wait for futures to complete and
            return their results, if any.
        timeout : int or float, optional
            Raises a concurrent.futures.TimeoutError if __next__() is called
            and the result isn’t available after timeout seconds from the
            original call to as_completed(). Ignored if synchronous=False.
        **node_contexts
            Keyword arguments that are node_name->param_dict

        """
        with self.get_executor() as executor:
            worker_count = self.get_worker_count(executor)
            split_count = split_count_helper(data, split_count or worker_count)
            if data is None:
                splits = [None for s in range(split_count)]
            else:
                splits = divide_data(data, split_count)
            futures = []

            info("%s: data len: %s, splits: %d, workers: %d" % (
                self.__class__.__name__,
                size(data, "n/a"),
                worker_count,
                split_count,
            ))

            for split in splits:
                futures.append(
                    executor.submit(consume,
                                    self.pipeline,
                                    split,
                                    cleanup=cleanup,
                                    **node_contexts))

            if synchronous:
                return self.get_results(futures, timeout=timeout)

            return futures
Пример #4
0
    def consume(self,
                data=None,
                cleanup=None,
                split_count=None,
                synchronous=False,
                timeout=None,
                **node_contexts):
        """Setup node contexts and consume data with the pipeline

        Parameters
        ----------
        data : iterable, optional
            Iterable of data to consume
        cleanup : dict, optional
            A mapping of arg names to clean up functions to be run after
            data processing is complete.
        split_count : int, optional
            How many slices to split the data into for parallel processing. Default
            is the number of workers in the provided queue.
        synchronous : bool, optional
            If False, return Jobs. If True, wait for jobs to complete and
            return their results, if any.
        timeout : int or float, optional
            If waiting for results, raise an exception if polling for all
            results takes longer than timeout seconds.
        **node_contexts
            Keyword arguments that are node_name->param_dict

        """
        if not split_count:
            dbg("determining split count from rq worker count")
            workers = Worker.all(queue=self.queue)
            split_count = len(workers)

        split_count = split_count_helper(data, split_count)
        if data is None:
            splits = [None for s in range(split_count)]
        else:
            splits = divide_data(data, split_count)

        dbg("%s: data len: %s, splits: %d" %
            (self.__class__.__name__, size(data, "n/a"), split_count))

        async_results = []
        for split in splits:
            async_results.append(
                self.queue.enqueue(
                    rq_consume,
                    args=(self.pipeline, split),
                    kwargs=dict(cleanup=cleanup, **node_contexts),
                ))

        if synchronous:
            return get_async_results(async_results, timeout=timeout)

        return async_results
Пример #5
0
    def run(self,
            data,
            func,
            split_count=None,
            timeout=None,
            push_type=PushTypes.Async):
        """Use a asyncio to apply func to data

        Parameters
        ----------
        data
            An iterable to process
        func : callable
            A async callable that will be passed data to operate on using asyncio.
        split_count : int, optional
            How many slices to split the data into for concurrent processing. Default
            is to set split_count = len(data).
        timeout : int or float, optional
            Time to wait for jobs to complete before raising an error. Ignored
            unless using a push_type that waits for results.
        push_type : str, optional
            If "async", push the Futures immediately.
            If "input", push the input data immediately after task submission.
            If "result", collect the result synchronously and push it.

        """
        split_count = split_count or len(data)
        splits = divide_data(data, split_count)
        info("%s: data len: %s, splits: %s" %
             (self.__class__.__name__, size(data, "n/a"), split_count))

        loop, close = get_or_create_event_loop()

        try:
            futures = [loop.create_task(func(split)) for split in splits]

            if push_type == PushTypes.Async:
                for future in futures:
                    self.push(future)
            elif push_type == PushTypes.Input:
                self.push(data)
            elif push_type == PushTypes.Result:
                self.push(self.get_results(futures, timeout=timeout))
            else:
                raise AssertionError("Invalid push_type: %s" % push_type)
        finally:
            if close and push_type == PushTypes.Result:
                # We can only be sure its safe to close the event loop if it
                # was created and all processing took place in here.
                loop.close()
Пример #6
0
 def get_splits(self, data, split_count):
     """Split the data into split_count slices"""
     return divide_data(data, split_count)
Пример #7
0
    def run(self,
            data,
            func,
            executor=None,
            executor_kwargs=None,
            split_count=None,
            timeout=None,
            push_type=PushTypes.Async,
            **kwargs):
        """Use a parallel executor to apply func to data

        Parameters
        ----------
        data
            An iterable to process
        func : callable
            A callable that will be passed data to operate on in parallel
        executor : Executor, optional
            If passed use this executor instead of creating one.
        executor_kwargs : dict, optional
            Keyword arguments to pass when initalizing an executor.
        split_count : int, optional
            How many slices to split the data into for parallel processing. Default
            is to set split_count = number of workers
        timeout : int or float, optional
            Time to wait for jobs to complete before raising an error. Ignored
            unless using a push_type that waits for results.
        push_type : str, optional
            If "async", push the Futures immediately.
            If "input", push the input data immediately after task submission.
            If "result", collect the result synchronously and push it.
        **kwargs
            Keyword arguments passed to the executor when submitting work

        """
        self.check_data(data)

        shutdown = True
        if executor:
            shutdown = False
        else:
            executor_kwargs = executor_kwargs or {}
            executor = self.get_executor(**executor_kwargs)

        try:
            worker_count = self.get_worker_count(executor)
            split_count = split_count or worker_count
            splits = divide_data(data, split_count)
            info("%s: data len: %s, splits: %s, workers: %d" % (
                self.__class__.__name__,
                size(data, "n/a"),
                split_count,
                worker_count,
            ))
            futures = self.submit(executor, func, splits, **kwargs)

            if push_type == PushTypes.Async:
                for future in futures:
                    self.push(future)
            elif push_type == PushTypes.Input:
                self.push(data)
            elif push_type == PushTypes.Result:
                self.push(self.get_results(futures, timeout=timeout))
            else:
                raise AssertionError("Invalid push_type: %s" % push_type)

        finally:
            if shutdown:
                self.shutdown_executor(executor)