def _push(self, data): """Override Consecution's push such that we can push in parallel""" if self._logging == "output": self._write_log(data) executor_kwargs = self.context.get("executor_kwargs", None) or {} with self.executor_class(**executor_kwargs) as executor: futures = [] do_split = self.context.get("split", False) info( "%s: split=%s, %d downstream nodes" % (self.__class__.__name__, do_split, len( self._downstream_nodes)), label="push", ) if do_split: # Split the data among the downstream nodes splits = divide_data(data, len(self._downstream_nodes)) for i, split in enumerate(splits): node = self._downstream_nodes[i] futures.append(executor.submit(node._process, split)) else: # Pass complete data to each downstream node for downstream in self._downstream_nodes: futures.append(executor.submit(downstream._process, data)) # Wait for results for future in self.__class__.as_completed_func(futures): future.result()
def consume(self, data=None, cleanup=None, split_count=None, synchronous=False, timeout=None, **node_contexts): """Setup node contexts and consume data with the pipeline Parameters ---------- data : iterable, optional Iterable of data to consume cleanup : dict, optional A mapping of arg names to clean up functions to be run after data processing is complete. split_count : int, optional How many slices to split the data into for parallel processing. Default is to use executor._max_workers. synchronous : bool, optional If False, return Futures. If True, wait for futures to complete and return their results, if any. timeout : int or float, optional Raises a concurrent.futures.TimeoutError if __next__() is called and the result isn’t available after timeout seconds from the original call to as_completed(). Ignored if synchronous=False. **node_contexts Keyword arguments that are node_name->param_dict """ with self.get_executor() as executor: worker_count = self.get_worker_count(executor) split_count = split_count_helper(data, split_count or worker_count) if data is None: splits = [None for s in range(split_count)] else: splits = divide_data(data, split_count) futures = [] info("%s: data len: %s, splits: %d, workers: %d" % ( self.__class__.__name__, size(data, "n/a"), worker_count, split_count, )) for split in splits: futures.append( executor.submit(consume, self.pipeline, split, cleanup=cleanup, **node_contexts)) if synchronous: return self.get_results(futures, timeout=timeout) return futures
def run(self, data, func, split_count=None, timeout=None, push_type=PushTypes.Async): """Use a asyncio to apply func to data Parameters ---------- data An iterable to process func : callable A async callable that will be passed data to operate on using asyncio. split_count : int, optional How many slices to split the data into for concurrent processing. Default is to set split_count = len(data). timeout : int or float, optional Time to wait for jobs to complete before raising an error. Ignored unless using a push_type that waits for results. push_type : str, optional If "async", push the Futures immediately. If "input", push the input data immediately after task submission. If "result", collect the result synchronously and push it. """ split_count = split_count or len(data) splits = divide_data(data, split_count) info("%s: data len: %s, splits: %s" % (self.__class__.__name__, size(data, "n/a"), split_count)) loop, close = get_or_create_event_loop() try: futures = [loop.create_task(func(split)) for split in splits] if push_type == PushTypes.Async: for future in futures: self.push(future) elif push_type == PushTypes.Input: self.push(data) elif push_type == PushTypes.Result: self.push(self.get_results(futures, timeout=timeout)) else: raise AssertionError("Invalid push_type: %s" % push_type) finally: if close and push_type == PushTypes.Result: # We can only be sure its safe to close the event loop if it # was created and all processing took place in here. loop.close()
def run(self, data, func, executor=None, executor_kwargs=None, split_count=None, timeout=None, push_type=PushTypes.Async, **kwargs): """Use a parallel executor to apply func to data Parameters ---------- data An iterable to process func : callable A callable that will be passed data to operate on in parallel executor : Executor, optional If passed use this executor instead of creating one. executor_kwargs : dict, optional Keyword arguments to pass when initalizing an executor. split_count : int, optional How many slices to split the data into for parallel processing. Default is to set split_count = number of workers timeout : int or float, optional Time to wait for jobs to complete before raising an error. Ignored unless using a push_type that waits for results. push_type : str, optional If "async", push the Futures immediately. If "input", push the input data immediately after task submission. If "result", collect the result synchronously and push it. **kwargs Keyword arguments passed to the executor when submitting work """ self.check_data(data) shutdown = True if executor: shutdown = False else: executor_kwargs = executor_kwargs or {} executor = self.get_executor(**executor_kwargs) try: worker_count = self.get_worker_count(executor) split_count = split_count or worker_count splits = divide_data(data, split_count) info("%s: data len: %s, splits: %s, workers: %d" % ( self.__class__.__name__, size(data, "n/a"), split_count, worker_count, )) futures = self.submit(executor, func, splits, **kwargs) if push_type == PushTypes.Async: for future in futures: self.push(future) elif push_type == PushTypes.Input: self.push(data) elif push_type == PushTypes.Result: self.push(self.get_results(futures, timeout=timeout)) else: raise AssertionError("Invalid push_type: %s" % push_type) finally: if shutdown: self.shutdown_executor(executor)
def _get_script_args(self): """Generate all tlbx Args for this Glider""" node_lookup = self.glider.get_node_lookup() script_args = OrderedDict() # Map of arg names to Args arg_dests = {} # Map of arg dests back to names node_arg_names = defaultdict(set) requires_data = not isinstance(self.glider.top_node, NoInputNode) if requires_data and not self.blacklisted("", SCRIPT_DATA_ARG): script_args[SCRIPT_DATA_ARG] = Arg(SCRIPT_DATA_ARG, nargs="+") def add_script_arg(node, arg_name, **kwargs): script_arg = self._get_script_arg(node, arg_name, **kwargs) if not script_arg: return script_args[script_arg.name] = script_arg arg_dests[script_arg.dest] = script_arg.name node_arg_names[arg_name].add(script_arg.name) for node in node_lookup.values(): node_help = {} if FunctionDoc: try: # Only works if run() has docs in numpydoc format docs = FunctionDoc(node.run) node_help = { v.name: "\n".join(v.desc) for v in docs["Parameters"] } except Exception as e: info("failed to parse node '%s' run() docs: %s" % (node.name, str(e))) for arg_name, _ in node.run_args.items(): add_script_arg( node, arg_name, required=True, arg_help=node_help.get(arg_name, None), ) for kwarg_name, kwarg_default in node.run_kwargs.items(): add_script_arg( node, kwarg_name, required=False, default=kwarg_default, arg_help=node_help.get(kwarg_name, None), ) def assert_arg_present(custom_arg, arg_name): raiseifnot( arg_name in script_args, ("Custom arg %s with dest=%s maps to node arg=%s " "which is not in the script arg list. Check for " "conflicting args that cover the same node arg." % (custom_arg.name, custom_arg.dest, arg_name)), ) for custom_arg in self.custom_args: raiseif( self.blacklisted("", custom_arg.name), "Blacklisted arg '%s' passed as a custom arg" % custom_arg.name, ) if custom_arg.dest in node_arg_names: # Find and delete all node-based args this will cover for arg_name in node_arg_names[custom_arg.dest]: assert_arg_present(custom_arg, arg_name) del script_args[arg_name] if custom_arg.dest in arg_dests: # Remove the original arg that this custom arg will satisfy arg_name = arg_dests[custom_arg.dest] assert_arg_present(custom_arg, arg_name) del script_args[arg_name] script_args[custom_arg.name] = custom_arg arg_dests[custom_arg.dest] = custom_arg.name return script_args.values()