def collect_requirements(graph) -> Tuple[iset, iset]: """Collect & split datanodes in (possibly overlapping) `needs`/`provides`.""" operations = list(yield_ops(graph)) provides = iset(p for op in operations for p in op.provides) needs = iset( _optionalized(graph, n) for op in operations for n in op.needs) provides = iset(provides) return needs, provides
def validate(self, inputs: Items = UNSET, outputs: Items = UNSET): """ Scream on invalid inputs, outputs or no operations in graph. :param inputs: the inputs that this plan was :term:`compile`\\d for, or MORE; will scream if LESS... :param outputs: the outputs that this plan was :term:`compile`\\d for, or LESS; will scream if MORE... :raises ValueError: *Unsolvable graph...* if it cannot produce any `outputs` from the given `inputs`. *Plan needs more inputs...* if given `inputs` mismatched plan's :attr:`needs`. *Unreachable outputs...* if net cannot produce asked `outputs`. """ if not self.dag: raise ValueError( f"Unsolvable graph:\n +--{self.net}" f"\n +--possible inputs: {list(self.net.needs)}" f"\n +--possible outputs: {list(self.net.provides)}" ) if inputs is UNSET: inputs = self.needs if outputs is UNSET: outputs = self.provides # Check plan<-->inputs mismatch. # missing = iset(self.needs) - set(inputs) if missing: raise ValueError( f"Plan needs more inputs: {list(missing)}" f"\n given inputs: {list(inputs)}\n {self}" ) if outputs: unknown = ( iset(astuple(outputs, "outputs", allowed_types=abc.Sequence)) - self.provides ) if unknown: raise ValueError( f"Unreachable outputs {list(unknown)}\n for given inputs {list(unknown)}" f"\n for graph: {self}\n {self}" )
def _filter_projects_by_pnames(self, projects, version, *pnames): """Separate `version` from `pnames`, scream if unknown pnames.""" if pnames: all_pnames = [prj.pname for prj in projects] pnames = iset(pnames) unknown_projects = (pnames - iset(all_pnames)) if unknown_projects: raise cmdlets.CmdException( "Unknown project(s): %s\n Choose from existing one(s): %s" % (', '.join(unknown_projects), ', '.join(all_pnames))) projects = [p for p in projects if p.pname in pnames] return version, projects
def _inherit_parent_cmd(self, change): """ Inherit config-related stuff from up the cmd-chain. """ if self.parent: ## Collect parents, ordered like that: # subapp, self, parent1, ... # cmd_chain = self.my_cmd_chain() ## Collect separately and merge SPECs separately, # to prepend them before SPECs at the end. # conf_classes = list( itz.concat(cmd.conf_classes for cmd in cmd_chain)) ## Merge aliases/flags reversed. # cmd_aliases = dtz.merge(cmd.cmd_aliases for cmd in cmd_chain[::-1]) cmd_flags = dtz.merge(cmd.cmd_flags for cmd in cmd_chain[::-1]) else: ## We are root. cmd_chain = [self] conf_classes = list(self.conf_classes) cmd_aliases = self.cmd_aliases cmd_flags = self.cmd_flags cmd_classes = [type(cmd) for cmd in cmd_chain] self.classes = list(iset(cmd_classes + conf_classes)) self.aliases.update(cmd_aliases) self.flags.update(cmd_flags)
def load_config_files(self): """Load default user-specified overrides config files. Config-files in descending orders: - user-overrides: - :envvar:`<APPNAME>_CONFIG_FILE`, or if not set, - :attr:`config_file`; - default config-files: - ~/.<appname>/<appname>_config.{json,py} and - <this-file's-folder>/<appname>_config.{json,py}. """ # Load "standard" configs, # path-list in descending priority order. # paths = list(iset([default_config_dir(), _mydir])) self.load_config_file(default_config_fname(), path=paths) # Load "user" configs. # user_conf_fpaths = self.user_config_fpaths for fp in user_conf_fpaths[::-1]: cdir, cfname = osp.split(fp) self.load_config_file(cfname, path=cdir)
def _topo_sort_nodes(dag) -> iset: """ Topo-sort dag by execution order & operation-insertion order to break ties. This means (probably!?) that the first inserted win the `needs`, but the last one win the `provides` (and the final solution). Inform user in case of cycles. """ node_keys = dict(zip(dag.nodes, count())) try: return iset(nx.lexicographical_topological_sort(dag, key=node_keys.get)) except nx.NetworkXUnfeasible as ex: import sys from textwrap import dedent tb = sys.exc_info()[2] msg = dedent(f""" {ex} TIP: Launch a post-mortem debugger, move 3 frames UP, and plot the `graphtik.planning.Network' class in `self` to discover the cycle. If GRAPHTIK_DEBUG enabled, this plot will be stored tmp-folder automatically :-) """) raise nx.NetworkXUnfeasible(msg).with_traceback(tb)
def _slices_to_ids(slices, thelist): from boltons.setutils import IndexedSet as iset all_ids = list(range(len(thelist))) mask_ids = iset() for aslice in slices: mask_ids.update(all_ids[aslice]) return list(mask_ids)
def __call__(self, *operations): """ Composes a collection of operations into a single computation graph, obeying the ``merge`` property, if set in the constructor. :param operations: Each argument should be an operation instance created using ``operation``. :return: Returns a special type of operation class, which represents an entire computation graph as a single operation. """ assert len(operations), "no operations provided to compose" # If merge is desired, deduplicate operations before building network if self.merge: merge_set = iset() # Preseve given node order. for op in operations: if isinstance(op, NetworkOperation): netop_nodes = nx.topological_sort(op.net.graph) merge_set.update(s for s in netop_nodes if isinstance(s, Operation)) else: merge_set.add(op) operations = merge_set provides = iset(p for op in operations for p in op.provides) # Mark them all as optional, now that #18 calmly ignores # non-fully satisfied operations. needs = iset(optional(n) for op in operations for n in op.needs) - provides # Build network net = Network() for op in operations: net.add_op(op) return NetworkOperation(name=self.name, needs=needs, provides=provides, params={}, net=net)
def _glob_find_files(pattern_pairs: Tuple[str, str], mybase: Path): from boltons.setutils import IndexedSet as iset files = iset() notfiles = set() # type: ignore for positive, negative in pattern_pairs: if positive: new_files = iset(mybase.glob(positive)) cleared_files = [ f for f in new_files if not any(nf in f.parents for nf in notfiles) ] files.update(cleared_files) elif negative: new_notfiles = mybase.glob(negative) notfiles.update(new_notfiles) else: raise AssertionError("Both in (positive, negative) pair are None!") return files
def collect_fpaths(self, path_list): """ Collects all (``.json|.py``) files present in the `path_list`, (descending order). :param path_list: A list of paths (absolute, relative, dir or folders). :type path_list: List[Text] :return: fully-normalized paths, with ext """ collected_paths = self.collected_paths = iset() cfg_exts = self.supported_cfg_extensions def try_file_extensions(basepath): loaded_any = False for ext in cfg_exts: f = fu.ensure_file_ext(basepath, ext) if f in collected_paths: continue loaded = osp.isfile(f) self.visit_file(f, loaded=loaded) loaded_any |= loaded ## Load any files in `conf.d/`, alphabetically-sorted. # for ext in ('', ) + cfg_exts: if basepath.endswith(ext): conf_d = fu.ensure_file_ext(basepath.rstrip(ext), '.d') if os.path.isdir(conf_d): for f in sorted(os.listdir(conf_d)): loaded = f.endswith(cfg_exts) self.visit_file(osp.join(conf_d, f), loaded=loaded) loaded_any |= loaded return loaded_any def _derive_config_fpaths( path): # -> List[Text]: TODO: enable cmdlet typing comments """Return multiple *existent* fpaths for each config-file path (folder/file).""" p = fu.convpath(path) loaded_any = try_file_extensions(p) ## Do not strip ext if has matched WITH ext. if not loaded_any: try_file_extensions(osp.splitext(p)[0]) for cf in path_list: _derive_config_fpaths(cf) return list(collected_paths)
def _process_dependencies( deps: Collection[str], ) -> Tuple[Collection[str], Collection[str]]: """ Strip or singularize any :term:`implicit`/:term:`sideffects` and apply CWD. :param cwd: The :term:`current-working-document`, when given, all non-root `dependencies` (`needs`, `provides` & `aliases`) become :term:`jsonp`\\s, prefixed with this. :return: a x2 tuple ``(op_deps, fn_deps)``, where any instances of :term:`sideffects` in `deps` are processed like this: `op_deps` - any :func:`.sfxed` is replaced by a sequence of ":func:`singularized <.dep_singularized>`" instances, one for each item in its :term:`sfx_list`; - any duplicates are discarded; - order is irrelevant, since they don't reach the function. `fn_deps` - the dependencies consumed/produced by underlying functions, in the order they are first met. In particular, it replaces any :func:`.sfxed` by the :func:`stripped <.dep_stripped>`, unless ... - it had been declared as :term:`implicit`, in which case, it is discared; - any :func:`.sfx` are simply dropped. """ #: The dedupe any `sideffected`. seen_sideffecteds = set() def as_fn_deps(dep): """Strip and dedupe any sfxed, drop any sfx and implicit. """ if is_implicit(dep): # must ignore also `sfxed`s pass elif is_sfxed(dep): dep = dep_stripped(dep) if not dep in seen_sideffecteds: seen_sideffecteds.add(dep) return (dep, ) elif not is_sfx(dep): # must kick after `sfxed` return (dep, ) return () assert deps is not None if deps: op_deps = iset(nn for n in deps for nn in dep_singularized(n)) fn_deps = tuple(nn for n in deps for nn in as_fn_deps(n)) return op_deps, fn_deps else: return deps, deps
def _autodiscover_project_basepaths(self) -> Dict[str, Path]: """ Invoked when no config exists (or asked to updated it) to guess projects. :return: a mapping of {pnames: basepaths} """ from . import engrave if not self.autodiscover_subproject_projects: raise cmdlets.CmdException( "No `Polyvers.autodiscover_subproject_projects` param given!") fproc = engrave.FileProcessor(parent=self) with self.errlogged(doing='discovering project paths', info_log=self.log.info): scan_projects = self.autodiscover_subproject_projects #: Dict[Path, #: List[Tuple[pvproject.Project, Engrave, Graft, List[Match]]]] match_map = fproc.scan_projects(scan_projects) ## Accept projects only if one, and only one, # pair (pname <--> path) matched. # pname_path_pairs: List[Tuple[str, Path]] = [ (match.groupdict()['pname'].decode('utf-8'), fpath.parent / (prj.basepath or '.')) for fpath, mqruples in match_map.items() for prj, _eng, _graft, match in mqruples ] unique_pname_paths = iset(pname_path_pairs) ## check basepath conflicts. # projects: Dict[str, Path] = {} dupe_projects: Dict[str, Set[Path]] = defaultdict(set) for pname, basepath in unique_pname_paths: dupe_basepath = projects.get(pname) if dupe_basepath and dupe_basepath != basepath: dupe_projects[pname].add(basepath) else: projects[pname] = basepath if dupe_projects: raise cmdlets.CmdException( "Discovered conflicting project-basepaths: %s" % yu.ydumps(dupe_basepath)) return projects
def operation_executed(self, op, outputs): """ Invoked once per operation, with its results. It will update :attr:`executed` with the operation status and if `outputs` were partials, it will update :attr:`canceled` with the unsatisfied ops downstream of `op`. :param op: the operation that completed ok :param outputs: The named values the op` actually produced, which may be a subset of its `provides`. Sideffects are not considered. """ def collect_canceled_sideffects(dep, val) -> Collection: """yield any sfx `dep` with falsy value, singularizing sideffected.""" if val or not is_sfx(dep): return () return dep_singularized(dep) self._populate_op_layer_with_outputs(op, outputs) if first_solid(self.is_reschedule, getattr(op, "rescheduled", None)): ## Find which provides have been broken? # # OPTIMIZE: could use _fn_provides missing_outs = iset(op.provides) - set(outputs) sfx = (out for out in missing_outs if is_sfx(out)) canceled_sideffects = [ sf for k, v in outputs.items() for sf in collect_canceled_sideffects(k, v) ] outs_to_break = (missing_outs - sfx) | canceled_sideffects log.info( "... (%s) missing partial outputs %s from rescheduled %s.", self.solid, list(outs_to_break), op, ) if outs_to_break: dag = self.dag dag.remove_edges_from((op, out) for out in outs_to_break) self._reschedule(dag, "rescheduled", op) # list used by `check_if_incomplete()` self.broken[op] = outs_to_break
def __init__( self, *, excludes: Iterable[_FnKey] = None, base_modules: Iterable[Union[ModuleType, str]] = None, predicate: Callable[[Any], bool] = None, include_methods=False, sep=None, ): super().__init__(sep) if include_methods is not None: self.include_methods = bool(include_methods) self._seen: Set[int] = set() self.excludes = set(excludes or ()) self.base_modules = iset(sys.modules[m] if isinstance(m, str) else m for m in (base_modules or ())) self.predicate = predicate self.collected = []
def yield_files(self, *fpaths): """ :return: a 2 tuple `(fpath, file_text)` """ import io import os from boltons.setutils import IndexedSet as iset fpaths = iset(fpaths) or ['-'] for fpath in fpaths: if fpath == '-': msg = "Reading STDIN." if getattr(sys.stdin, 'isatty', lambda: False)(): msg += ("..paste text, then [Ctrl+%s] to exit!" % 'Z' if sys.platform == 'win32' else 'D') self.log.info(msg) text = sys.stdin.read() yield "<STDIN: %i-chars>" % len(text), text else: fpath = convpath(fpath, abs_path=False) if osp.exists(fpath): afpath = convpath(fpath, abs_path=True) if osp.exists(afpath): fpath = afpath else: self.log.error( "File to read '%s' not found!" "\n CWD: %s", fpath, os.curdir) continue try: with io.open(fpath, 'rt') as fin: text = fin.read() yield fpath, text except Exception as ex: self.log.error( "Reading file-path '%s' failed due to: %r", fpath, ex, exc_info=self.verbose) # WARN: from `cmdlets.Spec` continue
def check_if_incomplete(self) -> Optional[IncompleteExecutionError]: """Return a :class:`IncompleteExecutionError` if `pipeline` operations failed/canceled. """ failures = { op: ex for op, ex in self.executed.items() if isinstance(ex, Exception) } incomplete = iset(chain(self.canceled, failures.keys())) if incomplete: incomplete = list(yield_node_names(incomplete)) partial_msgs = { f"\n +--{op.name}: {list(pouts)}" for op, pouts in self.broken.items() } err_msgs = [ f"\n +--{op.name}: {type(ex).__name__}('{ex}')" for op, ex in failures.items() ] msg = ( f"Not completed x{len(incomplete)} operations {list(incomplete)}" f" due to x{len(failures)} failures and x{len(partial_msgs)} partial-ops:" f"{''.join(err_msgs)}{''.join(partial_msgs)}" ) return IncompleteExecutionError(msg, self)
def collect_gpgs(): inc_errors = 1 gpg_kws = {} gpg_paths = iset( itt.chain.from_iterable(pndlu.where(prog) for prog in ('gpg2', 'gpg'))) gnupghome = osp.expanduser('~/.gnupg') gpg_avail = [] for gpg_path in gpg_paths: try: gpg = gnupg.GPG(gpgbinary=gpg_path, **gpg_kws) row = _describe_gpg(gpg) except Exception as ex: #raise if inc_errors: row = (gpg_path, '%s: %s' % (type(ex).__name__, str(ex)), None, None) else: continue gpg_avail.append(row) cols = ['GnuPG path', 'Version', '#PRIV', '#TOTAL'] gpg_avail = pd.DataFrame(gpg_avail, columns=cols) return gpg_avail
def inputs_for_recompute( graph, inputs: Sequence[str], recompute_from: Sequence[str], recompute_till: Sequence[str] = None, ) -> Tuple[iset, iset]: """ Clears the inputs between `recompute_from >--<= recompute_till` to clear. :param graph: MODIFIED, at most 2 helper nodes inserted :param inputs: a sequence :param recompute_from: None or a sequence, including any out-of-graph deps (logged)) :param recompute_till: (optional) a sequence, only in-graph deps. :return: a 2-tuple with the reduced `inputs` by the dependencies that must be removed from the graph to recompute (along with those dependencies). It works by temporarily adding x2 nodes to find and remove the intersection of:: strict-descendants(recompute_from) & ancestors(recompute_till) FIXME: merge recompute() with travesing unsatisfied (see ``test_recompute_NEEDS_FIX``) bc it clears inputs of unsatisfied ops (cannot be replaced later) """ START, STOP = "_TMP.RECOMPUTE_FROM", "_TMP.RECOMPUTE_TILL" deps = set(yield_datanodes(graph.nodes)) recompute_from = iset(recompute_from) # traversed in logs inputs = iset(inputs) # returned bad = recompute_from - deps if bad: log.info("... ignoring unknown `recompute_from` dependencies: %s", list(bad)) recompute_from = recompute_from & deps # avoid sideffect in `recompute_from` assert recompute_from, f"Given unknown-only `recompute_from` {locals()}" graph.add_edges_from((START, i) for i in recompute_from) # strictly-downstreams from START between_deps = iset(nx.descendants(graph, START)) & deps - recompute_from if recompute_till: graph.add_edges_from( (i, STOP) for i in recompute_till) # edge reversed! # upstreams from STOP upstreams = set(nx.ancestors(graph, STOP)) & deps between_deps &= upstreams recomputes = between_deps & inputs new_inputs = iset(inputs) - recomputes if log.isEnabledFor(logging.DEBUG): log.debug( "... recompute x%i data%s means deleting x%i inputs%s, to arrive from x%i %s -> x%i %s.", len(between_deps), list(between_deps), len(recomputes), list(recomputes), len(inputs), list(inputs), len(new_inputs), list(new_inputs), ) return new_inputs, recomputes
def reparse_operation_data( name, needs, provides, aliases=(), cwd: Sequence[str] = None ) -> Tuple[str, Collection[str], Collection[str], Collection[Tuple[str, str]]]: """ Validate & reparse operation data as lists. :return: name, needs, provides, aliases As a separate function to be reused by client building operations, to detect errors early. """ from .jsonpointer import jsonp_path if name is not None and not isinstance(name, str): raise TypeError(f"Non-str `name` given: {name}") cwd_parts = jsonp_path(cwd) if cwd else () # Allow single string-value for needs parameter needs = astuple(needs, "needs", allowed_types=cabc.Collection) if not all(isinstance(i, str) for i in needs): raise TypeError(f"All `needs` must be str, got: {needs!r}") needs = jsonp_ize_all(needs, cwd_parts) # Allow single value for provides parameter provides = astuple(provides, "provides", allowed_types=cabc.Collection) if not all(isinstance(i, str) for i in provides): raise TypeError(f"All `provides` must be str, got: {provides!r}") provides = jsonp_ize_all(provides, cwd_parts) aliases = as_renames(aliases, "aliases") if aliases: ## Sanity checks, or `jsonp_ize_all()` would fail. # if not all( src and isinstance(src, str) and dst and isinstance(dst, str) for src, dst in aliases): raise TypeError( f"All `aliases` must be non-empty str, got: {aliases!r}") # XXX: Why jsonp_ize here? (and not everywhere, or nowhere in fnop?) aliases = [(prefixed(src, cwd_parts), prefixed(dst, cwd_parts)) for src, dst in aliases] if any(1 for src, dst in aliases if dst in provides): bad = ", ".join(f"{src} -> {dst}" for src, dst in aliases if dst in provides) raise ValueError( f"The `aliases` [{bad}] clash with existing provides in {list(provides)}!" ) aliases_src = iset(src for src, _dst in aliases) all_provides = iset(provides) | (dep_stripped(d) for d in provides) if not aliases_src <= all_provides: bad_alias_sources = aliases_src - all_provides bad_aliases = ", ".join(f"{src!r}-->{dst!r}" for src, dst in aliases if src in bad_alias_sources) raise ValueError( f"The `aliases` [{bad_aliases}] rename non-existent provides in {list(all_provides)}!" ) sfx_aliases = [ f"{src!r} -> {dst!r}" for src, dst in aliases if is_pure_sfx(src) or is_pure_sfx(dst) ] if sfx_aliases: raise ValueError( f"The `aliases` must not contain `sideffects` {sfx_aliases}" "\n Simply add any extra `sideffects` in the `provides`.") implicit_aliases = [ f"{'<implicit>' if bad_src else ''}{src!r} -> " f"{dst!r}{'<implicit>' if bad_dst else ''}" for src, dst in aliases for bad_src in [ is_implicit(src) or any( is_implicit(i) for i in provides if i == src) ] for bad_dst in [is_implicit(dst)] if bad_src or bad_dst ] if implicit_aliases: raise ValueError( f"The `aliases` must not contain `implicits`: {implicit_aliases}" "\n Simply add any extra `implicits` in the `provides`.") return name, needs, provides, aliases
def build_network( operations, rescheduled=None, endured=None, parallel=None, marshalled=None, node_props=None, renamer=None, excludes=None, ): """ The :term:`network` factory that does :term:`operation merging` before constructing it. :param nest: see same-named param in :func:`.compose` """ kw = { k: v for k, v in locals().items() if v is not None and k not in ("operations", "excludes") } def proc_op(op, parent=None): """clone FuncOperation with certain props changed""" ## Convey any node-props specified in the pipeline here # to all sub-operations. # if kw: op_kw = kw.copy() if node_props: op_kw["node_props"] = {**op.node_props, **node_props} if callable(renamer): def parent_wrapper(ren_args: RenArgs) -> str: # Provide RenArgs.parent. return renamer(ren_args._replace(parent=parent)) op_kw["renamer"] = parent_wrapper op = op.withset(**op_kw) ## Last minute checks, couldn't check earlier due to builder pattern. # if hasattr(op, "fn"): op.validate_fn_name() if not op.provides: TypeError(f"`provides` must not be empty!") return op merge_set = iset() # Preseve given node order. for op in operations: if isinstance(op, Pipeline): merge_set.update(proc_op(s, op) for s in op.ops) else: merge_set.add(proc_op(op)) if excludes is not None: excludes = { op for op in merge_set if op in asset(excludes, "excludes") } if excludes: merge_set = [op for op in merge_set if op not in excludes] log.info("Compose excluded %i operations %s.", len(excludes), excludes) assert all(bool(n) for n in merge_set) from .planning import Network # Imported here not to affect locals() at the top. return Network(*merge_set)
def _zip_results_returns_dict(self, results, is_rescheduled) -> dict: if hasattr(results, "_asdict"): # named tuple results = results._asdict() elif isinstance(results, cabc.Mapping): pass elif hasattr(results, "__dict__"): # regular object results = vars(results) else: raise ValueError( "Expected results as mapping, named_tuple, object, " f"got {type(results).__name__!r}: {results}\n {self}" f"\n {debug_var_tip}") fn_required = self._fn_provides if fn_required: renames = {get_keyword(i): i for i in fn_required} # +1 useless key: None renames.pop(None, None) fn_expected = fn_required = [ get_keyword(i) or i for i in fn_required ] else: fn_expected = fn_required = renames = () if is_rescheduled: # Canceled sfx(ed) are welcomed. fn_expected = iset( [*fn_expected, *(i for i in self.provides if is_sfx(i))]) res_names = results.keys() ## Clip unknown outputs (handy for reuse). # unknown = [i for i in (res_names - fn_expected) if not is_pure_sfx(i)] if unknown: unknown = list(unknown) log.warning( "Results%s contained +%i unknown provides%s - will DELETE them!\n %s", list(res_names), len(unknown), list(unknown), self, ) # Filter results, don't mutate them. # NOTE: too invasive when no-evictions!? results = {k: v for k, v in results.items() if k not in unknown} missmatched = fn_required - res_names if missmatched: if is_rescheduled: log.warning("... Op %r did not provide%s", self.name, list(missmatched)) else: raise ValueError( f"Got x{len(results)} results({list(results)}) mismatched " f"-{len(missmatched)} provides({list(fn_expected)}):" f" {list(missmatched)}\n {self}\n {debug_var_tip}") if renames: results = {renames.get(k, k): v for k, v in results.items()} return results
def _build_execution_steps(self, dag, inputs, outputs): """ Create the list of operation-nodes & *instructions* evaluating all operations & instructions needed a) to free memory and b) avoid overwritting given intermediate inputs. :param dag: The original dag, pruned; not broken. :param outputs: outp-names to decide whether to add (and which) evict-instructions Instances of :class:`_EvictInstructions` are inserted in `steps` between operation nodes to reduce the memory footprint of solutions while the computation is running. An evict-instruction is inserted whenever a *need* is not used by any other *operation* further down the DAG. """ steps = [] # create an execution order such that each layer's needs are provided. ordered_nodes = iset(nx.topological_sort(dag)) # Add Operations evaluation steps, and instructions to free and "pin" # data. for i, node in enumerate(ordered_nodes): if isinstance(node, _DataNode): if node in inputs and dag.pred[node]: # Add a pin-instruction only when there is another operation # generating this data as output. steps.append(_PinInstruction(node)) elif isinstance(node, Operation): steps.append(node) # Keep all values in solution if not specific outputs asked. if not outputs: continue # Add instructions to evict predecessors as possible. A # predecessor may be evicted if it is a data placeholder that # is no longer needed by future Operations. # It shouldn't make a difference if it were the broken dag # bc these are preds of data (provides), and we scan here # preds of ops (need). for need in dag.pred[node]: log.debug("checking if node %s can be evicted", need) for future_node in ordered_nodes[i + 1 :]: if ( isinstance(future_node, Operation) and need in future_node.needs ): break else: if need not in outputs: log.debug(" adding evict-instruction for %s", need) steps.append(_EvictInstruction(need)) else: raise AssertionError("Unrecognized network graph node %r" % node) return steps
def _prune_graph(self, inputs: Items, outputs: Items, predicate: NodePredicate = None ) -> Tuple[nx.DiGraph, Tuple, Tuple, OpMap]: """ Determines what graph steps need to run to get to the requested outputs from the provided inputs: - Eliminate steps that are not on a path arriving to requested outputs; - Eliminate unsatisfied operations: partial inputs or no outputs needed; - consolidate the list of needs & provides. :param inputs: The names of all given inputs. :param outputs: The desired output names. This can also be ``None``, in which case the necessary steps are all graph nodes that are reachable from the provided inputs. :param predicate: the :term:`node predicate` is a 2-argument callable(op, node-data) that should return true for nodes to include; if None, all nodes included. :return: a 4-tuple: - the *pruned* :term:`execution dag`, - net's needs & outputs based on the given inputs/outputs and the net (may overlap, see :func:`collect_requirements()`), - an {op, prune-explanation} dictionary Use the returned `needs/provides` to build a new plan. :raises ValueError: - if `outputs` asked do not exist in network, with msg: *Unknown output nodes: ...* """ # TODO: break cycles based on weights here. dag = self.graph ## When `inputs` is None, we have to keep all possible input nodes # and this is achieved with 2 tricky locals: # # inputs # it is kept falsy, to disable the edge-breaking, so that # the ascending_from_outputs that follows can reach all input nodes; # including intermediate ones; # satisfied_inputs # it is filled with all possible input nodes, to trick `unsatisfied_operations()` # to assume their operations are satisfied, and keep them. # if inputs is None and outputs is None: satisfied_inputs, outputs = self.needs, self.provides else: if inputs is None: # outputs: NOT None satisfied_inputs = self.needs - outputs else: # inputs: NOT None, outputs: None # Just ignore `inputs` not in the graph. satisfied_inputs = inputs = iset(inputs) & dag.nodes ## Scream on unknown `outputs`. # if outputs: unknown_outputs = iset(outputs) - dag.nodes if unknown_outputs: raise ValueError( f"Unknown output nodes: {list(unknown_outputs)}\n {self}" "\n (tip: set GRAPHTIK_DEBUG envvar to view Op details in print-outs)" ) assert isinstance(satisfied_inputs, abc.Collection) assert inputs is None or isinstance(inputs, abc.Collection) assert outputs is None or isinstance(outputs, abc.Collection) broken_dag = dag.copy() # preserve net's graph if predicate: self._apply_graph_predicate(broken_dag, predicate) # Break the incoming edges to all given inputs. # # Nodes producing any given intermediate inputs are unnecessary # (unless they are also used elsewhere). # To discover which ones to prune, we break their incoming edges # and they will drop out while collecting ancestors from the outputs. # if inputs: for n in inputs: # Coalesce to a list, to avoid concurrent modification. broken_dag.remove_edges_from( list((src, dst) for src, dst, subdoc in broken_dag.in_edges(n, data="subdoc") if not subdoc)) comments: OpMap = {} # Drop stray input values and operations (if any). if outputs is not None: ## If caller requested specific outputs, we can prune any # unrelated nodes further up the dag. # ending_in_outputs = set() for out in yield_chaindocs(dag, outputs, ending_in_outputs): # TODO: speedup prune-by-outs with traversing code ending_in_outputs.update(nx.ancestors(broken_dag, out)) ending_in_outputs.add(out) # Clone it, to modify it, or BUG@@ much later (e.g in eviction planing). broken_dag = broken_dag.subgraph(ending_in_outputs).copy() irrelevant_ops = [ op for op in yield_ops(dag) if op not in ending_in_outputs ] if irrelevant_ops: comments.update( (op, "outputs-irrelevant") for op in irrelevant_ops) log.info( "... dropping output-irrelevant ops%s.\n +--outputs: %s", irrelevant_ops, outputs, ) # Prune unsatisfied operations (those with partial inputs or no outputs). unsatisfied, sorted_nodes = unsatisfied_operations( broken_dag, satisfied_inputs) comments.update(unsatisfied) # Clone it, to modify it. pruned_dag = dag.subgraph(broken_dag.nodes - unsatisfied).copy() ## Clean unlinked data-nodes (except those both given & asked). # unlinked_data = set(nx.isolates(pruned_dag)) if outputs is not None: # FIXME: must cast to simple set due to mahmoud/boltons#252 (boltons < v20.1) unlinked_data -= set(satisfied_inputs & outputs) pruned_dag.remove_nodes_from(unlinked_data) inputs = iset( _optionalized(pruned_dag, n) for n in satisfied_inputs if n in pruned_dag) if outputs is None: outputs = iset( n for n in self.provides if n not in inputs and n in pruned_dag and not is_sfx(n)) else: # filter-out from new `provides` if pruned. outputs = iset(n for n in outputs if n in pruned_dag) assert inputs is not None or isinstance(inputs, abc.Collection) assert outputs is not None or isinstance(outputs, abc.Collection) return pruned_dag, sorted_nodes, tuple(inputs), tuple( outputs), comments
def __init__( self, fn: Callable = None, name=None, needs: Items = None, provides: Items = None, aliases: Mapping = None, *, cwd=None, rescheduled=None, endured=None, parallel=None, marshalled=None, returns_dict=None, node_props: Mapping = None, ): """ Build a new operation out of some function and its requirements. See :func:`.operation` for the full documentation of parameters, study the code for attributes (or read them from rendered sphinx site). """ from .jsonpointer import jsonp_path super().__init__() node_props = node_props = node_props if node_props else {} if fn and not callable(fn): raise TypeError( f"Operation was provided with a non-callable: {fn}") if node_props is not None and not isinstance(node_props, cabc.Mapping): raise TypeError( f"Operation `node_props` must be a dict, was {type(node_props).__name__!r}: {node_props}" ) if name is None and fn: name = func_name(fn, None, mod=0, fqdn=0, human=0, partials=1) ## Overwrite reparsed op-data. name, needs, provides, aliases = reparse_operation_data( name, needs, provides, aliases, cwd) user_needs, user_provides = needs, provides needs, _fn_needs = _process_dependencies(needs) provides, _fn_provides = _process_dependencies(provides) alias_dst = aliases and tuple(dst for _src, dst in aliases) provides = iset((*provides, *alias_dst)) # TODO: enact conveyor fn if varargs in the outputs. if fn is None and name and len(_fn_needs) == len(_fn_provides): log.debug( "Auto-setting conveyor identity function on op(%s) for needs(%s) --> provides(%s)", name, needs, provides, ) fn = identity_fn #: The :term:`operation`'s underlying function. self.fn = fn #: a name for the operation (e.g. `'conv1'`, `'sum'`, etc..); #: any "parents split by dots(``.``)". #: :seealso: :ref:`operation-nesting` self.name = name #: Fake function attributes. #: if fn: update_wrapper( self, fn, assigned=("__module__", "__doc__", "__annotations__"), updated=(), ) self.__name__ = name qname = getattr(fn, "__qualname__", None) or name if qname: # "ab.cd" => "ab.NAME", "ab" => "NAME", "" => "NAME" qname = ".".join((*qname.split(".")[:-1], name)) self.__qualname__ = qname #: Dependencies ready to lay the graph for :term:`pruning` #: (NO-DUPES, SFX, SINGULAR :term:`sideffected`\s). self.needs = needs #: The :term:`needs` as given by the user, stored for *builder pattern* #: to work. self._user_needs = user_needs #: Value names the underlying function requires #: (DUPES preserved, NO-SFX, STRIPPED :term:`sideffected`). self._fn_needs = _fn_needs #: Value names ready to lay the graph for :term:`pruning` #: (NO DUPES, ALIASES, SFX, SINGULAR sideffecteds, +alias destinations). self.provides = provides #: The :term:`provides` as given by the user, stored for *builder pattern* #: to work. self._user_provides = user_provides #: Value names the underlying function produces #: (DUPES, NO-ALIASES, NO_SFX, STRIPPED :term:`sideffected`). self._fn_provides = _fn_provides #: an optional mapping of `fn_provides` to additional ones, together #: comprising this operations the `provides`. #: #: You cannot alias an :term:`alias`. self.aliases = aliases #: The :term:`current-working-document`, when defined, all non-root `dependencies` # become :term:`jsonp` and are prefixed with this. self.cwd = cwd #: If true, underlying *callable* may produce a subset of `provides`, #: and the :term:`plan` must then :term:`reschedule` after the operation #: has executed. In that case, it makes more sense for the *callable* #: to `returns_dict`. self.rescheduled = rescheduled #: If true, even if *callable* fails, solution will :term:`reschedule`; #: ignored if :term:`endurance` enabled globally. self.endured = endured #: execute in (deprecated) :term:`parallel` self.parallel = parallel #: If true, operation will be :term:`marshalled <marshalling>` while computed, #: along with its `inputs` & `outputs`. #: (usefull when run in (deprecated) `parallel` with a :term:`process pool`). self.marshalled = marshalled #: If true, it means the underlying function :term:`returns dictionary` , #: and no further processing is done on its results, #: i.e. the returned output-values are not zipped with `provides`. #: #: It does not have to return any :term:`alias` `outputs`. #: #: Can be changed amidst execution by the operation's function. self.returns_dict = returns_dict #: Added as-is into NetworkX graph, and you may filter operations by #: :meth:`.Pipeline.withset()`. #: Also plot-rendering affected if they match `Graphviz` properties, #: if they start with :data:`.USER_STYLE_PREFFIX`, #: unless they start with underscore(``_``). self.node_props = node_props
def _prune_graph(self, outputs, inputs): """ Determines what graph steps need to run to get to the requested outputs from the provided inputs. : - Eliminate steps that are not on a path arriving to requested outputs. - Eliminate unsatisfied operations: partial inputs or no outputs needed. :param iterable outputs: A list of desired output names. This can also be ``None``, in which case the necessary steps are all graph nodes that are reachable from one of the provided inputs. :param iterable inputs: The inputs names of all given inputs. :return: the *pruned_dag* """ dag = self.graph # Ignore input names that aren't in the graph. graph_inputs = set(dag.nodes) & set(inputs) # unordered, iterated, but ok # Scream if some requested outputs aren't in the graph. unknown_outputs = iset(outputs) - dag.nodes if unknown_outputs: raise ValueError( "Unknown output node(s) asked: %s" % ", ".join(unknown_outputs) ) broken_dag = dag.copy() # preserve net's graph # Break the incoming edges to all given inputs. # # Nodes producing any given intermediate inputs are unecessary # (unless they are also used elsewhere). # To discover which ones to prune, we break their incoming edges # and they will drop out while collecting ancestors from the outputs. broken_edges = set() # unordered, not iterated for given in graph_inputs: broken_edges.update(broken_dag.in_edges(given)) broken_dag.remove_edges_from(broken_edges) # Drop stray input values and operations (if any). broken_dag.remove_nodes_from(list(nx.isolates(broken_dag))) if outputs: # If caller requested specific outputs, we can prune any # unrelated nodes further up the dag. ending_in_outputs = set() for output_name in outputs: ending_in_outputs.add(_DataNode(output_name)) ending_in_outputs.update(nx.ancestors(dag, output_name)) broken_dag = broken_dag.subgraph(ending_in_outputs) # Prune unsatisfied operations (those with partial inputs or no outputs). unsatisfied = self._collect_unsatisfied_operations(broken_dag, inputs) # Clone it so that it is picklable. pruned_dag = dag.subgraph(broken_dag.nodes - unsatisfied).copy() assert all( isinstance(n, (Operation, _DataNode)) for n in pruned_dag ), pruned_dag return pruned_dag, broken_edges