def build_outs_trie(stages): outs = Trie() for stage in filter(bool, stages): # bug? not using it later for out in stage.outs: out_key = out.path_info.parts # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ( "Paths for outs:\n'{}'('{}')\n'{}'('{}')\n" "overlap. To avoid unpredictable behaviour, " "rerun command with non overlapping outs paths." ).format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out return outs
def build_outs_trie(stages): outs = Trie() for stage in stages: for out in stage.outs: out_key = out.fs.path.parts(out.fs_path) # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ( "The output paths:\n'{}'('{}')\n'{}'('{}')\n" "overlap and are thus in the same tracked directory.\n" "To keep reproducibility, outputs should be in separate " "tracked directories or tracked individually.").format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out return outs
def _collect_graph(self, stages=None): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, collect stages in the repository. Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from dvc.exceptions import ( OutputDuplicationError, StagePathAsOutputError, OverlappingOutputPathsError, ) G = nx.DiGraph() stages = stages or self.stages stages = [stage for stage in stages if stage] outs = {} for stage in stages: for out in stage.outs: if out.path_info in outs: dup_stages = [stage, outs[out.path_info].stage] raise OutputDuplicationError(str(out), dup_stages) outs[out.path_info] = out for stage in stages: for out in stage.outs: for p in out.path_info.parents: if p in outs: raise OverlappingOutputPathsError(outs[p], out) for stage in stages: stage_path_info = PathInfo(stage.path) for p in chain([stage_path_info], stage_path_info.parents): if p in outs: raise StagePathAsOutputError(stage, str(outs[p])) for stage in stages: G.add_node(stage) for dep in stage.deps: if dep.path_info is None: continue for out_path_info, out in outs.items(): if out_path_info.overlaps(dep.path_info): G.add_node(out.stage) G.add_edge(stage, out.stage) check_acyclic(G) return G
def _collect_graph(self, stages): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, collect stages in the repository. Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from pygtrie import Trie from dvc.exceptions import ( OutputDuplicationError, OverlappingOutputPathsError, StagePathAsOutputError, ) G = nx.DiGraph() stages = stages or self.stages outs = Trie() # Use trie to efficiently find overlapping outs and deps for stage in filter(bool, stages): # bug? not using it later for out in stage.outs: out_key = out.path_info.parts # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ("Paths for outs:\n'{}'('{}')\n'{}'('{}')\n" "overlap. To avoid unpredictable behaviour, " "rerun command with non overlapping outs paths." ).format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out for stage in stages: out = outs.shortest_prefix(PathInfo(stage.path).parts).value if out: raise StagePathAsOutputError(stage, str(out)) # Building graph G.add_nodes_from(stages) for stage in stages: for dep in stage.deps: if dep.path_info is None: continue dep_key = dep.path_info.parts overlapping = [n.value for n in outs.prefixes(dep_key)] if outs.has_subtrie(dep_key): overlapping.extend(outs.values(prefix=dep_key)) G.add_edges_from((stage, out.stage) for out in overlapping) check_acyclic(G) return G
def graph(self, stages=None, from_directory=None): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, use the ones on the `from_directory`. from_directory (str): directory where to look at for stages, if None is given, use the current working directory Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from dvc.exceptions import ( OutputDuplicationError, StagePathAsOutputError, OverlappingOutputPathsError, ) G = nx.DiGraph() G_active = nx.DiGraph() stages = stages or self.stages(from_directory, check_dag=False) stages = [stage for stage in stages if stage] outs = [] for stage in stages: for out in stage.outs: existing = [] for o in outs: if o.path_info == out.path_info: existing.append(o.stage) in_o_dir = out.path_info.isin(o.path_info) in_out_dir = o.path_info.isin(out.path_info) if in_o_dir or in_out_dir: raise OverlappingOutputPathsError(o, out) if existing: stages = [stage.relpath, existing[0].relpath] raise OutputDuplicationError(str(out), stages) outs.append(out) for stage in stages: stage_path_info = PathInfo(stage.path) for out in outs: if stage_path_info.isin(out.path_info): raise StagePathAsOutputError(stage.wdir, stage.relpath) for stage in stages: node = os.path.relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) G_active.add_node(node, stage=stage) for dep in stage.deps: for out in outs: if (out.path_info != dep.path_info and not dep.path_info.isin(out.path_info) and not out.path_info.isin(dep.path_info)): continue dep_stage = out.stage dep_node = os.path.relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) if not stage.locked: G_active.add_node(dep_node, stage=dep_stage) G_active.add_edge(node, dep_node) self._check_cyclic_graph(G) return G, G_active
def graph(self, stages=None, from_directory=None): import networkx as nx from dvc.exceptions import ( OutputDuplicationError, StagePathAsOutputError, OverlappingOutputPathsError, ) G = nx.DiGraph() G_active = nx.DiGraph() stages = stages or self.stages(from_directory, check_dag=False) stages = [stage for stage in stages if stage] outs = [] for stage in stages: for out in stage.outs: existing = [] for o in outs: if o.path == out.path: existing.append(o.stage) in_o_dir = out.path.startswith(o.path + o.sep) in_out_dir = o.path.startswith(out.path + out.sep) if in_o_dir or in_out_dir: raise OverlappingOutputPathsError(o, out) if existing: stages = [stage.relpath, existing[0].relpath] raise OutputDuplicationError(out.path, stages) outs.append(out) for stage in stages: path_dir = os.path.dirname(stage.path) + os.sep for out in outs: if path_dir.startswith(out.path + os.sep): raise StagePathAsOutputError(stage.wdir, stage.relpath) # collect the whole DAG for stage in stages: node = os.path.relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) G_active.add_node(node, stage=stage) for dep in stage.deps: for out in outs: if (out.path != dep.path and not dep.path.startswith(out.path + out.sep) and not out.path.startswith(dep.path + dep.sep)): continue dep_stage = out.stage dep_node = os.path.relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) if not stage.locked: G_active.add_node(dep_node, stage=dep_stage) G_active.add_edge(node, dep_node) self._check_cyclic_graph(G) return G, G_active