def build_outs_trie(stages): outs = Trie() for stage in filter(bool, stages): # bug? not using it later for out in stage.outs: out_key = out.path_info.parts # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ( "Paths for outs:\n'{}'('{}')\n'{}'('{}')\n" "overlap. To avoid unpredictable behaviour, " "rerun command with non overlapping outs paths." ).format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out return outs
def build_outs_trie(stages): outs = Trie() for stage in stages: for out in stage.outs: out_key = out.fs.path.parts(out.fs_path) # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ( "The output paths:\n'{}'('{}')\n'{}'('{}')\n" "overlap and are thus in the same tracked directory.\n" "To keep reproducibility, outputs should be in separate " "tracked directories or tracked individually.").format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out return outs
def _check_output_duplication(self, outs): from dvc.exceptions import OutputDuplicationError for stage in self.stages(): for o in stage.outs: for out in outs: if o.path == out.path and o.stage.path != out.stage.path: stages = [o.stage.relpath, out.stage.relpath] raise OutputDuplicationError(o.path, stages)
def graph(self, stages=None, from_directory=None): import networkx as nx from dvc.exceptions import ( OutputDuplicationError, WorkingDirectoryAsOutputError, ) G = nx.DiGraph() G_active = nx.DiGraph() stages = stages or self.stages(from_directory) stages = [stage for stage in stages if stage] outs = [] for stage in stages: for out in stage.outs: existing = [o.stage for o in outs if o.path == out.path] if existing: stages = [stage.relpath, existing[0].relpath] raise OutputDuplicationError(out.path, stages) outs.append(out) for stage in stages: for out in outs: overlaps = stage.cwd == out.path or stage.cwd.startswith( out.path + os.sep) if overlaps: raise WorkingDirectoryAsOutputError( stage.cwd, stage.relpath) # collect the whole DAG for stage in stages: node = os.path.relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) G_active.add_node(node, stage=stage) for dep in stage.deps: for out in outs: if (out.path != dep.path and not dep.path.startswith(out.path + out.sep) and not out.path.startswith(dep.path + dep.sep)): continue dep_stage = out.stage dep_node = os.path.relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) if not stage.locked: G_active.add_node(dep_node, stage=dep_stage) G_active.add_edge(node, dep_node) self._check_cyclic_graph(G) return G, G_active
def _find_output_by_path(self, path, outs=None): from dvc.exceptions import OutputDuplicationError if not outs: outs = [out for stage in self.active_stages() for out in stage.outs] abs_path = os.path.abspath(path) matched = [out for out in outs if out.path == abs_path] stages = [out.stage.path for out in matched] if len(stages) > 1: raise OutputDuplicationError(path, stages) return matched[0] if matched else None
def check_graphs(repo: "Repo", stage: Union["Stage", "PipelineStage"], force: bool = True) -> None: """Checks graph and if that stage already exists. If it exists in the dvc.yaml file, it errors out unless force is given. """ from dvc.exceptions import OutputDuplicationError try: if force: with suppress(ValueError): repo.stages.remove(stage) else: _check_stage_exists(repo, stage, stage.path) repo.check_modified_graph([stage]) except OutputDuplicationError as exc: raise OutputDuplicationError(exc.output, set(exc.stages) - {stage})
def _find_output_by_path(self, path, outs=None, recursive=False): from dvc.exceptions import OutputDuplicationError if not outs: astages = self.active_stages() outs = [out for stage in astages for out in stage.outs] abs_path = os.path.abspath(path) if os.path.isdir(abs_path) and recursive: matched = [ out for out in outs if os.path.abspath(out.path).startswith(abs_path) ] else: matched = [out for out in outs if out.path == abs_path] stages = [out.stage.relpath for out in matched] if len(stages) > 1: raise OutputDuplicationError(path, stages) return matched if matched else []
def graph(self, from_directory=None): import networkx as nx from dvc.exceptions import OutputDuplicationError G = nx.DiGraph() G_active = nx.DiGraph() stages = self.stages(from_directory) outs = [] outs_by_path = {} for stage in stages: for o in stage.outs: existing = outs_by_path.get(o.path, None) if existing is not None: stages = [o.stage.relpath, existing.stage.relpath] raise OutputDuplicationError(o.path, stages) outs.append(o) outs_by_path[o.path] = o # collect the whole DAG for stage in stages: node = os.path.relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) G_active.add_node(node, stage=stage) for dep in stage.deps: for out in outs: if out.path != dep.path \ and not dep.path.startswith(out.path + out.sep) \ and not out.path.startswith(dep.path + dep.sep): continue dep_stage = out.stage dep_node = os.path.relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) if not stage.locked: G_active.add_node(dep_node, stage=dep_stage) G_active.add_edge(node, dep_node) return G, G_active
def _collect_graph(self, stages=None): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, collect stages in the repository. Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from dvc.exceptions import ( OutputDuplicationError, StagePathAsOutputError, OverlappingOutputPathsError, ) G = nx.DiGraph() stages = stages or self.stages stages = [stage for stage in stages if stage] outs = {} for stage in stages: for out in stage.outs: if out.path_info in outs: dup_stages = [stage, outs[out.path_info].stage] raise OutputDuplicationError(str(out), dup_stages) outs[out.path_info] = out for stage in stages: for out in stage.outs: for p in out.path_info.parents: if p in outs: raise OverlappingOutputPathsError(outs[p], out) for stage in stages: stage_path_info = PathInfo(stage.path) for p in chain([stage_path_info], stage_path_info.parents): if p in outs: raise StagePathAsOutputError(stage, str(outs[p])) for stage in stages: G.add_node(stage) for dep in stage.deps: if dep.path_info is None: continue for out_path_info, out in outs.items(): if out_path_info.overlaps(dep.path_info): G.add_node(out.stage) G.add_edge(stage, out.stage) check_acyclic(G) return G
def _collect_graph(self, stages): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, collect stages in the repository. Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from pygtrie import Trie from dvc.exceptions import ( OutputDuplicationError, OverlappingOutputPathsError, StagePathAsOutputError, ) G = nx.DiGraph() stages = stages or self.stages outs = Trie() # Use trie to efficiently find overlapping outs and deps for stage in filter(bool, stages): # bug? not using it later for out in stage.outs: out_key = out.path_info.parts # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ("Paths for outs:\n'{}'('{}')\n'{}'('{}')\n" "overlap. To avoid unpredictable behaviour, " "rerun command with non overlapping outs paths." ).format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out for stage in stages: out = outs.shortest_prefix(PathInfo(stage.path).parts).value if out: raise StagePathAsOutputError(stage, str(out)) # Building graph G.add_nodes_from(stages) for stage in stages: for dep in stage.deps: if dep.path_info is None: continue dep_key = dep.path_info.parts overlapping = [n.value for n in outs.prefixes(dep_key)] if outs.has_subtrie(dep_key): overlapping.extend(outs.values(prefix=dep_key)) G.add_edges_from((stage, out.stage) for out in overlapping) check_acyclic(G) return G
def graph(self, stages=None, from_directory=None): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, use the ones on the `from_directory`. from_directory (str): directory where to look at for stages, if None is given, use the current working directory Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from dvc.exceptions import ( OutputDuplicationError, StagePathAsOutputError, OverlappingOutputPathsError, ) G = nx.DiGraph() G_active = nx.DiGraph() stages = stages or self.stages(from_directory, check_dag=False) stages = [stage for stage in stages if stage] outs = [] for stage in stages: for out in stage.outs: existing = [] for o in outs: if o.path_info == out.path_info: existing.append(o.stage) in_o_dir = out.path_info.isin(o.path_info) in_out_dir = o.path_info.isin(out.path_info) if in_o_dir or in_out_dir: raise OverlappingOutputPathsError(o, out) if existing: stages = [stage.relpath, existing[0].relpath] raise OutputDuplicationError(str(out), stages) outs.append(out) for stage in stages: stage_path_info = PathInfo(stage.path) for out in outs: if stage_path_info.isin(out.path_info): raise StagePathAsOutputError(stage.wdir, stage.relpath) for stage in stages: node = os.path.relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) G_active.add_node(node, stage=stage) for dep in stage.deps: for out in outs: if (out.path_info != dep.path_info and not dep.path_info.isin(out.path_info) and not out.path_info.isin(dep.path_info)): continue dep_stage = out.stage dep_node = os.path.relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) if not stage.locked: G_active.add_node(dep_node, stage=dep_stage) G_active.add_edge(node, dep_node) self._check_cyclic_graph(G) return G, G_active
def graph(self, stages=None, from_directory=None): import networkx as nx from dvc.exceptions import ( OutputDuplicationError, StagePathAsOutputError, OverlappingOutputPathsError, ) G = nx.DiGraph() G_active = nx.DiGraph() stages = stages or self.stages(from_directory, check_dag=False) stages = [stage for stage in stages if stage] outs = [] for stage in stages: for out in stage.outs: existing = [] for o in outs: if o.path == out.path: existing.append(o.stage) in_o_dir = out.path.startswith(o.path + o.sep) in_out_dir = o.path.startswith(out.path + out.sep) if in_o_dir or in_out_dir: raise OverlappingOutputPathsError(o, out) if existing: stages = [stage.relpath, existing[0].relpath] raise OutputDuplicationError(out.path, stages) outs.append(out) for stage in stages: path_dir = os.path.dirname(stage.path) + os.sep for out in outs: if path_dir.startswith(out.path + os.sep): raise StagePathAsOutputError(stage.wdir, stage.relpath) # collect the whole DAG for stage in stages: node = os.path.relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) G_active.add_node(node, stage=stage) for dep in stage.deps: for out in outs: if (out.path != dep.path and not dep.path.startswith(out.path + out.sep) and not out.path.startswith(dep.path + dep.sep)): continue dep_stage = out.stage dep_node = os.path.relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) if not stage.locked: G_active.add_node(dep_node, stage=dep_stage) G_active.add_edge(node, dep_node) self._check_cyclic_graph(G) return G, G_active
def run(self, fname=None, no_exec=False, single_stage=False, **kwargs): from dvc.dvcfile import PIPELINE_FILE, Dvcfile from dvc.exceptions import InvalidArgumentError, OutputDuplicationError from dvc.stage import PipelineStage, Stage, create_stage, restore_meta from dvc.stage.exceptions import InvalidStageName if not kwargs.get("cmd"): raise InvalidArgumentError("command is not specified") stage_cls = PipelineStage path = PIPELINE_FILE stage_name = kwargs.get("name") if stage_name and single_stage: raise InvalidArgumentError( "`-n|--name` is incompatible with `--single-stage`") if stage_name and fname: raise InvalidArgumentError( "`--file` is currently incompatible with `-n|--name` " "and requires `--single-stage`") if not stage_name and not single_stage: raise InvalidArgumentError("`-n|--name` is required") if single_stage: kwargs.pop("name", None) stage_cls = Stage path = fname or _get_file_path(kwargs) else: if not is_valid_name(stage_name): raise InvalidStageName params = chunk_dict(parse_params_from_cli(kwargs.pop("params", []))) stage = create_stage(stage_cls, repo=self, path=path, params=params, **kwargs) restore_meta(stage) if kwargs.get("run_cache", True) and stage.can_be_skipped: return None dvcfile = Dvcfile(self, stage.path) try: if kwargs.get("force", True): with suppress(ValueError): self.stages.remove(stage) else: _check_stage_exists(dvcfile, stage) self.check_modified_graph([stage]) except OutputDuplicationError as exc: raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) if no_exec: stage.ignore_outs() else: stage.run( no_commit=kwargs.get("no_commit", False), run_cache=kwargs.get("run_cache", True), ) dvcfile.dump(stage, update_lock=not no_exec) return stage