def collect_granular(self, target=None, with_deps=False, recursive=False, graph=None): """ Priority is in the order of following in case of ambiguity: - .dvc file or .yaml file - dir if recursive and directory exists - stage_name - output file """ if not target: return [(stage, None) for stage in self.stages] file, name = parse_target(target) stages = [] # Optimization: do not collect the graph for a specific target if not file: # parsing is ambiguous when it does not have a colon # or if it's not a dvcfile, as it can be a stage name # in `dvc.yaml` or, an output in a stage. logger.debug("Checking if stage '%s' is in '%s'", target, PIPELINE_FILE) if not (recursive and os.path.isdir(target)): stage = self._collect_from_default_dvcfile(target) if stage: stages = (self._collect_pipeline(stage) if with_deps else [stage]) elif not with_deps and is_valid_filename(file): stages = self.get_stages(file, name) if not stages: if not (recursive and os.path.isdir(target)): try: (out, ) = self.find_outs_by_path(target, strict=False) filter_info = PathInfo(os.path.abspath(target)) return [(out.stage, filter_info)] except OutputNotFoundError: pass try: stages = self.collect(target, with_deps, recursive, graph) except StageFileDoesNotExistError as exc: # collect() might try to use `target` as a stage name # and throw error that dvc.yaml does not exist, whereas it # should say that both stage name and file does not exist. if file and is_valid_filename(file): raise raise NoOutputOrStageError(target, exc.file) from exc except StageNotFound as exc: raise NoOutputOrStageError(target, exc.file) from exc return [(stage, None) for stage in stages]
def _collect_specific_target( loader: "StageLoad", target: str, with_deps: bool, recursive: bool, accept_group: bool, ) -> Tuple[StageIter, "OptStr", "OptStr"]: from dvc.dvcfile import is_valid_filename # Optimization: do not collect the graph for a specific target file, name = parse_target(target) # if the target has a file, we can load directly from it. if not file: # but, if there's no file, parsing is ambiguous as it can be a # stage name in `dvc.yaml` file or an output. We prioritize # `dvc.yaml` stage name here. If it exists, then we move on. # else, we assume it's a output name in the `collect_granular()` below msg = "Checking if stage '%s' is in '%s'" logger.debug(msg, target, PIPELINE_FILE) if not (recursive and loader.tree.isdir(target)): stages = _maybe_collect_from_dvc_yaml( loader, target, with_deps, accept_group=accept_group, ) if stages: return stages, file, name elif not with_deps and is_valid_filename(file): stages = loader.load_all(file, name, accept_group=accept_group) return stages, file, name return [], file, name
def get(url, path, out=None, rev=None): from dvc.external_repo import external_repo from dvc.dvcfile import is_valid_filename out = resolve_output(path, out) if is_valid_filename(out): raise GetDVCFileError() # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(url=url, rev=rev) as repo: if hasattr(repo, "cache"): repo.cache.local.cache_dir = tmp_dir # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we # are done, and to make that work we would have to copy data # over anyway before removing the cache, so we might just copy # it right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] repo.pull_to(path, PathInfo(out)) finally: remove(tmp_dir)
def _func(fname): if dvcfiles: return True return not ( is_valid_filename(fname) or fname == DvcIgnore.DVCIGNORE_FILE )
def parse_target(target, default=None): from dvc.dvcfile import PIPELINE_FILE, PIPELINE_LOCK, is_valid_filename from dvc.exceptions import DvcException if not target: return None, None match = TARGET_REGEX.match(target) if not match: return target, None path, name = ( match.group("path"), match.group("name"), ) if path: if os.path.basename(path) == PIPELINE_LOCK: raise DvcException( "Did you mean: `{}`?".format( target.replace(".lock", ".yaml", 1) ) ) if not name: ret = (target, None) return ret if is_valid_filename(target) else ret[::-1] if not path: path = default or PIPELINE_FILE logger.debug("Assuming file to be '%s'", path) return path, name
def collect_granular( self, target: str = None, with_deps: bool = False, recursive: bool = False, graph: "DiGraph" = None, accept_group: bool = False, ) -> List[StageInfo]: """Collects a list of (stage, filter_info) from the given target. Priority is in the order of following in case of ambiguity: - .dvc file or .yaml file - dir if recursive and directory exists - stage_name - output file Args: target: if not provided, all of the stages without any filters are returned. If `target` is a path to a dvc-tracked output, a (stage, output_path_info) is returned. Otherwise, the details above for `target` in `collect()` applies. (see `collect()` for other arguments) """ if not target: return [StageInfo(stage) for stage in self.repo.stages] stages, file, _ = _collect_specific_target(self, target, with_deps, recursive, accept_group) if not stages: if not (recursive and self.tree.isdir(target)): try: (out, ) = self.repo.find_outs_by_path(target, strict=False) filter_info = PathInfo(os.path.abspath(target)) return [StageInfo(out.stage, filter_info)] except OutputNotFoundError: pass try: stages = self.collect( target, with_deps, recursive, graph, accept_group=accept_group, ) except StageFileDoesNotExistError as exc: # collect() might try to use `target` as a stage name # and throw error that dvc.yaml does not exist, whereas it # should say that both stage name and file does not exist. if file and is_valid_filename(file): raise raise NoOutputOrStageError(target, exc.file) from exc except StageNotFound as exc: raise NoOutputOrStageError(target, exc.file) from exc return [StageInfo(stage) for stage in stages]
def _func(fname): from dvc.dvcfile import is_valid_filename from dvc.ignore import DvcIgnore if dvcfiles: return True return not (is_valid_filename(fname) or fname == DvcIgnore.DVCIGNORE_FILE)
def get(url, path, out=None, rev=None, jobs=None): import shortuuid from dvc.dvcfile import is_valid_filename from dvc.external_repo import external_repo from dvc.fs.callbacks import Callback out = resolve_output(path, out) if is_valid_filename(out): raise GetDVCFileError() # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we # are done, and to make that work we would have to copy data # over anyway before removing the cache, so we might just copy # it right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. cache_types = ["reflink", "hardlink", "copy"] try: with external_repo( url=url, rev=rev, cache_dir=tmp_dir, cache_types=cache_types ) as repo: if os.path.isabs(path): from dvc.fs.data import DataFileSystem fs = DataFileSystem(repo=repo, workspace="local") fs_path = path else: fs = repo.dvcfs fs_path = fs.from_os_path(path) with Callback.as_tqdm_callback( desc=f"Downloading {fs.path.name(path)}", unit="files", ) as cb: fs.get( fs_path, os.path.abspath(out), batch_size=jobs, callback=cb, ) finally: remove(tmp_dir)
def _validate_output_path(cls, path, stage=None): from dvc.dvcfile import is_valid_filename if is_valid_filename(path): raise cls.IsStageFileError(path) if stage: check = stage.repo.tree.dvcignore.check_ignore(path) if check.match: raise cls.IsIgnoredError(check)
def _validate_output_path(self, path, stage=None): from dvc.dvcfile import is_valid_filename if is_valid_filename(path): raise self.IsStageFileError(path) if stage: abs_path = os.path.join(stage.wdir, path) if self._is_path_dvcignore(abs_path): check = stage.repo.dvcignore.check_ignore(abs_path) raise self.IsIgnoredError(check)
def parse_target( target: str, default: str = None, isa_glob: bool = False ) -> Tuple[Optional[str], Optional[str]]: from dvc.dvcfile import PIPELINE_FILE, PIPELINE_LOCK, is_valid_filename from dvc.exceptions import DvcException from dvc.parsing import JOIN if not target: return None, None default = default or PIPELINE_FILE if isa_glob: path, _, glob = target.rpartition(":") return path or default, glob or None # look for first "@", so as not to assume too much about stage name # eg: it might contain ":" in a generated stages from dict which might # affect further parsing with the regex. group, _, key = target.partition(JOIN) match = TARGET_REGEX.match(group) if not match: return target, None path, name = ( match.group("path"), match.group("name"), ) if name and key: name += f"{JOIN}{key}" if path: if os.path.basename(path) == PIPELINE_LOCK: raise DvcException( "Did you mean: `{}`?".format( target.replace(".lock", ".yaml", 1) ) ) if not name: ret = (target, None) return ret if is_valid_filename(target) else ret[::-1] if not path: logger.trace( # type: ignore[attr-defined] "Assuming file to be '%s'", default ) return path or default, name
def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False): from dvc.dvcfile import is_valid_filename assert repo_walk try: _, dvc_dirs, dvc_fnames = (next(dvc_walk) if dvc_walk else (None, [], [])) repo_root, repo_dirs, repo_fnames = next(repo_walk) except StopIteration: return # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs dvc_set = set(dvc_dirs) repo_set = set(repo_dirs) dvc_only = list(dvc_set - repo_set) repo_only = list(repo_set - dvc_set) shared = list(dvc_set & repo_set) dirs = shared + dvc_only + repo_only # merge file lists files = { fname for fname in dvc_fnames + repo_fnames if dvcfiles or not is_valid_filename(fname) } yield repo_root, dirs, list(files) def is_dvc_repo(d): return self._is_dvc_repo(os.path.join(repo_root, d)) # remove subrepos to prevent it from being traversed subrepos = set(filter(is_dvc_repo, repo_only)) # set dir order for next recursion level - shared dirs first so that # next() for both generators recurses into the same shared directory dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set] repo_dirs[:] = lfilter(lambda d: d in (repo_set - subrepos), dirs) for dirname in dirs: if dirname in subrepos: dir_path = os.path.join(repo_root, dirname) yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles) elif dirname in shared: yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) elif dirname in dvc_set: yield from self._dvc_walk(dvc_walk) elif dirname in repo_set: yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)
def collect_granular(self, target, *args, **kwargs): from ..dvcfile import Dvcfile, is_valid_filename if not target: return [(stage, None) for stage in self.stages] file, name = parse_target(target) if is_valid_filename(file) and not kwargs.get("with_deps"): # Optimization: do not collect the graph for a specific .dvc target stages = Dvcfile(self, file).stages.filter(name) return [(stage, None) for stage in stages.values()] try: (out, ) = self.find_outs_by_path(file, strict=False) filter_info = PathInfo(os.path.abspath(file)) return [(out.stage, filter_info)] except OutputNotFoundError: stages = self.collect(target, *args, **kwargs) return [(stage, None) for stage in stages]
def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False): assert repo_walk try: _, dvc_dirs, dvc_fnames = (next(dvc_walk) if dvc_walk else (None, [], [])) repo_root, repo_dirs, repo_fnames = next(repo_walk) except StopIteration: return # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs dvc_set = set(dvc_dirs) repo_set = set(repo_dirs) dvc_only = list(dvc_set - repo_set) repo_only = list(repo_set - dvc_set) shared = list(dvc_set & repo_set) dirs = shared + dvc_only + repo_only # merge file lists files = { fname for fname in dvc_fnames + repo_fnames if dvcfiles or not is_valid_filename(fname) } yield repo_root, dirs, list(files) # set dir order for next recursion level - shared dirs first so that # next() for both generators recurses into the same shared directory dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set] repo_dirs[:] = [dirname for dirname in dirs if dirname in repo_set] for dirname in dirs: dir_path = os.path.join(repo_root, dirname) if self._is_dvc_repo(dir_path): yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles) elif dirname in shared: yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) elif dirname in dvc_set: yield from self._dvc_walk(dvc_walk) elif dirname in repo_set: yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)
def _walk(self, dvc_walk, repo_walk, dvcfiles=False): try: _, dvc_dirs, dvc_fnames = next(dvc_walk) repo_root, repo_dirs, repo_fnames = next(repo_walk) except StopIteration: return # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs dvc_set = set(dvc_dirs) repo_set = set(repo_dirs) dvc_only = list(dvc_set - repo_set) repo_only = list(repo_set - dvc_set) shared = list(dvc_set & repo_set) dirs = shared + dvc_only + repo_only # merge file lists files = { fname for fname in dvc_fnames + repo_fnames if dvcfiles or not is_valid_filename(fname) } yield repo_root, dirs, list(files) # set dir order for next recursion level - shared dirs first so that # next() for both generators recurses into the same shared directory dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set] repo_dirs[:] = [dirname for dirname in dirs if dirname in repo_set] for dirname in dirs: if dirname in shared: yield from self._walk(dvc_walk, repo_walk, dvcfiles=dvcfiles) elif dirname in dvc_set: yield from self._walk_one(dvc_walk) elif dirname in repo_set: yield from self._walk_one(repo_walk)
def _validate_output_path(cls, path): from dvc.dvcfile import is_valid_filename if is_valid_filename(path): raise cls.IsStageFileError(path)
def is_dvcfile_and_not_ignored(root, file): return is_valid_filename( file) and not is_ignored(f"{root}{sep}{file}")