예제 #1
0
파일: __init__.py 프로젝트: stjordanis/dvc
    def __init__(
        self,
        root_dir=None,
        scm=None,
        rev=None,
        subrepos=False,
        uninitialized=False,
        config=None,
        url=None,
        repo_factory=None,
    ):
        from dvc.config import Config
        from dvc.data_cloud import DataCloud
        from dvc.fs.local import LocalFileSystem
        from dvc.lock import LockNoop, make_lock
        from dvc.objects.db import ODBManager
        from dvc.repo.live import Live
        from dvc.repo.metrics import Metrics
        from dvc.repo.params import Params
        from dvc.repo.plots import Plots
        from dvc.repo.stage import StageLoad
        from dvc.scm import SCM
        from dvc.stage.cache import StageCache
        from dvc.state import State, StateNoop

        self.url = url
        self._fs_conf = {"repo_factory": repo_factory}

        if rev and not scm:
            scm = SCM(root_dir or os.curdir)

        self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs(
            root_dir=root_dir, scm=scm, rev=rev, uninitialized=uninitialized
        )

        if scm:
            self._fs = scm.get_fs(rev)
        else:
            self._fs = LocalFileSystem(url=self.root_dir)

        self.config = Config(self.dvc_dir, fs=self.fs, config=config)
        self._uninitialized = uninitialized
        self._scm = scm

        # used by RepoFileSystem to determine if it should traverse subrepos
        self.subrepos = subrepos

        self.cloud = DataCloud(self)
        self.stage = StageLoad(self)

        if scm or not self.dvc_dir:
            self.lock = LockNoop()
            self.state = StateNoop()
            self.odb = ODBManager(self)
        else:
            self.lock = make_lock(
                os.path.join(self.tmp_dir, "lock"),
                tmp_dir=self.tmp_dir,
                hardlink_lock=self.config["core"].get("hardlink_lock", False),
                friendly=True,
            )

            # NOTE: storing state and link_state in the repository itself to
            # avoid any possible state corruption in 'shared cache dir'
            # scenario.
            self.state = State(self.root_dir, self.tmp_dir, self.dvcignore)
            self.odb = ODBManager(self)

            self.stage_cache = StageCache(self)

            self._ignore()

        self.metrics = Metrics(self)
        self.plots = Plots(self)
        self.params = Params(self)
        self.live = Live(self)

        self.stage_collection_error_handler: Optional[
            Callable[[str, Exception], None]
        ] = None
        self._lock_depth = 0
예제 #2
0
파일: __init__.py 프로젝트: qooglewb/dvc
class Repo(object):
    DVC_DIR = ".dvc"

    from dvc.repo.destroy import destroy
    from dvc.repo.install import install
    from dvc.repo.add import add
    from dvc.repo.remove import remove
    from dvc.repo.lock import lock as lock_stage
    from dvc.repo.move import move
    from dvc.repo.run import run
    from dvc.repo.imp import imp
    from dvc.repo.imp_url import imp_url
    from dvc.repo.reproduce import reproduce
    from dvc.repo.checkout import checkout
    from dvc.repo.push import push
    from dvc.repo.fetch import fetch
    from dvc.repo.pull import pull
    from dvc.repo.status import status
    from dvc.repo.gc import gc
    from dvc.repo.commit import commit
    from dvc.repo.diff import diff
    from dvc.repo.brancher import brancher
    from dvc.repo.get import get
    from dvc.repo.get_url import get_url
    from dvc.repo.update import update

    def __init__(self, root_dir=None):
        from dvc.state import State
        from dvc.lock import Lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.repo.metrics import Metrics
        from dvc.scm.tree import WorkingTree
        from dvc.repo.tag import Tag

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        self.scm = SCM(self.root_dir, repo=self)

        self.tree = WorkingTree(self.root_dir)

        self.lock = Lock(self.dvc_dir)
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self, self.config.config)

        core = self.config.config[Config.SECTION_CORE]

        level = core.get(Config.SECTION_CORE_LOGLEVEL)
        if level:
            logger.setLevel(level.upper())

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        self.metrics = Metrics(self)
        self.tag = Tag(self)

        self._ignore()

    def __repr__(self):
        return "Repo: '{root_dir}'".format(root_dir=self.root_dir)

    @classmethod
    def find_root(cls, root=None):
        if root is None:
            root = os.getcwd()
        else:
            root = os.path.abspath(os.path.realpath(root))

        while True:
            dvc_dir = os.path.join(root, cls.DVC_DIR)
            if os.path.isdir(dvc_dir):
                return root
            if os.path.ismount(root):
                break
            root = os.path.dirname(root)
        raise NotDvcRepoError(root)

    @classmethod
    def find_dvc_dir(cls, root=None):
        root_dir = cls.find_root(root)
        return os.path.join(root_dir, cls.DVC_DIR)

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False, force=False):
        from dvc.repo.init import init

        init(root_dir=root_dir, no_scm=no_scm, force=force)
        return Repo(root_dir)

    def unprotect(self, target):
        return self.cache.local.unprotect(PathInfo(target))

    def _ignore(self):
        from dvc.updater import Updater

        updater = Updater(self.dvc_dir)

        flist = [
            self.lock.lock_file,
            self.config.config_local_file,
            updater.updater_file,
            updater.lock.lock_file,
        ] + self.state.files

        if self.cache.local.cache_dir.startswith(self.root_dir):
            flist += [self.cache.local.cache_dir]

        self.scm.ignore_list(flist)

    def check_dag(self, stages):
        """Generate graph including the new stage to check for errors"""
        self.graph(stages=stages)

    @staticmethod
    def _check_cyclic_graph(graph):
        import networkx as nx
        from dvc.exceptions import CyclicGraphError

        cycles = list(nx.simple_cycles(graph))

        if cycles:
            raise CyclicGraphError(cycles[0])

    def _get_pipeline(self, node):
        pipelines = [i for i in self.pipelines() if i.has_node(node)]
        assert len(pipelines) == 1
        return pipelines[0]

    def collect(self, target, with_deps=False, recursive=False):
        import networkx as nx
        from dvc.stage import Stage

        if not target or (recursive and os.path.isdir(target)):
            return self.active_stages(target)

        stage = Stage.load(self, target)
        if not with_deps:
            return [stage]

        node = relpath(stage.path, self.root_dir)
        G = self._get_pipeline(node)

        ret = []
        for n in nx.dfs_postorder_nodes(G, node):
            ret.append(G.node[n]["stage"])

        return ret

    def used_cache(
        self,
        target=None,
        all_branches=False,
        active=True,
        with_deps=False,
        all_tags=False,
        remote=None,
        force=False,
        jobs=None,
        recursive=False,
    ):
        """Get the stages related to the given target and collect
        the `info` of its outputs.

        This is useful to know what files from the cache are _in use_
        (namely, a file described as an output on a stage).

        The scope is, by default, the working directory, but you can use
        `all_branches` or `all_tags` to expand scope.

        Returns:
            A dictionary with Schemes (representing output's location) as keys,
            and a list with the outputs' `dumpd` as values.
        """

        cache = {}
        cache["local"] = []
        cache["s3"] = []
        cache["gs"] = []
        cache["hdfs"] = []
        cache["ssh"] = []
        cache["azure"] = []

        for branch in self.brancher(all_branches=all_branches,
                                    all_tags=all_tags):
            if target:
                if recursive and os.path.isdir(target):
                    stages = self.stages(target)
                else:
                    stages = self.collect(target, with_deps=with_deps)
            elif active:
                stages = self.active_stages()
            else:
                stages = self.stages()

            for stage in stages:
                if active and not target and stage.locked:
                    logger.warning(
                        "DVC-file '{path}' is locked. Its dependencies are"
                        " not going to be pushed/pulled/fetched.".format(
                            path=stage.relpath))

                for out in stage.outs:
                    scheme = out.path_info.scheme
                    used_cache = out.get_used_cache(remote=remote,
                                                    force=force,
                                                    jobs=jobs)

                    cache[scheme].extend(
                        dict(entry, branch=branch) for entry in used_cache)

        return cache

    def graph(self, stages=None, from_directory=None):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, use the ones
                on the `from_directory`.

            from_directory (str): directory where to look at for stages, if
                None is given, use the current working directory

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from dvc.exceptions import (
            OutputDuplicationError,
            StagePathAsOutputError,
            OverlappingOutputPathsError,
        )

        G = nx.DiGraph()
        G_active = nx.DiGraph()
        stages = stages or self.stages(from_directory, check_dag=False)
        stages = [stage for stage in stages if stage]
        outs = {}

        for stage in stages:
            for out in stage.outs:
                if out.path_info in outs:
                    stages = [stage.relpath, outs[out.path_info].stage.relpath]
                    raise OutputDuplicationError(str(out), stages)
                outs[out.path_info] = out

        for stage in stages:
            for out in stage.outs:
                for p in out.path_info.parents:
                    if p in outs:
                        raise OverlappingOutputPathsError(outs[p], out)

        for stage in stages:
            stage_path_info = PathInfo(stage.path)
            for p in chain([stage_path_info], stage_path_info.parents):
                if p in outs:
                    raise StagePathAsOutputError(stage.wdir, stage.relpath)

        for stage in stages:
            node = relpath(stage.path, self.root_dir)

            G.add_node(node, stage=stage)
            G_active.add_node(node, stage=stage)

            for dep in stage.deps:
                if dep.path_info is None:
                    continue

                for out in outs:
                    if (out == dep.path_info or dep.path_info.isin(out)
                            or out.isin(dep.path_info)):
                        dep_stage = outs[out].stage
                        dep_node = relpath(dep_stage.path, self.root_dir)
                        G.add_node(dep_node, stage=dep_stage)
                        G.add_edge(node, dep_node)
                        if not stage.locked:
                            G_active.add_node(dep_node, stage=dep_stage)
                            G_active.add_edge(node, dep_node)

        self._check_cyclic_graph(G)

        return G, G_active

    def pipelines(self, from_directory=None):
        import networkx as nx

        G, G_active = self.graph(from_directory=from_directory)

        return [
            G.subgraph(c).copy() for c in nx.weakly_connected_components(G)
        ]

    @staticmethod
    def _filter_out_dirs(dirs, outs, root_dir):
        def filter_dirs(dname):
            path = os.path.join(root_dir, dname)
            for out in outs:
                if path == os.path.normpath(out):
                    return False
            return True

        return list(filter(filter_dirs, dirs))

    def stages(self, from_directory=None, check_dag=True):
        """
        Walks down the root directory looking for Dvcfiles,
        skipping the directories that are related with
        any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories
        tracked by DVC (e.g. `dvc add data` would skip `data/`)

        NOTE: For large repos, this could be an expensive
              operation. Consider using some memoization.
        """
        from dvc.stage import Stage

        if not from_directory:
            from_directory = self.root_dir
        elif not os.path.isdir(from_directory):
            raise TargetNotDirectoryError(from_directory)

        stages = []
        outs = []

        for root, dirs, files in self.tree.walk(from_directory,
                                                dvcignore=self.dvcignore):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_valid_filename(path):
                    continue
                stage = Stage.load(self, path)
                for out in stage.outs:
                    if out.scheme == "local":
                        outs.append(out.fspath + out.sep)
                stages.append(stage)

            dirs[:] = self._filter_out_dirs(dirs, outs, root)

        if check_dag:
            self.check_dag(stages)

        return stages

    def active_stages(self, from_directory=None):
        import networkx as nx

        stages = []
        for G in self.pipelines(from_directory):
            stages.extend(list(nx.get_node_attributes(G, "stage").values()))
        return stages

    def find_outs_by_path(self, path, outs=None, recursive=False):
        if not outs:
            # there is no `from_directory=path` argument because some data
            # files might be generated to an upper level, and so it is
            # needed to look at all the files (project root_dir)
            stages = self.stages()
            outs = [out for stage in stages for out in stage.outs]

        abs_path = os.path.abspath(path)
        is_dir = self.tree.isdir(abs_path)

        def func(out):
            if out.scheme == "local" and out.fspath == abs_path:
                return True

            if is_dir and recursive and out.path_info.isin(abs_path):
                return True

            return False

        matched = list(filter(func, outs))
        if not matched:
            raise OutputNotFoundError(path)

        return matched

    def find_out_by_relpath(self, relpath):
        path = os.path.join(self.root_dir, relpath)
        out, = self.find_outs_by_path(path)
        return out

    def is_dvc_internal(self, path):
        path_parts = os.path.normpath(path).split(os.path.sep)
        return self.DVC_DIR in path_parts

    def open(self, path, remote=None, mode="r", encoding=None):
        """Opens a specified resource as a file descriptor"""
        out, = self.find_outs_by_path(path)
        if out.isdir():
            raise ValueError("Can't open a dir")

        cache_file = self.cache.local.checksum_to_path_info(out.checksum)
        cache_file = fspath_py35(cache_file)

        with self.state:
            cache_info = out.get_used_cache(remote=remote)
            self.cloud.pull(cache_info, remote=remote)

        # Since pull may just skip with a warning, we need to check it here
        if not os.path.exists(cache_file):
            raise OutputFileMissingError(relpath(path, self.root_dir))

        return _open(cache_file, mode=mode, encoding=encoding)

    @cached_property
    def dvcignore(self):
        return DvcIgnoreFilter(self.root_dir)
예제 #3
0
파일: __init__.py 프로젝트: jdeguia/dvc
class Repo:
    DVC_DIR = ".dvc"

    from dvc.repo.destroy import destroy
    from dvc.repo.install import install
    from dvc.repo.add import add
    from dvc.repo.remove import remove
    from dvc.repo.ls import ls
    from dvc.repo.freeze import freeze, unfreeze
    from dvc.repo.move import move
    from dvc.repo.run import run
    from dvc.repo.imp import imp
    from dvc.repo.imp_url import imp_url
    from dvc.repo.reproduce import reproduce
    from dvc.repo.checkout import _checkout
    from dvc.repo.push import push
    from dvc.repo.fetch import _fetch
    from dvc.repo.pull import pull
    from dvc.repo.status import status
    from dvc.repo.gc import gc
    from dvc.repo.commit import commit
    from dvc.repo.diff import diff
    from dvc.repo.brancher import brancher
    from dvc.repo.get import get
    from dvc.repo.get_url import get_url
    from dvc.repo.update import update

    def __init__(self, root_dir=None, scm=None, rev=None):
        from dvc.state import State, StateNoop
        from dvc.lock import make_lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.repo.metrics import Metrics
        from dvc.repo.plots import Plots
        from dvc.repo.params import Params
        from dvc.scm.tree import WorkingTree
        from dvc.utils.fs import makedirs
        from dvc.stage.cache import StageCache

        if scm:
            # use GitTree instead of WorkingTree as default repo tree instance
            tree = scm.get_tree(rev)
            self.root_dir = self.find_root(root_dir, tree)
            self.scm = scm
            self.tree = tree
            self.state = StateNoop()
        else:
            root_dir = self.find_root(root_dir)
            self.root_dir = os.path.abspath(os.path.realpath(root_dir))

        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)
        self.config = Config(self.dvc_dir)

        if not scm:
            no_scm = self.config["core"].get("no_scm", False)
            self.scm = SCM(self.root_dir, no_scm=no_scm)
            self.tree = WorkingTree(self.root_dir)

        self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
        self.index_dir = os.path.join(self.tmp_dir, "index")
        makedirs(self.index_dir, exist_ok=True)

        hardlink_lock = self.config["core"].get("hardlink_lock", False)
        self.lock = make_lock(
            os.path.join(self.tmp_dir, "lock"),
            tmp_dir=self.tmp_dir,
            hardlink_lock=hardlink_lock,
            friendly=True,
        )

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        if not scm:
            # NOTE: storing state and link_state in the repository itself to
            # avoid any possible state corruption in 'shared cache dir'
            # scenario.
            self.state = State(self.cache.local)

        self.stage_cache = StageCache(self)

        self.metrics = Metrics(self)
        self.plots = Plots(self)
        self.params = Params(self)

        self._ignore()

    @property
    def tree(self):
        return self._tree

    @tree.setter
    def tree(self, tree):
        if is_working_tree(tree) or tree.tree_root == self.root_dir:
            root = None
        else:
            root = self.root_dir
        self._tree = (
            tree if isinstance(tree, CleanTree) else CleanTree(tree, root)
        )
        # Our graph cache is no longer valid, as it was based on the previous
        # tree.
        self._reset()

    def __repr__(self):
        return f"{self.__class__.__name__}: '{self.root_dir}'"

    @classmethod
    def find_root(cls, root=None, tree=None):
        root_dir = os.path.realpath(root or os.curdir)

        if tree:
            if tree.isdir(os.path.join(root_dir, cls.DVC_DIR)):
                return root_dir
            raise NotDvcRepoError(f"'{root}' does not contain DVC directory")

        if not os.path.isdir(root_dir):
            raise NotDvcRepoError(f"directory '{root}' does not exist")

        while True:
            dvc_dir = os.path.join(root_dir, cls.DVC_DIR)
            if os.path.isdir(dvc_dir):
                return root_dir
            if os.path.ismount(root_dir):
                break
            root_dir = os.path.dirname(root_dir)

        message = (
            "you are not inside of a DVC repository "
            "(checked up to mount point '{}')"
        ).format(root_dir)
        raise NotDvcRepoError(message)

    @classmethod
    def find_dvc_dir(cls, root=None):
        root_dir = cls.find_root(root)
        return os.path.join(root_dir, cls.DVC_DIR)

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False):
        from dvc.repo.init import init

        init(root_dir=root_dir, no_scm=no_scm, force=force, subdir=subdir)
        return Repo(root_dir)

    def unprotect(self, target):
        return self.cache.local.tree.unprotect(PathInfo(target))

    def _ignore(self):
        flist = [self.config.files["local"], self.tmp_dir]

        if path_isin(self.cache.local.cache_dir, self.root_dir):
            flist += [self.cache.local.cache_dir]

        self.scm.ignore_list(flist)

    def get_stage(self, path=None, name=None):
        if not path:
            path = PIPELINE_FILE
            logger.debug("Assuming '%s' to be a stage inside '%s'", name, path)

        dvcfile = Dvcfile(self, path)
        return dvcfile.stages[name]

    def get_stages(self, path=None, name=None):
        if not path:
            path = PIPELINE_FILE
            logger.debug("Assuming '%s' to be a stage inside '%s'", name, path)

        if name:
            return [self.get_stage(path, name)]

        dvcfile = Dvcfile(self, path)
        return list(dvcfile.stages.values())

    def check_modified_graph(self, new_stages):
        """Generate graph including the new stage to check for errors"""
        # Building graph might be costly for the ones with many DVC-files,
        # so we provide this undocumented hack to skip it. See [1] for
        # more details. The hack can be used as:
        #
        #     repo = Repo(...)
        #     repo._skip_graph_checks = True
        #     repo.add(...)
        #
        # A user should care about not duplicating outs and not adding cycles,
        # otherwise DVC might have an undefined behaviour.
        #
        # [1] https://github.com/iterative/dvc/issues/2671
        if not getattr(self, "_skip_graph_checks", False):
            self._collect_graph(self.stages + new_stages)

    def _collect_inside(self, path, graph):
        import networkx as nx

        stages = nx.dfs_postorder_nodes(graph)
        return [stage for stage in stages if path_isin(stage.path, path)]

    def collect(
        self, target=None, with_deps=False, recursive=False, graph=None
    ):
        if not target:
            return list(graph) if graph else self.stages

        if recursive and os.path.isdir(target):
            return self._collect_inside(
                os.path.abspath(target), graph or self.graph
            )

        path, name = parse_target(target)
        stages = self.get_stages(path, name)
        if not with_deps:
            return stages

        res = set()
        for stage in stages:
            res.update(self._collect_pipeline(stage, graph=graph))
        return res

    def _collect_pipeline(self, stage, graph=None):
        import networkx as nx

        pipeline = get_pipeline(get_pipelines(graph or self.graph), stage)
        return nx.dfs_postorder_nodes(pipeline, stage)

    def _collect_from_default_dvcfile(self, target):
        dvcfile = Dvcfile(self, PIPELINE_FILE)
        if dvcfile.exists():
            return dvcfile.stages.get(target)

    def collect_granular(
        self, target=None, with_deps=False, recursive=False, graph=None
    ):
        """
        Priority is in the order of following in case of ambiguity:
            - .dvc file or .yaml file
            - dir if recursive and directory exists
            - stage_name
            - output file
        """
        if not target:
            return [(stage, None) for stage in self.stages]

        file, name = parse_target(target)
        stages = []

        # Optimization: do not collect the graph for a specific target
        if not file:
            # parsing is ambiguous when it does not have a colon
            # or if it's not a dvcfile, as it can be a stage name
            # in `dvc.yaml` or, an output in a stage.
            logger.debug(
                "Checking if stage '%s' is in '%s'", target, PIPELINE_FILE
            )
            if not (recursive and os.path.isdir(target)):
                stage = self._collect_from_default_dvcfile(target)
                if stage:
                    stages = (
                        self._collect_pipeline(stage) if with_deps else [stage]
                    )
        elif not with_deps and is_valid_filename(file):
            stages = self.get_stages(file, name)

        if not stages:
            if not (recursive and os.path.isdir(target)):
                try:
                    (out,) = self.find_outs_by_path(target, strict=False)
                    filter_info = PathInfo(os.path.abspath(target))
                    return [(out.stage, filter_info)]
                except OutputNotFoundError:
                    pass

            try:
                stages = self.collect(target, with_deps, recursive, graph)
            except StageFileDoesNotExistError as exc:
                # collect() might try to use `target` as a stage name
                # and throw error that dvc.yaml does not exist, whereas it
                # should say that both stage name and file does not exist.
                if file and is_valid_filename(file):
                    raise
                raise NoOutputOrStageError(target, exc.file) from exc
            except StageNotFound as exc:
                raise NoOutputOrStageError(target, exc.file) from exc

        return [(stage, None) for stage in stages]

    def used_cache(
        self,
        targets=None,
        all_branches=False,
        with_deps=False,
        all_tags=False,
        all_commits=False,
        remote=None,
        force=False,
        jobs=None,
        recursive=False,
        used_run_cache=None,
    ):
        """Get the stages related to the given target and collect
        the `info` of its outputs.

        This is useful to know what files from the cache are _in use_
        (namely, a file described as an output on a stage).

        The scope is, by default, the working directory, but you can use
        `all_branches`/`all_tags`/`all_commits` to expand the scope.

        Returns:
            A dictionary with Schemes (representing output's location) mapped
            to items containing the output's `dumpd` names and the output's
            children (if the given output is a directory).
        """
        from dvc.cache import NamedCache

        cache = NamedCache()

        for branch in self.brancher(
            all_branches=all_branches,
            all_tags=all_tags,
            all_commits=all_commits,
        ):
            targets = targets or [None]

            pairs = cat(
                self.collect_granular(
                    target, recursive=recursive, with_deps=with_deps
                )
                for target in targets
            )

            suffix = f"({branch})" if branch else ""
            for stage, filter_info in pairs:
                used_cache = stage.get_used_cache(
                    remote=remote,
                    force=force,
                    jobs=jobs,
                    filter_info=filter_info,
                )
                cache.update(used_cache, suffix=suffix)

        if used_run_cache:
            used_cache = self.stage_cache.get_used_cache(
                used_run_cache, remote=remote, force=force, jobs=jobs,
            )
            cache.update(used_cache)

        return cache

    def _collect_graph(self, stages):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, collect stages
                in the repository.

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from pygtrie import Trie
        from dvc.exceptions import (
            OutputDuplicationError,
            StagePathAsOutputError,
            OverlappingOutputPathsError,
        )

        G = nx.DiGraph()
        stages = stages or self.stages
        outs = Trie()  # Use trie to efficiently find overlapping outs and deps

        for stage in filter(bool, stages):  # bug? not using it later
            for out in stage.outs:
                out_key = out.path_info.parts

                # Check for dup outs
                if out_key in outs:
                    dup_stages = [stage, outs[out_key].stage]
                    raise OutputDuplicationError(str(out), dup_stages)

                # Check for overlapping outs
                if outs.has_subtrie(out_key):
                    parent = out
                    overlapping = first(outs.values(prefix=out_key))
                else:
                    parent = outs.shortest_prefix(out_key).value
                    overlapping = out
                if parent and overlapping:
                    msg = (
                        "Paths for outs:\n'{}'('{}')\n'{}'('{}')\n"
                        "overlap. To avoid unpredictable behaviour, "
                        "rerun command with non overlapping outs paths."
                    ).format(
                        str(parent),
                        parent.stage.addressing,
                        str(overlapping),
                        overlapping.stage.addressing,
                    )
                    raise OverlappingOutputPathsError(parent, overlapping, msg)

                outs[out_key] = out

        for stage in stages:
            out = outs.shortest_prefix(PathInfo(stage.path).parts).value
            if out:
                raise StagePathAsOutputError(stage, str(out))

        # Building graph
        G.add_nodes_from(stages)
        for stage in stages:
            for dep in stage.deps:
                if dep.path_info is None:
                    continue

                dep_key = dep.path_info.parts
                overlapping = list(n.value for n in outs.prefixes(dep_key))
                if outs.has_subtrie(dep_key):
                    overlapping.extend(outs.values(prefix=dep_key))

                G.add_edges_from((stage, out.stage) for out in overlapping)
        check_acyclic(G)

        return G

    @cached_property
    def graph(self):
        return self._collect_graph(self.stages)

    @cached_property
    def pipelines(self):
        return get_pipelines(self.graph)

    @cached_property
    def stages(self):
        """
        Walks down the root directory looking for Dvcfiles,
        skipping the directories that are related with
        any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories
        tracked by DVC (e.g. `dvc add data` would skip `data/`)

        NOTE: For large repos, this could be an expensive
              operation. Consider using some memoization.
        """
        return self._collect_stages()

    @cached_property
    def plot_templates(self):
        from .plots.template import PlotTemplates

        return PlotTemplates(self.dvc_dir)

    def _collect_stages(self):
        stages = []
        outs = set()

        for root, dirs, files in self.tree.walk(self.root_dir):
            for file_name in filter(is_valid_filename, files):
                new_stages = self.get_stages(os.path.join(root, file_name))
                stages.extend(new_stages)
                outs.update(
                    out.fspath
                    for stage in new_stages
                    for out in stage.outs
                    if out.scheme == "local"
                )
            dirs[:] = [d for d in dirs if os.path.join(root, d) not in outs]
        return stages

    def find_outs_by_path(self, path, outs=None, recursive=False, strict=True):
        if not outs:
            outs = [out for stage in self.stages for out in stage.outs]

        abs_path = os.path.abspath(path)
        path_info = PathInfo(abs_path)
        match = path_info.__eq__ if strict else path_info.isin_or_eq

        def func(out):
            if out.scheme == "local" and match(out.path_info):
                return True

            if recursive and out.path_info.isin(path_info):
                return True

            return False

        matched = list(filter(func, outs))
        if not matched:
            raise OutputNotFoundError(path, self)

        return matched

    def find_out_by_relpath(self, relpath):
        path = os.path.join(self.root_dir, relpath)
        (out,) = self.find_outs_by_path(path)
        return out

    def is_dvc_internal(self, path):
        path_parts = os.path.normpath(path).split(os.path.sep)
        return self.DVC_DIR in path_parts

    @contextmanager
    def open_by_relpath(self, path, remote=None, mode="r", encoding=None):
        """Opens a specified resource as a file descriptor"""

        tree = RepoTree(self, stream=True)
        path = os.path.join(self.root_dir, path)
        try:
            with self.state:
                with tree.open(
                    os.path.join(self.root_dir, path),
                    mode=mode,
                    encoding=encoding,
                    remote=remote,
                ) as fobj:
                    yield fobj
        except FileNotFoundError as exc:
            raise FileMissingError(path) from exc
        except IsADirectoryError as exc:
            raise DvcIsADirectoryError(f"'{path}' is a directory") from exc

    def close(self):
        self.scm.close()

    @locked
    def checkout(self, *args, **kwargs):
        return self._checkout(*args, **kwargs)

    @locked
    def fetch(self, *args, **kwargs):
        return self._fetch(*args, **kwargs)

    def _reset(self):
        self.__dict__.pop("graph", None)
        self.__dict__.pop("stages", None)
        self.__dict__.pop("pipelines", None)
        self.__dict__.pop("dvcignore", None)
예제 #4
0
def test_init_none(tmp_dir):
    assert isinstance(SCM(os.fspath(tmp_dir), no_scm=True), NoSCM)
예제 #5
0
def test_init_no_git(tmp_dir):
    with pytest.raises(SCMError):
        SCM(os.fspath(tmp_dir))
예제 #6
0
파일: project.py 프로젝트: wellic/dvc
class Project(object):
    DVC_DIR = '.dvc'

    def __init__(self, root_dir):
        from dvc.logger import Logger
        from dvc.config import Config
        from dvc.state import LinkState, State
        from dvc.lock import Lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.updater import Updater

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)
        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self)
        self.link_state = LinkState(self)

        core = self.config._config[Config.SECTION_CORE]
        self.logger = Logger(core.get(Config.SECTION_CORE_LOGLEVEL, None))

        self.cache = Cache(self)
        self.cloud = DataCloud(self, config=self.config._config)
        self.updater = Updater(self.dvc_dir)

        self._ignore()

        self.updater.check()

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False, force=False):
        """
        Initiate dvc project in directory.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        import shutil
        from dvc.scm import SCM, Base
        from dvc.config import Config

        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)

        scm = SCM(root_dir)
        if type(scm) == Base and not no_scm:
            msg = "{} is not tracked by any supported scm tool(e.g. git)."
            raise InitError(msg.format(root_dir))

        if os.path.isdir(dvc_dir):
            if not force:
                msg = "'{}' exists. Use '-f' to force."
                raise InitError(msg.format(os.path.relpath(dvc_dir)))
            shutil.rmtree(dvc_dir)

        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        proj = Project(root_dir)

        scm.add([config.config_file])
        if scm.ignore_file():
            scm.add([os.path.join(dvc_dir, scm.ignore_file())])

        return proj

    def destroy(self):
        import shutil

        for stage in self.stages():
            stage.remove()

        shutil.rmtree(self.dvc_dir)

    def _ignore(self):
        flist = [self.state.state_file,
                 self.state._lock_file.lock_file,
                 self.link_state.state_file,
                 self.link_state._lock_file.lock_file,
                 self.lock.lock_file,
                 self.config.config_local_file,
                 self.updater.updater_file]

        if self.cache.local.cache_dir.startswith(self.root_dir):
            flist += [self.cache.local.cache_dir]

        self.scm.ignore_list(flist)

    def install(self):
        self.scm.install()

    def to_dvc_path(self, path):
        return os.path.relpath(path, self.root_dir)

    def add(self, fname):
        stage = Stage.loads(project=self,
                            outs=[fname],
                            add=True)

        stage.save()
        stage.dump()
        return stage

    def remove(self, target, outs_only=False):
        if not Stage.is_stage_file(target):
            raise NotDvcFileError(target)

        stage = Stage.load(self, target)
        if outs_only:
            stage.remove_outs()
        else:
            stage.remove()

        return stage

    def lock_stage(self, target, unlock=False):
        if not Stage.is_stage_file(target):
            raise NotDvcFileError(target)

        stage = Stage.load(self, target)
        stage.locked = False if unlock else True
        stage.dump()

        return stage

    def move(self, from_path, to_path):
        import dvc.output as Output

        from_out = Output.loads_from(Stage(self, cwd=os.curdir),
                                     [from_path])[0]

        found = False
        for stage in self.stages():
            for out in stage.outs:
                if out.path != from_out.path:
                    continue

                if not stage.is_data_source:
                    msg = 'Dvcfile \'{}\' is not a data source.'
                    raise DvcException(msg.format(stage.rel_path))

                found = True
                to_out = Output.loads_from(out.stage,
                                           [to_path],
                                           out.cache,
                                           out.metric)[0]
                out.move(to_out)

                stage_base = os.path.basename(stage.path)
                stage_base = stage_base.rstrip(Stage.STAGE_FILE_SUFFIX)

                stage_dir = os.path.dirname(stage.path)
                from_base = os.path.basename(from_path)
                to_base = os.path.basename(to_path)
                if stage_base == from_base:
                    os.unlink(stage.path)
                    path = to_base + Stage.STAGE_FILE_SUFFIX
                    stage.path = os.path.join(stage_dir, path)

            stage.dump()

        if not found:
            msg = 'Unable to find dvcfile with output \'{}\''
            raise DvcException(msg.format(from_path))

    def run(self,
            cmd=None,
            deps=[],
            outs=[],
            outs_no_cache=[],
            metrics_no_cache=[],
            fname=Stage.STAGE_FILE,
            cwd=os.curdir,
            no_exec=False):
        stage = Stage.loads(project=self,
                            fname=fname,
                            cmd=cmd,
                            cwd=cwd,
                            outs=outs,
                            outs_no_cache=outs_no_cache,
                            metrics_no_cache=metrics_no_cache,
                            deps=deps)
        if not no_exec:
            stage.run()
        stage.dump()
        return stage

    def imp(self, url, out):
        stage = Stage.loads(project=self,
                            cmd=None,
                            deps=[url],
                            outs=[out])

        stage.run()
        stage.dump()
        return stage

    def _reproduce_stage(self, stages, node, force):
        stage = stages[node]

        if stage.locked:
            msg = 'DVC file \'{}\' is locked. Its dependecies are not ' \
                  'going to be reproduced.'
            self.logger.warn(msg.format(stage.relpath))

        stage = stage.reproduce(force=force)
        if not stage:
            return []
        stage.dump()
        return [stage]

    def reproduce(self, target, recursive=True, force=False):
        import networkx as nx

        G = self.graph()[1]
        stages = nx.get_node_attributes(G, 'stage')
        node = os.path.relpath(os.path.abspath(target), self.root_dir)
        if node not in stages:
            raise NotDvcFileError(target)

        if recursive:
            return self._reproduce_stages(G, stages, node, force)

        return self._reproduce_stage(stages, node, force)

    def _reproduce_stages(self, G, stages, node, force):
        import networkx as nx

        result = []
        for n in nx.dfs_postorder_nodes(G, node):
            try:
                result += self._reproduce_stage(stages, n, force)
            except Exception as ex:
                raise ReproductionError(stages[n].relpath, ex)
        return result

    def _cleanup_unused_links(self, all_stages):
        used = []
        for stage in all_stages:
            for out in stage.outs:
                used.append(out.path)
        self.link_state.remove_unused(used)

    def checkout(self, target=None):
        all_stages = self.active_stages()

        if target:
            if not Stage.is_stage_file(target):
                raise NotDvcFileError(target)
            stages = [Stage.load(self, target)]
        else:
            stages = all_stages

        self._cleanup_unused_links(all_stages)

        for stage in stages:
            if stage.locked:
                msg = 'DVC file \'{}\' is locked. Its dependecies are not ' \
                      'going to be checked out.'
                self.logger.warn(msg.format(stage.relpath))

            stage.checkout()

    def _used_cache(self, target=None, all_branches=False, active=True):
        cache = {}
        cache['local'] = []
        cache['s3'] = []
        cache['gs'] = []
        cache['hdfs'] = []
        cache['ssh'] = []

        for branch in self.scm.brancher(all_branches=all_branches):
            if target:
                stages = [Stage.load(self, target)]
            elif active:
                stages = self.active_stages()
            else:
                stages = self.stages()

            for stage in stages:
                if active and not target and stage.locked:
                    msg = 'DVC file \'{}\' is locked. Its dependecies are ' \
                          'not going to be pushed/pulled/fetched.'
                    self.logger.warn(msg.format(stage.relpath))

                for out in stage.outs:
                    if not out.use_cache or not out.info:
                        continue

                    info = out.dumpd()
                    info['branch'] = branch

                    cache[out.path_info['scheme']] += [info]

        return cache

    def gc(self, all_branches=False, cloud=False, remote=None):
        clist = self._used_cache(target=None,
                                 all_branches=all_branches,
                                 active=False)
        self.cache.local.gc(clist)

        if self.cache.s3:
            self.cache.s3.gc(clist)

        if self.cache.gs:
            self.cache.gs.gc(clist)

        if self.cache.ssh:
            self.cache.ssh.gc(clist)

        if self.cache.hdfs:
            self.cache.hdfs.gc(clist)

        if self.cache.azure:
            self.cache.azure.gc(clist)

        if cloud:
            self.cloud._get_cloud(remote).gc(clist)

    def push(self,
             target=None,
             jobs=1,
             remote=None,
             all_branches=False,
             show_checksums=False):
        self.cloud.push(self._used_cache(target, all_branches)['local'],
                        jobs,
                        remote=remote,
                        show_checksums=show_checksums)

    def fetch(self,
              target=None,
              jobs=1,
              remote=None,
              all_branches=False,
              show_checksums=False):
        self.cloud.pull(self._used_cache(target, all_branches)['local'],
                        jobs,
                        remote=remote,
                        show_checksums=show_checksums)

    def pull(self,
             target=None,
             jobs=1,
             remote=None,
             all_branches=False,
             show_checksums=False):
        self.fetch(target,
                   jobs,
                   remote=remote,
                   all_branches=all_branches,
                   show_checksums=show_checksums)
        self.checkout(target=target)

    def _local_status(self, target=None):
        status = {}

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.active_stages()

        for stage in stages:
            if stage.locked:
                msg = 'DVC file \'{}\' is locked. Its dependecies are not ' \
                      'going to be shown in status output.'
                self.logger.warn(msg.format(stage.relpath))

            status.update(stage.status())

        return status

    def _cloud_status(self,
                      target=None,
                      jobs=1,
                      remote=None,
                      show_checksums=False):
        import dvc.remote.base as cloud

        status = {}
        for md5, ret in self.cloud.status(self._used_cache(target)['local'],
                                          jobs,
                                          remote=remote,
                                          show_checksums=show_checksums):
            if ret == cloud.STATUS_OK:
                continue

            prefix_map = {
                cloud.STATUS_DELETED: 'deleted',
                cloud.STATUS_NEW: 'new',
            }

            status[md5] = prefix_map[ret]

        return status

    def status(self,
               target=None,
               jobs=1,
               cloud=False,
               remote=None,
               show_checksums=False):
        if cloud:
            return self._cloud_status(target,
                                      jobs,
                                      remote=remote,
                                      show_checksums=show_checksums)
        return self._local_status(target)

    def _read_metric_json(self, fd, json_path):
        import json
        from jsonpath_rw import parse

        parser = parse(json_path)
        return [x.value for x in parser.find(json.load(fd))]

    def _do_read_metric_xsv(self, reader, row, col):
        if col is not None and row is not None:
            return [reader[row][col]]
        elif col is not None:
            return [r[col] for r in reader]
        elif row is not None:
            return reader[row]
        return None

    def _read_metric_hxsv(self, fd, hxsv_path, delimiter):
        import csv

        col, row = hxsv_path.split(',')
        row = int(row)
        reader = list(csv.DictReader(fd, delimiter=delimiter))
        return self._do_read_metric_xsv(reader, row, col)

    def _read_metric_xsv(self, fd, xsv_path, delimiter):
        import csv

        col, row = xsv_path.split(',')
        row = int(row)
        col = int(col)
        reader = list(csv.reader(fd, delimiter=delimiter))
        return self._do_read_metric_xsv(reader, row, col)

    def _read_metric(self, path, typ=None, xpath=None):
        ret = None

        if not os.path.exists(path):
            return ret

        try:
            with open(path, 'r') as fd:
                if typ == 'json':
                    ret = self._read_metric_json(fd, xpath)
                elif typ == 'csv':
                    ret = self._read_metric_xsv(fd, xpath, ',')
                elif typ == 'tsv':
                    ret = self._read_metric_xsv(fd, xpath, '\t')
                elif typ == 'hcsv':
                    ret = self._read_metric_hxsv(fd, xpath, ',')
                elif typ == 'htsv':
                    ret = self._read_metric_hxsv(fd, xpath, '\t')
                else:
                    ret = fd.read()
        except Exception as exc:
            self.logger.error('Unable to read metric in \'{}\''.format(path),
                              exc)

        return ret

    def _find_output_by_path(self, path, outs=None):
        from dvc.exceptions import OutputDuplicationError

        if not outs:
            astages = self.active_stages()
            outs = [out for stage in astages for out in stage.outs]

        abs_path = os.path.abspath(path)
        matched = [out for out in outs if out.path == abs_path]
        stages = [out.stage.path for out in matched]
        if len(stages) > 1:
            raise OutputDuplicationError(path, stages)

        return matched[0] if matched else None

    def metrics_show(self,
                     path=None,
                     typ=None,
                     xpath=None,
                     all_branches=False):
        res = {}
        for branch in self.scm.brancher(all_branches=all_branches):
            astages = self.active_stages()
            outs = [out for stage in astages for out in stage.outs]

            if path:
                out = self._find_output_by_path(path, outs=outs)
                stage = out.stage.path if out else None
                if out and all([out.metric,
                                not typ,
                                isinstance(out.metric, dict)]):
                    entries = [(path,
                                out.metric.get(out.PARAM_METRIC_TYPE, None),
                                out.metric.get(out.PARAM_METRIC_XPATH, None))]
                else:
                    entries = [(path, typ, xpath)]
            else:
                metrics = filter(lambda o: o.metric, outs)
                stage = None
                entries = []
                for o in metrics:
                    if not typ and isinstance(o.metric, dict):
                        t = o.metric.get(o.PARAM_METRIC_TYPE, None)
                        x = o.metric.get(o.PARAM_METRIC_XPATH, None)
                    else:
                        t = typ
                        x = xpath
                    entries.append((o.path, t, x))

            for fname, t, x in entries:
                if stage:
                    self.checkout(stage)

                rel = os.path.relpath(fname)
                metric = self._read_metric(fname,
                                           typ=t,
                                           xpath=x)
                if not metric:
                    continue

                if branch not in res:
                    res[branch] = {}

                res[branch][rel] = metric

        for branch, val in res.items():
            if all_branches:
                self.logger.info('{}:'.format(branch))
            for fname, metric in val.items():
                self.logger.info('\t{}: {}'.format(fname, metric))

        if res:
            return res

        if path:
            msg = 'File \'{}\' does not exist'.format(path)
        else:
            msg = 'No metric files in this repository. ' \
                  'Use \'dvc metrics add\' to add a metric file to track.'
        raise DvcException(msg)

    def _metrics_modify(self, path, typ=None, xpath=None, delete=False):
        out = self._find_output_by_path(path)
        if not out:
            msg = 'Unable to find file \'{}\' in the pipeline'.format(path)
            raise DvcException(msg)

        if out.path_info['scheme'] != 'local':
            msg = 'Output \'{}\' scheme \'{}\' is not supported for metrics'
            raise DvcException(msg.format(out.path, out.path_info['scheme']))

        if out.use_cache:
            msg = 'Cached output \'{}\' is not supported for metrics'
            raise DvcException(msg.format(out.rel_path))

        if typ:
            if not isinstance(out.metric, dict):
                out.metric = {}
            out.metric[out.PARAM_METRIC_TYPE] = typ

        if xpath:
            if not isinstance(out.metric, dict):
                out.metric = {}
            out.metric[out.PARAM_METRIC_XPATH] = xpath

        if delete:
            out.metric = None

        out._verify_metric()

        out.stage.dump()

    def metrics_modify(self, path=None, typ=None, xpath=None):
        self._metrics_modify(path, typ, xpath)

    def metrics_add(self, path, typ=None, xpath=None):
        if not typ:
            typ = 'raw'
        self._metrics_modify(path, typ, xpath)

    def metrics_remove(self, path):
        self._metrics_modify(path, delete=True)

    def graph(self):
        import networkx as nx

        G = nx.DiGraph()
        G_active = nx.DiGraph()
        stages = self.stages()

        outs = []
        for stage in stages:
            outs += stage.outs

        # collect the whole DAG
        for stage in stages:
            node = os.path.relpath(stage.path, self.root_dir)

            G.add_node(node, stage=stage)
            G_active.add_node(node, stage=stage)

            for dep in stage.deps:
                for out in outs:
                    if out.path != dep.path \
                       and not dep.path.startswith(out.path + out.sep):
                        continue

                    dep_stage = out.stage
                    dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                    G.add_node(dep_node, stage=dep_stage)
                    G.add_edge(node, dep_node)
                    if not stage.locked:
                        G_active.add_node(dep_node, stage=dep_stage)
                        G_active.add_edge(node, dep_node)

        return G, G_active

    def pipelines(self):
        import networkx as nx

        G, G_active = self.graph()

        if len(G.nodes()) == 0:
            return []

        # find pipeline ends aka "output stages"
        ends = [node for node, in_degree in G.in_degree() if in_degree == 0]

        # filter out subgraphs that didn't exist in original G
        pipelines = []
        for c in nx.weakly_connected_components(G_active):
            H = G_active.subgraph(c)
            found = False
            for node in ends:
                if node in H:
                    found = True
                    break
            if found:
                pipelines.append(H)

        return pipelines

    def stages(self):
        stages = []
        for root, dirs, files in os.walk(self.root_dir):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_stage_file(path):
                    continue
                stages.append(Stage.load(self, path))
        return stages

    def active_stages(self):
        import networkx as nx

        stages = []
        for G in self.pipelines():
            stages.extend(list(nx.get_node_attributes(G, 'stage').values()))
        return stages
예제 #7
0
 def setUp(self):
     super(TestGitSubmoduleTree, self).setUp()
     self.scm = SCM(self._root_dir)
     self.tree = GitTree(self.git, "master")
     self._pushd(self._root_dir)
예제 #8
0
파일: test_scm.py 프로젝트: yk/dvc
 def test_git(self):
     Repo.init(os.curdir)
     self.assertIsInstance(SCM(self._root_dir), Git)
예제 #9
0
class Repo(object):
    DVC_DIR = ".dvc"

    from dvc.repo.destroy import destroy
    from dvc.repo.install import install
    from dvc.repo.add import add
    from dvc.repo.remove import remove
    from dvc.repo.ls import ls
    from dvc.repo.lock import lock as lock_stage
    from dvc.repo.move import move
    from dvc.repo.run import run
    from dvc.repo.imp import imp
    from dvc.repo.imp_url import imp_url
    from dvc.repo.reproduce import reproduce
    from dvc.repo.checkout import _checkout
    from dvc.repo.push import push
    from dvc.repo.fetch import _fetch
    from dvc.repo.pull import pull
    from dvc.repo.status import status
    from dvc.repo.gc import gc
    from dvc.repo.commit import commit
    from dvc.repo.diff import diff
    from dvc.repo.brancher import brancher
    from dvc.repo.get import get
    from dvc.repo.get_url import get_url
    from dvc.repo.update import update

    def __init__(self, root_dir=None):
        from dvc.state import State
        from dvc.lock import make_lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.repo.metrics import Metrics
        from dvc.scm.tree import WorkingTree
        from dvc.repo.tag import Tag
        from dvc.utils.fs import makedirs

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        no_scm = self.config["core"].get("no_scm", False)
        self.scm = SCM(self.root_dir, no_scm=no_scm)

        self.tree = WorkingTree(self.root_dir)

        self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
        makedirs(self.tmp_dir, exist_ok=True)

        hardlink_lock = self.config["core"].get("hardlink_lock", False)
        self.lock = make_lock(
            os.path.join(self.dvc_dir, "lock"),
            tmp_dir=os.path.join(self.dvc_dir, "tmp"),
            hardlink_lock=hardlink_lock,
            friendly=True,
        )

        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self)

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        self.metrics = Metrics(self)
        self.tag = Tag(self)

        self._ignore()

    @property
    def tree(self):
        return self._tree

    @tree.setter
    def tree(self, tree):
        self._tree = tree if isinstance(tree, CleanTree) else CleanTree(tree)
        # Our graph cache is no longer valid, as it was based on the previous
        # tree.
        self._reset()

    def __repr__(self):
        return "{}: '{}'".format(self.__class__.__name__, self.root_dir)

    @classmethod
    def find_root(cls, root=None):
        root_dir = os.path.realpath(root or os.curdir)

        if not os.path.isdir(root_dir):
            raise NotDvcRepoError("directory '{}' does not exist".format(root))

        while True:
            dvc_dir = os.path.join(root_dir, cls.DVC_DIR)
            if os.path.isdir(dvc_dir):
                return root_dir
            if os.path.ismount(root_dir):
                break
            root_dir = os.path.dirname(root_dir)

        message = ("you are not inside of a DVC repository "
                   "(checked up to mount point '{}')").format(root_dir)
        raise NotDvcRepoError(message)

    @classmethod
    def find_dvc_dir(cls, root=None):
        root_dir = cls.find_root(root)
        return os.path.join(root_dir, cls.DVC_DIR)

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False):
        from dvc.repo.init import init

        init(root_dir=root_dir, no_scm=no_scm, force=force, subdir=subdir)
        return Repo(root_dir)

    def unprotect(self, target):
        return self.cache.local.unprotect(PathInfo(target))

    def _ignore(self):
        from dvc.updater import Updater

        updater = Updater(self.dvc_dir)

        flist = ([self.config.files["local"], updater.updater_file] +
                 [self.lock.lockfile, updater.lock.lockfile, self.tmp_dir] +
                 self.state.files)

        if path_isin(self.cache.local.cache_dir, self.root_dir):
            flist += [self.cache.local.cache_dir]

        self.scm.ignore_list(flist)

    def check_modified_graph(self, new_stages):
        """Generate graph including the new stage to check for errors"""
        self._collect_graph(self.stages + new_stages)

    def collect(self, target, with_deps=False, recursive=False, graph=None):
        import networkx as nx
        from dvc.stage import Stage

        G = graph or self.graph

        if not target:
            return list(G)

        target = os.path.abspath(target)

        if recursive and os.path.isdir(target):
            stages = nx.dfs_postorder_nodes(G)
            return [stage for stage in stages if path_isin(stage.path, target)]

        stage = Stage.load(self, target)
        if not with_deps:
            return [stage]

        pipeline = get_pipeline(get_pipelines(G), stage)
        return list(nx.dfs_postorder_nodes(pipeline, stage))

    def collect_granular(self, target, *args, **kwargs):
        if not target:
            return [(stage, None) for stage in self.stages]

        try:
            (out, ) = self.find_outs_by_path(target, strict=False)
            filter_info = PathInfo(os.path.abspath(target))
            return [(out.stage, filter_info)]
        except OutputNotFoundError:
            stages = self.collect(target, *args, **kwargs)
            return [(stage, None) for stage in stages]

    def used_cache(
        self,
        targets=None,
        all_branches=False,
        with_deps=False,
        all_tags=False,
        all_commits=False,
        remote=None,
        force=False,
        jobs=None,
        recursive=False,
    ):
        """Get the stages related to the given target and collect
        the `info` of its outputs.

        This is useful to know what files from the cache are _in use_
        (namely, a file described as an output on a stage).

        The scope is, by default, the working directory, but you can use
        `all_branches`/`all_tags`/`all_commits` to expand the scope.

        Returns:
            A dictionary with Schemes (representing output's location) as keys,
            and a list with the outputs' `dumpd` as values.
        """
        from dvc.cache import NamedCache

        cache = NamedCache()

        for branch in self.brancher(
                all_branches=all_branches,
                all_tags=all_tags,
                all_commits=all_commits,
        ):
            targets = targets or [None]

            pairs = cat(
                self.collect_granular(
                    target, recursive=recursive, with_deps=with_deps)
                for target in targets)

            suffix = "({})".format(branch) if branch else ""
            for stage, filter_info in pairs:
                used_cache = stage.get_used_cache(
                    remote=remote,
                    force=force,
                    jobs=jobs,
                    filter_info=filter_info,
                )
                cache.update(used_cache, suffix=suffix)

        return cache

    def _collect_graph(self, stages=None):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, collect stages
                in the repository.

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from pygtrie import Trie
        from dvc.exceptions import (
            OutputDuplicationError,
            StagePathAsOutputError,
            OverlappingOutputPathsError,
        )

        G = nx.DiGraph()
        stages = stages or self.stages
        stages = [stage for stage in stages if stage]
        outs = Trie()  # Use trie to efficiently find overlapping outs and deps

        for stage in stages:
            for out in stage.outs:
                out_key = out.path_info.parts

                # Check for dup outs
                if out_key in outs:
                    dup_stages = [stage, outs[out_key].stage]
                    raise OutputDuplicationError(str(out), dup_stages)

                # Check for overlapping outs
                if outs.has_subtrie(out_key):
                    parent = out
                    overlapping = first(outs.values(prefix=out_key))
                else:
                    parent = outs.shortest_prefix(out_key).value
                    overlapping = out
                if parent and overlapping:
                    msg = ("Paths for outs:\n'{}'('{}')\n'{}'('{}')\n"
                           "overlap. To avoid unpredictable behaviour, "
                           "rerun command with non overlapping outs paths."
                           ).format(
                               str(parent),
                               parent.stage.relpath,
                               str(overlapping),
                               overlapping.stage.relpath,
                           )
                    raise OverlappingOutputPathsError(parent, overlapping, msg)

                outs[out_key] = out

        for stage in stages:
            out = outs.shortest_prefix(PathInfo(stage.path).parts).value
            if out:
                raise StagePathAsOutputError(stage, str(out))

        # Building graph
        G.add_nodes_from(stages)
        for stage in stages:
            for dep in stage.deps:
                if dep.path_info is None:
                    continue

                dep_key = dep.path_info.parts
                overlapping = list(n.value for n in outs.prefixes(dep_key))
                if outs.has_subtrie(dep_key):
                    overlapping.extend(outs.values(prefix=dep_key))

                G.add_edges_from((stage, out.stage) for out in overlapping)

        check_acyclic(G)

        return G

    @cached_property
    def graph(self):
        return self._collect_graph()

    @cached_property
    def pipelines(self):
        return get_pipelines(self.graph)

    @cached_property
    def stages(self):
        """
        Walks down the root directory looking for Dvcfiles,
        skipping the directories that are related with
        any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories
        tracked by DVC (e.g. `dvc add data` would skip `data/`)

        NOTE: For large repos, this could be an expensive
              operation. Consider using some memoization.
        """
        from dvc.stage import Stage

        stages = []
        outs = set()

        for root, dirs, files in self.tree.walk(self.root_dir):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_valid_filename(path):
                    continue
                stage = Stage.load(self, path)
                stages.append(stage)

                for out in stage.outs:
                    if out.scheme == "local":
                        outs.add(out.fspath)

            dirs[:] = [d for d in dirs if os.path.join(root, d) not in outs]

        return stages

    def find_outs_by_path(self, path, outs=None, recursive=False, strict=True):
        if not outs:
            outs = [out for stage in self.stages for out in stage.outs]

        abs_path = os.path.abspath(path)
        path_info = PathInfo(abs_path)
        match = path_info.__eq__ if strict else path_info.isin_or_eq

        def func(out):
            if out.scheme == "local" and match(out.path_info):
                return True

            if recursive and out.path_info.isin(path_info):
                return True

            return False

        matched = list(filter(func, outs))
        if not matched:
            raise OutputNotFoundError(path, self)

        return matched

    def find_out_by_relpath(self, relpath):
        path = os.path.join(self.root_dir, relpath)
        (out, ) = self.find_outs_by_path(path)
        return out

    def is_dvc_internal(self, path):
        path_parts = os.path.normpath(path).split(os.path.sep)
        return self.DVC_DIR in path_parts

    @contextmanager
    def open_by_relpath(self, path, remote=None, mode="r", encoding=None):
        """Opens a specified resource as a file descriptor"""
        cause = None
        try:
            out = self.find_out_by_relpath(path)
        except OutputNotFoundError as exc:
            out = None
            cause = exc

        if out and out.use_cache:
            try:
                with self._open_cached(out, remote, mode, encoding) as fd:
                    yield fd
                return
            except FileNotFoundError as exc:
                raise FileMissingError(path) from exc

        abs_path = os.path.join(self.root_dir, path)
        if os.path.exists(abs_path):
            with open(abs_path, mode=mode, encoding=encoding) as fd:
                yield fd
            return

        raise FileMissingError(path) from cause

    def _open_cached(self, out, remote=None, mode="r", encoding=None):
        if out.isdir():
            raise ValueError("Can't open a dir")

        cache_file = self.cache.local.checksum_to_path_info(out.checksum)
        cache_file = fspath_py35(cache_file)

        if os.path.exists(cache_file):
            return open(cache_file, mode=mode, encoding=encoding)

        try:
            remote_obj = self.cloud.get_remote(remote)
            remote_info = remote_obj.checksum_to_path_info(out.checksum)
            return remote_obj.open(remote_info, mode=mode, encoding=encoding)
        except RemoteActionNotImplemented:
            with self.state:
                cache_info = out.get_used_cache(remote=remote)
                self.cloud.pull(cache_info, remote=remote)

            return open(cache_file, mode=mode, encoding=encoding)

    def close(self):
        self.scm.close()

    @locked
    def checkout(self, *args, **kwargs):
        return self._checkout(*args, **kwargs)

    @locked
    def fetch(self, *args, **kwargs):
        return self._fetch(*args, **kwargs)

    def _reset(self):
        self.__dict__.pop("graph", None)
        self.__dict__.pop("stages", None)
        self.__dict__.pop("pipelines", None)
        self.__dict__.pop("dvcignore", None)
예제 #10
0
 def setUp(self):
     super().setUp()
     self.scm = SCM(self._root_dir)
     self._pushd(self._root_dir)
예제 #11
0
파일: test_scm.py 프로젝트: yk/dvc
 def test_none(self):
     self.assertIsInstance(SCM(self._root_dir), NoSCM)
예제 #12
0
파일: __init__.py 프로젝트: surono24/dvc
    def __init__(self, root_dir=None, scm=None, rev=None):
        from dvc.state import State, StateNoop
        from dvc.lock import make_lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.repo.metrics import Metrics
        from dvc.repo.plots import Plots
        from dvc.repo.params import Params
        from dvc.tree.local import LocalRemoteTree
        from dvc.utils.fs import makedirs
        from dvc.stage.cache import StageCache

        if scm:
            tree = scm.get_tree(rev)
            self.root_dir = self.find_root(root_dir, tree)
            self.scm = scm
            self.tree = scm.get_tree(
                rev, use_dvcignore=True, dvcignore_root=self.root_dir
            )
            self.state = StateNoop()
        else:
            root_dir = self.find_root(root_dir)
            self.root_dir = os.path.abspath(os.path.realpath(root_dir))
            self.tree = LocalRemoteTree(
                self,
                {"url": self.root_dir},
                use_dvcignore=True,
                dvcignore_root=self.root_dir,
            )

        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)
        self.config = Config(self.dvc_dir, tree=self.tree)

        if not scm:
            no_scm = self.config["core"].get("no_scm", False)
            self.scm = SCM(self.root_dir, no_scm=no_scm)

        self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
        self.index_dir = os.path.join(self.tmp_dir, "index")
        makedirs(self.index_dir, exist_ok=True)

        hardlink_lock = self.config["core"].get("hardlink_lock", False)
        self.lock = make_lock(
            os.path.join(self.tmp_dir, "lock"),
            tmp_dir=self.tmp_dir,
            hardlink_lock=hardlink_lock,
            friendly=True,
        )

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        if not scm:
            # NOTE: storing state and link_state in the repository itself to
            # avoid any possible state corruption in 'shared cache dir'
            # scenario.
            self.state = State(self.cache.local)

        self.stage_cache = StageCache(self)

        self.metrics = Metrics(self)
        self.plots = Plots(self)
        self.params = Params(self)

        self._ignore()
예제 #13
0
class Project(object):
    DVC_DIR = '.dvc'

    def __init__(self, root_dir=None):
        from dvc.logger import logger
        from dvc.config import Config
        from dvc.state import State
        from dvc.lock import Lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.updater import Updater
        from dvc.prompt import Prompt

        root_dir = self._find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        self.scm = SCM(self.root_dir, project=self)
        self.lock = Lock(self.dvc_dir)
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self, self.config._config)

        core = self.config._config[Config.SECTION_CORE]
        self.logger = logger
        self.logger.set_level(core.get(Config.SECTION_CORE_LOGLEVEL, None))

        self.cache = Cache(self)
        self.cloud = DataCloud(self, config=self.config._config)
        self.updater = Updater(self.dvc_dir)
        self.prompt = Prompt()

        self._files_to_git_add = []

        self._ignore()

        self.updater.check()

    @staticmethod
    def _find_root(root=None):
        if root is None:
            root = os.getcwd()
        else:
            root = os.path.abspath(os.path.realpath(root))

        while True:
            dvc_dir = os.path.join(root, Project.DVC_DIR)
            if os.path.isdir(dvc_dir):
                return root
            if os.path.ismount(root):
                break
            root = os.path.dirname(root)
        raise NotDvcProjectError(root)

    @staticmethod
    def _find_dvc_dir(root=None):
        root_dir = Project._find_root(root)
        return os.path.join(root_dir, Project.DVC_DIR)

    def _remind_to_git_add(self):
        if len(self._files_to_git_add) == 0:
            return

        msg = '\nTo track the changes with git run:\n\n'
        msg += '\tgit add ' + " ".join(self._files_to_git_add)

        self.logger.info(msg)

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False, force=False):
        """
        Initiate dvc project in directory.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        import colorama
        import shutil
        from dvc.scm import SCM, Base
        from dvc.config import Config
        from dvc.logger import logger

        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)
        scm = SCM(root_dir)
        if type(scm) == Base and not no_scm:
            msg = "{} is not tracked by any supported scm tool(e.g. git)."
            raise InitError(msg.format(root_dir))

        if os.path.isdir(dvc_dir):
            if not force:
                msg = "'{}' exists. Use '-f' to force."
                raise InitError(msg.format(os.path.relpath(dvc_dir)))
            shutil.rmtree(dvc_dir)

        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        proj = Project(root_dir)

        scm.add([config.config_file])
        if scm.ignore_file():
            scm.add([os.path.join(dvc_dir, scm.ignore_file())])

        logger.info('\nYou can now commit the changes to git.')

        logger.info(
            "\n"
            "{yellow}What's next?{nc}\n"
            "{yellow}------------{nc}\n"
            "- Check out the documentation: {blue}https://dvc.org/doc{nc}\n"
            "- Get help and share ideas: {blue}https://dvc.org/chat{nc}\n"
            "- Star us on GitHub: {blue}https://github.com/iterative/dvc{nc}"

            .format(yellow=colorama.Fore.YELLOW,
                    blue=colorama.Fore.BLUE,
                    green=colorama.Fore.GREEN,
                    nc=colorama.Fore.RESET)
        )

        return proj

    @staticmethod
    def load_all(projects_paths):
        """
        Instantiate all projects in the given list of paths.

        Args:
            projects_paths: List of paths to projects.

        Returns:
            List of Project instances in the same order of the given paths.
        """
        return [Project(path) for path in projects_paths]

    def destroy(self):
        import shutil

        for stage in self.stages():
            stage.remove()

        shutil.rmtree(self.dvc_dir)

    def _ignore(self):
        flist = [
            self.state.state_file,
            self.lock.lock_file,
            self.config.config_local_file,
            self.updater.updater_file,
            self.updater.lock.lock_file,
        ] + self.state.temp_files

        if self.cache.local.cache_dir.startswith(self.root_dir):
            flist += [self.cache.local.cache_dir]

        self.scm.ignore_list(flist)

    def install(self):
        self.scm.install()

    def to_dvc_path(self, path):
        return os.path.relpath(path, self.root_dir)

    def _check_cwd_specified_as_output(self, cwd, stages):
        from dvc.exceptions import WorkingDirectoryAsOutputError

        cwd_path = os.path.abspath(os.path.normpath(cwd))

        for stage in stages:
            for output in stage.outs:
                if os.path.isdir(output.path) and output.path == cwd_path:
                    raise WorkingDirectoryAsOutputError(cwd, stage.relpath)

    def _check_output_duplication(self, outs, stages):
        from dvc.exceptions import OutputDuplicationError

        for stage in stages:
            for o in stage.outs:
                for out in outs:
                    if o.path == out.path and o.stage.path != out.stage.path:
                        stages = [o.stage.relpath, out.stage.relpath]
                        raise OutputDuplicationError(o.path, stages)

    def _check_circular_dependency(self, deps, outs):
        from dvc.exceptions import CircularDependencyError

        circular_dependencies = (
            set(file.path for file in deps) &
            set(file.path for file in outs)
        )

        if circular_dependencies:
            raise CircularDependencyError(circular_dependencies.pop())

    def add(self, fname, recursive=False):
        from dvc.stage import Stage

        fnames = []
        if recursive and os.path.isdir(fname):
            fnames = []
            for root, dirs, files in os.walk(fname):
                for f in files:
                    path = os.path.join(root, f)
                    if Stage.is_stage_file(path):
                        continue
                    if os.path.basename(path) == self.scm.ignore_file():
                        continue
                    if self.scm.is_tracked(path):
                        continue
                    fnames.append(path)
        else:
            fnames = [fname]

        all_stages = self.stages()
        stages = []
        self._files_to_git_add = []
        with self.state:
            for f in fnames:
                stage = Stage.loads(project=self,
                                    outs=[f],
                                    add=True)

                if stage is None:
                    stages.append(stage)
                    continue

                self._check_output_duplication(stage.outs, all_stages)

                stage.save()
                stage.dump()
                stages.append(stage)

        self._remind_to_git_add()

        return stages

    def remove(self, target, outs_only=False):
        from dvc.stage import Stage

        stage = Stage.load(self, target)
        if outs_only:
            stage.remove_outs()
        else:
            stage.remove()

        return stage

    def lock_stage(self, target, unlock=False):
        from dvc.stage import Stage

        stage = Stage.load(self, target)
        stage.locked = False if unlock else True
        stage.dump()

        return stage

    def move(self, from_path, to_path):
        import dvc.output as Output
        from dvc.stage import Stage

        from_out = Output.loads_from(Stage(self, cwd=os.curdir),
                                     [from_path])[0]

        found = False
        self._files_to_git_add = []
        with self.state:
            for stage in self.stages():
                for out in stage.outs:
                    if out.path != from_out.path:
                        continue

                    if not stage.is_data_source:
                        raise MoveNotDataSourceError(stage.relpath)

                    found = True
                    to_out = Output.loads_from(out.stage,
                                               [to_path],
                                               out.cache,
                                               out.metric)[0]
                    out.move(to_out)

                    stage_base = os.path.basename(stage.path)
                    stage_base = stage_base.rstrip(Stage.STAGE_FILE_SUFFIX)

                    stage_dir = os.path.dirname(stage.path)
                    from_base = os.path.basename(from_path)
                    to_base = os.path.basename(to_path)
                    if stage_base == from_base:
                        os.unlink(stage.path)
                        path = to_base + Stage.STAGE_FILE_SUFFIX
                        stage.path = os.path.join(stage_dir, path)

                stage.dump()

        self._remind_to_git_add()

        if not found:
            msg = 'Unable to find dvcfile with output \'{}\''
            raise DvcException(msg.format(from_path))

    def _unprotect_file(self, path):
        import stat
        import uuid
        from dvc.system import System
        from dvc.utils import copyfile, move, remove

        if System.is_symlink(path) or System.is_hardlink(path):
            self.logger.debug("Unprotecting '{}'".format(path))

            tmp = os.path.join(os.path.dirname(path), '.' + str(uuid.uuid4()))
            move(path, tmp)

            copyfile(tmp, path)

            remove(tmp)
        else:
            self.logger.debug("Skipping copying for '{}', since it is not "
                              "a symlink or a hardlink.".format(path))

        os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)

    def _unprotect_dir(self, path):
        for root, dirs, files in os.walk(path):
            for f in files:
                path = os.path.join(root, f)
                self._unprotect_file(path)

    def unprotect(self, path):
        if not os.path.exists(path):
            raise DvcException("Can't unprotect non-existing "
                               "data '{}'".format(path))

        if os.path.isdir(path):
            self._unprotect_dir(path)
        else:
            self._unprotect_file(path)

    def run(self,
            cmd=None,
            deps=[],
            outs=[],
            outs_no_cache=[],
            metrics_no_cache=[],
            fname=None,
            cwd=os.curdir,
            no_exec=False,
            overwrite=False,
            ignore_build_cache=False,
            remove_outs=False):
        from dvc.stage import Stage

        with self.state:
            stage = Stage.loads(project=self,
                                fname=fname,
                                cmd=cmd,
                                cwd=cwd,
                                outs=outs,
                                outs_no_cache=outs_no_cache,
                                metrics_no_cache=metrics_no_cache,
                                deps=deps,
                                overwrite=overwrite,
                                ignore_build_cache=ignore_build_cache,
                                remove_outs=remove_outs)

        if stage is None:
            return None

        all_stages = self.stages()

        self._check_cwd_specified_as_output(cwd, all_stages)
        self._check_output_duplication(stage.outs, all_stages)
        self._check_circular_dependency(stage.deps, stage.outs)

        self._files_to_git_add = []
        with self.state:
            if not no_exec:
                stage.run()

        stage.dump()

        self._remind_to_git_add()

        return stage

    def imp(self, url, out):
        from dvc.stage import Stage

        stage = Stage.loads(project=self,
                            cmd=None,
                            deps=[url],
                            outs=[out])

        if stage is None:
            return None

        self._check_output_duplication(stage.outs, self.stages())
        self._check_circular_dependency(stage.deps, stage.outs)

        self._files_to_git_add = []
        with self.state:
            stage.run()

        stage.dump()

        self._remind_to_git_add()

        return stage

    def _reproduce_stage(self, stages, node, force, dry, interactive):
        stage = stages[node]

        if stage.locked:
            msg = 'DVC file \'{}\' is locked. Its dependencies are not ' \
                  'going to be reproduced.'
            self.logger.warn(msg.format(stage.relpath))

        stage = stage.reproduce(force=force, dry=dry, interactive=interactive)
        if not stage:
            return []

        if not dry:
            stage.dump()

        return [stage]

    def reproduce(self,
                  target=None,
                  recursive=True,
                  force=False,
                  dry=False,
                  interactive=False,
                  pipeline=False,
                  all_pipelines=False):
        from dvc.stage import Stage

        if target is None and not all_pipelines:
            raise ValueError()

        if not interactive:
            config = self.config
            core = config._config[config.SECTION_CORE]
            interactive = core.get(config.SECTION_CORE_INTERACTIVE, False)

        targets = []
        if pipeline or all_pipelines:
            if pipeline:
                stage = Stage.load(self, target)
                node = os.path.relpath(stage.path, self.root_dir)
                pipelines = [self._get_pipeline(node)]
            else:
                pipelines = self.pipelines()

            for G in pipelines:
                for node in G.nodes():
                    if G.in_degree(node) == 0:
                        targets.append(os.path.join(self.root_dir, node))
        else:
            targets.append(target)

        self._files_to_git_add = []

        ret = []
        with self.state:
            for target in targets:
                stages = self._reproduce(target,
                                         recursive=recursive,
                                         force=force,
                                         dry=dry,
                                         interactive=interactive)
                ret.extend(stages)

        self._remind_to_git_add()

        return ret

    def _reproduce(self,
                   target,
                   recursive=True,
                   force=False,
                   dry=False,
                   interactive=False):
        import networkx as nx
        from dvc.stage import Stage

        stage = Stage.load(self, target)
        G = self.graph()[1]
        stages = nx.get_node_attributes(G, 'stage')
        node = os.path.relpath(stage.path, self.root_dir)

        if recursive:
            ret = self._reproduce_stages(G,
                                         stages,
                                         node,
                                         force,
                                         dry,
                                         interactive)
        else:
            ret = self._reproduce_stage(stages,
                                        node,
                                        force,
                                        dry,
                                        interactive)

        return ret

    def _reproduce_stages(self, G, stages, node, force, dry, interactive):
        import networkx as nx

        result = []
        for n in nx.dfs_postorder_nodes(G, node):
            try:
                result += self._reproduce_stage(stages,
                                                n,
                                                force,
                                                dry,
                                                interactive)
            except Exception as ex:
                raise ReproductionError(stages[n].relpath, ex)
        return result

    def _cleanup_unused_links(self, all_stages):
        used = []
        for stage in all_stages:
            for out in stage.outs:
                used.append(out.path)
        self.state.remove_unused_links(used)

    def checkout(self, target=None, with_deps=False, force=False):
        all_stages = self.active_stages()
        stages = all_stages

        if target:
            stages = self._collect(target, with_deps=with_deps)

        with self.state:
            self._cleanup_unused_links(all_stages)

            for stage in stages:
                if stage.locked:
                    msg = 'DVC file \'{}\' is locked. Its dependencies are ' \
                          'not going to be checked out.'
                    self.logger.warn(msg.format(stage.relpath))

                stage.checkout(force=force)

    def _get_pipeline(self, node):
        pipelines = list(filter(lambda g: node in g.nodes(),
                                self.pipelines()))
        assert len(pipelines) == 1
        return pipelines[0]

    def _collect(self, target, with_deps=False):
        import networkx as nx
        from dvc.stage import Stage

        stage = Stage.load(self, target)
        if not with_deps:
            return [stage]

        node = os.path.relpath(stage.path, self.root_dir)
        G = self._get_pipeline(node)
        stages = nx.get_node_attributes(G, 'stage')

        ret = [stage]
        for n in nx.dfs_postorder_nodes(G, node):
            ret.append(stages[n])

        return ret

    def _collect_dir_cache(self,
                           out,
                           branch=None,
                           remote=None,
                           force=False,
                           jobs=None):
        info = out.dumpd()
        ret = [info]
        r = out.remote
        md5 = info[r.PARAM_MD5]

        if self.cache.local.changed_cache_file(md5):
            try:
                self.cloud.pull(ret,
                                jobs=jobs,
                                remote=remote,
                                show_checksums=False)
            except DvcException as exc:
                msg = "Failed to pull cache for '{}': {}"
                self.logger.debug(msg.format(out, exc))

        if self.cache.local.changed_cache_file(md5):
            msg = "Missing cache for directory '{}'. " \
                  "Cache for files inside will be lost. " \
                  "Would you like to continue? Use '-f' to force. "
            if not (force or self.prompt.prompt(msg, False)):
                raise DvcException("Unable to fully collect "
                                   "used cache without cache "
                                   "for directory '{}'".format(out))
            else:
                return ret

        for i in self.cache.local.load_dir_cache(md5):
            i['branch'] = branch
            i[r.PARAM_PATH] = os.path.join(info[r.PARAM_PATH],
                                           i[r.PARAM_RELPATH])
            ret.append(i)

        return ret

    def _collect_used_cache(self,
                            out,
                            branch=None,
                            remote=None,
                            force=False,
                            jobs=None):
        if not out.use_cache or not out.info:
            return []

        info = out.dumpd()
        info['branch'] = branch
        ret = [info]

        if out.path_info['scheme'] != 'local':
            return ret

        md5 = info[out.remote.PARAM_MD5]
        cache = self.cache.local.get(md5)
        if not out.remote.is_dir_cache(cache):
            return ret

        return self._collect_dir_cache(out,
                                       branch=branch,
                                       remote=remote,
                                       force=force,
                                       jobs=jobs)

    def _used_cache(self,
                    target=None,
                    all_branches=False,
                    active=True,
                    with_deps=False,
                    all_tags=False,
                    remote=None,
                    force=False,
                    jobs=None):
        cache = {}
        cache['local'] = []
        cache['s3'] = []
        cache['gs'] = []
        cache['hdfs'] = []
        cache['ssh'] = []
        cache['azure'] = []

        for branch in self.scm.brancher(all_branches=all_branches,
                                        all_tags=all_tags):
            if target:
                stages = self._collect(target,
                                       with_deps=with_deps)
            elif active:
                stages = self.active_stages()
            else:
                stages = self.stages()

            for stage in stages:
                if active and not target and stage.locked:
                    msg = 'DVC file \'{}\' is locked. Its dependencies are ' \
                          'not going to be pushed/pulled/fetched.'
                    self.logger.warn(msg.format(stage.relpath))

                for out in stage.outs:
                    scheme = out.path_info['scheme']
                    cache[scheme] += self._collect_used_cache(out,
                                                              branch=branch,
                                                              remote=remote,
                                                              force=force,
                                                              jobs=jobs)

        return cache

    @staticmethod
    def merge_cache_lists(clists):
        merged_cache = collections.defaultdict(list)

        for cache_list in clists:
            for scheme, cache in cache_list.items():
                for item in cache:
                    if item not in merged_cache[scheme]:
                        merged_cache[scheme].append(item)

        return merged_cache

    @staticmethod
    def load_all_used_cache(projects,
                            target=None,
                            all_branches=False,
                            active=True,
                            with_deps=False,
                            all_tags=False,
                            remote=None,
                            force=False,
                            jobs=None):
        clists = []

        for project in projects:
            with project.state:
                project_clist = project._used_cache(target=None,
                                                    all_branches=all_branches,
                                                    active=False,
                                                    with_deps=with_deps,
                                                    all_tags=all_tags,
                                                    remote=remote,
                                                    force=force,
                                                    jobs=jobs)

                clists.append(project_clist)

        return clists

    def _do_gc(self, typ, func, clist):
        removed = func(clist)
        if not removed:
            self.logger.info("No unused {} cache to remove.".format(typ))

    def gc(self,
           all_branches=False,
           cloud=False,
           remote=None,
           with_deps=False,
           all_tags=False,
           force=False,
           jobs=None,
           projects=None):

        all_projects = [self]

        if projects is not None and len(projects) > 0:
            all_projects.extend(Project.load_all(projects))

        all_clists = Project.load_all_used_cache(all_projects,
                                                 target=None,
                                                 all_branches=all_branches,
                                                 active=False,
                                                 with_deps=with_deps,
                                                 all_tags=all_tags,
                                                 remote=remote,
                                                 force=force,
                                                 jobs=jobs)

        if len(all_clists) > 1:
            clist = Project.merge_cache_lists(all_clists)
        else:
            clist = all_clists[0]

        with self.state:
            self._do_gc('local', self.cache.local.gc, clist)

            if self.cache.s3:
                self._do_gc('s3', self.cache.s3.gc, clist)

            if self.cache.gs:
                self._do_gc('gs', self.cache.gs.gc, clist)

            if self.cache.ssh:
                self._do_gc('ssh', self.cache.ssh.gc, clist)

            if self.cache.hdfs:
                self._do_gc('hdfs', self.cache.hdfs.gc, clist)

            if self.cache.azure:
                self._do_gc('azure', self.cache.azure.gc, clist)

            if cloud:
                self._do_gc('remote', self.cloud._get_cloud(remote,
                                                            'gc -c').gc, clist)

    def push(self,
             target=None,
             jobs=1,
             remote=None,
             all_branches=False,
             show_checksums=False,
             with_deps=False,
             all_tags=False):
        with self.state:
            used = self._used_cache(target,
                                    all_branches=all_branches,
                                    all_tags=all_tags,
                                    with_deps=with_deps,
                                    force=True,
                                    remote=remote,
                                    jobs=jobs)['local']
            self.cloud.push(used,
                            jobs,
                            remote=remote,
                            show_checksums=show_checksums)

    def fetch(self,
              target=None,
              jobs=1,
              remote=None,
              all_branches=False,
              show_checksums=False,
              with_deps=False,
              all_tags=False):
        with self.state:
            used = self._used_cache(target,
                                    all_branches=all_branches,
                                    all_tags=all_tags,
                                    with_deps=with_deps,
                                    force=True,
                                    remote=remote,
                                    jobs=jobs)['local']
            self.cloud.pull(used,
                            jobs,
                            remote=remote,
                            show_checksums=show_checksums)

    def pull(self,
             target=None,
             jobs=1,
             remote=None,
             all_branches=False,
             show_checksums=False,
             with_deps=False,
             all_tags=False,
             force=False):
        self.fetch(target,
                   jobs,
                   remote=remote,
                   all_branches=all_branches,
                   all_tags=all_tags,
                   show_checksums=show_checksums,
                   with_deps=with_deps)
        self.checkout(target=target, with_deps=with_deps, force=force)

    def _local_status(self, target=None, with_deps=False):
        status = {}

        if target:
            stages = self._collect(target,
                                   with_deps=with_deps)
        else:
            stages = self.active_stages()

        for stage in stages:
            if stage.locked:
                msg = 'DVC file \'{}\' is locked. Its dependencies are not ' \
                      'going to be shown in the status output.'
                self.logger.warn(msg.format(stage.relpath))

            status.update(stage.status())

        return status

    def _cloud_status(self,
                      target=None,
                      jobs=1,
                      remote=None,
                      show_checksums=False,
                      all_branches=False,
                      with_deps=False,
                      all_tags=False):
        import dvc.remote.base as cloud

        used = self._used_cache(target,
                                all_branches=all_branches,
                                all_tags=all_tags,
                                with_deps=with_deps,
                                force=True,
                                remote=remote,
                                jobs=jobs)['local']

        status = {}
        for md5, ret in self.cloud.status(used,
                                          jobs,
                                          remote=remote,
                                          show_checksums=show_checksums):
            if ret == cloud.STATUS_OK:
                continue

            prefix_map = {
                cloud.STATUS_DELETED: 'deleted',
                cloud.STATUS_NEW: 'new',
            }

            status[md5] = prefix_map[ret]

        return status

    def status(self,
               target=None,
               jobs=1,
               cloud=False,
               remote=None,
               show_checksums=False,
               all_branches=False,
               with_deps=False,
               all_tags=False):
        with self.state:
            if cloud:
                return self._cloud_status(target,
                                          jobs,
                                          remote=remote,
                                          show_checksums=show_checksums,
                                          all_branches=all_branches,
                                          with_deps=with_deps,
                                          all_tags=all_tags)
            return self._local_status(target,
                                      with_deps=with_deps)

    def _read_metric_json(self, fd, json_path):
        import json
        from jsonpath_rw import parse

        parser = parse(json_path)
        return [x.value for x in parser.find(json.load(fd))]

    def _do_read_metric_xsv(self, reader, row, col):
        if col is not None and row is not None:
            return [reader[row][col]]
        elif col is not None:
            return [r[col] for r in reader]
        elif row is not None:
            return reader[row]
        return None

    def _read_metric_hxsv(self, fd, hxsv_path, delimiter):
        import csv

        col, row = hxsv_path.split(',')
        row = int(row)
        reader = list(csv.DictReader(fd, delimiter=delimiter))
        return self._do_read_metric_xsv(reader, row, col)

    def _read_metric_xsv(self, fd, xsv_path, delimiter):
        import csv

        col, row = xsv_path.split(',')
        row = int(row)
        col = int(col)
        reader = list(csv.reader(fd, delimiter=delimiter))
        return self._do_read_metric_xsv(reader, row, col)

    def _read_metric(self, path, typ=None, xpath=None):
        ret = None

        if not os.path.exists(path):
            return ret

        try:
            with open(path, 'r') as fd:
                if typ == 'json':
                    ret = self._read_metric_json(fd, xpath)
                elif typ == 'csv':
                    ret = self._read_metric_xsv(fd, xpath, ',')
                elif typ == 'tsv':
                    ret = self._read_metric_xsv(fd, xpath, '\t')
                elif typ == 'hcsv':
                    ret = self._read_metric_hxsv(fd, xpath, ',')
                elif typ == 'htsv':
                    ret = self._read_metric_hxsv(fd, xpath, '\t')
                else:
                    ret = fd.read()
        except Exception as exc:
            self.logger.error('Unable to read metric in \'{}\''.format(path),
                              exc)

        return ret

    def _find_output_by_path(self, path, outs=None):
        from dvc.exceptions import OutputDuplicationError

        if not outs:
            astages = self.active_stages()
            outs = [out for stage in astages for out in stage.outs]

        abs_path = os.path.abspath(path)
        matched = [out for out in outs if out.path == abs_path]
        stages = [out.stage.relpath for out in matched]
        if len(stages) > 1:
            raise OutputDuplicationError(path, stages)

        return matched[0] if matched else None

    def metrics_show(self,
                     path=None,
                     typ=None,
                     xpath=None,
                     all_branches=False,
                     all_tags=False):
        res = {}
        for branch in self.scm.brancher(all_branches=all_branches,
                                        all_tags=all_tags):
            astages = self.active_stages()
            outs = [out for stage in astages for out in stage.outs]

            if path:
                out = self._find_output_by_path(path, outs=outs)
                stage = out.stage.path if out else None
                if out and all([out.metric,
                                not typ,
                                isinstance(out.metric, dict)]):
                    entries = [(path,
                                out.metric.get(out.PARAM_METRIC_TYPE, None),
                                out.metric.get(out.PARAM_METRIC_XPATH, None))]
                else:
                    entries = [(path, typ, xpath)]
            else:
                metrics = filter(lambda o: o.metric, outs)
                stage = None
                entries = []
                for o in metrics:
                    if not typ and isinstance(o.metric, dict):
                        t = o.metric.get(o.PARAM_METRIC_TYPE, typ)
                        x = o.metric.get(o.PARAM_METRIC_XPATH, xpath)
                    else:
                        t = typ
                        x = xpath
                    entries.append((o.path, t, x))

            for fname, t, x in entries:
                if stage:
                    self.checkout(stage, force=True)

                rel = os.path.relpath(fname)
                metric = self._read_metric(fname,
                                           typ=t,
                                           xpath=x)
                if not metric:
                    continue

                if branch not in res:
                    res[branch] = {}

                res[branch][rel] = metric

        for branch, val in res.items():
            if all_branches or all_tags:
                self.logger.info('{}:'.format(branch))
            for fname, metric in val.items():
                self.logger.info('\t{}: {}'.format(fname, metric))

        if res:
            return res

        if path:
            msg = 'File \'{}\' does not exist'.format(path)
        else:
            msg = 'No metric files in this repository. ' \
                  'Use \'dvc metrics add\' to add a metric file to track.'
        raise DvcException(msg)

    def _metrics_modify(self, path, typ=None, xpath=None, delete=False):
        out = self._find_output_by_path(path)
        if not out:
            msg = 'Unable to find file \'{}\' in the pipeline'.format(path)
            raise DvcException(msg)

        if out.path_info['scheme'] != 'local':
            msg = 'Output \'{}\' scheme \'{}\' is not supported for metrics'
            raise DvcException(msg.format(out.path, out.path_info['scheme']))

        if out.use_cache:
            msg = 'Cached output \'{}\' is not supported for metrics'
            raise DvcException(msg.format(out.rel_path))

        if typ:
            if not isinstance(out.metric, dict):
                out.metric = {}
            out.metric[out.PARAM_METRIC_TYPE] = typ

        if xpath:
            if not isinstance(out.metric, dict):
                out.metric = {}
            out.metric[out.PARAM_METRIC_XPATH] = xpath

        if delete:
            out.metric = None

        out._verify_metric()

        out.stage.dump()

    def metrics_modify(self, path=None, typ=None, xpath=None):
        self._metrics_modify(path, typ, xpath)

    def metrics_add(self, path, typ=None, xpath=None):
        if not typ:
            typ = 'raw'
        self._metrics_modify(path, typ, xpath)

    def metrics_remove(self, path):
        self._metrics_modify(path, delete=True)

    def graph(self):
        import networkx as nx
        from dvc.exceptions import OutputDuplicationError

        G = nx.DiGraph()
        G_active = nx.DiGraph()
        stages = self.stages()

        outs = []
        outs_by_path = {}
        for stage in stages:
            for o in stage.outs:
                existing = outs_by_path.get(o.path, None)
                if existing is not None:
                    stages = [o.stage.relpath, existing.stage.relpath]
                    raise OutputDuplicationError(o.path, stages)
                outs.append(o)
                outs_by_path[o.path] = o

        # collect the whole DAG
        for stage in stages:
            node = os.path.relpath(stage.path, self.root_dir)

            G.add_node(node, stage=stage)
            G_active.add_node(node, stage=stage)

            for dep in stage.deps:
                for out in outs:
                    if out.path != dep.path \
                       and not dep.path.startswith(out.path + out.sep) \
                       and not out.path.startswith(dep.path + dep.sep):
                        continue

                    dep_stage = out.stage
                    dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                    G.add_node(dep_node, stage=dep_stage)
                    G.add_edge(node, dep_node)
                    if not stage.locked:
                        G_active.add_node(dep_node, stage=dep_stage)
                        G_active.add_edge(node, dep_node)

        return G, G_active

    def pipelines(self):
        import networkx as nx

        G, G_active = self.graph()

        if len(G.nodes()) == 0:
            return []

        # find pipeline ends aka "output stages"
        ends = [node for node, in_degree in G.in_degree() if in_degree == 0]

        # filter out subgraphs that didn't exist in original G
        pipelines = []
        for c in nx.weakly_connected_components(G_active):
            H = G_active.subgraph(c)
            found = False
            for node in ends:
                if node in H:
                    found = True
                    break
            if found:
                pipelines.append(H)

        return pipelines

    def stages(self):
        """
        Walks down the root directory looking for Dvcfiles,
        skipping the directories that are related with
        any SCM (e.g. `.git`) or DVC itself (`.dvc`).

        NOTE: For large projects, this could be an expensive
              operation.  Consider using some memorization.
        """
        from dvc.stage import Stage

        stages = []
        outs = []
        for root, dirs, files in os.walk(self.root_dir):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_stage_file(path):
                    continue
                stage = Stage.load(self, path)
                for out in stage.outs:
                    outs.append(out.path + out.sep)
                stages.append(stage)

            def filter_dirs(dname):
                path = os.path.join(root, dname)
                if path == self.dvc_dir or path == self.scm.dir:
                    return False
                for out in outs:
                    if path == os.path.normpath(out) or path.startswith(out):
                        return False
                return True

            dirs[:] = list(filter(filter_dirs, dirs))

        return stages

    def active_stages(self):
        import networkx as nx

        stages = []
        for G in self.pipelines():
            stages.extend(list(nx.get_node_attributes(G, 'stage').values()))
        return stages
예제 #14
0
class Repo(object):
    DVC_DIR = ".dvc"

    from dvc.repo.destroy import destroy
    from dvc.repo.install import install
    from dvc.repo.add import add
    from dvc.repo.remove import remove
    from dvc.repo.lock import lock as lock_stage
    from dvc.repo.move import move
    from dvc.repo.run import run
    from dvc.repo.imp import imp
    from dvc.repo.reproduce import reproduce
    from dvc.repo.checkout import checkout
    from dvc.repo.push import push
    from dvc.repo.fetch import fetch
    from dvc.repo.pull import pull
    from dvc.repo.status import status
    from dvc.repo.gc import gc
    from dvc.repo.commit import commit
    from dvc.repo.diff import diff
    from dvc.repo.brancher import brancher

    def __init__(self, root_dir=None):
        from dvc.config import Config
        from dvc.state import State
        from dvc.lock import Lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.updater import Updater
        from dvc.repo.metrics import Metrics
        from dvc.scm.tree import WorkingTree
        from dvc.repo.tag import Tag
        from dvc.repo.pkg import Pkg

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        self.tree = WorkingTree(self.root_dir)

        self.scm = SCM(self.root_dir, repo=self)
        self.lock = Lock(self.dvc_dir)
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self, self.config.config)

        core = self.config.config[Config.SECTION_CORE]

        level = core.get(Config.SECTION_CORE_LOGLEVEL)
        if level:
            logger.setLevel(level.upper())

        self.cache = Cache(self)
        self.cloud = DataCloud(self, config=self.config.config)
        self.updater = Updater(self.dvc_dir)

        self.metrics = Metrics(self)
        self.tag = Tag(self)
        self.pkg = Pkg(self)

        self._ignore()

        self.updater.check()

    def __repr__(self):
        return "Repo: '{root_dir}'".format(root_dir=self.root_dir)

    @staticmethod
    def find_root(root=None):
        if root is None:
            root = os.getcwd()
        else:
            root = os.path.abspath(os.path.realpath(root))

        while True:
            dvc_dir = os.path.join(root, Repo.DVC_DIR)
            if os.path.isdir(dvc_dir):
                return root
            if os.path.ismount(root):
                break
            root = os.path.dirname(root)
        raise NotDvcRepoError(root)

    @staticmethod
    def find_dvc_dir(root=None):
        root_dir = Repo.find_root(root)
        return os.path.join(root_dir, Repo.DVC_DIR)

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False, force=False):
        from dvc.repo.init import init

        init(root_dir=root_dir, no_scm=no_scm, force=force)
        return Repo(root_dir)

    def unprotect(self, target):
        path_info = {"schema": "local", "path": target}
        return self.cache.local.unprotect(path_info)

    def _ignore(self):
        flist = [
            self.state.state_file,
            self.lock.lock_file,
            self.config.config_local_file,
            self.updater.updater_file,
            self.updater.lock.lock_file,
        ] + self.state.temp_files

        if self.cache.local.cache_dir.startswith(self.root_dir):
            flist += [self.cache.local.cache_dir]

        self.scm.ignore_list(flist)

    def check_dag(self, stages):
        """Generate graph including the new stage to check for errors"""
        self.graph(stages=stages)

    @staticmethod
    def _check_cyclic_graph(graph):
        import networkx as nx
        from dvc.exceptions import CyclicGraphError

        cycles = list(nx.simple_cycles(graph))

        if cycles:
            raise CyclicGraphError(cycles[0])

    def _get_pipeline(self, node):
        pipelines = [i for i in self.pipelines() if i.has_node(node)]
        assert len(pipelines) == 1
        return pipelines[0]

    def collect(self, target, with_deps=False, recursive=False):
        import networkx as nx
        from dvc.stage import Stage

        if not target or recursive:
            return self.active_stages(target)

        stage = Stage.load(self, target)
        if not with_deps:
            return [stage]

        node = os.path.relpath(stage.path, self.root_dir)
        G = self._get_pipeline(node)

        ret = []
        for n in nx.dfs_postorder_nodes(G, node):
            ret.append(G.node[n]["stage"])

        return ret

    def _collect_dir_cache(self,
                           out,
                           branch=None,
                           remote=None,
                           force=False,
                           jobs=None):
        info = out.dumpd()
        ret = [info]
        r = out.remote
        md5 = info[r.PARAM_CHECKSUM]

        if self.cache.local.changed_cache_file(md5):
            try:
                self.cloud.pull(ret,
                                jobs=jobs,
                                remote=remote,
                                show_checksums=False)
            except DvcException as exc:
                msg = "Failed to pull cache for '{}': {}"
                logger.debug(msg.format(out, exc))

        if self.cache.local.changed_cache_file(md5):
            msg = ("Missing cache for directory '{}'. "
                   "Cache for files inside will be lost. "
                   "Would you like to continue? Use '-f' to force. ")
            if not force and not prompt.confirm(msg):
                raise DvcException(
                    "unable to fully collect used cache"
                    " without cache for directory '{}'".format(out))
            else:
                return ret

        for i in out.dir_cache:
            i["branch"] = branch
            i[r.PARAM_PATH] = os.path.join(info[r.PARAM_PATH],
                                           i[r.PARAM_RELPATH])
            ret.append(i)

        return ret

    def _collect_used_cache(self,
                            out,
                            branch=None,
                            remote=None,
                            force=False,
                            jobs=None):
        if not out.use_cache or not out.info:
            if not out.info:
                logger.warning("Output '{}'({}) is missing version "
                               "info. Cache for it will not be collected. "
                               "Use dvc repro to get your pipeline up to "
                               "date.".format(out, out.stage))
            return []

        info = out.dumpd()
        info["branch"] = branch
        ret = [info]

        if out.scheme != "local":
            return ret

        if not out.is_dir_checksum:
            return ret

        return self._collect_dir_cache(out,
                                       branch=branch,
                                       remote=remote,
                                       force=force,
                                       jobs=jobs)

    def used_cache(
        self,
        target=None,
        all_branches=False,
        active=True,
        with_deps=False,
        all_tags=False,
        remote=None,
        force=False,
        jobs=None,
        recursive=False,
    ):
        cache = {}
        cache["local"] = []
        cache["s3"] = []
        cache["gs"] = []
        cache["hdfs"] = []
        cache["ssh"] = []
        cache["azure"] = []

        for branch in self.brancher(all_branches=all_branches,
                                    all_tags=all_tags):
            if target:
                if recursive:
                    stages = self.stages(target)
                else:
                    stages = self.collect(target, with_deps=with_deps)
            elif active:
                stages = self.active_stages()
            else:
                stages = self.stages()

            for stage in stages:
                if active and not target and stage.locked:
                    logger.warning(
                        "DVC file '{path}' is locked. Its dependencies are"
                        " not going to be pushed/pulled/fetched.".format(
                            path=stage.relpath))

                for out in stage.outs:
                    scheme = out.path_info["scheme"]
                    cache[scheme].extend(
                        self._collect_used_cache(
                            out,
                            branch=branch,
                            remote=remote,
                            force=force,
                            jobs=jobs,
                        ))

        return cache

    def graph(self, stages=None, from_directory=None):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, use the ones
                on the `from_directory`.

            from_directory (str): directory where to look at for stages, if
                None is given, use the current working directory

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from dvc.exceptions import (
            OutputDuplicationError,
            StagePathAsOutputError,
            OverlappingOutputPathsError,
        )

        G = nx.DiGraph()
        G_active = nx.DiGraph()
        stages = stages or self.stages(from_directory, check_dag=False)
        stages = [stage for stage in stages if stage]
        outs = []

        for stage in stages:
            for out in stage.outs:
                existing = []
                for o in outs:
                    if o.path == out.path:
                        existing.append(o.stage)

                    in_o_dir = out.path.startswith(o.path + o.sep)
                    in_out_dir = o.path.startswith(out.path + out.sep)
                    if in_o_dir or in_out_dir:
                        raise OverlappingOutputPathsError(o, out)

                if existing:
                    stages = [stage.relpath, existing[0].relpath]
                    raise OutputDuplicationError(out.path, stages)

                outs.append(out)

        for stage in stages:
            path_dir = os.path.dirname(stage.path) + os.sep
            for out in outs:
                if path_dir.startswith(out.path + os.sep):
                    raise StagePathAsOutputError(stage.wdir, stage.relpath)

        for stage in stages:
            node = os.path.relpath(stage.path, self.root_dir)

            G.add_node(node, stage=stage)
            G_active.add_node(node, stage=stage)

            for dep in stage.deps:
                for out in outs:
                    if (out.path != dep.path
                            and not dep.path.startswith(out.path + out.sep)
                            and not out.path.startswith(dep.path + dep.sep)):
                        continue

                    dep_stage = out.stage
                    dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                    G.add_node(dep_node, stage=dep_stage)
                    G.add_edge(node, dep_node)
                    if not stage.locked:
                        G_active.add_node(dep_node, stage=dep_stage)
                        G_active.add_edge(node, dep_node)

        self._check_cyclic_graph(G)

        return G, G_active

    def pipelines(self, from_directory=None):
        import networkx as nx

        G, G_active = self.graph(from_directory=from_directory)

        return [
            G.subgraph(c).copy() for c in nx.weakly_connected_components(G)
        ]

    def stages(self, from_directory=None, check_dag=True):
        """
        Walks down the root directory looking for Dvcfiles,
        skipping the directories that are related with
        any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories
        tracked by DVC (e.g. `dvc add data` would skip `data/`)

        NOTE: For large repos, this could be an expensive
              operation. Consider using some memoization.
        """
        from dvc.stage import Stage

        if not from_directory:
            from_directory = self.root_dir
        elif not os.path.isdir(from_directory):
            raise TargetNotDirectoryError(from_directory)

        stages = []
        outs = []

        ignore_file_handler = DvcIgnoreFileHandler(self.tree)
        for root, dirs, files in self.tree.walk(
                from_directory, ignore_file_handler=ignore_file_handler):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_valid_filename(path):
                    continue
                stage = Stage.load(self, path)
                for out in stage.outs:
                    outs.append(out.path + out.sep)
                stages.append(stage)

            def filter_dirs(dname, root=root):
                path = os.path.join(root, dname)
                if path in (self.dvc_dir, self.scm.dir):
                    return False
                for out in outs:
                    if path == os.path.normpath(out) or path.startswith(out):
                        return False
                return True

            dirs[:] = list(filter(filter_dirs, dirs))

        if check_dag:
            self.check_dag(stages)

        return stages

    def active_stages(self, from_directory=None):
        import networkx as nx

        stages = []
        for G in self.pipelines(from_directory):
            stages.extend(list(nx.get_node_attributes(G, "stage").values()))
        return stages

    def find_outs_by_path(self, path, outs=None, recursive=False):
        if not outs:
            # there is no `from_directory=path` argument because some data
            # files might be generated to an upper level, and so it is
            # needed to look at all the files (project root_dir)
            stages = self.stages()
            outs = [out for stage in stages for out in stage.outs]

        abs_path = os.path.abspath(path)
        is_dir = self.tree.isdir(abs_path)

        def func(out):
            if out.path == abs_path:
                return True

            if is_dir and recursive and out.path.startswith(abs_path + os.sep):
                return True

            return False

        matched = list(filter(func, outs))
        if not matched:
            raise OutputNotFoundError(path)

        return matched

    def is_dvc_internal(self, path):
        path_parts = os.path.normpath(path).split(os.path.sep)
        return self.DVC_DIR in path_parts
예제 #15
0
파일: __init__.py 프로젝트: nik123/dvc
    def __init__(
        self,
        root_dir=None,
        fs=None,
        rev=None,
        subrepos=False,
        uninitialized=False,
        config=None,
        url=None,
        repo_factory=None,
    ):
        from dvc.config import Config
        from dvc.data.db import ODBManager
        from dvc.data_cloud import DataCloud
        from dvc.fs.git import GitFileSystem
        from dvc.fs.local import localfs
        from dvc.lock import LockNoop, make_lock
        from dvc.repo.live import Live
        from dvc.repo.metrics import Metrics
        from dvc.repo.params import Params
        from dvc.repo.plots import Plots
        from dvc.repo.stage import StageLoad
        from dvc.scm import SCM
        from dvc.stage.cache import StageCache
        from dvc.state import State, StateNoop

        self.url = url
        self._fs_conf = {"repo_factory": repo_factory}
        self._fs = fs or localfs
        self._scm = None

        if rev and not fs:
            self._scm = SCM(root_dir or os.curdir)
            self._fs = GitFileSystem(scm=self._scm, rev=rev)

        self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs(
            root_dir=root_dir, fs=self.fs, uninitialized=uninitialized)

        self.config = Config(self.dvc_dir, fs=self.fs, config=config)
        self._uninitialized = uninitialized

        # used by RepoFileSystem to determine if it should traverse subrepos
        self.subrepos = subrepos

        self.cloud = DataCloud(self)
        self.stage = StageLoad(self)

        if isinstance(self.fs, GitFileSystem) or not self.dvc_dir:
            self.lock = LockNoop()
            self.state = StateNoop()
            self.odb = ODBManager(self)
        else:
            self.lock = make_lock(
                os.path.join(self.tmp_dir, "lock"),
                tmp_dir=self.tmp_dir,
                hardlink_lock=self.config["core"].get("hardlink_lock", False),
                friendly=True,
            )

            state_db_dir = self._get_database_dir("state")
            self.state = State(self.root_dir, state_db_dir, self.dvcignore)
            self.odb = ODBManager(self)

            self.stage_cache = StageCache(self)

            self._ignore()

        self.metrics = Metrics(self)
        self.plots = Plots(self)
        self.params = Params(self)
        self.live = Live(self)

        self.stage_collection_error_handler: Optional[Callable[
            [str, Exception], None]] = None
        self._lock_depth = 0
예제 #16
0
class Repo(object):
    DVC_DIR = ".dvc"

    from dvc.repo.destroy import destroy
    from dvc.repo.install import install
    from dvc.repo.add import add
    from dvc.repo.remove import remove
    from dvc.repo.lock import lock as lock_stage
    from dvc.repo.move import move
    from dvc.repo.run import run
    from dvc.repo.imp import imp
    from dvc.repo.imp_url import imp_url
    from dvc.repo.reproduce import reproduce
    from dvc.repo.checkout import _checkout
    from dvc.repo.push import push
    from dvc.repo.fetch import _fetch
    from dvc.repo.pull import pull
    from dvc.repo.status import status
    from dvc.repo.gc import gc
    from dvc.repo.commit import commit
    from dvc.repo.diff import diff
    from dvc.repo.brancher import brancher
    from dvc.repo.get import get
    from dvc.repo.get_url import get_url
    from dvc.repo.update import update

    def __init__(self, root_dir=None):
        from dvc.state import State
        from dvc.lock import Lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.repo.metrics import Metrics
        from dvc.scm.tree import WorkingTree
        from dvc.repo.tag import Tag

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        self.scm = SCM(self.root_dir)

        self.tree = WorkingTree(self.root_dir)

        self.lock = Lock(
            os.path.join(self.dvc_dir, "lock"),
            tmp_dir=os.path.join(self.dvc_dir, "tmp"),
        )
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self, self.config.config)

        core = self.config.config[Config.SECTION_CORE]

        level = core.get(Config.SECTION_CORE_LOGLEVEL)
        if level:
            logger.setLevel(level.upper())

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        self.metrics = Metrics(self)
        self.tag = Tag(self)

        self._ignore()

    @property
    def tree(self):
        return self._tree

    @tree.setter
    def tree(self, tree):
        self._tree = tree
        # Our graph cache is no longer valid, as it was based on the previous
        # tree.
        self._reset()

    def __repr__(self):
        return "Repo: '{root_dir}'".format(root_dir=self.root_dir)

    @classmethod
    def find_root(cls, root=None):
        if root is None:
            root = os.getcwd()
        else:
            root = os.path.abspath(os.path.realpath(root))

        while True:
            dvc_dir = os.path.join(root, cls.DVC_DIR)
            if os.path.isdir(dvc_dir):
                return root
            if os.path.ismount(root):
                break
            root = os.path.dirname(root)
        raise NotDvcRepoError(root)

    @classmethod
    def find_dvc_dir(cls, root=None):
        root_dir = cls.find_root(root)
        return os.path.join(root_dir, cls.DVC_DIR)

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False, force=False):
        from dvc.repo.init import init

        init(root_dir=root_dir, no_scm=no_scm, force=force)
        return Repo(root_dir)

    def unprotect(self, target):
        return self.cache.local.unprotect(PathInfo(target))

    def _ignore(self):
        from dvc.updater import Updater

        updater = Updater(self.dvc_dir)

        flist = (
            [self.config.config_local_file, updater.updater_file]
            + self.state.files
            + self.lock.files
            + updater.lock.files
        )

        if self.cache.local.cache_dir.startswith(self.root_dir + os.sep):
            flist += [self.cache.local.cache_dir]

        self.scm.ignore_list(flist)

    def check_modified_graph(self, new_stages):
        """Generate graph including the new stage to check for errors"""
        self._collect_graph(self.stages + new_stages)

    def collect(self, target, with_deps=False, recursive=False, graph=None):
        import networkx as nx
        from dvc.stage import Stage

        G = graph or self.graph

        if not target:
            return get_stages(G)

        target = os.path.abspath(target)

        if recursive and os.path.isdir(target):
            attrs = nx.get_node_attributes(G, "stage")
            nodes = [node for node in nx.dfs_postorder_nodes(G)]

            ret = []
            for node in nodes:
                stage = attrs[node]
                if stage.path.startswith(target + os.sep):
                    ret.append(stage)
            return ret

        stage = Stage.load(self, target)
        if not with_deps:
            return [stage]

        node = relpath(stage.path, self.root_dir)
        pipeline = get_pipeline(get_pipelines(G), node)

        return [
            pipeline.node[n]["stage"]
            for n in nx.dfs_postorder_nodes(pipeline, node)
        ]

    def used_cache(
        self,
        targets=None,
        all_branches=False,
        with_deps=False,
        all_tags=False,
        all_commits=False,
        remote=None,
        force=False,
        jobs=None,
        recursive=False,
    ):
        """Get the stages related to the given target and collect
        the `info` of its outputs.

        This is useful to know what files from the cache are _in use_
        (namely, a file described as an output on a stage).

        The scope is, by default, the working directory, but you can use
        `all_branches` or `all_tags` to expand scope.

        Returns:
            A dictionary with Schemes (representing output's location) as keys,
            and a list with the outputs' `dumpd` as values.
        """
        from dvc.cache import NamedCache

        cache = NamedCache()

        for branch in self.brancher(
            all_branches=all_branches,
            all_tags=all_tags,
            all_commits=all_commits,
        ):
            if targets:
                stages = []
                for target in targets:
                    collected = self.collect(
                        target, recursive=recursive, with_deps=with_deps
                    )
                    stages.extend(collected)
            else:
                stages = self.stages

            for stage in stages:
                if stage.is_repo_import:
                    dep, = stage.deps
                    cache.external[dep.repo_pair].add(dep.def_path)
                    continue

                for out in stage.outs:
                    used_cache = out.get_used_cache(
                        remote=remote, force=force, jobs=jobs
                    )
                    suffix = "({})".format(branch) if branch else ""
                    cache.update(used_cache, suffix=suffix)

        return cache

    def _collect_graph(self, stages=None):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, collect stages
                in the repository.

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from dvc.exceptions import (
            OutputDuplicationError,
            StagePathAsOutputError,
            OverlappingOutputPathsError,
        )

        G = nx.DiGraph()
        stages = stages or self.collect_stages()
        stages = [stage for stage in stages if stage]
        outs = {}

        for stage in stages:
            for out in stage.outs:
                if out.path_info in outs:
                    stages = [stage.relpath, outs[out.path_info].stage.relpath]
                    raise OutputDuplicationError(str(out), stages)
                outs[out.path_info] = out

        for stage in stages:
            for out in stage.outs:
                for p in out.path_info.parents:
                    if p in outs:
                        raise OverlappingOutputPathsError(outs[p], out)

        for stage in stages:
            stage_path_info = PathInfo(stage.path)
            for p in chain([stage_path_info], stage_path_info.parents):
                if p in outs:
                    raise StagePathAsOutputError(stage.wdir, stage.relpath)

        for stage in stages:
            node = relpath(stage.path, self.root_dir)

            G.add_node(node, stage=stage)

            for dep in stage.deps:
                if dep.path_info is None:
                    continue

                for out in outs:
                    if (
                        out == dep.path_info
                        or dep.path_info.isin(out)
                        or out.isin(dep.path_info)
                    ):
                        dep_stage = outs[out].stage
                        dep_node = relpath(dep_stage.path, self.root_dir)
                        G.add_node(dep_node, stage=dep_stage)
                        G.add_edge(node, dep_node)

        check_acyclic(G)

        return G

    @cached_property
    def graph(self):
        return self._collect_graph()

    @cached_property
    def pipelines(self):
        return get_pipelines(self.graph)

    @staticmethod
    def _filter_out_dirs(dirs, outs, root_dir):
        def filter_dirs(dname):
            path = os.path.join(root_dir, dname)
            for out in outs:
                if path == os.path.normpath(out):
                    return False
            return True

        return list(filter(filter_dirs, dirs))

    def collect_stages(self):
        """
        Walks down the root directory looking for Dvcfiles,
        skipping the directories that are related with
        any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories
        tracked by DVC (e.g. `dvc add data` would skip `data/`)

        NOTE: For large repos, this could be an expensive
              operation. Consider using some memoization.
        """
        from dvc.stage import Stage

        stages = []
        outs = []

        for root, dirs, files in self.tree.walk(
            self.root_dir, dvcignore=self.dvcignore
        ):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_valid_filename(path):
                    continue
                stage = Stage.load(self, path)
                for out in stage.outs:
                    if out.scheme == "local":
                        outs.append(out.fspath + out.sep)
                stages.append(stage)

            dirs[:] = self._filter_out_dirs(dirs, outs, root)

        return stages

    @cached_property
    def stages(self):
        return get_stages(self.graph)

    def find_outs_by_path(self, path, outs=None, recursive=False):
        if not outs:
            outs = [out for stage in self.stages for out in stage.outs]

        abs_path = os.path.abspath(path)
        is_dir = self.tree.isdir(abs_path)

        def func(out):
            if out.scheme == "local" and out.fspath == abs_path:
                return True

            if is_dir and recursive and out.path_info.isin(abs_path):
                return True

            return False

        matched = list(filter(func, outs))
        if not matched:
            raise OutputNotFoundError(path)

        return matched

    def find_out_by_relpath(self, relpath):
        path = os.path.join(self.root_dir, relpath)
        out, = self.find_outs_by_path(path)
        return out

    def is_dvc_internal(self, path):
        path_parts = os.path.normpath(path).split(os.path.sep)
        return self.DVC_DIR in path_parts

    @contextmanager
    def open(self, path, remote=None, mode="r", encoding=None):
        """Opens a specified resource as a file descriptor"""
        cause = None
        try:
            out, = self.find_outs_by_path(path)
        except OutputNotFoundError as e:
            out = None
            cause = e

        if out and out.use_cache:
            try:
                with self._open_cached(out, remote, mode, encoding) as fd:
                    yield fd
                return
            except FileNotFoundError as e:
                raise FileMissingError(relpath(path, self.root_dir), cause=e)

        if self.tree.exists(path):
            with self.tree.open(path, mode, encoding) as fd:
                yield fd
            return

        raise FileMissingError(relpath(path, self.root_dir), cause=cause)

    def _open_cached(self, out, remote=None, mode="r", encoding=None):
        if out.isdir():
            raise ValueError("Can't open a dir")

        cache_file = self.cache.local.checksum_to_path_info(out.checksum)
        cache_file = fspath_py35(cache_file)

        if os.path.exists(cache_file):
            return _open(cache_file, mode=mode, encoding=encoding)

        try:
            remote_obj = self.cloud.get_remote(remote)
            remote_info = remote_obj.checksum_to_path_info(out.checksum)
            return remote_obj.open(remote_info, mode=mode, encoding=encoding)
        except RemoteActionNotImplemented:
            with self.state:
                cache_info = out.get_used_cache(remote=remote)
                self.cloud.pull(cache_info, remote=remote)

            return _open(cache_file, mode=mode, encoding=encoding)

    @cached_property
    def dvcignore(self):
        return DvcIgnoreFilter(self.root_dir, self.tree)

    def close(self):
        self.scm.close()

    @locked
    def checkout(self, *args, **kwargs):
        return self._checkout(*args, **kwargs)

    @locked
    def fetch(self, *args, **kwargs):
        return self._fetch(*args, **kwargs)

    def _reset(self):
        self.__dict__.pop("graph", None)
        self.__dict__.pop("stages", None)
        self.__dict__.pop("pipelines", None)
        self.__dict__.pop("dvcignore", None)
예제 #17
0
파일: test_tree.py 프로젝트: bgheneti/dvc
 def setUp(self):
     super().setUp()
     self.scm = SCM(self._root_dir)
     self.tree = GitTree(self.git, "master")
     self._pushd(self._root_dir)
예제 #18
0
def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False):
    """
    Creates an empty repo on the given directory -- basically a
    `.dvc` directory with subdirectories for configuration and cache.

    It should be tracked by a SCM or use the `--no-scm` flag.

    If the given directory is not empty, you must use the `--force`
    flag to override it.

    Args:
        root_dir: Path to repo's root directory.

    Returns:
        Repo instance.

    Raises:
        KeyError: Raises an exception.
    """

    if no_scm and subdir:
        raise InvalidArgumentError(
            "Cannot initialize repo with `--no-scm` and `--subdir`")

    root_dir = os.path.realpath(root_dir)
    dvc_dir = os.path.join(root_dir, Repo.DVC_DIR)

    try:
        scm = SCM(root_dir, search_parent_directories=subdir, no_scm=no_scm)
    except SCMError:
        raise InitError(
            f"{root_dir} is not tracked by any supported SCM tool (e.g. Git). "
            "Use `--no-scm` if you don't want to use any SCM or "
            "`--subdir` if initializing inside a subdirectory of a parent SCM "
            "repository.")

    if scm.is_ignored(dvc_dir):
        raise InitError(f"{dvc_dir} is ignored by your SCM tool. \n"
                        "Make sure that it's tracked, "
                        "for example, by adding '!.dvc' to .gitignore.")

    if os.path.isdir(dvc_dir):
        if not force:
            raise InitError(f"'{relpath(dvc_dir)}' exists. Use `-f` to force.")

        remove(dvc_dir)

    os.mkdir(dvc_dir)

    config = Config.init(dvc_dir)

    if no_scm:
        with config.edit() as conf:
            conf["core"]["no_scm"] = True

    dvcignore = init_dvcignore(root_dir)

    proj = Repo(root_dir)

    proj.plots.templates.init()

    with proj.scm_context(autostage=True) as context:
        files = [
            config.files["repo"],
            dvcignore,
            proj.plots.templates.templates_dir,
        ]
        ignore_file = context.scm.ignore_file
        if ignore_file:
            files.extend([os.path.join(dvc_dir, ignore_file)])
        proj.scm_context.track_file(files)

    logger.info("Initialized DVC repository.\n")
    if not no_scm:
        logger.info("You can now commit the changes to git.\n")
    return proj
예제 #19
0
파일: project.py 프로젝트: sergeyenin/dvc
class Project(object):
    DVC_DIR = ".dvc"

    def __init__(self, root_dir=None):
        from dvc.config import Config
        from dvc.state import State
        from dvc.lock import Lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.updater import Updater

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        self.scm = SCM(self.root_dir, project=self)
        self.lock = Lock(self.dvc_dir)
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self, self.config.config)

        core = self.config.config[Config.SECTION_CORE]

        logger.set_level(core.get(Config.SECTION_CORE_LOGLEVEL))

        self.cache = Cache(self)
        self.cloud = DataCloud(self, config=self.config.config)
        self.updater = Updater(self.dvc_dir)

        self.files_to_git_add = []

        self._ignore()

        self.updater.check()

    def __repr__(self):
        return "Project: '{root_dir}'".format(root_dir=self.root_dir)

    @staticmethod
    def find_root(root=None):
        if root is None:
            root = os.getcwd()
        else:
            root = os.path.abspath(os.path.realpath(root))

        while True:
            dvc_dir = os.path.join(root, Project.DVC_DIR)
            if os.path.isdir(dvc_dir):
                return root
            if os.path.ismount(root):
                break
            root = os.path.dirname(root)
        raise NotDvcProjectError(root)

    @staticmethod
    def find_dvc_dir(root=None):
        root_dir = Project.find_root(root)
        return os.path.join(root_dir, Project.DVC_DIR)

    def _remind_to_git_add(self):
        if not self.files_to_git_add:
            return

        logger.info(
            "\n"
            "To track the changes with git run:\n"
            "\n"
            "\tgit add {files}".format(files=" ".join(self.files_to_git_add)))

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False, force=False):
        """
        Creates an empty project on the given directory -- basically a
        `.dvc` directory with subdirectories for configuration and cache.

        It should be tracked by a SCM or use the `--no-scm` flag.

        If the given directory is not empty, you must use the `--force`
        flag to override it.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        import shutil
        from dvc.scm import SCM, Base
        from dvc.config import Config

        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)
        scm = SCM(root_dir)
        if type(scm) == Base and not no_scm:
            raise InitError(
                "{project} is not tracked by any supported scm tool"
                " (e.g. git). Use '--no-scm' if you don't want to use any scm."
                .format(project=root_dir))

        if os.path.isdir(dvc_dir):
            if not force:
                raise InitError(
                    "'{project}' exists. Use '-f' to force.".format(
                        project=os.path.relpath(dvc_dir)))

            shutil.rmtree(dvc_dir)

        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        proj = Project(root_dir)

        scm.add([config.config_file])

        if scm.ignore_file:
            scm.add([os.path.join(dvc_dir, scm.ignore_file)])
            logger.info("\nYou can now commit the changes to git.\n")

        proj._welcome_message()

        return proj

    def destroy(self):
        import shutil

        for stage in self.stages():
            stage.remove()

        shutil.rmtree(self.dvc_dir)

    def _ignore(self):
        flist = [
            self.state.state_file,
            self.lock.lock_file,
            self.config.config_local_file,
            self.updater.updater_file,
            self.updater.lock.lock_file,
        ] + self.state.temp_files

        if self.cache.local.cache_dir.startswith(self.root_dir):
            flist += [self.cache.local.cache_dir]

        self.scm.ignore_list(flist)

    def install(self):
        self.scm.install()

    @staticmethod
    def _check_cyclic_graph(graph):
        import networkx as nx
        from dvc.exceptions import CyclicGraphError

        cycles = list(nx.simple_cycles(graph))

        if cycles:
            raise CyclicGraphError(cycles[0])

    def add(self, fname, recursive=False):
        from dvc.stage import Stage

        fnames = []
        if recursive and os.path.isdir(fname):
            for root, dirs, files in os.walk(fname):
                for f in files:
                    path = os.path.join(root, f)
                    if Stage.is_stage_file(path):
                        continue
                    if os.path.basename(path) == self.scm.ignore_file:
                        continue
                    if self.scm.is_tracked(path):
                        continue
                    fnames.append(path)
        else:
            fnames = [fname]

        stages = []
        self.files_to_git_add = []
        with self.state:
            for f in fnames:
                stage = Stage.create(project=self, outs=[f], add=True)

                if stage is None:
                    stages.append(stage)
                    continue

                stage.save()
                stages.append(stage)

        self._check_dag(self.stages() + stages)

        for stage in stages:
            stage.dump()

        self._remind_to_git_add()

        return stages

    def _check_dag(self, stages):
        """Generate graph including the new stage to check for errors"""
        self.graph(stages=stages)

    def remove(self, target, outs_only=False):
        from dvc.stage import Stage

        stage = Stage.load(self, target)
        if outs_only:
            stage.remove_outs()
        else:
            stage.remove()

        return stage

    def lock_stage(self, target, unlock=False):
        from dvc.stage import Stage

        stage = Stage.load(self, target)
        stage.locked = False if unlock else True
        stage.dump()

        return stage

    def move(self, from_path, to_path):
        """
        Renames an output file and modifies the stage associated
        to reflect the change on the pipeline.

        If the output has the same name as its stage, it would
        also rename the corresponding stage file.

        E.g.
              Having: (hello, hello.dvc)

              $ dvc move hello greetings

              Result: (greeting, greeting.dvc)

        It only works with outputs generated by `add` or `import`,
        also known as data sources.
        """
        import dvc.output as Output
        from dvc.stage import Stage

        from_out = Output.loads_from(Stage(self, cwd=os.curdir),
                                     [from_path])[0]

        to_path = self._expand_target_path(from_path, to_path)

        try:
            stage, out = next((stage, out) for stage in self.stages()
                              for out in stage.outs
                              if from_out.path == out.path)
        except StopIteration:
            raise DvcException(
                "unable to find stage file with output '{path}'".format(
                    path=from_path))

        if not stage.is_data_source:
            raise MoveNotDataSourceError(stage.relpath)

        stage_name = os.path.splitext(os.path.basename(stage.path))[0]
        from_name = os.path.basename(from_out.path)
        if stage_name == from_name:
            os.unlink(stage.path)

            stage.path = os.path.join(
                os.path.dirname(to_path),
                os.path.basename(to_path) + Stage.STAGE_FILE_SUFFIX,
            )

            stage.cwd = os.path.join(self.root_dir, os.path.dirname(to_path))

        to_out = Output.loads_from(stage, [os.path.basename(to_path)],
                                   out.cache, out.metric)[0]

        with self.state:
            out.move(to_out)

        stage.dump()

        self._remind_to_git_add()

    def _unprotect_file(self, path):
        import stat
        import uuid
        from dvc.system import System
        from dvc.utils import copyfile, move, remove

        if System.is_symlink(path) or System.is_hardlink(path):
            logger.debug("Unprotecting '{}'".format(path))

            tmp = os.path.join(os.path.dirname(path), "." + str(uuid.uuid4()))
            move(path, tmp)

            copyfile(tmp, path)

            remove(tmp)
        else:
            logger.debug("Skipping copying for '{}', since it is not "
                         "a symlink or a hardlink.".format(path))

        os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)

    def _unprotect_dir(self, path):
        for root, dirs, files in os.walk(path):
            for f in files:
                path = os.path.join(root, f)
                self._unprotect_file(path)

    def unprotect(self, path):
        if not os.path.exists(path):
            raise DvcException(
                "can't unprotect non-existing data '{}'".format(path))

        if os.path.isdir(path):
            self._unprotect_dir(path)
        else:
            self._unprotect_file(path)

    def run(
        self,
        cmd=None,
        deps=[],
        outs=[],
        outs_no_cache=[],
        metrics_no_cache=[],
        fname=None,
        cwd=os.curdir,
        no_exec=False,
        overwrite=False,
        ignore_build_cache=False,
        remove_outs=False,
    ):
        from dvc.stage import Stage

        with self.state:
            stage = Stage.create(
                project=self,
                fname=fname,
                cmd=cmd,
                cwd=cwd,
                outs=outs,
                outs_no_cache=outs_no_cache,
                metrics_no_cache=metrics_no_cache,
                deps=deps,
                overwrite=overwrite,
                ignore_build_cache=ignore_build_cache,
                remove_outs=remove_outs,
            )

        if stage is None:
            return None

        self._check_dag(self.stages() + [stage])

        self.files_to_git_add = []
        with self.state:
            if not no_exec:
                stage.run()

        stage.dump()

        self._remind_to_git_add()

        return stage

    def imp(self, url, out, resume=False):
        from dvc.stage import Stage

        stage = Stage.create(project=self, cmd=None, deps=[url], outs=[out])

        if stage is None:
            return None

        self._check_dag(self.stages() + [stage])

        self.files_to_git_add = []
        with self.state:
            stage.run(resume=resume)

        stage.dump()

        self._remind_to_git_add()

        return stage

    def _reproduce_stage(self, stages, node, force, dry, interactive):
        stage = stages[node]

        if stage.locked:
            logger.warning(
                "DVC file '{path}' is locked. Its dependencies are"
                " not going to be reproduced.".format(path=stage.relpath))

        stage = stage.reproduce(force=force, dry=dry, interactive=interactive)
        if not stage:
            return []

        if not dry:
            stage.dump()

        return [stage]

    def reproduce(
        self,
        target=None,
        recursive=True,
        force=False,
        dry=False,
        interactive=False,
        pipeline=False,
        all_pipelines=False,
        ignore_build_cache=False,
    ):
        from dvc.stage import Stage

        if not target and not all_pipelines:
            raise ValueError()

        if not interactive:
            config = self.config
            core = config.config[config.SECTION_CORE]
            interactive = core.get(config.SECTION_CORE_INTERACTIVE, False)

        targets = []
        if pipeline or all_pipelines:
            if pipeline:
                stage = Stage.load(self, target)
                node = os.path.relpath(stage.path, self.root_dir)
                pipelines = [self._get_pipeline(node)]
            else:
                pipelines = self.pipelines()

            for G in pipelines:
                for node in G.nodes():
                    if G.in_degree(node) == 0:
                        targets.append(os.path.join(self.root_dir, node))
        else:
            targets.append(target)

        self.files_to_git_add = []

        ret = []
        with self.state:
            for target in targets:
                stages = self._reproduce(
                    target,
                    recursive=recursive,
                    force=force,
                    dry=dry,
                    interactive=interactive,
                    ignore_build_cache=ignore_build_cache,
                )
                ret.extend(stages)

        self._remind_to_git_add()

        return ret

    def _reproduce(
        self,
        target,
        recursive=True,
        force=False,
        dry=False,
        interactive=False,
        ignore_build_cache=False,
    ):
        import networkx as nx
        from dvc.stage import Stage

        stage = Stage.load(self, target)
        G = self.graph()[1]
        stages = nx.get_node_attributes(G, "stage")
        node = os.path.relpath(stage.path, self.root_dir)

        if recursive:
            ret = self._reproduce_stages(G, stages, node, force, dry,
                                         interactive, ignore_build_cache)
        else:
            ret = self._reproduce_stage(stages, node, force, dry, interactive)

        return ret

    def _reproduce_stages(self, G, stages, node, force, dry, interactive,
                          ignore_build_cache):
        import networkx as nx

        result = []
        for n in nx.dfs_postorder_nodes(G, node):
            try:
                ret = self._reproduce_stage(stages, n, force, dry, interactive)

                if len(ret) == 0 and ignore_build_cache:
                    # NOTE: we are walking our pipeline from the top to the
                    # bottom. If one stage is changed, it will be reproduced,
                    # which tells us that we should force reproducing all of
                    # the other stages down below, even if their direct
                    # dependencies didn't change.
                    force = True

                result += ret
            except Exception as ex:
                raise ReproductionError(stages[n].relpath, ex)
        return result

    def _cleanup_unused_links(self, all_stages):
        used = []
        for stage in all_stages:
            for out in stage.outs:
                used.append(out.path)
        self.state.remove_unused_links(used)

    def checkout(self,
                 target=None,
                 with_deps=False,
                 force=False,
                 recursive=False):

        if target and not recursive:
            all_stages = self.active_stages()
            stages = self._collect(target, with_deps=with_deps)
        else:
            all_stages = self.active_stages(target)
            stages = all_stages

        with self.state:
            self._cleanup_unused_links(all_stages)

            for stage in stages:
                if stage.locked:
                    logger.warning(
                        "DVC file '{path}' is locked. Its dependencies are"
                        " not going to be checked out.".format(
                            path=stage.relpath))

                stage.checkout(force=force)

    def _get_pipeline(self, node):
        pipelines = list(filter(lambda g: node in g.nodes(), self.pipelines()))
        assert len(pipelines) == 1
        return pipelines[0]

    def _collect(self, target, with_deps=False):
        import networkx as nx
        from dvc.stage import Stage

        stage = Stage.load(self, target)
        if not with_deps:
            return [stage]

        node = os.path.relpath(stage.path, self.root_dir)
        G = self._get_pipeline(node)
        stages = nx.get_node_attributes(G, "stage")

        ret = [stage]
        for n in nx.dfs_postorder_nodes(G, node):
            ret.append(stages[n])

        return ret

    def _collect_dir_cache(self,
                           out,
                           branch=None,
                           remote=None,
                           force=False,
                           jobs=None):
        info = out.dumpd()
        ret = [info]
        r = out.remote
        md5 = info[r.PARAM_CHECKSUM]

        if self.cache.local.changed_cache_file(md5):
            try:
                self.cloud.pull(ret,
                                jobs=jobs,
                                remote=remote,
                                show_checksums=False)
            except DvcException as exc:
                msg = "Failed to pull cache for '{}': {}"
                logger.debug(msg.format(out, exc))

        if self.cache.local.changed_cache_file(md5):
            msg = ("Missing cache for directory '{}'. "
                   "Cache for files inside will be lost. "
                   "Would you like to continue? Use '-f' to force. ")
            if not force and not prompt.confirm(msg):
                raise DvcException(
                    "unable to fully collect used cache"
                    " without cache for directory '{}'".format(out))
            else:
                return ret

        for i in self.cache.local.load_dir_cache(md5):
            i["branch"] = branch
            i[r.PARAM_PATH] = os.path.join(info[r.PARAM_PATH],
                                           i[r.PARAM_RELPATH])
            ret.append(i)

        return ret

    def _collect_used_cache(self,
                            out,
                            branch=None,
                            remote=None,
                            force=False,
                            jobs=None):
        if not out.use_cache or not out.info:
            if not out.info:
                logger.warning("Output '{}'({}) is missing version "
                               "info. Cache for it will not be collected. "
                               "Use dvc repro to get your pipeline up to "
                               "date.".format(out, out.stage))
            return []

        info = out.dumpd()
        info["branch"] = branch
        ret = [info]

        if out.scheme != "local":
            return ret

        md5 = info[out.remote.PARAM_CHECKSUM]
        cache = self.cache.local.get(md5)
        if not out.remote.is_dir_cache(cache):
            return ret

        return self._collect_dir_cache(out,
                                       branch=branch,
                                       remote=remote,
                                       force=force,
                                       jobs=jobs)

    def _used_cache(
        self,
        target=None,
        all_branches=False,
        active=True,
        with_deps=False,
        all_tags=False,
        remote=None,
        force=False,
        jobs=None,
        recursive=False,
    ):
        cache = {}
        cache["local"] = []
        cache["s3"] = []
        cache["gs"] = []
        cache["hdfs"] = []
        cache["ssh"] = []
        cache["azure"] = []

        for branch in self.scm.brancher(all_branches=all_branches,
                                        all_tags=all_tags):
            if target:
                if recursive:
                    stages = self.stages(target)
                else:
                    stages = self._collect(target, with_deps=with_deps)
            elif active:
                stages = self.active_stages()
            else:
                stages = self.stages()

            for stage in stages:
                if active and not target and stage.locked:
                    logger.warning(
                        "DVC file '{path}' is locked. Its dependencies are"
                        " not going to be pushed/pulled/fetched.".format(
                            path=stage.relpath))

                for out in stage.outs:
                    scheme = out.path_info["scheme"]
                    cache[scheme] += self._collect_used_cache(
                        out,
                        branch=branch,
                        remote=remote,
                        force=force,
                        jobs=jobs,
                    )

        return cache

    @staticmethod
    def merge_cache_lists(clists):
        merged_cache = collections.defaultdict(list)

        for cache_list in clists:
            for scheme, cache in cache_list.items():
                for item in cache:
                    if item not in merged_cache[scheme]:
                        merged_cache[scheme].append(item)

        return merged_cache

    @staticmethod
    def load_all_used_cache(
        projects,
        target=None,
        all_branches=False,
        active=True,
        with_deps=False,
        all_tags=False,
        remote=None,
        force=False,
        jobs=None,
    ):
        clists = []

        for project in projects:
            with project.state:
                project_clist = project._used_cache(
                    target=None,
                    all_branches=all_branches,
                    active=False,
                    with_deps=with_deps,
                    all_tags=all_tags,
                    remote=remote,
                    force=force,
                    jobs=jobs,
                )

                clists.append(project_clist)

        return clists

    def _do_gc(self, typ, func, clist):
        removed = func(clist)
        if not removed:
            logger.info("No unused {} cache to remove.".format(typ))

    def gc(
        self,
        all_branches=False,
        cloud=False,
        remote=None,
        with_deps=False,
        all_tags=False,
        force=False,
        jobs=None,
        projects=None,
    ):

        all_projects = [self]

        if projects:
            all_projects.extend(Project(path) for path in projects)

        all_clists = Project.load_all_used_cache(
            all_projects,
            target=None,
            all_branches=all_branches,
            active=False,
            with_deps=with_deps,
            all_tags=all_tags,
            remote=remote,
            force=force,
            jobs=jobs,
        )

        if len(all_clists) > 1:
            clist = Project.merge_cache_lists(all_clists)
        else:
            clist = all_clists[0]

        with self.state:
            self._do_gc("local", self.cache.local.gc, clist)

            if self.cache.s3:
                self._do_gc("s3", self.cache.s3.gc, clist)

            if self.cache.gs:
                self._do_gc("gs", self.cache.gs.gc, clist)

            if self.cache.ssh:
                self._do_gc("ssh", self.cache.ssh.gc, clist)

            if self.cache.hdfs:
                self._do_gc("hdfs", self.cache.hdfs.gc, clist)

            if self.cache.azure:
                self._do_gc("azure", self.cache.azure.gc, clist)

            if cloud:
                self._do_gc("remote",
                            self.cloud._get_cloud(remote, "gc -c").gc, clist)

    def push(
        self,
        target=None,
        jobs=1,
        remote=None,
        all_branches=False,
        show_checksums=False,
        with_deps=False,
        all_tags=False,
        recursive=False,
    ):
        with self.state:
            used = self._used_cache(
                target,
                all_branches=all_branches,
                all_tags=all_tags,
                with_deps=with_deps,
                force=True,
                remote=remote,
                jobs=jobs,
                recursive=recursive,
            )["local"]
            self.cloud.push(used,
                            jobs,
                            remote=remote,
                            show_checksums=show_checksums)

    def fetch(
        self,
        target=None,
        jobs=1,
        remote=None,
        all_branches=False,
        show_checksums=False,
        with_deps=False,
        all_tags=False,
        recursive=False,
    ):
        with self.state:
            used = self._used_cache(
                target,
                all_branches=all_branches,
                all_tags=all_tags,
                with_deps=with_deps,
                force=True,
                remote=remote,
                jobs=jobs,
                recursive=recursive,
            )["local"]
            self.cloud.pull(used,
                            jobs,
                            remote=remote,
                            show_checksums=show_checksums)

    def pull(
        self,
        target=None,
        jobs=1,
        remote=None,
        all_branches=False,
        show_checksums=False,
        with_deps=False,
        all_tags=False,
        force=False,
        recursive=False,
    ):
        self.fetch(
            target,
            jobs,
            remote=remote,
            all_branches=all_branches,
            all_tags=all_tags,
            show_checksums=show_checksums,
            with_deps=with_deps,
            recursive=recursive,
        )
        self.checkout(
            target=target,
            with_deps=with_deps,
            force=force,
            recursive=recursive,
        )

    def _local_status(self, target=None, with_deps=False):
        status = {}

        if target:
            stages = self._collect(target, with_deps=with_deps)
        else:
            stages = self.active_stages()

        for stage in stages:
            if stage.locked:
                logger.warning(
                    "DVC file '{path}' is locked. Its dependencies are"
                    " not going to be shown in the status output.".format(
                        path=stage.relpath))

            status.update(stage.status())

        return status

    def _cloud_status(
        self,
        target=None,
        jobs=1,
        remote=None,
        show_checksums=False,
        all_branches=False,
        with_deps=False,
        all_tags=False,
    ):
        import dvc.remote.base as cloud

        used = self._used_cache(
            target,
            all_branches=all_branches,
            all_tags=all_tags,
            with_deps=with_deps,
            force=True,
            remote=remote,
            jobs=jobs,
        )["local"]

        ret = {}
        status_info = self.cloud.status(used,
                                        jobs,
                                        remote=remote,
                                        show_checksums=show_checksums)
        for md5, info in status_info.items():
            name = info["name"]
            status = info["status"]
            if status == cloud.STATUS_OK:
                continue

            prefix_map = {
                cloud.STATUS_DELETED: "deleted",
                cloud.STATUS_NEW: "new",
            }

            ret[name] = prefix_map[status]

        return ret

    def status(
        self,
        target=None,
        jobs=1,
        cloud=False,
        remote=None,
        show_checksums=False,
        all_branches=False,
        with_deps=False,
        all_tags=False,
    ):
        with self.state:
            if cloud:
                return self._cloud_status(
                    target,
                    jobs,
                    remote=remote,
                    show_checksums=show_checksums,
                    all_branches=all_branches,
                    with_deps=with_deps,
                    all_tags=all_tags,
                )
            return self._local_status(target, with_deps=with_deps)

    def _read_metric_json(self, fd, json_path):
        import json
        from jsonpath_rw import parse

        parser = parse(json_path)
        return [x.value for x in parser.find(json.load(fd))]

    def _do_read_metric_xsv(self, reader, row, col):
        if col is not None and row is not None:
            return [reader[row][col]]
        elif col is not None:
            return [r[col] for r in reader]
        elif row is not None:
            return reader[row]
        return None

    def _read_metric_hxsv(self, fd, hxsv_path, delimiter):
        import csv

        col, row = hxsv_path.split(",")
        row = int(row)
        reader = list(csv.DictReader(fd, delimiter=builtin_str(delimiter)))
        return self._do_read_metric_xsv(reader, row, col)

    def _read_metric_xsv(self, fd, xsv_path, delimiter):
        import csv

        col, row = xsv_path.split(",")
        row = int(row)
        col = int(col)
        reader = list(csv.reader(fd, delimiter=builtin_str(delimiter)))
        return self._do_read_metric_xsv(reader, row, col)

    def _read_metric(self, path, typ=None, xpath=None):
        ret = None

        if not os.path.exists(path):
            return ret

        try:
            with open(path, "r") as fd:
                if typ == "json":
                    ret = self._read_metric_json(fd, xpath)
                elif typ == "csv":
                    ret = self._read_metric_xsv(fd, xpath, ",")
                elif typ == "tsv":
                    ret = self._read_metric_xsv(fd, xpath, "\t")
                elif typ == "hcsv":
                    ret = self._read_metric_hxsv(fd, xpath, ",")
                elif typ == "htsv":
                    ret = self._read_metric_hxsv(fd, xpath, "\t")
                else:
                    ret = fd.read().strip()
        except Exception:
            logger.error("unable to read metric in '{}'".format(path))

        return ret

    def _find_output_by_path(self, path, outs=None, recursive=False):
        from dvc.exceptions import OutputDuplicationError

        if not outs:
            astages = self.active_stages()
            outs = [out for stage in astages for out in stage.outs]

        abs_path = os.path.abspath(path)
        if os.path.isdir(abs_path) and recursive:
            matched = [
                out for out in outs
                if os.path.abspath(out.path).startswith(abs_path)
            ]
        else:
            matched = [out for out in outs if out.path == abs_path]
            stages = [out.stage.relpath for out in matched]
            if len(stages) > 1:
                raise OutputDuplicationError(path, stages)

        return matched if matched else []

    def metrics_show(
        self,
        path=None,
        typ=None,
        xpath=None,
        all_branches=False,
        all_tags=False,
        recursive=False,
    ):
        res = {}
        for branch in self.scm.brancher(all_branches=all_branches,
                                        all_tags=all_tags):

            astages = self.active_stages()
            outs = [out for stage in astages for out in stage.outs]

            if path:
                outs = self._find_output_by_path(path,
                                                 outs=outs,
                                                 recursive=recursive)
                stages = [out.stage.path for out in outs]
                entries = []
                for out in outs:
                    if all([out.metric, not typ,
                            isinstance(out.metric, dict)]):
                        entries += [(
                            out.path,
                            out.metric.get(out.PARAM_METRIC_TYPE, None),
                            out.metric.get(out.PARAM_METRIC_XPATH, None),
                        )]
                    else:
                        entries += [(out.path, typ, xpath)]
                if not entries:
                    if os.path.isdir(path):
                        logger.warning(
                            "Path '{path}' is a directory. "
                            "Consider runnig with '-R'.".format(path=path))
                        return {}

                    else:
                        entries += [(path, typ, xpath)]

            else:
                metrics = filter(lambda o: o.metric, outs)
                stages = None
                entries = []
                for o in metrics:
                    if not typ and isinstance(o.metric, dict):
                        t = o.metric.get(o.PARAM_METRIC_TYPE, typ)
                        x = o.metric.get(o.PARAM_METRIC_XPATH, xpath)
                    else:
                        t = typ
                        x = xpath
                    entries.append((o.path, t, x))

            for fname, t, x in entries:
                if stages:
                    for stage in stages:
                        self.checkout(stage, force=True)

                rel = os.path.relpath(fname)
                metric = self._read_metric(fname, typ=t, xpath=x)
                if not metric:
                    continue

                if branch not in res:
                    res[branch] = {}

                res[branch][rel] = metric

        for branch, val in res.items():
            if all_branches or all_tags:
                logger.info("{}:".format(branch))
            for fname, metric in val.items():
                logger.info("\t{}: {}".format(fname, metric))

        if res:
            return res

        if path and os.path.isdir(path):
            msg = "file '{}' does not exist".format(path)
            return res

        if path:
            msg = "file '{}' does not exist".format(path)
        else:
            msg = ("no metric files in this repository."
                   " use 'dvc metrics add' to add a metric file to track.")

        raise DvcException(msg)

    def _metrics_modify(self, path, typ=None, xpath=None, delete=False):
        outs = self._find_output_by_path(path)

        if not outs:
            msg = "unable to find file '{}' in the pipeline".format(path)
            raise DvcException(msg)

        if len(outs) != 1:
            msg = ("-R not yet supported for metrics modify. "
                   "Make sure only one metric is referred to by '{}'".format(
                       path))
            raise DvcException(msg)

        out = outs[0]

        if out.scheme != "local":
            msg = "output '{}' scheme '{}' is not supported for metrics"
            raise DvcException(msg.format(out.path, out.path_info["scheme"]))

        if out.use_cache:
            msg = "cached output '{}' is not supported for metrics"
            raise DvcException(msg.format(out.rel_path))

        if typ:
            if not isinstance(out.metric, dict):
                out.metric = {}
            out.metric[out.PARAM_METRIC_TYPE] = typ

        if xpath:
            if not isinstance(out.metric, dict):
                out.metric = {}
            out.metric[out.PARAM_METRIC_XPATH] = xpath

        if delete:
            out.metric = None

        out._verify_metric()

        out.stage.dump()

    def metrics_modify(self, path=None, typ=None, xpath=None):
        self._metrics_modify(path, typ, xpath)

    def metrics_add(self, path, typ=None, xpath=None):
        if not typ:
            typ = "raw"
        self._metrics_modify(path, typ, xpath)

    def metrics_remove(self, path):
        self._metrics_modify(path, delete=True)

    def graph(self, stages=None, from_directory=None):
        import networkx as nx
        from dvc.exceptions import (
            OutputDuplicationError,
            WorkingDirectoryAsOutputError,
        )

        G = nx.DiGraph()
        G_active = nx.DiGraph()
        stages = stages or self.stages(from_directory)
        stages = [stage for stage in stages if stage]
        outs = []

        for stage in stages:
            for out in stage.outs:
                existing = [o.stage for o in outs if o.path == out.path]

                if existing:
                    stages = [stage.relpath, existing[0].relpath]
                    raise OutputDuplicationError(out.path, stages)

                outs.append(out)

        for stage in stages:
            for out in outs:
                overlaps = stage.cwd == out.path or stage.cwd.startswith(
                    out.path + os.sep)

                if overlaps:
                    raise WorkingDirectoryAsOutputError(
                        stage.cwd, stage.relpath)

        # collect the whole DAG
        for stage in stages:
            node = os.path.relpath(stage.path, self.root_dir)

            G.add_node(node, stage=stage)
            G_active.add_node(node, stage=stage)

            for dep in stage.deps:
                for out in outs:
                    if (out.path != dep.path
                            and not dep.path.startswith(out.path + out.sep)
                            and not out.path.startswith(dep.path + dep.sep)):
                        continue

                    dep_stage = out.stage
                    dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                    G.add_node(dep_node, stage=dep_stage)
                    G.add_edge(node, dep_node)
                    if not stage.locked:
                        G_active.add_node(dep_node, stage=dep_stage)
                        G_active.add_edge(node, dep_node)

        self._check_cyclic_graph(G)

        return G, G_active

    def pipelines(self, from_directory=None):
        import networkx as nx

        G, G_active = self.graph(from_directory=from_directory)

        return [
            G.subgraph(c).copy() for c in nx.weakly_connected_components(G)
        ]

    def stages(self, from_directory=None):
        """
        Walks down the root directory looking for Dvcfiles,
        skipping the directories that are related with
        any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories
        tracked by DVC (e.g. `dvc add data` would skip `data/`)

        NOTE: For large projects, this could be an expensive
              operation. Consider using some memoization.
        """
        from dvc.stage import Stage

        if not from_directory:
            from_directory = self.root_dir

        stages = []
        outs = []
        for root, dirs, files in os.walk(from_directory):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_stage_file(path):
                    continue
                stage = Stage.load(self, path)
                for out in stage.outs:
                    outs.append(out.path + out.sep)
                stages.append(stage)

            def filter_dirs(dname, root=root):
                path = os.path.join(root, dname)
                if path in (self.dvc_dir, self.scm.dir):
                    return False
                for out in outs:
                    if path == os.path.normpath(out) or path.startswith(out):
                        return False
                return True

            dirs[:] = list(filter(filter_dirs, dirs))

        return stages

    def active_stages(self, from_directory=None):
        import networkx as nx

        stages = []
        for G in self.pipelines(from_directory):
            stages.extend(list(nx.get_node_attributes(G, "stage").values()))
        return stages

    def _welcome_message(self):
        import colorama

        logger.box(
            "DVC has enabled anonymous aggregate usage analytics.\n"
            "Read the analytics documentation (and how to opt-out) here:\n"
            "{blue}https://dvc.org/doc/user-guide/analytics{nc}".format(
                blue=colorama.Fore.BLUE, nc=colorama.Fore.RESET),
            border_color="red",
        )

        logger.info(
            "{yellow}What's next?{nc}\n"
            "{yellow}------------{nc}\n"
            "- Check out the documentation: {blue}https://dvc.org/doc{nc}\n"
            "- Get help and share ideas: {blue}https://dvc.org/chat{nc}\n"
            "- Star us on GitHub: {blue}https://github.com/iterative/dvc{nc}".
            format(
                yellow=colorama.Fore.YELLOW,
                blue=colorama.Fore.BLUE,
                nc=colorama.Fore.RESET,
            ))

    def _expand_target_path(self, from_path, to_path):
        if os.path.isdir(to_path) and not os.path.isdir(from_path):
            return os.path.join(to_path, os.path.basename(from_path))
        return to_path
예제 #20
0
파일: __init__.py 프로젝트: databill86/dvc
class Repo(object):
    DVC_DIR = ".dvc"

    from dvc.repo.destroy import destroy
    from dvc.repo.install import install
    from dvc.repo.add import add
    from dvc.repo.remove import remove
    from dvc.repo.lock import lock as lock_stage
    from dvc.repo.move import move
    from dvc.repo.run import run
    from dvc.repo.imp import imp
    from dvc.repo.reproduce import reproduce
    from dvc.repo.checkout import checkout
    from dvc.repo.push import push
    from dvc.repo.fetch import fetch
    from dvc.repo.pull import pull
    from dvc.repo.status import status
    from dvc.repo.gc import gc
    from dvc.repo.commit import commit
    from dvc.repo.pkg import install_pkg

    def __init__(self, root_dir=None):
        from dvc.config import Config
        from dvc.state import State
        from dvc.lock import Lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.updater import Updater
        from dvc.repo.metrics import Metrics

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        self.scm = SCM(self.root_dir, repo=self)
        self.lock = Lock(self.dvc_dir)
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self, self.config.config)

        core = self.config.config[Config.SECTION_CORE]

        logger.set_level(core.get(Config.SECTION_CORE_LOGLEVEL))

        self.cache = Cache(self)
        self.cloud = DataCloud(self, config=self.config.config)
        self.updater = Updater(self.dvc_dir)

        self.metrics = Metrics(self)

        self._ignore()

        self.updater.check()

    def __repr__(self):
        return "Repo: '{root_dir}'".format(root_dir=self.root_dir)

    @staticmethod
    def find_root(root=None):
        if root is None:
            root = os.getcwd()
        else:
            root = os.path.abspath(os.path.realpath(root))

        while True:
            dvc_dir = os.path.join(root, Repo.DVC_DIR)
            if os.path.isdir(dvc_dir):
                return root
            if os.path.ismount(root):
                break
            root = os.path.dirname(root)
        raise NotDvcRepoError(root)

    @staticmethod
    def find_dvc_dir(root=None):
        root_dir = Repo.find_root(root)
        return os.path.join(root_dir, Repo.DVC_DIR)

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False, force=False):
        from dvc.repo.init import init

        init(root_dir=root_dir, no_scm=no_scm, force=force)
        return Repo(root_dir)

    @staticmethod
    def unprotect(target):
        from dvc.repo.unprotect import unprotect

        return unprotect(target)

    def _ignore(self):
        flist = [
            self.state.state_file,
            self.lock.lock_file,
            self.config.config_local_file,
            self.updater.updater_file,
            self.updater.lock.lock_file,
        ] + self.state.temp_files

        if self.cache.local.cache_dir.startswith(self.root_dir):
            flist += [self.cache.local.cache_dir]

        self.scm.ignore_list(flist)

    def check_dag(self, stages):
        """Generate graph including the new stage to check for errors"""
        self.graph(stages=stages)

    @staticmethod
    def _check_cyclic_graph(graph):
        import networkx as nx
        from dvc.exceptions import CyclicGraphError

        cycles = list(nx.simple_cycles(graph))

        if cycles:
            raise CyclicGraphError(cycles[0])

    def _get_pipeline(self, node):
        pipelines = list(filter(lambda g: node in g.nodes(), self.pipelines()))
        assert len(pipelines) == 1
        return pipelines[0]

    def collect(self, target, with_deps=False):
        import networkx as nx
        from dvc.stage import Stage

        stage = Stage.load(self, target)
        if not with_deps:
            return [stage]

        node = os.path.relpath(stage.path, self.root_dir)
        G = self._get_pipeline(node)
        stages = nx.get_node_attributes(G, "stage")

        ret = [stage]
        for n in nx.dfs_postorder_nodes(G, node):
            ret.append(stages[n])

        return ret

    def _collect_dir_cache(
        self, out, branch=None, remote=None, force=False, jobs=None
    ):
        info = out.dumpd()
        ret = [info]
        r = out.remote
        md5 = info[r.PARAM_CHECKSUM]

        if self.cache.local.changed_cache_file(md5):
            try:
                self.cloud.pull(
                    ret, jobs=jobs, remote=remote, show_checksums=False
                )
            except DvcException as exc:
                msg = "Failed to pull cache for '{}': {}"
                logger.debug(msg.format(out, exc))

        if self.cache.local.changed_cache_file(md5):
            msg = (
                "Missing cache for directory '{}'. "
                "Cache for files inside will be lost. "
                "Would you like to continue? Use '-f' to force. "
            )
            if not force and not prompt.confirm(msg):
                raise DvcException(
                    "unable to fully collect used cache"
                    " without cache for directory '{}'".format(out)
                )
            else:
                return ret

        for i in self.cache.local.load_dir_cache(md5):
            i["branch"] = branch
            i[r.PARAM_PATH] = os.path.join(
                info[r.PARAM_PATH], i[r.PARAM_RELPATH]
            )
            ret.append(i)

        return ret

    def _collect_used_cache(
        self, out, branch=None, remote=None, force=False, jobs=None
    ):
        if not out.use_cache or not out.info:
            if not out.info:
                logger.warning(
                    "Output '{}'({}) is missing version "
                    "info. Cache for it will not be collected. "
                    "Use dvc repro to get your pipeline up to "
                    "date.".format(out, out.stage)
                )
            return []

        info = out.dumpd()
        info["branch"] = branch
        ret = [info]

        if out.scheme != "local":
            return ret

        md5 = info[out.remote.PARAM_CHECKSUM]
        cache = self.cache.local.get(md5)
        if not out.remote.is_dir_cache(cache):
            return ret

        return self._collect_dir_cache(
            out, branch=branch, remote=remote, force=force, jobs=jobs
        )

    def used_cache(
        self,
        target=None,
        all_branches=False,
        active=True,
        with_deps=False,
        all_tags=False,
        remote=None,
        force=False,
        jobs=None,
        recursive=False,
    ):
        cache = {}
        cache["local"] = []
        cache["s3"] = []
        cache["gs"] = []
        cache["hdfs"] = []
        cache["ssh"] = []
        cache["azure"] = []

        for branch in self.scm.brancher(
            all_branches=all_branches, all_tags=all_tags
        ):
            if target:
                if recursive:
                    stages = self.stages(target)
                else:
                    stages = self.collect(target, with_deps=with_deps)
            elif active:
                stages = self.active_stages()
            else:
                stages = self.stages()

            for stage in stages:
                if active and not target and stage.locked:
                    logger.warning(
                        "DVC file '{path}' is locked. Its dependencies are"
                        " not going to be pushed/pulled/fetched.".format(
                            path=stage.relpath
                        )
                    )

                for out in stage.outs:
                    scheme = out.path_info["scheme"]
                    cache[scheme] += self._collect_used_cache(
                        out,
                        branch=branch,
                        remote=remote,
                        force=force,
                        jobs=jobs,
                    )

        return cache

    def graph(self, stages=None, from_directory=None):
        import networkx as nx
        from dvc.exceptions import (
            OutputDuplicationError,
            StagePathAsOutputError,
        )

        G = nx.DiGraph()
        G_active = nx.DiGraph()
        stages = stages or self.stages(from_directory, check_dag=False)
        stages = [stage for stage in stages if stage]
        outs = []

        for stage in stages:
            for out in stage.outs:
                existing = [o.stage for o in outs if o.path == out.path]

                if existing:
                    stages = [stage.relpath, existing[0].relpath]
                    raise OutputDuplicationError(out.path, stages)

                outs.append(out)

        for stage in stages:
            path_dir = os.path.dirname(stage.path) + os.sep
            for out in outs:
                if path_dir.startswith(out.path + os.sep):
                    raise StagePathAsOutputError(stage.wdir, stage.relpath)

        # collect the whole DAG
        for stage in stages:
            node = os.path.relpath(stage.path, self.root_dir)

            G.add_node(node, stage=stage)
            G_active.add_node(node, stage=stage)

            for dep in stage.deps:
                for out in outs:
                    if (
                        out.path != dep.path
                        and not dep.path.startswith(out.path + out.sep)
                        and not out.path.startswith(dep.path + dep.sep)
                    ):
                        continue

                    dep_stage = out.stage
                    dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                    G.add_node(dep_node, stage=dep_stage)
                    G.add_edge(node, dep_node)
                    if not stage.locked:
                        G_active.add_node(dep_node, stage=dep_stage)
                        G_active.add_edge(node, dep_node)

        self._check_cyclic_graph(G)

        return G, G_active

    def pipelines(self, from_directory=None):
        import networkx as nx

        G, G_active = self.graph(from_directory=from_directory)

        return [
            G.subgraph(c).copy() for c in nx.weakly_connected_components(G)
        ]

    def stages(self, from_directory=None, check_dag=True):
        """
        Walks down the root directory looking for Dvcfiles,
        skipping the directories that are related with
        any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories
        tracked by DVC (e.g. `dvc add data` would skip `data/`)

        NOTE: For large repos, this could be an expensive
              operation. Consider using some memoization.
        """
        from dvc.stage import Stage

        if not from_directory:
            from_directory = self.root_dir

        stages = []
        outs = []
        for root, dirs, files in os.walk(str(from_directory)):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_stage_file(path):
                    continue
                stage = Stage.load(self, path)
                for out in stage.outs:
                    outs.append(out.path + out.sep)
                stages.append(stage)

            def filter_dirs(dname, root=root):
                path = os.path.join(root, dname)
                if path in (self.dvc_dir, self.scm.dir):
                    return False
                for out in outs:
                    if path == os.path.normpath(out) or path.startswith(out):
                        return False
                return True

            dirs[:] = list(filter(filter_dirs, dirs))

        if check_dag:
            self.check_dag(stages)

        return stages

    def active_stages(self, from_directory=None):
        import networkx as nx

        stages = []
        for G in self.pipelines(from_directory):
            stages.extend(list(nx.get_node_attributes(G, "stage").values()))
        return stages

    def find_outs_by_path(self, path, outs=None, recursive=False):
        if not outs:
            outs = [out for stage in self.stages() for out in stage.outs]

        abs_path = os.path.abspath(path)
        is_dir = os.path.isdir(abs_path)

        def func(out):
            if out.path == abs_path:
                return True

            if is_dir and recursive and out.path.startswith(abs_path + os.sep):
                return True

            return False

        matched = list(filter(func, outs))
        if not matched:
            raise OutputNotFoundError(path)

        return matched

    def is_dvc_internal(self, path):
        path_parts = os.path.normpath(path).split(os.path.sep)
        return self.DVC_DIR in path_parts
예제 #21
0
파일: base.py 프로젝트: phdtanvir/dvc
 def scm(self):
     return SCM(self.root_dir)
예제 #22
0
파일: analytics.py 프로젝트: zeta1999/dvc
def _scm_in_use():
    try:
        scm = SCM(root_dir=Repo.find_root())
        return type(scm).__name__
    except NotDvcRepoError:
        pass
예제 #23
0
def test_init_git(tmp_dir):
    Repo.init(os.fspath(tmp_dir))
    assert isinstance(SCM(os.fspath(tmp_dir)), Git)
예제 #24
0
파일: project.py 프로젝트: rdwrt/dvc
class Project(object):
    DVC_DIR = '.dvc'

    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.cache = Cache(self.dvc_dir)
        self.state = State(self.root_dir, self.dvc_dir)
        self.config = Config(self.dvc_dir)
        self.logger = Logger(self.config._config)
        self.cloud = DataCloud(self.cache, self.config._config)

    @staticmethod
    def init(root_dir=os.curdir):
        """
        Initiate dvc project in directory.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)
        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        cache = Cache.init(dvc_dir)
        state = State.init(root_dir, dvc_dir)
        lock = Lock(dvc_dir)

        scm = SCM(root_dir)
        scm.ignore_list([cache.cache_dir, state.state_file, lock.lock_file])

        ignore_file = os.path.join(dvc_dir, scm.ignore_file())
        scm.add([config.config_file, ignore_file])

        return Project(root_dir)

    def to_dvc_path(self, path):
        return os.path.relpath(path, self.root_dir)

    def add(self, fname):
        out = os.path.basename(fname)
        stage_fname = out + Stage.STAGE_FILE_SUFFIX
        cwd = os.path.dirname(os.path.abspath(fname))
        stage = Stage.loads(project=self,
                            cmd=None,
                            deps=[],
                            outs=[out],
                            fname=stage_fname,
                            cwd=cwd)

        stage.save()
        stage.dump()
        return stage

    def remove(self, target):
        if not Stage.is_stage_file(target):
            raise StageNotFoundError(target)

        stage = Stage.load(self, target)
        for out in stage.outs:
            out.remove()

        return stage

    def run(self,
            cmd=None,
            deps=[],
            outs=[],
            outs_no_cache=[],
            fname=Stage.STAGE_FILE,
            cwd=os.curdir,
            no_exec=False):
        stage = Stage.loads(project=self,
                            fname=fname,
                            cmd=cmd,
                            cwd=cwd,
                            outs=outs,
                            outs_no_cache=outs_no_cache,
                            deps=deps)
        if not no_exec:
            stage.run()
        stage.dump()
        return stage

    def _reproduce_stage(self, stages, node, force):
        if not stages[node].changed():
            return []

        stages[node].reproduce(force=force)
        stages[node].dump()
        return [stages[node]]

    def reproduce(self, target, recursive=True, force=False):
        stages = nx.get_node_attributes(self.graph(), 'stage')
        node = os.path.relpath(os.path.abspath(target), self.root_dir)
        if node not in stages:
            raise StageNotFoundError(target)

        if recursive:
            return self._reproduce_stages(stages, node, force)

        return self._reproduce_stage(stages, node, force)

    def _reproduce_stages(self, stages, node, force):
        result = []
        for n in nx.dfs_postorder_nodes(self.graph(), node):
            try:
                result += self._reproduce_stage(stages, n, force)
            except Exception as ex:
                raise ReproductionError(stages[n].relpath, ex)
        return result

    def _remove_untracked_hardlinks(self):
        untracked = self.scm.untracked_files()
        cache = dict((System.inode(c), c) for c in self.cache.all())
        for file in untracked:
            inode = System.inode(file)
            if inode not in cache.keys():
                continue

            Logger.info(u'Remove \'{}\''.format(file))
            os.remove(file)

            dir = os.path.dirname(file)
            if len(dir) != 0 and not os.listdir(dir):
                Logger.info(u'Remove empty directory \'{}\''.format(dir))
                os.removedirs(dir)

    def checkout(self):
        self._remove_untracked_hardlinks()
        for stage in self.stages():
            stage.checkout()

    def _used_cache(self, target=None):
        cache_set = set()

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in stages:
            for out in stage.outs:
                if not out.use_cache:
                    continue
                cache_set |= set([out.cache])
                if out.is_dir_cache(out.cache) and os.path.isfile(out.cache):
                    dir_cache = out.dir_cache()
                    cache_set |= set(dir_cache.values())

        return list(cache_set)

    def gc(self):
        clist = self._used_cache()
        for cache in self.cache.all():
            if cache in clist:
                continue
            os.unlink(cache)
            self.logger.info(u'\'{}\' was removed'.format(
                self.to_dvc_path(cache)))

    def push(self, target=None, jobs=1):
        return self.cloud.push(self._used_cache(target), jobs)

    def fetch(self, target=None, jobs=1):
        return self.cloud.pull(self._used_cache(target), jobs)

    def pull(self, target=None, jobs=1):
        ret = self.fetch(target, jobs)
        self.checkout()
        return ret

    def _local_status(self, target=None):
        status = {}

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in self.stages():
            status.update(stage.status())

        return status

    def _cloud_status(self, target=None, jobs=1):
        status = {}
        for target, ret in self.cloud.status(self._used_cache(target), jobs):
            if ret == cloud.STATUS_UNKNOWN or ret == cloud.STATUS_OK:
                continue

            prefix_map = {
                cloud.STATUS_DELETED: 'deleted',
                cloud.STATUS_MODIFIED: 'modified',
                cloud.STATUS_NEW: 'new',
            }

            path = os.path.relpath(target, self.cache.cache_dir)

            status[path] = prefix_map[ret]

        return status

    def status(self, target=None, jobs=1, cloud=False):
        if cloud:
            return self._cloud_status(target, jobs)
        return self._local_status(target)

    def graph(self):
        G = nx.DiGraph()

        for stage in self.stages():
            node = os.path.relpath(stage.path, self.root_dir)
            G.add_node(node, stage=stage)
            for dep in stage.deps:
                dep_stage = dep.stage()
                if not dep_stage:
                    continue
                dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                G.add_node(dep_node, stage=dep_stage)
                G.add_edge(node, dep_node)

        return G

    def stages(self):
        stages = []
        for root, dirs, files in os.walk(self.root_dir):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_stage_file(path):
                    continue
                stages.append(Stage.load(self, path))
        return stages

    def outs(self):
        outs = []
        for stage in self.stages():
            outs += stage.outs
        return outs
예제 #25
0
 def test_git_submodule(self):
     self.assertIsInstance(SCM(os.curdir), Git)
예제 #26
0
    def __init__(
        self,
        root_dir=None,
        scm=None,
        rev=None,
        subrepos=False,
        uninitialized=False,
    ):
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.lock import make_lock
        from dvc.repo.experiments import Experiments
        from dvc.repo.metrics import Metrics
        from dvc.repo.params import Params
        from dvc.repo.plots import Plots
        from dvc.scm import SCM
        from dvc.stage.cache import StageCache
        from dvc.state import State, StateNoop
        from dvc.tree.local import LocalTree
        from dvc.utils.fs import makedirs

        if scm:
            tree = scm.get_tree(rev)
            self.root_dir = self.find_root(root_dir, tree)
            self.scm = scm
            self.tree = scm.get_tree(rev,
                                     use_dvcignore=True,
                                     dvcignore_root=self.root_dir)
            self.state = StateNoop()
        else:
            try:
                root_dir = self.find_root(root_dir)
            except NotDvcRepoError:
                if not uninitialized:
                    raise
                root_dir = SCM(root_dir or os.curdir).root_dir
            self.root_dir = os.path.abspath(os.path.realpath(root_dir))
            self.tree = LocalTree(
                self,
                {"url": self.root_dir},
                use_dvcignore=True,
                dvcignore_root=self.root_dir,
            )

        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)
        self.config = Config(self.dvc_dir, tree=self.tree)

        if not scm:
            no_scm = self.config["core"].get("no_scm", False)
            self.scm = SCM(self.root_dir, no_scm=no_scm)

        self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
        makedirs(self.tmp_dir, exist_ok=True)

        hardlink_lock = self.config["core"].get("hardlink_lock", False)
        self.lock = make_lock(
            os.path.join(self.tmp_dir, "lock"),
            tmp_dir=self.tmp_dir,
            hardlink_lock=hardlink_lock,
            friendly=True,
        )
        # used by RepoTree to determine if it should traverse subrepos
        self.subrepos = subrepos

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        if not scm:
            # NOTE: storing state and link_state in the repository itself to
            # avoid any possible state corruption in 'shared cache dir'
            # scenario.
            self.state = State(self.cache.local)

        self.stage_cache = StageCache(self)

        self.metrics = Metrics(self)
        self.plots = Plots(self)
        self.params = Params(self)

        try:
            self.experiments = Experiments(self)
        except NotImplementedError:
            self.experiments = None

        self._ignore()
예제 #27
0
파일: project.py 프로젝트: yustoris/dvc
class Project(object):
    DVC_DIR = '.dvc'

    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)
        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.link_state = LinkState(self.root_dir, self.dvc_dir)
        self.logger = Logger(self.config._config[Config.SECTION_CORE].get(
            Config.SECTION_CORE_LOGLEVEL, None))
        self.cache = Cache(self)
        self.cloud = DataCloud(self, config=self.config._config)
        self.updater = Updater(self.dvc_dir)

        self._ignore()

        self.updater.check()

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False):
        """
        Initiate dvc project in directory.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)

        scm = SCM(root_dir)
        if type(scm) == Base and not no_scm:
            msg = "{} is not tracked by any supported scm tool(e.g. git).".format(
                root_dir)
            raise InitError(msg)

        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        proj = Project(root_dir)

        scm.add([config.config_file])
        if scm.ignore_file():
            scm.add([os.path.join(dvc_dir, scm.ignore_file())])

        return proj

    def _ignore(self):
        l = [
            self.link_state.state_file, self.link_state._lock_file.lock_file,
            self.lock.lock_file, self.config.config_local_file,
            self.updater.updater_file
        ]

        if self.cache.local.cache_dir.startswith(self.root_dir):
            l += [self.cache.local.cache_dir]

        self.scm.ignore_list(l)

    def install(self):
        self.scm.install()

    def to_dvc_path(self, path):
        return os.path.relpath(path, self.root_dir)

    def add(self, fname):
        out = os.path.basename(os.path.normpath(fname))
        stage_fname = out + Stage.STAGE_FILE_SUFFIX
        cwd = os.path.dirname(os.path.abspath(fname))
        stage = Stage.loads(project=self,
                            cmd=None,
                            deps=[],
                            outs=[out],
                            fname=stage_fname,
                            cwd=cwd)

        stage.save()
        stage.dump()
        return stage

    def remove(self, target, outs_only=False):
        if not Stage.is_stage_file(target):
            raise NotDvcFileError(target)

        stage = Stage.load(self, target)
        if outs_only:
            stage.remove_outs()
        else:
            stage.remove()

        return stage

    def lock_stage(self, target, unlock=False):
        if not Stage.is_stage_file(target):
            raise NotDvcFileError(target)

        stage = Stage.load(self, target)
        stage.locked = False if unlock else True
        stage.dump()

        return stage

    def move(self, from_path, to_path):
        from_out = Output.loads_from(Stage(self, cwd=os.curdir),
                                     [from_path])[0]

        found = False
        for stage in self.stages():
            for out in stage.outs:
                if out.path != from_out.path:
                    continue

                if not stage.is_data_source:
                    raise DvcException(
                        'Dvcfile \'{}\' is not a data source.'.format(
                            stage.rel_path))

                found = True
                to_out = Output.loads_from(out.stage, [to_path], out.cache,
                                           out.metric)[0]
                out.move(to_out)

                stage_base = os.path.basename(stage.path).rstrip(
                    Stage.STAGE_FILE_SUFFIX)
                stage_dir = os.path.dirname(stage.path)
                from_base = os.path.basename(from_path)
                to_base = os.path.basename(to_path)
                if stage_base == from_base:
                    os.unlink(stage.path)
                    stage.path = os.path.join(
                        stage_dir, to_base + Stage.STAGE_FILE_SUFFIX)

            stage.dump()

        if not found:
            raise DvcException(
                'Unable to find dvcfile with output \'{}\''.format(from_path))

    def run(self,
            cmd=None,
            deps=[],
            outs=[],
            outs_no_cache=[],
            metrics_no_cache=[],
            fname=Stage.STAGE_FILE,
            cwd=os.curdir,
            no_exec=False):
        stage = Stage.loads(project=self,
                            fname=fname,
                            cmd=cmd,
                            cwd=cwd,
                            outs=outs,
                            outs_no_cache=outs_no_cache,
                            metrics_no_cache=metrics_no_cache,
                            deps=deps)
        if not no_exec:
            stage.run()
        stage.dump()
        return stage

    def imp(self, url, out):
        stage_fname = out + Stage.STAGE_FILE_SUFFIX
        cwd = os.path.dirname(os.path.abspath(out))
        stage = Stage.loads(project=self,
                            cmd=None,
                            deps=[url],
                            outs=[out],
                            fname=stage_fname,
                            cwd=cwd)

        stage.run()
        stage.dump()
        return stage

    def _reproduce_stage(self, stages, node, force):
        stage = stages[node].reproduce(force=force)
        if not stage:
            return []
        stage.dump()
        return [stage]

    def reproduce(self, target, recursive=True, force=False):
        stages = nx.get_node_attributes(self.graph(), 'stage')
        node = os.path.relpath(os.path.abspath(target), self.root_dir)
        if node not in stages:
            raise NotDvcFileError(target)

        if recursive:
            return self._reproduce_stages(stages, node, force)

        return self._reproduce_stage(stages, node, force)

    def _reproduce_stages(self, stages, node, force):
        result = []
        for n in nx.dfs_postorder_nodes(self.graph(), node):
            try:
                result += self._reproduce_stage(stages, n, force)
            except Exception as ex:
                raise ReproductionError(stages[n].relpath, ex)
        return result

    def checkout(self, target=None):
        if target:
            if not Stage.is_stage_file(target):
                raise NotDvcFileError(target)
            stages = [Stage.load(self, target)]
        else:
            self.link_state.remove_all()
            stages = self.stages()

        for stage in stages:
            stage.checkout()

    def _used_cache(self, target=None, all_branches=False):
        cache = {}
        cache['local'] = []
        cache['s3'] = []
        cache['gs'] = []
        cache['hdfs'] = []
        cache['ssh'] = []

        for branch in self.scm.brancher(all_branches=all_branches):
            if target:
                stages = [Stage.load(self, target)]
            else:
                stages = self.stages()

            for stage in stages:
                for out in stage.outs:
                    if not out.use_cache or not out.info:
                        continue

                    cache[out.path_info['scheme']] += [out.info]

        return cache

    def gc(self, all_branches=False):
        clist = self._used_cache(target=None, all_branches=all_branches)
        self.cache.local.gc(clist['local'])

        if self.cache.s3:
            self.cache.s3.gc(clist['s3'])

        if self.cache.gs:
            self.cache.gs.gc(clist['gs'])

        if self.cache.ssh:
            self.cache.ssh.gc(clist['ssh'])

        if self.cache.hdfs:
            self.cache.hdfs.gc(clist['hdfs'])

    def push(self, target=None, jobs=1, remote=None, all_branches=False):
        self.cloud.push(self._used_cache(target, all_branches)['local'],
                        jobs,
                        remote=remote)

    def fetch(self, target=None, jobs=1, remote=None, all_branches=False):
        self.cloud.pull(self._used_cache(target, all_branches)['local'],
                        jobs,
                        remote=remote)

    def pull(self, target=None, jobs=1, remote=None, all_branches=False):
        self.fetch(target, jobs, remote=remote, all_branches=all_branches)
        self.checkout(target=target)

    def _local_status(self, target=None):
        status = {}

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in stages:
            status.update(stage.status())

        return status

    def _cloud_status(self, target=None, jobs=1, remote=None):
        import dvc.remote.base as cloud

        status = {}
        for md5, ret in self.cloud.status(self._used_cache(target)['local'],
                                          jobs,
                                          remote=remote):
            if ret == cloud.STATUS_OK:
                continue

            prefix_map = {
                cloud.STATUS_DELETED: 'deleted',
                cloud.STATUS_NEW: 'new',
            }

            status[md5] = prefix_map[ret]

        return status

    def status(self, target=None, jobs=1, cloud=False, remote=None):
        if cloud:
            return self._cloud_status(target, jobs, remote=remote)
        return self._local_status(target)

    def _read_metric_json(self, fd, json_path):
        parser = parse(json_path)
        return [x.value for x in parser.find(json.load(fd))]

    def _do_read_metric_tsv(self, reader, row, col):
        if col != None and row != None:
            return [reader[row][col]]
        elif col != None:
            return [r[col] for r in reader]
        elif row != None:
            return reader[row]
        return None

    def _read_metric_htsv(self, fd, htsv_path):
        col, row = htsv_path.split(',')
        row = int(row)
        reader = list(csv.DictReader(fd, delimiter='\t'))
        return self._do_read_metric_tsv(reader, row, col)

    def _read_metric_tsv(self, fd, tsv_path):
        col, row = tsv_path.split(',')
        row = int(row)
        col = int(col)
        reader = list(csv.reader(fd, delimiter='\t'))
        return self._do_read_metric_tsv(reader, row, col)

    def _read_metric(self,
                     path,
                     json_path=None,
                     tsv_path=None,
                     htsv_path=None):
        ret = None

        if not os.path.exists(path):
            return ret

        try:
            with open(path, 'r') as fd:
                if json_path:
                    ret = self._read_metric_json(fd, json_path)
                elif tsv_path:
                    ret = self._read_metric_tsv(fd, tsv_path)
                elif htsv_path:
                    ret = self._read_metric_htsv(fd, htsv_path)
                else:
                    ret = fd.read()
        except Exception as exc:
            self.logger.debug('Unable to read metric in \'{}\''.format(path),
                              exc)

        return ret

    def metrics_show(self,
                     path=None,
                     json_path=None,
                     tsv_path=None,
                     htsv_path=None,
                     all_branches=False):
        res = {}
        for branch in self.scm.brancher(all_branches=all_branches):
            metrics = filter(lambda o: o.metric, self.outs())
            fnames = [path] if path else map(lambda o: o.path, metrics)
            for fname in fnames:
                rel = os.path.relpath(fname)
                metric = self._read_metric(fname,
                                           json_path=json_path,
                                           tsv_path=tsv_path,
                                           htsv_path=htsv_path)
                if not metric:
                    continue

                if branch not in res:
                    res[branch] = {}

                res[branch][rel] = metric

        for branch, val in res.items():
            self.logger.info('{}:'.format(branch))
            for fname, metric in val.items():
                self.logger.info('\t{}: {}'.format(fname, metric))

        return res

    def _metrics_modify(self, path, val):
        apath = os.path.abspath(path)
        for stage in self.stages():
            for out in stage.outs:
                if apath != out.path:
                    continue

                if out.path_info['scheme'] != 'local':
                    msg = 'Output \'{}\' scheme \'{}\' is not supported for metrics'
                    raise DvcException(
                        msg.format(out.path, out.path_info['scheme']))

                if out.use_cache:
                    msg = 'Cached output \'{}\' is not supported for metrics'
                    raise DvcException(msg.format(out.rel_path))

                out.metric = val

            stage.dump()

    def metrics_add(self, path):
        self._metrics_modify(path, True)

    def metrics_remove(self, path):
        self._metrics_modify(path, False)

    def graph(self):
        G = nx.DiGraph()
        stages = self.stages()
        outs = self.outs()

        for stage in stages:
            node = os.path.relpath(stage.path, self.root_dir)
            G.add_node(node, stage=stage)
            if stage.locked:
                continue
            for dep in stage.deps:
                for out in outs:
                    if out.path != dep.path and not dep.path.startswith(
                            out.path + out.sep):
                        continue

                    dep_stage = out.stage
                    dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                    G.add_node(dep_node, stage=dep_stage)
                    G.add_edge(node, dep_node)

        return G

    def stages(self):
        stages = []
        for root, dirs, files in os.walk(self.root_dir):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_stage_file(path):
                    continue
                stages.append(Stage.load(self, path))
        return stages

    def outs(self):
        outs = []
        for stage in self.stages():
            outs += stage.outs
        return outs
예제 #28
0
파일: init.py 프로젝트: zang3tsu/dvc
def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False):
    """
    Creates an empty repo on the given directory -- basically a
    `.dvc` directory with subdirectories for configuration and cache.

    It should be tracked by a SCM or use the `--no-scm` flag.

    If the given directory is not empty, you must use the `--force`
    flag to override it.

    Args:
        root_dir: Path to repo's root directory.

    Returns:
        Repo instance.

    Raises:
        KeyError: Raises an exception.
    """

    if no_scm and subdir:
        raise InvalidArgumentError(
            "Cannot initialize repo with `--no-scm` and `--subdir`")

    root_dir = os.path.realpath(root_dir)
    dvc_dir = os.path.join(root_dir, Repo.DVC_DIR)

    try:
        scm = SCM(root_dir, search_parent_directories=subdir, no_scm=no_scm)
    except SCMError:
        raise InitError(
            "{repo} is not tracked by any supported SCM tool (e.g. Git). "
            "Use `--no-scm` if you don't want to use any SCM or "
            "`--subdir` if initializing inside a subdirectory of a parent SCM "
            "repository.".format(repo=root_dir))

    if os.path.isdir(dvc_dir):
        if not force:
            raise InitError("'{repo}' exists. Use `-f` to force.".format(
                repo=relpath(dvc_dir)))

        remove(dvc_dir)

    os.mkdir(dvc_dir)

    config = Config.init(dvc_dir)

    if no_scm:
        with config.edit() as conf:
            conf["core"]["no_scm"] = True

    dvcignore = init_dvcignore(root_dir)

    proj = Repo(root_dir)

    scm.add(
        [config.files["repo"], dvcignore, proj.plot_templates.templates_dir])

    if scm.ignore_file:
        scm.add([os.path.join(dvc_dir, scm.ignore_file)])
        logger.info("\nYou can now commit the changes to git.\n")

    _welcome_message()

    return proj
예제 #29
0
    def scm(self):
        from dvc.scm import SCM

        no_scm = self.config["core"].get("no_scm", False)
        return self._scm if self._scm else SCM(self.root_dir, no_scm=no_scm)
예제 #30
0
파일: project.py 프로젝트: rizplate/dvc
class Project(object):
    DVC_DIR = '.dvc'

    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)
        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.link_state = LinkState(self.root_dir, self.dvc_dir)
        self.logger = Logger(self.config._config[Config.SECTION_CORE].get(
            Config.SECTION_CORE_LOGLEVEL, None))
        self.cache = Cache(self)
        self.cloud = DataCloud(cache=self.cache, config=self.config._config)
        self.updater = Updater(self.dvc_dir)

        self._ignore()

        self.updater.check()

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False):
        """
        Initiate dvc project in directory.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)

        scm = SCM(root_dir)
        if type(scm) == Base and not no_scm:
            msg = "{} is not tracked by any supported scm tool(e.g. git).".format(
                root_dir)
            raise InitError(msg)

        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        proj = Project(root_dir)

        scm.add([config.config_file])
        if scm.ignore_file():
            scm.add([os.path.join(dvc_dir, scm.ignore_file())])

        return proj

    def _ignore(self):
        l = [
            self.link_state.state_file, self.link_state._lock_file.lock_file,
            self.lock.lock_file, self.config.config_local_file,
            self.updater.updater_file
        ]

        if self.cache.local.cache_dir.startswith(self.root_dir):
            l += [self.cache.local.cache_dir]

        self.scm.ignore_list(l)

    def install(self):
        self.scm.install()

    def to_dvc_path(self, path):
        return os.path.relpath(path, self.root_dir)

    def add(self, fname):
        out = os.path.basename(os.path.normpath(fname))
        stage_fname = out + Stage.STAGE_FILE_SUFFIX
        cwd = os.path.dirname(os.path.abspath(fname))
        stage = Stage.loads(project=self,
                            cmd=None,
                            deps=[],
                            outs=[out],
                            fname=stage_fname,
                            cwd=cwd)

        stage.save()
        stage.dump()
        return stage

    def remove(self, target):
        if not Stage.is_stage_file(target):
            raise StageNotFoundError(target)

        stage = Stage.load(self, target)
        for out in stage.outs:
            out.remove()

        return stage

    def run(self,
            cmd=None,
            deps=[],
            outs=[],
            outs_no_cache=[],
            fname=Stage.STAGE_FILE,
            cwd=os.curdir,
            no_exec=False):
        stage = Stage.loads(project=self,
                            fname=fname,
                            cmd=cmd,
                            cwd=cwd,
                            outs=outs,
                            outs_no_cache=outs_no_cache,
                            deps=deps)
        if not no_exec:
            stage.run()
        stage.dump()
        return stage

    def _reproduce_stage(self, stages, node, force):
        stage = stages[node].reproduce(force=force)
        if not stage:
            return []
        stage.dump()
        return [stage]

    def reproduce(self, target, recursive=True, force=False):
        stages = nx.get_node_attributes(self.graph(), 'stage')
        node = os.path.relpath(os.path.abspath(target), self.root_dir)
        if node not in stages:
            raise StageNotFoundError(target)

        if recursive:
            return self._reproduce_stages(stages, node, force)

        return self._reproduce_stage(stages, node, force)

    def _reproduce_stages(self, stages, node, force):
        result = []
        for n in nx.dfs_postorder_nodes(self.graph(), node):
            try:
                result += self._reproduce_stage(stages, n, force)
            except Exception as ex:
                raise ReproductionError(stages[n].relpath, ex)
        return result

    def checkout(self, target=None):
        if target:
            if not Stage.is_stage_file(target):
                raise StageNotFoundError(target)
            stages = [Stage.load(self, target)]
        else:
            self.link_state.remove_all()
            stages = self.stages()

        for stage in stages:
            stage.checkout()

    def _used_cache(self, target=None):
        cache_set = set()

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in stages:
            for out in stage.outs:
                if out.path_info['scheme'] != 'local':
                    continue

                if not out.use_cache or not out.cache:
                    continue

                cache_set |= set([out.cache])
                if self.cache.local.is_dir_cache(out.cache) and os.path.isfile(
                        out.cache):
                    dir_cache = self.cache.local.dir_cache(out.cache)
                    cache_set |= set(dir_cache.values())

        return list(cache_set)

    def gc(self):
        clist = self._used_cache()
        for cache in self.cache.local.all():
            if cache in clist:
                continue
            os.unlink(cache)
            self.logger.info(u'\'{}\' was removed'.format(
                self.to_dvc_path(cache)))

    def push(self, target=None, jobs=1, remote=None):
        return self.cloud.push(self._used_cache(target), jobs, remote=remote)

    def fetch(self, target=None, jobs=1, remote=None):
        return self.cloud.pull(self._used_cache(target), jobs, remote=remote)

    def pull(self, target=None, jobs=1, remote=None):
        ret = self.fetch(target, jobs, remote=remote)
        self.checkout()
        return ret

    def _local_status(self, target=None):
        status = {}

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in stages:
            status.update(stage.status())

        return status

    def _cloud_status(self, target=None, jobs=1, remote=None):
        status = {}
        for target, ret in self.cloud.status(self._used_cache(target),
                                             jobs,
                                             remote=remote):
            if ret == cloud.STATUS_UNKNOWN or ret == cloud.STATUS_OK:
                continue

            prefix_map = {
                cloud.STATUS_DELETED: 'deleted',
                cloud.STATUS_MODIFIED: 'modified',
                cloud.STATUS_NEW: 'new',
            }

            path = os.path.relpath(target, self.cache.local.cache_dir)

            status[path] = prefix_map[ret]

        return status

    def status(self, target=None, jobs=1, cloud=False, remote=None):
        if cloud:
            return self._cloud_status(target, jobs, remote=remote)
        return self._local_status(target)

    def _read_metric_json(self, fd, json_path):
        parser = parse(json_path)
        return [x.value for x in parser.find(json.load(fd))]

    def _do_read_metric_tsv(self, reader, row, col):
        if col != None and row != None:
            return [reader[row][col]]
        elif col != None:
            return [r[col] for r in reader]
        elif row != None:
            return reader[row]
        return None

    def _read_metric_htsv(self, fd, htsv_path):
        col, row = htsv_path.split(',')
        row = int(row)
        reader = list(csv.DictReader(fd, delimiter='\t'))
        return self._do_read_metric_tsv(reader, row, col)

    def _read_metric_tsv(self, fd, tsv_path):
        col, row = tsv_path.split(',')
        row = int(row)
        col = int(col)
        reader = list(csv.reader(fd, delimiter='\t'))
        return self._do_read_metric_tsv(reader, row, col)

    def _read_metric(self,
                     path,
                     json_path=None,
                     tsv_path=None,
                     htsv_path=None):
        ret = None
        try:
            with open(path, 'r') as fd:
                if json_path:
                    ret = self._read_metric_json(fd, json_path)
                elif tsv_path:
                    ret = self._read_metric_tsv(fd, tsv_path)
                elif htsv_path:
                    ret = self._read_metric_htsv(fd, htsv_path)
                else:
                    ret = fd.read()
        except Exception as exc:
            self.logger.error('Unable to read metric in \'{}\''.format(path),
                              exc)

        return ret

    def metrics(self, path, json_path=None, tsv_path=None, htsv_path=None):
        res = {}
        saved = self.scm.active_branch()
        for branch in self.scm.list_branches():
            self.scm.checkout(branch)
            self.checkout()
            res[branch] = self._read_metric(path,
                                            json_path=json_path,
                                            tsv_path=tsv_path,
                                            htsv_path=htsv_path)
        self.scm.checkout(saved)
        self.checkout()
        return res

    def graph(self):
        G = nx.DiGraph()
        stages = self.stages()

        outs_map = {}
        for stage in stages:
            for o in stage.outs:
                outs_map[o.path] = stage

        for stage in stages:
            node = os.path.relpath(stage.path, self.root_dir)
            G.add_node(node, stage=stage)
            for dep in stage.deps:
                dep_stage = outs_map.get(dep.path, None)
                if not dep_stage:
                    continue
                dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                G.add_node(dep_node, stage=dep_stage)
                G.add_edge(node, dep_node)

        return G

    def stages(self):
        stages = []
        for root, dirs, files in os.walk(self.root_dir):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_stage_file(path):
                    continue
                stages.append(Stage.load(self, path))
        return stages

    def outs(self):
        outs = []
        for stage in self.stages():
            outs += stage.outs
        return outs