예제 #1
0
파일: base.py 프로젝트: shaunirwin/dvc
    def __init__(self, repo, config):
        self.repo = repo
        deps_ok = all(self.REQUIRES.values())
        if not deps_ok:
            missing = [k for k, v in self.REQUIRES.items() if v is None]
            url = config.get(Config.SECTION_REMOTE_URL,
                             "{}://".format(self.scheme))
            msg = ("URL '{}' is supported but requires these missing "
                   "dependencies: {}. If you have installed dvc using pip, "
                   "choose one of these options to proceed: \n"
                   "\n"
                   "    1) Install specific missing dependencies:\n"
                   "        pip install {}\n"
                   "    2) Install dvc package that includes those missing "
                   "dependencies: \n"
                   "        pip install dvc[{}]\n"
                   "    3) Install dvc package with all possible "
                   "dependencies included: \n"
                   "        pip install dvc[all]\n"
                   "\n"
                   "If you have installed dvc from a binary package and you "
                   "are still seeing this message, please report it to us "
                   "using https://github.com/iterative/dvc/issues. Thank you!"
                   ).format(url, missing, " ".join(missing), self.scheme)
            raise RemoteMissingDepsError(msg)

        self.protected = False
        self.state = StateBase()

        self._dir_info = {}
예제 #2
0
파일: base.py 프로젝트: shaunirwin/dvc
class RemoteBASE(object):
    scheme = None
    REGEX = None
    REQUIRES = {}
    JOBS = 4 * cpu_count()

    PARAM_RELPATH = "relpath"
    CHECKSUM_DIR_SUFFIX = ".dir"

    def __init__(self, repo, config):
        self.repo = repo
        deps_ok = all(self.REQUIRES.values())
        if not deps_ok:
            missing = [k for k, v in self.REQUIRES.items() if v is None]
            url = config.get(Config.SECTION_REMOTE_URL,
                             "{}://".format(self.scheme))
            msg = ("URL '{}' is supported but requires these missing "
                   "dependencies: {}. If you have installed dvc using pip, "
                   "choose one of these options to proceed: \n"
                   "\n"
                   "    1) Install specific missing dependencies:\n"
                   "        pip install {}\n"
                   "    2) Install dvc package that includes those missing "
                   "dependencies: \n"
                   "        pip install dvc[{}]\n"
                   "    3) Install dvc package with all possible "
                   "dependencies included: \n"
                   "        pip install dvc[all]\n"
                   "\n"
                   "If you have installed dvc from a binary package and you "
                   "are still seeing this message, please report it to us "
                   "using https://github.com/iterative/dvc/issues. Thank you!"
                   ).format(url, missing, " ".join(missing), self.scheme)
            raise RemoteMissingDepsError(msg)

        self.protected = False
        self.state = StateBase()

        self._dir_info = {}

    def __repr__(self):
        return "{class_name}: '{url}'".format(class_name=type(self).__name__,
                                              url=(self.url or "No url"))

    def compat_config(config):
        return config.copy()

    @classmethod
    def supported(cls, config):
        url = config[Config.SECTION_REMOTE_URL]
        return cls.match(url) is not None

    @staticmethod
    def _replace_sep(path, from_sep, to_sep):
        assert not ntpath.isabs(path)
        assert not posixpath.isabs(path)
        return path.replace(from_sep, to_sep)

    @classmethod
    def to_posixpath(cls, path):
        return cls._replace_sep(path, "\\", "/")

    @classmethod
    def to_ntpath(cls, path):
        return cls._replace_sep(path, "/", "\\")

    def to_ospath(self, path):
        if self.ospath == ntpath:
            return self.to_ntpath(path)
        elif self.ospath == posixpath:
            return self.to_posixpath(path)
        assert False

    @classmethod
    def match(cls, url):
        return re.match(cls.REGEX, url)

    def group(self, name):
        m = self.match(self.url)
        if not m:
            return None
        return m.group(name)

    @property
    def cache(self):
        return getattr(self.repo.cache, self.scheme)

    def get_file_checksum(self, path_info):
        raise NotImplementedError

    def _collect_dir(self, path_info):
        dir_info = []

        p_info = copy(path_info)
        dpath = p_info.path
        for root, dirs, files in self.walk(path_info):
            if len(files) > LARGE_DIR_SIZE:
                msg = ("Computing md5 for a large directory {}. "
                       "This is only done once.")
                relpath = self.ospath.relpath(root)
                logger.info(msg.format(relpath))
                files = progress(files, name=relpath)

            for fname in files:
                path = self.ospath.join(root, fname)
                p_info.path = path
                relpath = self.to_posixpath(self.ospath.relpath(path, dpath))

                checksum = self.get_file_checksum(p_info)
                dir_info.append({
                    self.PARAM_RELPATH: relpath,
                    self.PARAM_CHECKSUM: checksum,
                })

        # NOTE: sorting the list by path to ensure reproducibility
        return sorted(dir_info, key=itemgetter(self.PARAM_RELPATH))

    def get_dir_checksum(self, path_info):
        dir_info = self._collect_dir(path_info)
        checksum, from_info, to_info = self._get_dir_info_checksum(
            dir_info, path_info)
        if self.cache.changed_cache_file(checksum):
            self.cache.move(from_info, to_info)

        self.state.save(path_info, checksum)
        self.state.save(to_info, checksum)

        return checksum

    def _get_dir_info_checksum(self, dir_info, path_info):
        to_info = copy(path_info)
        to_info.path = self.cache.ospath.join(self.cache.prefix, tmp_fname(""))

        tmp = tempfile.NamedTemporaryFile(delete=False).name
        with open(tmp, "w+") as fobj:
            json.dump(dir_info, fobj, sort_keys=True)

        from_info = PathLOCAL(path=tmp)
        self.cache.upload([from_info], [to_info], no_progress_bar=True)

        checksum = self.get_file_checksum(to_info) + self.CHECKSUM_DIR_SUFFIX
        from_info = copy(to_info)
        to_info.path = self.cache.checksum_to_path(checksum)
        return checksum, from_info, to_info

    def get_dir_cache(self, checksum):
        assert checksum

        dir_info = self._dir_info.get(checksum)
        if dir_info:
            return dir_info

        dir_info = self.load_dir_cache(checksum)
        self._dir_info[checksum] = dir_info
        return dir_info

    def load_dir_cache(self, checksum):
        path_info = self.checksum_to_path_info(checksum)

        fobj = tempfile.NamedTemporaryFile(delete=False)
        path = fobj.name
        to_info = PathLOCAL(path=path)
        self.cache.download([path_info], [to_info], no_progress_bar=True)

        try:
            with open(path, "r") as fobj:
                d = json.load(fobj)
        except ValueError:
            logger.exception("Failed to load dir cache '{}'".format(path_info))
            return []
        finally:
            os.unlink(path)

        if not isinstance(d, list):
            msg = "dir cache file format error '{}' [skipping the file]"
            logger.error(msg.format(os.path.relpath(path)))
            return []

        for info in d:
            info[self.PARAM_RELPATH] = self.to_ospath(info[self.PARAM_RELPATH])

        return d

    @classmethod
    def is_dir_checksum(cls, checksum):
        return checksum.endswith(cls.CHECKSUM_DIR_SUFFIX)

    def get_checksum(self, path_info):
        if not self.exists(path_info):
            return None

        checksum = self.state.get(path_info)
        if checksum:
            return checksum

        if self.isdir(path_info):
            checksum = self.get_dir_checksum(path_info)
        else:
            checksum = self.get_file_checksum(path_info)

        if checksum:
            self.state.save(path_info, checksum)

        return checksum

    def save_info(self, path_info):
        assert path_info.scheme == self.scheme
        return {self.PARAM_CHECKSUM: self.get_checksum(path_info)}

    def changed(self, path_info, checksum_info):
        """Checks if data has changed.

        A file is considered changed if:
            - It doesn't exist on the working directory (was unlinked)
            - Checksum is not computed (saving a new file)
            - The checkusm stored in the State is different from the given one
            - There's no file in the cache

        Args:
            path_info: dict with path information.
            checksum: expected checksum for this data.

        Returns:
            bool: True if data has changed, False otherwise.
        """

        logger.debug("checking if '{}'('{}') has changed.".format(
            path_info, checksum_info))

        if not self.exists(path_info):
            logger.debug("'{}' doesn't exist.".format(path_info))
            return True

        checksum = checksum_info.get(self.PARAM_CHECKSUM)
        if checksum is None:
            logger.debug("checksum for '{}' is missing.".format(path_info))
            return True

        if self.changed_cache(checksum):
            logger.debug("cache for '{}'('{}') has changed.".format(
                path_info, checksum))
            return True

        actual = self.save_info(path_info)[self.PARAM_CHECKSUM]
        if checksum != actual:
            logger.debug(
                "checksum '{}'(actual '{}') for '{}' has changed.".format(
                    checksum, actual, path_info))
            return True

        logger.debug("'{}' hasn't changed.".format(path_info))
        return False

    def link(self, from_info, to_info):
        self.copy(from_info, to_info)

    def _save_file(self, path_info, checksum, save_link=True):
        assert checksum

        cache_info = self.checksum_to_path_info(checksum)
        if self.changed_cache(checksum):
            self.move(path_info, cache_info)
        else:
            self.remove(path_info)

        self.link(cache_info, path_info)

        if save_link:
            self.state.save_link(path_info)

        # we need to update path and cache, since in case of reflink,
        # or copy cache type moving original file results in updates on
        # next executed command, which causes md5 recalculation
        self.state.save(path_info, checksum)
        self.state.save(cache_info, checksum)

    def _save_dir(self, path_info, checksum):
        cache_info = self.checksum_to_path_info(checksum)
        dir_info = self.get_dir_cache(checksum)

        entry_info = copy(path_info)
        for entry in dir_info:
            entry_checksum = entry[self.PARAM_CHECKSUM]
            entry_info.path = self.ospath.join(path_info.path,
                                               entry[self.PARAM_RELPATH])

            self._save_file(entry_info, entry_checksum, save_link=False)

        self.state.save_link(path_info)
        self.state.save(cache_info, checksum)
        self.state.save(path_info, checksum)

    def is_empty(self, path_info):
        return False

    def isfile(self, path_info):
        raise NotImplementedError

    def isdir(self, path_info):
        return False

    def walk(self, path_info):
        raise NotImplementedError

    def protect(self, path_info):
        pass

    def save(self, path_info, checksum_info):
        if path_info.scheme != self.scheme:
            raise RemoteActionNotImplemented(
                "save {} -> {}".format(path_info.scheme, self.scheme),
                self.scheme,
            )

        checksum = checksum_info[self.PARAM_CHECKSUM]
        if not self.changed_cache(checksum):
            self._checkout(path_info, checksum)
            return

        self._save(path_info, checksum)

    def _save(self, path_info, checksum):
        to_info = self.checksum_to_path_info(checksum)
        logger.info("Saving '{}' to '{}'.".format(path_info, to_info))
        if self.isdir(path_info):
            self._save_dir(path_info, checksum)
            return
        self._save_file(path_info, checksum)

    def download(
        self,
        from_infos,
        to_infos,
        no_progress_bar=False,
        name=None,
        resume=False,
    ):
        raise RemoteActionNotImplemented("download", self.scheme)

    def upload(self, from_infos, to_infos, names=None, no_progress_bar=False):
        raise RemoteActionNotImplemented("upload", self.scheme)

    def remove(self, path_info):
        raise RemoteActionNotImplemented("remove", self.scheme)

    def move(self, from_info, to_info):
        self.copy(from_info, to_info)
        self.remove(from_info)

    def copy(self, from_info, to_info):
        raise RemoteActionNotImplemented("copy", self.scheme)

    def exists(self, path_infos):
        raise NotImplementedError

    @classmethod
    def _verify_path_args(cls, from_infos, to_infos, names=None):
        assert isinstance(from_infos, list)
        assert isinstance(to_infos, list)
        assert len(from_infos) == len(to_infos)

        if not names:
            names = len(to_infos) * [None]
        else:
            assert isinstance(names, list)
            assert len(names) == len(to_infos)

        return names

    @property
    def ospath(self):
        return posixpath

    def checksum_to_path(self, checksum):
        assert checksum
        return self.ospath.join(self.prefix, checksum[0:2], checksum[2:])

    def path_to_checksum(self, path):
        relpath = self.ospath.relpath(path, self.prefix)
        return self.ospath.dirname(relpath) + self.ospath.basename(relpath)

    def checksum_to_path_info(self, checksum):
        path_info = copy(self.path_info)
        path_info.path = self.checksum_to_path(checksum)
        return path_info

    def md5s_to_path_infos(self, md5s):
        return [self.checksum_to_path_info(md5) for md5 in md5s]

    def list_cache_paths(self):
        raise NotImplementedError

    def all(self):
        # NOTE: The list might be way too big(e.g. 100M entries, md5 for each
        # is 32 bytes, so ~3200Mb list) and we don't really need all of it at
        # the same time, so it makes sense to use a generator to gradually
        # iterate over it, without keeping all of it in memory.
        return (self.path_to_checksum(path)
                for path in self.list_cache_paths())

    def gc(self, cinfos):
        from dvc.remote.local import RemoteLOCAL

        used = {info[RemoteLOCAL.PARAM_CHECKSUM] for info in cinfos["local"]}

        if self.scheme != "":
            used |= {
                info[self.PARAM_CHECKSUM]
                for info in cinfos.get(self.scheme, [])
            }

        removed = False
        for checksum in self.all():
            if checksum in used:
                continue
            path_info = self.checksum_to_path_info(checksum)
            self.remove(path_info)
            removed = True
        return removed

    def changed_cache_file(self, checksum):
        """Compare the given checksum with the (corresponding) actual one.

        - Use `State` as a cache for computed checksums
            + The entries are invalidated by taking into account the following:
                * mtime
                * inode
                * size
                * checksum

        - Remove the file from cache if it doesn't match the actual checksum
        """
        cache_info = self.checksum_to_path_info(checksum)
        actual = self.get_checksum(cache_info)

        logger.debug("cache '{}' expected '{}' actual '{}'".format(
            str(cache_info), checksum, actual))

        if not checksum or not actual:
            return True

        if actual.split(".")[0] == checksum.split(".")[0]:
            return False

        if self.exists(cache_info):
            logger.warning("corrupted cache file '{}'.".format(cache_info))
            self.remove(cache_info)

        return True

    def _changed_dir_cache(self, checksum):
        if self.changed_cache_file(checksum):
            return True

        for entry in self.get_dir_cache(checksum):
            checksum = entry[self.PARAM_CHECKSUM]
            if self.changed_cache_file(checksum):
                return True

        return False

    def changed_cache(self, checksum):
        if self.is_dir_checksum(checksum):
            return self._changed_dir_cache(checksum)
        return self.changed_cache_file(checksum)

    def cache_exists(self, checksums):
        # NOTE: The reason for such an odd logic is that most of the remotes
        # take much shorter time to just retrieve everything they have under
        # a certain prefix(e.g. s3, gs, ssh, hdfs). Other remotes that can
        # check if particular file exists much quicker, use their own
        # implementation of cache_exists(see http, local).
        #
        # Result of all() might be way too big, so we should walk through
        # it in one pass.
        return list(filter(lambda checksum: checksum in checksums, self.all()))

    def already_cached(self, path_info):
        current = self.get_checksum(path_info)

        if not current:
            return False

        return not self.changed_cache(current)

    def safe_remove(self, path_info, force=False):
        if not self.exists(path_info):
            return

        if not force and not self.already_cached(path_info):
            msg = ("file '{}' is going to be removed."
                   " Are you sure you want to proceed?".format(str(path_info)))

            if not prompt.confirm(msg):
                raise ConfirmRemoveError(str(path_info))

        self.remove(path_info)

    def _checkout_file(self,
                       path_info,
                       checksum,
                       force,
                       progress_callback=None):
        cache_info = self.checksum_to_path_info(checksum)
        if self.exists(path_info):
            msg = "data '{}' exists. Removing before checkout."
            logger.warning(msg.format(str(path_info)))
            self.safe_remove(path_info, force=force)

        self.link(cache_info, path_info)
        self.state.save_link(path_info)
        self.state.save(path_info, checksum)
        if progress_callback:
            progress_callback.update(path_info.url)

    def makedirs(self, path_info):
        raise NotImplementedError

    def _checkout_dir(self,
                      path_info,
                      checksum,
                      force,
                      progress_callback=None):
        # Create dir separately so that dir is created
        # even if there are no files in it
        if not self.exists(path_info):
            self.makedirs(path_info)

        dir_info = self.get_dir_cache(checksum)

        logger.debug("Linking directory '{}'.".format(path_info))

        entry_info = copy(path_info)
        for entry in dir_info:
            relpath = entry[self.PARAM_RELPATH]
            entry_checksum = entry[self.PARAM_CHECKSUM]
            entry_cache_info = self.checksum_to_path_info(entry_checksum)
            entry_info.url = self.ospath.join(path_info.url, relpath)
            entry_info.path = self.ospath.join(path_info.path, relpath)

            entry_checksum_info = {self.PARAM_CHECKSUM: entry_checksum}
            if self.changed(entry_info, entry_checksum_info):
                if self.exists(entry_info):
                    self.safe_remove(entry_info, force=force)
                self.link(entry_cache_info, entry_info)
                self.state.save(entry_info, entry_checksum)
            if progress_callback:
                progress_callback.update(entry_info.url)

        self._remove_redundant_files(path_info, dir_info, force)

        self.state.save_link(path_info)
        self.state.save(path_info, checksum)

    def _remove_redundant_files(self, path_info, dir_info, force):
        existing_files = set(
            self.ospath.join(root, fname)
            for root, _, files in self.walk(path_info) for fname in files)

        needed_files = set(
            self.ospath.join(path_info.path, entry[self.PARAM_RELPATH])
            for entry in dir_info)

        delta = existing_files - needed_files

        d_info = copy(path_info)
        for path in delta:
            d_info.path = path
            self.safe_remove(d_info, force)

    def checkout(self,
                 path_info,
                 checksum_info,
                 force=False,
                 progress_callback=None):
        if (path_info.scheme not in ["local"]
                and path_info.scheme != self.scheme):
            raise NotImplementedError

        checksum = checksum_info.get(self.PARAM_CHECKSUM)
        if not checksum:
            msg = "No checksum info for '{}'."
            logger.debug(msg.format(str(path_info)))
            return

        if not self.changed(path_info, checksum_info):
            msg = "Data '{}' didn't change."
            logger.debug(msg.format(str(path_info)))
            return

        if self.changed_cache(checksum):
            msg = "Cache '{}' not found. File '{}' won't be created."
            logger.warning(msg.format(checksum, str(path_info)))
            self.safe_remove(path_info, force=force)
            return

        msg = "Checking out '{}' with cache '{}'."
        logger.debug(msg.format(str(path_info), checksum))

        return self._checkout(path_info, checksum, force, progress_callback)

    def _checkout(self,
                  path_info,
                  checksum,
                  force=False,
                  progress_callback=None):
        if not self.is_dir_checksum(checksum):
            return self._checkout_file(path_info,
                                       checksum,
                                       force,
                                       progress_callback=progress_callback)
        return self._checkout_dir(path_info,
                                  checksum,
                                  force,
                                  progress_callback=progress_callback)

    @staticmethod
    def unprotect(path_info):
        pass
예제 #3
0
파일: base.py 프로젝트: sauravsrijan/dvc
class RemoteBASE(object):
    scheme = "base"
    path_cls = URLInfo
    REQUIRES = {}
    JOBS = 4 * cpu_count()

    PARAM_RELPATH = "relpath"
    CHECKSUM_DIR_SUFFIX = ".dir"

    def __init__(self, repo, config):
        self.repo = repo
        deps_ok = all(self.REQUIRES.values())
        if not deps_ok:
            missing = [k for k, v in self.REQUIRES.items() if v is None]
            url = config.get(Config.SECTION_REMOTE_URL,
                             "{}://".format(self.scheme))
            msg = ("URL '{}' is supported but requires these missing "
                   "dependencies: {}. If you have installed dvc using pip, "
                   "choose one of these options to proceed: \n"
                   "\n"
                   "    1) Install specific missing dependencies:\n"
                   "        pip install {}\n"
                   "    2) Install dvc package that includes those missing "
                   "dependencies: \n"
                   "        pip install dvc[{}]\n"
                   "    3) Install dvc package with all possible "
                   "dependencies included: \n"
                   "        pip install dvc[all]\n"
                   "\n"
                   "If you have installed dvc from a binary package and you "
                   "are still seeing this message, please report it to us "
                   "using https://github.com/iterative/dvc/issues. Thank you!"
                   ).format(url, missing, " ".join(missing), self.scheme)
            raise RemoteMissingDepsError(msg)

        core = config.get(Config.SECTION_CORE, {})
        self.checksum_jobs = core.get(Config.SECTION_CORE_CHECKSUM_JOBS,
                                      CHECKSUM_JOBS)

        self.protected = False
        self.no_traverse = config.get(Config.SECTION_REMOTE_NO_TRAVERSE)
        self.state = StateBase()
        self._dir_info = {}

    def __repr__(self):
        return "{class_name}: '{path_info}'".format(
            class_name=type(self).__name__,
            path_info=self.path_info or "No path",
        )

    def compat_config(config):
        return config.copy()

    @classmethod
    def supported(cls, config):
        if isinstance(config, basestring):
            url = config
        else:
            url = config[Config.SECTION_REMOTE_URL]

        # NOTE: silently skipping remote, calling code should handle that
        parsed = urlparse(url)
        return parsed.scheme == cls.scheme

    @property
    def cache(self):
        return getattr(self.repo.cache, self.scheme)

    def get_file_checksum(self, path_info):
        raise NotImplementedError

    def _collect_dir(self, path_info):
        dir_info = {}

        with ThreadPoolExecutor(max_workers=self.checksum_jobs) as executor:
            for root, _dirs, files in self.walk(path_info):
                root_info = path_info / root

                for fname in files:

                    if fname == DvcIgnore.DVCIGNORE_FILE:
                        raise DvcIgnoreInCollectedDirError(root)

                    file_info = root_info / fname
                    relative_path = file_info.relative_to(path_info)
                    checksum = executor.submit(self.get_file_checksum,
                                               file_info)
                    dir_info[checksum] = {
                        # NOTE: this is lossy transformation:
                        #   "hey\there" -> "hey/there"
                        #   "hey/there" -> "hey/there"
                        # The latter is fine filename on Windows, which
                        # will transform to dir/file on back transform.
                        #
                        # Yes, this is a BUG, as long as we permit "/" in
                        # filenames on Windows and "\" on Unix
                        self.PARAM_RELPATH:
                        relative_path.as_posix()
                    }

            checksums = as_completed(dir_info)
            if len(dir_info) > LARGE_DIR_SIZE:
                msg = ("Computing md5 for a large number of files. "
                       "This is only done once.")
                logger.info(msg)
                checksums = progress(checksums, total=len(dir_info))

            # Resolving futures
            for checksum in checksums:
                entry = dir_info[checksum]
                entry[self.PARAM_CHECKSUM] = checksum.result()

        # Sorting the list by path to ensure reproducibility
        return sorted(dir_info.values(), key=itemgetter(self.PARAM_RELPATH))

    def get_dir_checksum(self, path_info):
        dir_info = self._collect_dir(path_info)
        checksum, tmp_info = self._get_dir_info_checksum(dir_info)
        new_info = self.cache.checksum_to_path_info(checksum)
        if self.cache.changed_cache_file(checksum):
            self.cache.move(tmp_info, new_info)

        self.state.save(path_info, checksum)
        self.state.save(new_info, checksum)

        return checksum

    def _get_dir_info_checksum(self, dir_info):
        tmp = tempfile.NamedTemporaryFile(delete=False).name
        with open(tmp, "w+") as fobj:
            json.dump(dir_info, fobj, sort_keys=True)

        from_info = PathInfo(tmp)
        to_info = self.cache.path_info / tmp_fname("")
        self.cache.upload(from_info, to_info, no_progress_bar=True)

        checksum = self.get_file_checksum(to_info) + self.CHECKSUM_DIR_SUFFIX
        return checksum, to_info

    def get_dir_cache(self, checksum):
        assert checksum

        dir_info = self._dir_info.get(checksum)
        if dir_info:
            return dir_info

        dir_info = self.load_dir_cache(checksum)
        self._dir_info[checksum] = dir_info
        return dir_info

    def load_dir_cache(self, checksum):
        path_info = self.checksum_to_path_info(checksum)

        fobj = tempfile.NamedTemporaryFile(delete=False)
        path = fobj.name
        to_info = PathInfo(path)
        self.cache.download(path_info, to_info, no_progress_bar=True)

        try:
            with open(path, "r") as fobj:
                d = json.load(fobj)
        except ValueError:
            logger.exception("Failed to load dir cache '{}'".format(path_info))
            return []
        finally:
            os.unlink(path)

        if not isinstance(d, list):
            msg = "dir cache file format error '{}' [skipping the file]"
            logger.error(msg.format(relpath(path)))
            return []

        for info in d:
            # NOTE: here is a BUG, see comment to .as_posix() below
            relative_path = PathInfo.from_posix(info[self.PARAM_RELPATH])
            info[self.PARAM_RELPATH] = relative_path.fspath

        return d

    @classmethod
    def is_dir_checksum(cls, checksum):
        return checksum.endswith(cls.CHECKSUM_DIR_SUFFIX)

    def get_checksum(self, path_info):
        if not self.exists(path_info):
            return None

        checksum = self.state.get(path_info)

        # If we have dir checksum in state db, but dir cache file is lost,
        # then we need to recollect the dir via .get_dir_checksum() call below,
        # see https://github.com/iterative/dvc/issues/2219 for context
        if (checksum and self.is_dir_checksum(checksum) and
                not self.exists(self.cache.checksum_to_path_info(checksum))):
            checksum = None

        if checksum:
            return checksum

        if self.isdir(path_info):
            checksum = self.get_dir_checksum(path_info)
        else:
            checksum = self.get_file_checksum(path_info)

        if checksum:
            self.state.save(path_info, checksum)

        return checksum

    def save_info(self, path_info):
        assert path_info.scheme == self.scheme
        return {self.PARAM_CHECKSUM: self.get_checksum(path_info)}

    def changed(self, path_info, checksum_info):
        """Checks if data has changed.

        A file is considered changed if:
            - It doesn't exist on the working directory (was unlinked)
            - Checksum is not computed (saving a new file)
            - The checkusm stored in the State is different from the given one
            - There's no file in the cache

        Args:
            path_info: dict with path information.
            checksum: expected checksum for this data.

        Returns:
            bool: True if data has changed, False otherwise.
        """

        logger.debug("checking if '{}'('{}') has changed.".format(
            path_info, checksum_info))

        if not self.exists(path_info):
            logger.debug("'{}' doesn't exist.".format(path_info))
            return True

        checksum = checksum_info.get(self.PARAM_CHECKSUM)
        if checksum is None:
            logger.debug("checksum for '{}' is missing.".format(path_info))
            return True

        if self.changed_cache(checksum):
            logger.debug("cache for '{}'('{}') has changed.".format(
                path_info, checksum))
            return True

        actual = self.save_info(path_info)[self.PARAM_CHECKSUM]
        if checksum != actual:
            logger.debug(
                "checksum '{}'(actual '{}') for '{}' has changed.".format(
                    checksum, actual, path_info))
            return True

        logger.debug("'{}' hasn't changed.".format(path_info))
        return False

    def link(self, from_info, to_info, link_type=None):
        self.copy(from_info, to_info)

    def _save_file(self, path_info, checksum, save_link=True):
        assert checksum

        cache_info = self.checksum_to_path_info(checksum)
        if self.changed_cache(checksum):
            self.move(path_info, cache_info)
        else:
            self.remove(path_info)

        self.link(cache_info, path_info)

        if save_link:
            self.state.save_link(path_info)

        # we need to update path and cache, since in case of reflink,
        # or copy cache type moving original file results in updates on
        # next executed command, which causes md5 recalculation
        self.state.save(path_info, checksum)
        self.state.save(cache_info, checksum)

    def _save_dir(self, path_info, checksum):
        cache_info = self.checksum_to_path_info(checksum)
        dir_info = self.get_dir_cache(checksum)

        for entry in dir_info:
            entry_info = path_info / entry[self.PARAM_RELPATH]
            entry_checksum = entry[self.PARAM_CHECKSUM]
            self._save_file(entry_info, entry_checksum, save_link=False)

        self.state.save_link(path_info)
        self.state.save(cache_info, checksum)
        self.state.save(path_info, checksum)

    def is_empty(self, path_info):
        return False

    def isfile(self, path_info):
        raise NotImplementedError

    def isdir(self, path_info):
        return False

    def walk(self, path_info):
        raise NotImplementedError

    @staticmethod
    def protect(path_info):
        pass

    def save(self, path_info, checksum_info):
        if path_info.scheme != self.scheme:
            raise RemoteActionNotImplemented(
                "save {} -> {}".format(path_info.scheme, self.scheme),
                self.scheme,
            )

        checksum = checksum_info[self.PARAM_CHECKSUM]
        if not self.changed_cache(checksum):
            self._checkout(path_info, checksum)
            return

        self._save(path_info, checksum)

    def _save(self, path_info, checksum):
        to_info = self.checksum_to_path_info(checksum)
        logger.info("Saving '{}' to '{}'.".format(path_info, to_info))
        if self.isdir(path_info):
            self._save_dir(path_info, checksum)
            return
        self._save_file(path_info, checksum)

    def upload(self, from_info, to_info, name=None, no_progress_bar=False):
        if not hasattr(self, "_upload"):
            raise RemoteActionNotImplemented("upload", self.scheme)

        if to_info.scheme != self.scheme:
            raise NotImplementedError

        if from_info.scheme != "local":
            raise NotImplementedError

        logger.debug("Uploading '{}' to '{}'".format(from_info, to_info))

        name = name or from_info.name

        if not no_progress_bar:
            progress.update_target(name, 0, None)

        try:
            self._upload(
                from_info.fspath,
                to_info,
                name=name,
                no_progress_bar=no_progress_bar,
            )
        except Exception:
            msg = "failed to upload '{}' to '{}'"
            logger.exception(msg.format(from_info, to_info))
            return 1  # 1 fail

        if not no_progress_bar:
            progress.finish_target(name)

        return 0

    def download(self, from_info, to_info, name=None, no_progress_bar=False):
        if not hasattr(self, "_download"):
            raise RemoteActionNotImplemented("download", self.scheme)

        if from_info.scheme != self.scheme:
            raise NotImplementedError

        if to_info.scheme == self.scheme != "local":
            self.copy(from_info, to_info)
            return 0

        if to_info.scheme != "local":
            raise NotImplementedError

        logger.debug("Downloading '{}' to '{}'".format(from_info, to_info))

        name = name or to_info.name

        if not no_progress_bar:
            # real progress is not always available,
            # lets at least show start and finish
            progress.update_target(name, 0, None)

        makedirs(fspath_py35(to_info.parent), exist_ok=True)
        tmp_file = tmp_fname(to_info)

        try:
            self._download(from_info,
                           tmp_file,
                           name=name,
                           no_progress_bar=no_progress_bar)
        except Exception:
            msg = "failed to download '{}' to '{}'"
            logger.exception(msg.format(from_info, to_info))
            return 1  # 1 fail

        move(tmp_file, fspath_py35(to_info))

        if not no_progress_bar:
            progress.finish_target(name)

        return 0

    def remove(self, path_info):
        raise RemoteActionNotImplemented("remove", self.scheme)

    def move(self, from_info, to_info):
        self.copy(from_info, to_info)
        self.remove(from_info)

    def copy(self, from_info, to_info):
        raise RemoteActionNotImplemented("copy", self.scheme)

    def exists(self, path_info):
        raise NotImplementedError

    def path_to_checksum(self, path):
        return "".join(self.path_cls(path).parts[-2:])

    def checksum_to_path_info(self, checksum):
        return self.path_info / checksum[0:2] / checksum[2:]

    def list_cache_paths(self):
        raise NotImplementedError

    def all(self):
        # NOTE: The list might be way too big(e.g. 100M entries, md5 for each
        # is 32 bytes, so ~3200Mb list) and we don't really need all of it at
        # the same time, so it makes sense to use a generator to gradually
        # iterate over it, without keeping all of it in memory.
        return (self.path_to_checksum(path)
                for path in self.list_cache_paths())

    def gc(self, cinfos):
        used = self.extract_used_local_checksums(cinfos)

        if self.scheme != "":
            used |= {
                info[self.PARAM_CHECKSUM]
                for info in cinfos.get(self.scheme, [])
            }

        removed = False
        for checksum in self.all():
            if checksum in used:
                continue
            path_info = self.checksum_to_path_info(checksum)
            self.remove(path_info)
            removed = True
        return removed

    def changed_cache_file(self, checksum):
        """Compare the given checksum with the (corresponding) actual one.

        - Use `State` as a cache for computed checksums
            + The entries are invalidated by taking into account the following:
                * mtime
                * inode
                * size
                * checksum

        - Remove the file from cache if it doesn't match the actual checksum
        """
        cache_info = self.checksum_to_path_info(checksum)
        actual = self.get_checksum(cache_info)

        logger.debug("cache '{}' expected '{}' actual '{}'".format(
            str(cache_info), checksum, actual))

        if not checksum or not actual:
            return True

        if actual.split(".")[0] == checksum.split(".")[0]:
            return False

        if self.exists(cache_info):
            logger.warning("corrupted cache file '{}'.".format(cache_info))
            self.remove(cache_info)

        return True

    def _changed_dir_cache(self, checksum):
        if self.changed_cache_file(checksum):
            return True

        if not self._changed_unpacked_dir(checksum):
            return False

        for entry in self.get_dir_cache(checksum):
            entry_checksum = entry[self.PARAM_CHECKSUM]
            if self.changed_cache_file(entry_checksum):
                return True

        self._update_unpacked_dir(checksum)
        return False

    def changed_cache(self, checksum):
        if self.is_dir_checksum(checksum):
            return self._changed_dir_cache(checksum)
        return self.changed_cache_file(checksum)

    def cache_exists(self, checksums):
        """Check if the given checksums are stored in the remote.

        There are two ways of performing this check:

        - Traverse: Get a list of all the files in the remote
            (traversing the cache directory) and compare it with
            the given checksums.

        - No traverse: For each given checksum, run the `exists`
            method and filter the checksums that aren't on the remote.
            This is done in parallel threads.
            It also shows a progress bar when performing the check.

        The reason for such an odd logic is that most of the remotes
        take much shorter time to just retrieve everything they have under
        a certain prefix (e.g. s3, gs, ssh, hdfs). Other remotes that can
        check if particular file exists much quicker, use their own
        implementation of cache_exists (see http, local).

        Returns:
            A list with checksums that were found in the remote
        """
        progress_callback = ProgressCallback(len(checksums))

        def exists_with_progress(chunks):
            return self.batch_exists(chunks, callback=progress_callback)

        if self.no_traverse and hasattr(self, "batch_exists"):
            with ThreadPoolExecutor(max_workers=self.JOBS) as executor:
                path_infos = [self.checksum_to_path_info(x) for x in checksums]
                chunks = to_chunks(path_infos, num_chunks=self.JOBS)
                results = executor.map(exists_with_progress, chunks)
                in_remote = itertools.chain.from_iterable(results)
                ret = list(itertools.compress(checksums, in_remote))
                progress_callback.finish("")
                return ret

        return list(set(checksums) & set(self.all()))

    def already_cached(self, path_info):
        current = self.get_checksum(path_info)

        if not current:
            return False

        return not self.changed_cache(current)

    def safe_remove(self, path_info, force=False):
        if not self.exists(path_info):
            return

        if not force and not self.already_cached(path_info):
            msg = ("file '{}' is going to be removed."
                   " Are you sure you want to proceed?".format(str(path_info)))

            if not prompt.confirm(msg):
                raise ConfirmRemoveError(str(path_info))

        self.remove(path_info)

    def _checkout_file(self,
                       path_info,
                       checksum,
                       force,
                       progress_callback=None):
        cache_info = self.checksum_to_path_info(checksum)
        if self.exists(path_info):
            msg = "data '{}' exists. Removing before checkout."
            logger.warning(msg.format(str(path_info)))
            self.safe_remove(path_info, force=force)

        self.link(cache_info, path_info)
        self.state.save_link(path_info)
        self.state.save(path_info, checksum)
        if progress_callback:
            progress_callback.update(str(path_info))

    def makedirs(self, path_info):
        raise NotImplementedError

    def _checkout_dir(self,
                      path_info,
                      checksum,
                      force,
                      progress_callback=None):
        # Create dir separately so that dir is created
        # even if there are no files in it
        if not self.exists(path_info):
            self.makedirs(path_info)

        dir_info = self.get_dir_cache(checksum)

        logger.debug("Linking directory '{}'.".format(path_info))

        for entry in dir_info:
            relative_path = entry[self.PARAM_RELPATH]
            entry_checksum = entry[self.PARAM_CHECKSUM]
            entry_cache_info = self.checksum_to_path_info(entry_checksum)
            entry_info = path_info / relative_path

            entry_checksum_info = {self.PARAM_CHECKSUM: entry_checksum}
            if self.changed(entry_info, entry_checksum_info):
                if self.exists(entry_info):
                    self.safe_remove(entry_info, force=force)
                self.link(entry_cache_info, entry_info)
                self.state.save(entry_info, entry_checksum)
            if progress_callback:
                progress_callback.update(str(entry_info))

        self._remove_redundant_files(path_info, dir_info, force)

        self.state.save_link(path_info)
        self.state.save(path_info, checksum)

    def _remove_redundant_files(self, path_info, dir_info, force):
        existing_files = set(path_info / root / fname
                             for root, _, files in self.walk(path_info)
                             for fname in files)

        needed_files = {
            path_info / entry[self.PARAM_RELPATH]
            for entry in dir_info
        }

        for path in existing_files - needed_files:
            self.safe_remove(path, force)

    def checkout(self,
                 path_info,
                 checksum_info,
                 force=False,
                 progress_callback=None):
        if path_info.scheme not in ["local", self.scheme]:
            raise NotImplementedError

        checksum = checksum_info.get(self.PARAM_CHECKSUM)
        if not checksum:
            logger.warning("No checksum info found for '{}'. "
                           "It won't be created.".format(str(path_info)))
            self.safe_remove(path_info, force=force)
            return

        if not self.changed(path_info, checksum_info):
            msg = "Data '{}' didn't change."
            logger.debug(msg.format(str(path_info)))
            return

        if self.changed_cache(checksum):
            msg = "Cache '{}' not found. File '{}' won't be created."
            logger.warning(msg.format(checksum, str(path_info)))
            self.safe_remove(path_info, force=force)
            return

        msg = "Checking out '{}' with cache '{}'."
        logger.debug(msg.format(str(path_info), checksum))

        return self._checkout(path_info, checksum, force, progress_callback)

    def _checkout(self,
                  path_info,
                  checksum,
                  force=False,
                  progress_callback=None):
        if not self.is_dir_checksum(checksum):
            return self._checkout_file(path_info,
                                       checksum,
                                       force,
                                       progress_callback=progress_callback)
        return self._checkout_dir(path_info,
                                  checksum,
                                  force,
                                  progress_callback=progress_callback)

    @staticmethod
    def unprotect(path_info):
        pass

    def _get_unpacked_dir_names(self, checksums):
        return set()

    def extract_used_local_checksums(self, cinfos):
        from dvc.remote import RemoteLOCAL

        used = {info[RemoteLOCAL.PARAM_CHECKSUM] for info in cinfos["local"]}
        unpacked = self._get_unpacked_dir_names(used)
        return used | unpacked

    def _changed_unpacked_dir(self, checksum):
        return True

    def _update_unpacked_dir(self, checksum, progress_callback=None):
        pass