def _collect_dir(self, path_info): dir_info = [] p_info = copy(path_info) dpath = p_info.path for root, dirs, files in self.walk(path_info): if len(files) > LARGE_DIR_SIZE: msg = ("Computing md5 for a large directory {}. " "This is only done once.") relpath = self.ospath.relpath(root) logger.info(msg.format(relpath)) files = progress(files, name=relpath) for fname in files: path = self.ospath.join(root, fname) p_info.path = path relpath = self.to_posixpath(self.ospath.relpath(path, dpath)) checksum = self.get_file_checksum(p_info) dir_info.append({ self.PARAM_RELPATH: relpath, self.PARAM_CHECKSUM: checksum, }) # NOTE: sorting the list by path to ensure reproducibility return sorted(dir_info, key=itemgetter(self.PARAM_RELPATH))
def _collect_dir(self, path_info): dir_info = [] for root, dirs, files in self.walk(path_info): if len(files) > LARGE_DIR_SIZE: msg = ("Computing md5 for a large directory {}. " "This is only done once.") title = str(self.path_cls(root)) logger.info(msg.format(title)) files = progress(files, name=title) for fname in files: file_info = self.path_cls(root) / fname relative_path = file_info.relative_to(path_info) dir_info.append({ # NOTE: this is lossy transformation: # "hey\there" -> "hey/there" # "hey/there" -> "hey/there" # The latter is fine filename on Windows, # which will transform to dir/file on back transform. # # Yes, this is a BUG, as long as we permit "/" in # filenames on Windows and "\" on Unix self.PARAM_RELPATH: relative_path.as_posix(), self.PARAM_CHECKSUM: self.get_file_checksum(file_info), }) # NOTE: sorting the list by path to ensure reproducibility return sorted(dir_info, key=itemgetter(self.PARAM_RELPATH))
def _create_unpacked_dir(self, checksum, dir_info, unpacked_dir_info): self.makedirs(unpacked_dir_info) for entry in progress(dir_info, name="Created unpacked dir"): entry_cache_info = self.checksum_to_path_info( entry[self.PARAM_CHECKSUM]) relpath = entry[self.PARAM_RELPATH] self.link(entry_cache_info, unpacked_dir_info / relpath, "hardlink") self.state.save(unpacked_dir_info, checksum)
def _calculate_checksums(self, file_infos): file_infos = list(file_infos) with ThreadPoolExecutor(max_workers=self.checksum_jobs) as executor: tasks = executor.map(self.get_file_checksum, file_infos) if len(file_infos) > LARGE_DIR_SIZE: msg = ("Computing md5 for a large number of files. " "This is only done once.") logger.info(msg) tasks = progress(tasks, total=len(file_infos)) checksums = { file_infos[index]: task for index, task in enumerate(tasks) } return checksums
def _create_unpacked_dir(self, checksum, dir_info, unpacked_dir_info): self.makedirs(unpacked_dir_info) for entry in progress(dir_info, name="Created unpacked dir"): entry_cache_info = self.checksum_to_path_info( entry[self.PARAM_CHECKSUM]) relative_path = entry[self.PARAM_RELPATH] # In shared cache mode some cache files might not be owned by the # user, so we need to use symlinks because, unless # /proc/sys/fs/protected_hardlinks is disabled, the user is not # allowed to create hardlinks to files that he doesn't own. link_types = ["hardlink", "symlink"] self._link(entry_cache_info, unpacked_dir_info / relative_path, link_types) self.state.save(unpacked_dir_info, checksum)
def _collect_dir(self, path_info): dir_info = {} with ThreadPoolExecutor(max_workers=self.checksum_jobs) as executor: for root, _dirs, files in self.walk(path_info): root_info = path_info / root for fname in files: if fname == DvcIgnore.DVCIGNORE_FILE: raise DvcIgnoreInCollectedDirError(root) file_info = root_info / fname relative_path = file_info.relative_to(path_info) checksum = executor.submit( self.get_file_checksum, file_info ) dir_info[checksum] = { # NOTE: this is lossy transformation: # "hey\there" -> "hey/there" # "hey/there" -> "hey/there" # The latter is fine filename on Windows, which # will transform to dir/file on back transform. # # Yes, this is a BUG, as long as we permit "/" in # filenames on Windows and "\" on Unix self.PARAM_RELPATH: relative_path.as_posix() } checksums = as_completed(dir_info) if len(dir_info) > LARGE_DIR_SIZE: msg = ( "Computing md5 for a large number of files. " "This is only done once." ) logger.info(msg) checksums = progress(checksums, total=len(dir_info)) # NOTE: resolving futures for checksum in checksums: entry = dir_info[checksum] entry[self.PARAM_CHECKSUM] = checksum.result() # NOTE: sorting the list by path to ensure reproducibility return sorted(dir_info.values(), key=itemgetter(self.PARAM_RELPATH))
def _get_plans(self, download, remote, status_info, status): cache = [] path_infos = [] names = [] for md5, info in progress(status_info.items(), name="Analysing status"): if info["status"] == status: cache.append(self.checksum_to_path_info(md5)) path_infos.append(remote.checksum_to_path_info(md5)) names.append(info["name"]) if download: to_infos = cache from_infos = path_infos else: to_infos = path_infos from_infos = cache return from_infos, to_infos, names
def cache_exists(self, md5s): return [ checksum for checksum in progress(md5s) if not self.changed_cache_file(checksum) ]
def cache_exists(self, checksums, jobs=None): return [ checksum for checksum in progress(checksums) if not self.changed_cache_file(checksum) ]