def get_file_id(rec): """Returns a suitable '@id' of a file metadata from a status result Prefer a present annex key, but fall back on the Git shasum that is always around. Identify the GITSHA as such, and in a similar manner to git-annex's style. Any ID string is prefixed with 'datalad:' to identify it as a DataLad-recognized ID. This prefix is defined in the main JSON-LD context defintion. """ id_ = rec['key'] if 'key' in rec else 'SHA1-s{}--{}'.format( rec['bytesize'] if 'bytesize' in rec else 0 if rec['type'] == 'symlink' else os.stat(rec['path']).st_size, rec['gitshasum'] if 'gitshasum' in rec else Digester( digests=['sha1'])(rec['path'])['sha1']) return 'datalad:{}'.format(id_)
def get_mtimes_and_digests(target_path): """Return digests (md5) and mtimes for all the files under target_path""" from datalad.utils import find_files from datalad.support.digests import Digester digester = Digester(['md5']) # bother only with existing ones for this test, i.e. skip annexed files without content target_files = [ f for f in find_files('.*', topdir=target_path, exclude_vcs=False, exclude_datalad=False) if exists(f) ] # let's leave only relative paths for easier analysis target_files_ = [relpath(f, target_path) for f in target_files] digests = {frel: digester(f) for f, frel in zip(target_files, target_files_)} mtimes = {frel: os.stat(f).st_mtime for f, frel in zip(target_files, target_files_)} return digests, mtimes
def _the_same_across_datasets(relpath, *dss): """Check if the file (present content or not) is identical across two datasets Compares files by content if under git, or by checksum if under annex Parameters ---------- *ds: Datasets relpath: str path within datasets Returns ------- bool or None True if identical, False if not, None if cannot be decided (e.g. different git-annex backend used) """ from datalad.utils import md5sum, unique from datalad.support.exceptions import FileInGitError from datalad.support.digests import Digester paths = [op.join(ds.path, relpath) for ds in dss] # The simplest check first -- exist in both and content is the same. # Even if content is just a symlink file on windows, the same content # condition would be correct if all(map(op.exists, paths)) and all_same(map(md5sum, paths)): return True # We first need to find problematic ones which are annexed and # have no content locally, and take their keys = [] backends = [] presents = [] for ds in dss: repo = ds.repo key = None present = True if isinstance(repo, AnnexRepo): annexprops = repo.get_file_annexinfo( relpath, eval_availability=True) if 'key' not in annexprops: continue key = annexprops['key'] # For now the rest (e.g. not tracked) remains an error if not annexprops['has_content']: present = False backends.append(repo.get_key_backend(key)) keys.append(key) presents.append(present) if all(presents): return all_same(map(md5sum, paths)) backends = unique(backends) assert backends, "Since not all present - some must be under annex, and thus must have a backend!" # so some files are missing! assert not all(presents) NeedContentError = RuntimeError if len(backends) > 1: # TODO: or signal otherwise somehow that we just need to get at least some # of those files to do the check!... raise NeedContentError( "Following paths are missing content and have different annex " "backends: %s. Cannot determine here if the same or not!" % ", ".join(p for (p, b) in zip(paths, presents) if not b) ) backend = backends[0].lower() if backend.endswith('E'): backend = backend[':-1'] if backend not in Digester.DEFAULT_DIGESTS: raise NeedContentError( "Do not know how to figure out content check for backend %s" % backend ) checksums = [ split_ext(key).split('--', 1)[1] if key else key for key in keys ] thechecksum = set( checksum for present, checksum in zip(presents, checksums) if present ) if len(thechecksum) > 1: # Different checksum (with the same backend) return False elif not thechecksum: raise RuntimeError("We must have had at least one key since prior logic" " showed that not all files have content here") thechecksum = thechecksum[0] if any(presents): # We do need to extract checksum from the key and check the present # files' content to match digester = Digester([backend]) for present, path in zip(presents, paths): if present and digester(path)[backend] != thechecksum: return False return True return False