예제 #1
0
def get_file_id(rec):
    """Returns a suitable '@id' of a file metadata from a status result

    Prefer a present annex key, but fall back on the Git shasum that is
    always around. Identify the GITSHA as such, and in a similar manner
    to git-annex's style.

    Any ID string is prefixed with 'datalad:' to identify it as a
    DataLad-recognized ID. This prefix is defined in the main JSON-LD
    context defintion.
    """
    id_ = rec['key'] if 'key' in rec else 'SHA1-s{}--{}'.format(
        rec['bytesize'] if 'bytesize' in rec else
        0 if rec['type'] == 'symlink' else os.stat(rec['path']).st_size,
        rec['gitshasum'] if 'gitshasum' in rec else Digester(
            digests=['sha1'])(rec['path'])['sha1'])
    return 'datalad:{}'.format(id_)
예제 #2
0
def get_mtimes_and_digests(target_path):
    """Return digests (md5) and mtimes for all the files under target_path"""
    from datalad.utils import find_files
    from datalad.support.digests import Digester
    digester = Digester(['md5'])

    # bother only with existing ones for this test, i.e. skip annexed files without content
    target_files = [
        f for f in find_files('.*', topdir=target_path, exclude_vcs=False, exclude_datalad=False)
        if exists(f)
    ]
    # let's leave only relative paths for easier analysis
    target_files_ = [relpath(f, target_path) for f in target_files]

    digests = {frel: digester(f) for f, frel in zip(target_files, target_files_)}
    mtimes = {frel: os.stat(f).st_mtime for f, frel in zip(target_files, target_files_)}
    return digests, mtimes
예제 #3
0
def _the_same_across_datasets(relpath, *dss):
    """Check if the file (present content or not) is identical across two datasets

    Compares files by content if under git, or by checksum if under annex

    Parameters
    ----------
    *ds: Datasets
    relpath: str
        path within datasets

    Returns
    -------
    bool or None
      True if identical, False if not, None if cannot be decided
      (e.g. different git-annex backend used)
    """
    from datalad.utils import md5sum, unique
    from datalad.support.exceptions import FileInGitError
    from datalad.support.digests import Digester

    paths = [op.join(ds.path, relpath) for ds in dss]
    # The simplest check first -- exist in both and content is the same.
    # Even if content is just a symlink file on windows, the same content
    # condition would be correct
    if all(map(op.exists, paths)) and all_same(map(md5sum, paths)):
        return True

    # We first need to find problematic ones which are annexed and
    # have no content locally, and take their
    keys = []
    backends = []
    presents = []
    for ds in dss:
        repo = ds.repo
        key = None
        present = True
        if isinstance(repo, AnnexRepo):
            annexprops = repo.get_file_annexinfo(
                relpath, eval_availability=True)
            if 'key' not in annexprops:
                continue
            key = annexprops['key']
            # For now the rest (e.g. not tracked) remains an error
            if not annexprops['has_content']:
                present = False
                backends.append(repo.get_key_backend(key))
        keys.append(key)
        presents.append(present)

    if all(presents):
        return all_same(map(md5sum, paths))

    backends = unique(backends)
    assert backends, "Since not all present - some must be under annex, and thus must have a backend!"
    # so some files are missing!
    assert not all(presents)
    NeedContentError = RuntimeError
    if len(backends) > 1:
        # TODO: or signal otherwise somehow that we just need to get at least some
        # of those files to do the check!...
        raise NeedContentError(
            "Following paths are missing content and have different annex "
            "backends: %s. Cannot determine here if the same or not!"
            % ", ".join(p for (p, b) in zip(paths, presents) if not b)
        )
    backend = backends[0].lower()
    if backend.endswith('E'):
        backend = backend[':-1']

    if backend not in Digester.DEFAULT_DIGESTS:
        raise NeedContentError(
            "Do not know how to figure out content check for backend %s" % backend
        )

    checksums = [
        split_ext(key).split('--', 1)[1] if key else key
        for key in keys
    ]
    thechecksum = set(
        checksum
        for present, checksum in zip(presents, checksums)
        if present
    )
    if len(thechecksum) > 1:
        # Different checksum (with the same backend)
        return False
    elif not thechecksum:
        raise RuntimeError("We must have had at least one key since prior logic"
                           " showed that not all files have content here")
    thechecksum = thechecksum[0]
    if any(presents):
        # We do need to extract checksum from the key and check the present
        # files' content to match
        digester = Digester([backend])
        for present, path in zip(presents, paths):
            if present and digester(path)[backend] != thechecksum:
                return False
        return True
    return False