예제 #1
0
파일: addurls.py 프로젝트: datalad/datalad
def get_file_parts(filename, prefix="name"):
    """Assign a name to various parts of a file.

    Parameters
    ----------
    filename : str
        A file name (no leading path is permitted).
    prefix : str
        Prefix to prepend to the key names.

    Returns
    -------
    A dict mapping each part to a value.
    """
    root, ext = split_ext(filename)
    root_py, ext_py = os.path.splitext(filename)

    return {prefix: filename,
            prefix + "_root": root,
            prefix + "_ext": ext,
            prefix + "_root_py": root_py,
            prefix + "_ext_py": ext_py}
예제 #2
0
파일: addurls.py 프로젝트: shots47s/datalad
def get_file_parts(filename, prefix="name"):
    """Assign a name to various parts of a file.

    Parameters
    ----------
    filename : str
        A file name (no leading path is permitted).
    prefix : str
        Prefix to prepend to the key names.

    Returns
    -------
    A dict mapping each part to a value.
    """
    root, ext = split_ext(filename)
    root_py, ext_py = os.path.splitext(filename)

    return {prefix: filename,
            prefix + "_root": root,
            prefix + "_ext": ext,
            prefix + "_root_py": root_py,
            prefix + "_ext_py": ext_py}
예제 #3
0
def _the_same_across_datasets(relpath, *dss):
    """Check if the file (present content or not) is identical across two datasets

    Compares files by content if under git, or by checksum if under annex

    Parameters
    ----------
    *ds: Datasets
    relpath: str
        path within datasets

    Returns
    -------
    bool or None
      True if identical, False if not, None if cannot be decided
      (e.g. different git-annex backend used)
    """
    from datalad.utils import md5sum, unique
    from datalad.support.exceptions import FileInGitError
    from datalad.support.digests import Digester

    paths = [op.join(ds.path, relpath) for ds in dss]
    # The simplest check first -- exist in both and content is the same.
    # Even if content is just a symlink file on windows, the same content
    # condition would be correct
    if all(map(op.exists, paths)) and all_same(map(md5sum, paths)):
        return True

    # We first need to find problematic ones which are annexed and
    # have no content locally, and take their
    keys = []
    backends = []
    presents = []
    for ds in dss:
        repo = ds.repo
        key = None
        present = True
        if isinstance(repo, AnnexRepo):
            annexprops = repo.get_file_annexinfo(
                relpath, eval_availability=True)
            if 'key' not in annexprops:
                continue
            key = annexprops['key']
            # For now the rest (e.g. not tracked) remains an error
            if not annexprops['has_content']:
                present = False
                backends.append(repo.get_key_backend(key))
        keys.append(key)
        presents.append(present)

    if all(presents):
        return all_same(map(md5sum, paths))

    backends = unique(backends)
    assert backends, "Since not all present - some must be under annex, and thus must have a backend!"
    # so some files are missing!
    assert not all(presents)
    NeedContentError = RuntimeError
    if len(backends) > 1:
        # TODO: or signal otherwise somehow that we just need to get at least some
        # of those files to do the check!...
        raise NeedContentError(
            "Following paths are missing content and have different annex "
            "backends: %s. Cannot determine here if the same or not!"
            % ", ".join(p for (p, b) in zip(paths, presents) if not b)
        )
    backend = backends[0].lower()
    if backend.endswith('E'):
        backend = backend[':-1']

    if backend not in Digester.DEFAULT_DIGESTS:
        raise NeedContentError(
            "Do not know how to figure out content check for backend %s" % backend
        )

    checksums = [
        split_ext(key).split('--', 1)[1] if key else key
        for key in keys
    ]
    thechecksum = set(
        checksum
        for present, checksum in zip(presents, checksums)
        if present
    )
    if len(thechecksum) > 1:
        # Different checksum (with the same backend)
        return False
    elif not thechecksum:
        raise RuntimeError("We must have had at least one key since prior logic"
                           " showed that not all files have content here")
    thechecksum = thechecksum[0]
    if any(presents):
        # We do need to extract checksum from the key and check the present
        # files' content to match
        digester = Digester([backend])
        for present, path in zip(presents, paths):
            if present and digester(path)[backend] != thechecksum:
                return False
        return True
    return False
예제 #4
0
파일: aggregate.py 프로젝트: hanke/datalad
def _the_same_across_datasets(relpath, *dss):
    """Check if the file (present content or not) is identical across two datasets

    Compares files by content if under git, or by checksum if under annex

    Parameters
    ----------
    *ds: Datasets
    relpath: str
        path within datasets

    Returns
    -------
    bool or None
      True if identical, False if not, None if cannot be decided
      (e.g. different git-annex backend used)
    """
    from datalad.utils import md5sum, unique
    from datalad.support.exceptions import FileInGitError
    from datalad.support.digests import Digester

    paths = [op.join(ds.path, relpath) for ds in dss]
    # The simplest check first -- exist in both and content is the same.
    # Even if content is just a symlink file on windows, the same content
    # condition would be correct
    if all(map(op.exists, paths)) and all_same(map(md5sum, paths)):
        return True

    # We first need to find problematic ones which are annexed and
    # have no content locally, and take their
    keys = []
    backends = []
    presents = []
    for ds in dss:
        repo = ds.repo
        key = None
        present = True
        if isinstance(repo, AnnexRepo):
            try:
                key = repo.get_file_key(relpath)
            except FileInGitError:
                continue
            if not key:
                raise ValueError(
                    "Must have got a key, unexpectedly got %r for %s within %s"
                    % (key, relpath, ds)
                )
            # For now the rest (e.g. not tracked) remains an error
            if not repo.file_has_content(relpath):
                present = False
                backends.append(repo.get_key_backend(key))
        keys.append(key)
        presents.append(present)

    if all(presents):
        return all_same(map(md5sum, paths))

    backends = unique(backends)
    assert backends, "Since not all present - some must be under annex, and thus must have a backend!"
    # so some files are missing!
    assert not all(presents)
    NeedContentError = RuntimeError
    if len(backends) > 1:
        # TODO: or signal otherwise somehow that we just need to get at least some
        # of those files to do the check!...
        raise NeedContentError(
            "Following paths are missing conent and have different annex "
            "backends: %s. Cannot determine here if the same or not!"
            % ", ".join(p for (p, b) in zip(paths, presents) if not b)
        )
    backend = backends[0].lower()
    if backend.endswith('E'):
        backend = backend[':-1']

    if backend not in Digester.DEFAULT_DIGESTS:
        raise NeedContentError(
            "Do not know how to figure out content check for backend %s" % backend
        )

    checksums = [
        split_ext(key).split('--', 1)[1] if key else key
        for key in keys
    ]
    thechecksum = set(
        checksum
        for present, checksum in zip(presents, checksums)
        if present
    )
    if len(thechecksum) > 1:
        # Different checksum (with the same backend)
        return False
    elif not thechecksum:
        raise RuntimeError("We must have had at least one key since prior logic"
                           " showed that not all files have content here")
    thechecksum = thechecksum[0]
    if any(presents):
        # We do need to extract checksum from the key and check the present
        # files' content to match
        digester = Digester([backend])
        for present, path in zip(presents, paths):
            if present and digester(path)[backend] != thechecksum:
                return False
        return True
    return False