Пример #1
0
def test_backup_archive(path, objtree, archivremote):
    """Similar to test_archive_layout(), but not focused on
    compatibility with the directory-type special remote. Instead,
    it tests build a second RIA remote from an existing one, e.g.
    for backup purposes.
    """
    ds = create(path)
    setup_archive_remote(ds.repo, objtree)
    populate_dataset(ds)
    ds.save()
    assert_repo_status(ds.path)

    # copy files into the RIA archive
    ds.repo.copy_to('.', 'archive')

    targetpath = Path(archivremote) / ds.id[:3] / ds.id[3:] / 'archives'
    targetpath.mkdir(parents=True)
    subprocess.run(
        ['7z', 'u', str(targetpath / 'archive.7z'), '.'],
        cwd=str(Path(objtree) / ds.id[:3] / ds.id[3:] / 'annex' / 'objects'),
    )
    initexternalremote(ds.repo,
                       '7z',
                       'ria',
                       config={'base-path': archivremote})
    # wipe out the initial RIA remote (just for testing if the upcoming
    # one can fully take over)
    shutil.rmtree(objtree)
    # fsck to make git-annex aware of the loss
    assert_status('error', [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='archive', fast=True)
    ])
    # now only available "here"
    eq_(len(ds.repo.whereis('one.txt')), 1)

    # make the backup archive known
    initexternalremote(ds.repo,
                       'backup',
                       'ria',
                       config={'base-path': archivremote})
    # now fsck the new remote to get the new special remote indexed
    assert_status('ok', [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='backup', fast=True)
    ])
    eq_(len(ds.repo.whereis('one.txt')), 2)

    # now we can drop all content locally, reobtain it, and survive an
    # fsck
    ds.drop('.')
    ds.get('.')
    assert_status('ok', [annexjson2result(r, ds) for r in ds.repo.fsck()])
Пример #2
0
def add_meta(rows):
    """Call `git annex metadata --set` using information in `rows`.
    """
    from unittest.mock import patch

    for row in rows:
        ds, filename = row["ds"], row["ds_filename"]
        with patch.object(ds.repo, "always_commit", False):
            res = ds.repo.add(filename)
            res_status = 'notneeded' if not res \
                else 'ok' if res.get('success', False) \
                else 'error'

            yield dict(
                action='add',
                # decorator dies with Path()
                path=str(ds.pathobj / filename),
                type='file',
                status=res_status,
                parentds=ds.path,
            )

            lgr.debug("Adding metadata to %s in %s", filename, ds.path)
            for a in ds.repo.set_metadata_(filename, add=row["meta_args"]):
                res = annexjson2result(a, ds, type="file", logger=lgr)
                # Don't show all added metadata for the file because that
                # could quickly flood the output.
                del res["message"]
                yield res
Пример #3
0
def add_meta(rows):
    """Call `git annex metadata --set` using information in `rows`.
    """
    from mock import patch

    for row in rows:
        ds, filename = row["ds"], row["ds_filename"]
        with patch.object(ds.repo, "always_commit", False):
            res = ds.repo.add(filename)
            res_status = 'notneeded' if not res \
                else 'ok' if res.get('success', False) \
                else 'error'

            yield dict(
                action='add',
                # decorator dies with Path()
                path=text_type(ds.pathobj / filename),
                type='file',
                status=res_status,
                parentds=ds.path,
            )

            lgr.debug("Adding metadata to %s in %s", filename, ds.path)
            for a in ds.repo.set_metadata_(filename, add=row["meta_args"]):
                res = annexjson2result(a, ds, type="file", logger=lgr)
                # Don't show all added metadata for the file because that
                # could quickly flood the output.
                del res["message"]
                yield res
Пример #4
0
def test_annexjson2result(dspath):
    # no explicit success means 'error'
    eq_(annexjson2result(dict(), None), dict(status='error'))
    # unrecognized -> error
    eq_(annexjson2result(dict(success='random'), None), dict(status='error'))
    # success is possible ;-)
    eq_(annexjson2result(dict(success=True), None), dict(status='ok'))

    # path handling
    # needs a dataset
    ds = Dataset(dspath)
    eq_(annexjson2result(dict(file='file1'), ds),
        dict(status='error', path=str(ds.pathobj / 'file1')))
    # on all platforms, paths are reported in platform conventions
    # although git-annex reports in posix
    eq_(annexjson2result(dict(file='dir1/file1'), ds),
        dict(status='error', path=str(ds.pathobj / 'dir1' / 'file1')))
Пример #5
0
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs):
    """Helper to drop content in datasets.

    Parameters
    ----------
    ds : Dataset
    paths : path or list(path)
      which content to drop
    check : bool
      whether to instruct annex to perform minimum copy availability
      checks
    noannex_iserror : bool
      whether calling this function on a pure Git repo results in an
      'impossible' or 'notneeded' result.
    **kwargs
      additional payload for the result dicts
    """
    if 'action' not in kwargs:
        kwargs['action'] = 'drop'
    # always need to make sure that we pass a list
    # `normalize_paths` decorator will otherwise screw all logic below
    paths = assure_list(paths)
    if not hasattr(ds.repo, 'drop'):
        for p in paths:
            r = get_status_dict(
                status='impossible' if noannex_iserror else 'notneeded',
                path=p if isabs(p) else normpath(opj(ds.path, p)),
                message="no annex'ed content",
                **kwargs)
            r['action'] = 'drop'
            yield r
        return

    opts = ['--force'] if not check else []
    respath_by_status = {}
    for res in ds.repo.drop(paths, options=opts):
        res = annexjson2result(
            # annex reports are always about files
            res,
            ds,
            type='file',
            **kwargs)
        success = success_status_map[res['status']]
        respath_by_status[success] = \
            respath_by_status.get(success, []) + [res['path']]
        yield res
    # report on things requested that annex was silent about
    for r in results_from_annex_noinfo(
            ds,
            paths,
            respath_by_status,
            dir_fail_msg='could not drop some content in %s %s',
            noinfo_dir_msg='nothing to drop from %s',
            noinfo_file_msg="no annex'ed content",
            **kwargs):
        r['action'] = 'drop'
        yield r
Пример #6
0
def _get_targetpaths(ds, content, refds_path, source, jobs):
    # not ready for Path instances...
    content = [str(c) for c in content]
    # hand over to git-annex, get files content,
    # report files in git as 'notneeded' to get
    ds_repo = ds.repo
    # needs to be an annex to get content
    if not isinstance(ds_repo, AnnexRepo):
        for r in results_from_paths(
                content,
                status='notneeded',
                message="no dataset annex, content already present",
                action='get',
                type='file',
                logger=lgr,
                refds=refds_path):
            yield r
        return
    respath_by_status = {}
    try:
        results = ds_repo.get(content,
                              options=['--from=%s' % source] if source else [],
                              jobs=jobs)
    except CommandError as exc:
        results = exc.kwargs.get("stdout_json")
        if not results:
            raise

    for res in results:
        res = annexjson2result(res,
                               ds,
                               type='file',
                               logger=lgr,
                               refds=refds_path)
        success = success_status_map[res['status']]
        # TODO: in case of some failed commands (e.g. get) there might
        # be no path in the record.  yoh has only vague idea of logic
        # here so just checks for having 'path', but according to
        # results_from_annex_noinfo, then it would be assumed that
        # `content` was acquired successfully, which is not the case
        if 'path' in res:
            respath_by_status[success] = \
                respath_by_status.get(success, []) + [res['path']]
        yield res

    for r in results_from_annex_noinfo(
            ds,
            content,
            respath_by_status,
            dir_fail_msg='could not get some content in %s %s',
            noinfo_dir_msg='nothing to get from %s',
            noinfo_file_msg='already present',
            action='get',
            logger=lgr,
            refds=refds_path):
        yield r
Пример #7
0
def add_meta(rows):
    """Call `git annex metadata --set` using information in `rows`.
    """
    for row in rows:
        ds, filename = row["ds"], row["ds_filename"]
        lgr.debug("Adding metadata to %s in %s", filename, ds.path)
        for a in ds.repo.set_metadata(filename, add=row["meta_args"]):
            res = annexjson2result(a, ds, type="file", logger=lgr)
            # Don't show all added metadata for the file because that
            # could quickly flood the output.
            del res["message"]
            yield res
Пример #8
0
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs):
    """Helper to drop content in datasets.

    Parameters
    ----------
    ds : Dataset
    paths : path or list(path)
      which content to drop
    check : bool
      whether to instruct annex to perform minimum copy availability
      checks
    noannex_iserror : bool
      whether calling this function on a pure Git repo results in an
      'impossible' or 'notneeded' result.
    **kwargs
      additional payload for the result dicts
    """
    if 'action' not in kwargs:
        kwargs['action'] = 'drop'
    # always need to make sure that we pass a list
    # `normalize_paths` decorator will otherwise screw all logic below
    paths = assure_list(paths)
    if not hasattr(ds.repo, 'drop'):
        for p in paths:
            r = get_status_dict(
                status='impossible' if noannex_iserror else 'notneeded',
                path=p if isabs(p) else normpath(opj(ds.path, p)),
                message="no annex'ed content",
                **kwargs)
            r['action'] = 'drop'
            yield r
        return

    opts = ['--force'] if not check else []
    respath_by_status = {}
    for res in ds.repo.drop(paths, options=opts):
        res = annexjson2result(
            # annex reports are always about files
            res, ds, type='file', **kwargs)
        success = success_status_map[res['status']]
        respath_by_status[success] = \
            respath_by_status.get(success, []) + [res['path']]
        yield res
    # report on things requested that annex was silent about
    for r in results_from_annex_noinfo(
            ds, paths, respath_by_status,
            dir_fail_msg='could not drop some content in %s %s',
            noinfo_dir_msg='nothing to drop from %s',
            noinfo_file_msg="no annex'ed content",
            **kwargs):
        r['action'] = 'drop'
        yield r
Пример #9
0
def add_meta(rows):
    """Call `git annex metadata --set` using information in `rows`.
    """
    from mock import patch

    for row in rows:
        ds, filename = row["ds"], row["ds_filename"]

        with patch.object(ds.repo, "always_commit", False):
            lgr.debug("Adding metadata to %s in %s", filename, ds.path)
            for a in ds.repo.set_metadata(filename, add=row["meta_args"]):
                res = annexjson2result(a, ds, type="file", logger=lgr)
                # Don't show all added metadata for the file because that
                # could quickly flood the output.
                del res["message"]
                yield res
Пример #10
0
def _postproc_result(res, respath_by_status, ds, **kwargs):
    res = annexjson2result(
        # annex reports are always about files
        res,
        ds,
        type='file',
        **kwargs)
    success = success_status_map[res['status']]
    respath_by_status[success] = \
        respath_by_status.get(success, []) + [res['path']]
    if res["status"] == "error" and res["action"] == "drop":
        msg = res["message"]
        if isinstance(msg, str) and "Use --force to" in msg:
            # Avoid confusing datalad-drop callers with git-annex-drop's
            # suggestion to use --force.
            res["message"] = msg.replace("--force", "--nocheck")
    return res
Пример #11
0
def add_meta(rows):
    """Call `git annex metadata --set` using information in `rows`.
    """
    from unittest.mock import patch

    # OPT: group by dataset first so to not patch/unpatch always_commit
    # per each file of which we could have thousands
    for ds, ds_rows in itertools.groupby(rows, itemgetter("ds")):
        with patch.object(ds.repo, "always_commit", False):
            for row in ds_rows:
                filename = row["ds_filename"]
                lgr.debug("Adding metadata to %s in %s", filename, ds.path)
                for a in ds.repo.set_metadata_(filename, add=row["meta_args"]):
                    res = annexjson2result(a, ds, type="file", logger=lgr)
                    # Don't show all added metadata for the file because that
                    # could quickly flood the output.
                    del res["message"]
                    yield res
Пример #12
0
def add_urls(rows, ifexists=None, options=None):
    """Call `git annex addurl` using information in `rows`.
    """
    for row in rows:
        filename_abs = row["filename_abs"]
        ds, filename = row["ds"], row["ds_filename"]
        lgr.debug("Adding metadata to %s in %s", filename, ds.path)

        if os.path.exists(filename_abs) or os.path.islink(filename_abs):
            if ifexists == "skip":
                yield get_status_dict(action="addurls",
                                      ds=ds,
                                      type="file",
                                      path=filename_abs,
                                      status="notneeded")
                continue
            elif ifexists == "overwrite":
                lgr.debug("Removing %s", filename_abs)
                unlink(filename_abs)
            else:
                lgr.debug("File %s already exists", filename_abs)

        try:
            out_json = ds.repo.add_url_to_file(filename,
                                               row["url"],
                                               batch=True,
                                               options=options)
        except AnnexBatchCommandError as exc:
            yield get_status_dict(action="addurls",
                                  ds=ds,
                                  type="file",
                                  path=filename_abs,
                                  message=exc_str(exc),
                                  status="error")
            continue

        # In the case of an error, the json object has file=None.
        if out_json["file"] is None:
            out_json["file"] = filename_abs
        yield annexjson2result(out_json,
                               ds,
                               action="addurls",
                               type="file",
                               logger=lgr)
Пример #13
0
def _postproc_annexdrop_result(res, respath_by_status, ds, **kwargs):
    res = annexjson2result(
        # annex reports are always about files
        res,
        ds,
        type='file',
        **kwargs)
    success = success_status_map[res['status']]
    respath_by_status[success] = \
        respath_by_status.get(success, []) + [res['path']]
    if res["status"] == "error" and res["action"] == "drop":
        msg = res.get("message", None)
        if isinstance(msg, str) and "Use --force to" in msg:
            # Avoid confusing datalad-drop callers with git-annex-drop's
            # suggestion to use --force.
            # Just mention reckless itself, do not go into the details
            # of which mode. This is likely changing over time and
            # adjusting this replacement will be forgotten.
            res["message"] = msg.replace("--force", "--reckless availability")
    return res
Пример #14
0
def add_urls(rows, ifexists=None, options=None):
    """Call `git annex addurl` using information in `rows`.
    """
    for row in rows:
        filename_abs = row["filename_abs"]
        ds, filename = row["ds"], row["ds_filename"]
        lgr.debug("Adding metadata to %s in %s", filename, ds.path)

        if os.path.exists(filename_abs) or os.path.islink(filename_abs):
            if ifexists == "skip":
                yield get_status_dict(action="addurls",
                                      ds=ds,
                                      type="file",
                                      path=filename_abs,
                                      status="notneeded")
                continue
            elif ifexists == "overwrite":
                lgr.debug("Removing %s", filename_abs)
                unlink(filename_abs)
            else:
                lgr.debug("File %s already exists", filename_abs)

        try:
            out_json = ds.repo.add_url_to_file(filename, row["url"],
                                               batch=True, options=options)
        except AnnexBatchCommandError as exc:
            yield get_status_dict(action="addurls",
                                  ds=ds,
                                  type="file",
                                  path=filename_abs,
                                  message=exc_str(exc),
                                  status="error")
            continue

        # In the case of an error, the json object has file=None.
        if out_json["file"] is None:
            out_json["file"] = filename_abs
        yield annexjson2result(out_json, ds, action="addurls",
                               type="file", logger=lgr)
Пример #15
0
    def __call__(
            path=None,
            dataset=None,
            # support passing this through in a path by path basis
            to_git=None,
            save=True,
            message=None,
            message_file=None,
            recursive=False,
            recursion_limit=None,
            ds2super=False,
            git_opts=None,
            annex_opts=None,
            annex_add_opts=None,
            jobs=None):
        # parameter constraints:
        if not path:
            raise InsufficientArgumentsError(
                "insufficient information for adding: requires at least a path")
        refds_path = Interface.get_refds_path(dataset)
        common_report = dict(action='add', logger=lgr, refds=refds_path)

        if message and message_file:
            raise ValueError("Both a message and message file were specified")

        if message_file:
            with open(message_file, "rb") as mfh:
                message = assure_unicode(mfh.read())

        to_add = []
        subds_to_add = {}
        ds_to_annotate_from_recursion = {}
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=dataset,
                # never recursion, need to handle manually below to be able to
                # discover untracked content
                recursive=False,
                action='add',
                # speed things up by using Git's modification detection, if there
                # is a repo with at least one commit
                modified='HEAD' \
                if dataset and \
                GitRepo.is_valid_repo(refds_path) and \
                GitRepo(refds_path).get_hexsha() \
                else None,
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            got_nothing = False
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset':
                yield get_status_dict(
                    status='impossible',
                    message='"there is no dataset to add this path to',
                    **dict(common_report, **ap))
                continue
            if ap.get('type', None) == 'directory' and \
                    ap.get('state', None) == 'untracked' and \
                    GitRepo.is_valid_repo(ap['path']):
                # this is an untracked wannabe subdataset in disguise
                ap['type'] = 'dataset'
            if recursive and \
                    (ap.get('raw_input', False) or
                     ap.get('state', None) in ('added', 'modified', 'untracked')) and \
                    (ap.get('parentds', None) or ap.get('type', None) == 'dataset'):
                # this was an actually requested input path, or a path that was found
                # modified by path annotation, based on an input argument
                # we need to recurse into all subdirs to find potentially
                # unregistered subdatasets
                # but only if this path has a parent, or is itself a dataset
                # otherwise there is nothing to add to
                _discover_subdatasets_recursively(
                    ds_to_annotate_from_recursion,
                    ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']],
                    recursion_limit)
                # get the file content of the root dataset of this search added too
                # but be careful with extreme recursion_limit settings
                if recursion_limit is None or recursion_limit > 0:
                    ap['process_content'] = True
            # record for further processing
            if not ap['path'] in ds_to_annotate_from_recursion:
                # if it was somehow already discovered
                to_add.append(ap)
        if got_nothing:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict(
                'add',
                status='notneeded',
                path=refds_path,
                type='dataset',
                logger=lgr)
            return

        for subds in ds_to_annotate_from_recursion:
            if subds not in subds_to_add:
                # always prefer the already annotated path
                subds_to_add[subds] = ds_to_annotate_from_recursion[subds]

        if dataset:
            # we have a base dataset, discover any intermediate datasets between
            # the base and any already discovered dataset
            discovered = {}
            discover_dataset_trace_to_targets(
                # from here
                dataset.path,
                # to any dataset we are aware of
                subds_to_add.keys(),
                [],
                discovered)
            for parentds in discovered:
                for subds in discovered[parentds]:
                    subds_to_add[subds] = subds_to_add.get(
                        subds,
                        dict(path=subds, parentds=parentds, type='dataset'))

        # merge custom paths and discovered dataset records, paths needs to go first,
        # because we know most about then, and subsequent annotation call we skip the
        # later duplicate ones
        to_add.extend(subds_to_add.values())
        # and compact, this should be OK as all the info is in each ap dict
        to_add = unique(to_add, lambda x: x['path'])

        if not to_add:
            # nothing left to do, potentially all errored before
            return

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, it will also deduplicate
        annotated_paths = AnnotatePaths.__call__(
            path=to_add,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='add',
            unavailable_path_status='impossible',
            unavailable_path_msg="path does not exist: %s",
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert(not completed)

        if not content_by_ds:
            # we should have complained about any inappropriate path argument
            # above, so if nothing is left, we can simply exit
            return

        # simple loop over datasets -- save happens later
        # start deep down
        to_save = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            torepoadd = {}
            respath_by_status = {}
            for ap in content_by_ds[ds_path]:
                # we have a new story
                ap.pop('status', None)
                torepoadd[ap['path']] = ap

                # skip anything that doesn't look like a wannabe subdataset
                if not ap.get('type', None) == 'dataset' or \
                        ap['path'] == ds_path:
                    continue

                if ap.get('registered_subds', False):
                    # subdataset that might be in this list because of the
                    # need to save all the way up to a super dataset
                    respath_by_status['success'] = \
                        respath_by_status.get('success', []) + [ap['path']]
                    yield get_status_dict(
                        status='notneeded',
                        message="already known subdataset",
                        **dict(common_report, **ap))
                    continue
                subds = Dataset(ap['path'])
                subds_relpath = relpath(ap['path'], ds_path)
                # Register the repository in the repo tree as a submodule
                try:
                    ds.repo.add_submodule(subds_relpath, url=None, name=None)
                except (CommandError, InvalidGitRepositoryError) as e:
                    yield get_status_dict(
                        ds=subds, status='error', message=e.stderr,
                        **dict(common_report, **ap))
                    continue
                # queue for saving using the updated annotated path
                ap['registered_subds'] = True
                # I hope this is true in direct mode too
                # TODO this is disabled, because in some circumstances
                # staging just doesn't happen, and it is unclear when
                # exactly -- the case that prompted disabling was a submodule
                # that had no content except for other submodules was not staged,
                # whereas another submodule on the same level in the same
                # superdataset which also has one file in it was staged
                # disable to work correctly, while paying a little bit of
                # slow down
                #ap['staged'] = True
                to_save.append(ap)
                # report added subdatasets -- `annex add` below won't do it
                yield get_status_dict(
                    ds=subds,
                    status='ok',
                    message='added new subdataset',
                    **dict(common_report, **ap))
                # make sure that .gitmodules is added to the list of files
                gitmodules_path = opj(ds.path, '.gitmodules')
                # for git
                torepoadd[gitmodules_path] = dict(path=gitmodules_path)
                # and for save
                to_save.append(dict(
                    path=gitmodules_path,
                    parentds=ds_path,
                    type='file'))
            # make sure any last minute additions make it to the saving stage
            # XXX? should content_by_ds become OrderedDict so that possible
            # super here gets processed last?
            lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd)
            is_annex = isinstance(ds.repo, AnnexRepo)
            add_kw = {'jobs': jobs} if is_annex and jobs else {}
            added = ds.repo.add_(
                list(torepoadd.keys()),
                git=to_git if is_annex else True,
                **add_kw
            )
            for a in added:
                res = annexjson2result(a, ds, type='file', **common_report)
                success = success_status_map[res['status']]
                respath_by_status[success] = \
                    respath_by_status.get(success, []) + [res['path']]
                # produce best possible path/result annotation
                if res['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    res = dict(torepoadd[res['path']], **res)
                # override this in all cases to be safe
                res['parentds'] = ds.path
                if success:
                    # this was successfully added, queue for saving this very path
                    # in the dataset
                    ap = {k: v for k, v in res.items() if k != 'status'}
                    ap['staged'] = True
                    # strip any status and state info (e.g. save will refuse to save
                    # stuff that is marked state='untracked'
                    to_save.append({k: v for k, v in res.items()
                                    if k not in ('status', 'state')})
                if a['file'] == '.gitmodules':
                    # filter out .gitmodules, because this is only included for
                    # technical reasons and has nothing to do with the actual content
                    continue
                if GitRepo.is_valid_repo(res['path']):
                    # more accurate report in case of an added submodule
                    # mountpoint.
                    # XXX Actually not sure if this can really happen
                    # (depends on what our low-level code would do)
                    # but worst case is that we loose a little bit of
                    # coverage...
                    res['type'] = 'dataset'
                    res['message'] = 'added new state as submodule'
                yield res

            for r in results_from_annex_noinfo(
                    ds, torepoadd, respath_by_status,
                    dir_fail_msg='could not add some content in %s %s',
                    noinfo_dir_msg='nothing to add from %s',
                    noinfo_file_msg='already included in the dataset',
                    action='add',
                    logger=lgr,
                    refds=refds_path):
                if r['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    r = dict(r, **torepoadd[r['path']])

                if r['status'] == 'notneeded':
                    # this could be a file that was staged already, it doesn't need
                    # to be added, but it should be saved/commited if so desired
                    to_save.append({k: v for k, v in r.items()
                                    if k not in ('status', 'state')})

                # XXX something is fishy with the next one, rethink when sober....
                if r['path'] == ds_path and r['status'] == 'ok':
                    # this is for the entire dataset itself which was explicitly requested
                    # make sure to save all
                    r['type'] = 'dataset'
                    r['process_content'] = True
                    to_save.append({k: v for k, v in r.items() if k != 'status'})
                yield r
            if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])):
                # TODO XXX we have an issue here when with `add('.')` and annex ignores any
                # dotfiles. In this case we end up not saving a dataset completely, because
                # we rely on accurate reporting. there is an issue about this already
                # TODO look up the issue ID
                # if there is a base dataset, but we are below it, and we have anything done to this
                # dataset -> queue dataset itself for saving its state in the parent
                ds_ap = dict(
                    path=ds.path,
                    # we have to look for the parent here, as we must save the
                    # subdataset in the parent and not the whole subdataset itself
                    type='dataset')
                parentds = get_dataset_root(normpath(opj(ds.path, pardir)))
                if parentds:
                    ds_ap['parentds'] = parentds
                if dataset:
                    ds_ap['refds'] = refds_path
                to_save.append(ds_ap)

        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        # TODO tell save what was staged already! Set 'staged=True' for
        # respective annotated paths that are fed into `save`

        # do not reuse any of the sorting done in here for saving, but instead
        # pass on all the annotated paths to have `save` figure out what to do with
        # them -- this is costs something, but should be safer, and frankly is
        # more comprehensible
        for res in Save.__call__(
                # hand-selected annotated paths
                path=to_save,
                dataset=refds_path,
                message=message if message else '[DATALAD] added content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Пример #16
0
def _test_bare_git_version_1(host, dspath, store):
    # This test should take a dataset and create a bare repository at the remote
    # end from it.
    # Given, that it is placed correctly within a tree of dataset, that remote
    # thing should then be usable as an ora-remote as well as as a git-type
    # remote.
    # Note: Usability of git remote by annex depends on dataset layout version
    #       (dirhashlower vs. -mixed).
    #       For version 1 (lower) upload and consumption should be
    #       interchangeable. It doesn't matter which remote is used for what
    #       direction.
    ds_path = Path(dspath)
    store = Path(store)
    ds = Dataset(ds_path).create()
    populate_dataset(ds)
    ds.save()

    bare_repo_path, _, _ = get_layout_locations(1, store, ds.id)
    # Use git to make sure the remote end is what git thinks a bare clone of it
    # should look like
    subprocess.run([
        'git', 'clone', '--bare',
        quote_cmdlinearg(str(dspath)),
        quote_cmdlinearg(str(bare_repo_path))
    ])

    if host:
        url = "ria+ssh://{host}{path}".format(host=host, path=store)
    else:
        url = "ria+{}".format(store.as_uri())
    init_opts = common_init_opts + ['url={}'.format(url)]
    # set up store:
    io = SSHRemoteIO(host) if host else LocalIO()
    create_store(io, store, '1')
    # set up the dataset location, too.
    # Note: Dataset layout version 1 (dirhash lower):
    create_ds_in_store(io, store, ds.id, '1', '1')

    # Now, let's have the bare repo as a git remote and use it with annex
    git_url = "ssh://{host}{path}".format(host=host, path=bare_repo_path) \
        if host else bare_repo_path.as_uri()
    ds.repo.add_remote('bare-git', git_url)
    ds.repo.enable_remote('bare-git')

    # copy files to the remote
    ds.repo.copy_to('.', 'bare-git')
    eq_(len(ds.repo.whereis('one.txt')), 2)

    # now we can drop all content locally, reobtain it, and survive an
    # fsck
    ds.drop('.')
    ds.get('.')
    assert_status('ok', [annexjson2result(r, ds) for r in ds.repo.fsck()])

    # Now, add the ora remote:
    ds.repo.init_remote('ora-remote', options=init_opts)
    # fsck to make availability known
    assert_status('ok', [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='ora-remote', fast=True)
    ])
    eq_(len(ds.repo.whereis('one.txt')), 3)

    # Now move content from git-remote to local and see it not being available
    # via bare-git anymore.
    ds.repo.call_annex(['move', '--all', '--from=bare-git'])
    # ora-remote doesn't know yet:
    eq_(len(ds.repo.whereis('one.txt')), 2)

    # But after fsck it does:
    fsck_res = [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='ora-remote', fast=True)
    ]
    assert_result_count(fsck_res,
                        1,
                        status='error',
                        message='** Based on the location log, one.txt\n'
                        '** was expected to be present, '
                        'but its content is missing.')
    assert_result_count(fsck_res,
                        1,
                        status='error',
                        message='** Based on the location log, subdir/two\n'
                        '** was expected to be present, '
                        'but its content is missing.')
    eq_(len(ds.repo.whereis('one.txt')), 1)
    # and the other way around: upload via ora-remote and have it available via
    # git-remote:
    ds.repo.copy_to('.', 'ora-remote')
    # fsck to make availability known
    assert_status('ok', [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='bare-git', fast=True)
    ])
    eq_(len(ds.repo.whereis('one.txt')), 3)
Пример #17
0
def _test_bare_git_version_2(host, dspath, store):
    # Similarly to test_bare_git_version_1, this should ensure a bare git repo
    # at the store location for a dataset doesn't conflict with the ORA remote.
    # Note: Usability of git remote by annex depends on dataset layout version
    #       (dirhashlower vs. -mixed).
    #       For version 2 (mixed) upload via ORA and consumption via git should
    #       work. But not the other way around, since git-annex uses
    #       dirhashlower with bare repos.

    ds_path = Path(dspath)
    store = Path(store)
    ds = Dataset(ds_path).create()
    populate_dataset(ds)
    ds.save()

    bare_repo_path, _, _ = get_layout_locations(1, store, ds.id)
    # Use git to make sure the remote end is what git thinks a bare clone of it
    # should look like
    subprocess.run([
        'git', 'clone', '--bare',
        quote_cmdlinearg(str(dspath)),
        quote_cmdlinearg(str(bare_repo_path))
    ])

    if host:
        url = "ria+ssh://{host}{path}".format(host=host, path=store)
    else:
        url = "ria+{}".format(store.as_uri())
    init_opts = common_init_opts + ['url={}'.format(url)]
    # set up store:
    io = SSHRemoteIO(host) if host else LocalIO()
    create_store(io, store, '1')
    # set up the dataset location, too.
    # Note: Dataset layout version 2 (dirhash mixed):
    create_ds_in_store(io, store, ds.id, '2', '1')

    # Now, let's have the bare repo as a git remote
    git_url = "ssh://{host}{path}".format(host=host, path=bare_repo_path) \
        if host else bare_repo_path.as_uri()
    ds.repo.add_remote('bare-git', git_url)
    ds.repo.enable_remote('bare-git')
    # and the ORA remote in addition:
    ds.repo.init_remote('ora-remote', options=init_opts)
    # upload keys via ORA:
    ds.repo.copy_to('.', 'ora-remote')
    # bare-git doesn't know yet:
    eq_(len(ds.repo.whereis('one.txt')), 2)
    # fsck to make availability known
    assert_status('ok', [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='bare-git', fast=True)
    ])
    eq_(len(ds.repo.whereis('one.txt')), 3)
    ds.drop('.')
    eq_(len(ds.repo.whereis('one.txt')), 2)
    # actually consumable via git remote:
    ds.repo.call_annex(['move', 'one.txt', '--from', 'bare-git'])
    eq_(len(ds.repo.whereis('one.txt')), 2)
    # now, move back via git - shouldn't be consumable via ORA
    ds.repo.call_annex(['move', 'one.txt', '--to', 'bare-git'])
    # fsck to make availability known, but there's nothing from POV of ORA:
    fsck_res = [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='ora-remote', fast=True)
    ]
    assert_result_count(fsck_res,
                        1,
                        status='error',
                        message='** Based on the location log, one.txt\n'
                        '** was expected to be present, '
                        'but its content is missing.')
    assert_result_count(fsck_res, 1, status='ok')
    eq_(len(fsck_res), 2)
    eq_(len(ds.repo.whereis('one.txt')), 1)
Пример #18
0
    def __call__(
            path=None,
            dataset=None,
            # support passing this through in a path by path basis
            to_git=None,
            save=True,
            message=None,
            recursive=False,
            recursion_limit=None,
            ds2super=False,
            git_opts=None,
            annex_opts=None,
            annex_add_opts=None,
            jobs=None):
        # parameter constraints:
        if not path:
            raise InsufficientArgumentsError(
                "insufficient information for adding: requires at least a path"
            )
        refds_path = Interface.get_refds_path(dataset)
        common_report = dict(action='add', logger=lgr, refds=refds_path)

        to_add = []
        subds_to_add = {}
        ds_to_annotate_from_recursion = {}
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=dataset,
                # never recursion, need to handle manually below to be able to
                # discover untracked content
                recursive=False,
                action='add',
                # speed things up by using Git's modification detection, if there
                # is a repo with at least one commit
                modified='HEAD' \
                if dataset and \
                GitRepo.is_valid_repo(refds_path) and \
                GitRepo(refds_path).get_hexsha() \
                else None,
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            got_nothing = False
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('parentds', None) is None and ap.get('type',
                                                           None) != 'dataset':
                yield get_status_dict(
                    status='impossible',
                    message='"there is no dataset to add this path to',
                    **dict(common_report, **ap))
                continue
            if ap.get('type', None) == 'directory' and \
                    ap.get('state', None) == 'untracked' and \
                    GitRepo.is_valid_repo(ap['path']):
                # this is an untracked wannabe subdataset in disguise
                ap['type'] = 'dataset'
            if recursive and \
                    (ap.get('raw_input', False) or
                     ap.get('state', None) in ('added', 'modified', 'untracked')) and \
                    (ap.get('parentds', None) or ap.get('type', None) == 'dataset'):
                # this was an actually requested input path, or a path that was found
                # modified by path annotation, based on an input argument
                # we need to recurse into all subdirs to find potentially
                # unregistered subdatasets
                # but only if this path has a parent, or is itself a dataset
                # otherwise there is nothing to add to
                _discover_subdatasets_recursively(
                    ds_to_annotate_from_recursion, ap['path'],
                    [ap['parentds'] if 'parentds' in ap else ap['path']],
                    recursion_limit)
                # get the file content of the root dataset of this search added too
                # but be careful with extreme recursion_limit settings
                if recursion_limit is None or recursion_limit > 0:
                    ap['process_content'] = True
            # record for further processing
            if not ap['path'] in ds_to_annotate_from_recursion:
                # if it was somehow already discovered
                to_add.append(ap)
            # TODO check if next isn't covered by discover_dataset_trace_to_targets already??
            if dataset and ap.get('type', None) == 'dataset':
                # duplicates not possible, annotated_paths returns unique paths
                subds_to_add[ap['path']] = ap
        if got_nothing:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict('add',
                                  status='notneeded',
                                  path=refds_path,
                                  type='dataset',
                                  logger=lgr)
            return

        for subds in ds_to_annotate_from_recursion:
            if subds not in subds_to_add:
                # always prefer the already annotated path
                subds_to_add[subds] = ds_to_annotate_from_recursion[subds]

        if dataset:
            # we have a base dataset, discover any intermediate datasets between
            # the base and any already discovered dataset
            discovered = {}
            discover_dataset_trace_to_targets(
                # from here
                dataset.path,
                # to any dataset we are aware of
                subds_to_add.keys(),
                [],
                discovered)
            for parentds in discovered:
                for subds in discovered[parentds]:
                    subds_to_add[subds] = subds_to_add.get(
                        subds,
                        dict(path=subds, parentds=parentds, type='dataset'))

        # merge custom paths and discovered dataset records, paths needs to go first,
        # because we know most about then, and subsequent annotation call we skip the
        # later duplicate ones
        to_add.extend(subds_to_add.values())
        # and compact, this should be OK as all the info is in each ap dict
        to_add = unique(to_add, lambda x: x['path'])

        if not to_add:
            # nothing left to do, potentially all errored before
            return

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, it will also deduplicate
        annotated_paths = AnnotatePaths.__call__(
            path=to_add,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='add',
            unavailable_path_status='impossible',
            unavailable_path_msg="path does not exist: %s",
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert (not completed)

        if not content_by_ds:
            # we should have complained about any inappropriate path argument
            # above, so if nothing is left, we can simply exit
            return

        # simple loop over datasets -- save happens later
        # start deep down
        to_save = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            torepoadd = {}
            respath_by_status = {}
            for ap in content_by_ds[ds_path]:
                # we have a new story
                ap.pop('status', None)
                torepoadd[ap['path']] = ap

                # skip anything that doesn't look like a wannabe subdataset
                if not ap.get('type', None) == 'dataset' or \
                        ap['path'] == ds_path:
                    continue

                if ap.get('registered_subds', False):
                    # subdataset that might be in this list because of the
                    # need to save all the way up to a super dataset
                    respath_by_status['success'] = \
                        respath_by_status.get('success', []) + [ap['path']]
                    yield get_status_dict(status='notneeded',
                                          message="already known subdataset",
                                          **dict(common_report, **ap))
                    continue
                subds = Dataset(ap['path'])
                # check that the subds has a commit, and refuse
                # to operate on it otherwise, or we would get a bastard
                # submodule that cripples git operations
                if not subds.repo.get_hexsha():
                    yield get_status_dict(
                        ds=subds,
                        status='impossible',
                        message='cannot add subdataset with no commits',
                        **dict(common_report, **ap))
                    continue
                subds_relpath = relpath(ap['path'], ds_path)
                # make an attempt to configure a submodule source URL based on the
                # discovered remote configuration
                remote, branch = subds.repo.get_tracking_branch()
                subds_url = subds.repo.get_remote_url(
                    remote) if remote else None
                # Register the repository in the repo tree as a submodule
                try:
                    ds.repo.add_submodule(subds_relpath,
                                          url=subds_url,
                                          name=None)
                except CommandError as e:
                    yield get_status_dict(ds=subds,
                                          status='error',
                                          message=e.stderr,
                                          **dict(common_report, **ap))
                    continue
                # queue for saving using the updated annotated path
                ap['registered_subds'] = True
                # I hope this is true in direct mode too
                # TODO this is disabled, because in some circumstances
                # staging just doesn't happen, and it is unclear when
                # exactly -- the case that prompted disabling was a submodule
                # that had no content except for other submodules was not staged,
                # whereas another submodule on the same level in the same
                # superdataset which also has one file in it was staged
                # disable to work correctly, while paying a little bit of
                # slow down
                #ap['staged'] = True
                to_save.append(ap)
                _fixup_submodule_dotgit_setup(ds, subds_relpath)
                # report added subdatasets -- `annex add` below won't do it
                yield get_status_dict(ds=subds,
                                      status='ok',
                                      message='added new subdataset',
                                      **dict(common_report, **ap))
                # make sure that .gitmodules is added to the list of files
                gitmodules_path = opj(ds.path, '.gitmodules')
                # for git
                torepoadd[gitmodules_path] = dict(path=gitmodules_path)
                # and for save
                to_save.append(
                    dict(path=gitmodules_path, parentds=ds_path, type='file'))
            # make sure any last minute additions make it to the saving stage
            # XXX? should content_by_ds become OrderedDict so that possible
            # super here gets processed last?
            lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd)
            is_annex = isinstance(ds.repo, AnnexRepo)
            add_kw = {'jobs': jobs} if is_annex and jobs else {}
            added = ds.repo.add(list(torepoadd.keys()),
                                git=to_git if is_annex else True,
                                **add_kw)
            for a in added:
                res = annexjson2result(a, ds, type='file', **common_report)
                success = success_status_map[res['status']]
                respath_by_status[success] = \
                    respath_by_status.get(success, []) + [res['path']]
                # produce best possible path/result annotation
                if res['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    res = dict(torepoadd[res['path']], **res)
                # override this in all cases to be safe
                res['parentds'] = ds.path
                if success:
                    # this was successfully added, queue for saving this very path
                    # in the dataset
                    ap = {k: v for k, v in res.items() if k != 'status'}
                    ap['staged'] = True
                    # strip any status and state info (e.g. save will refuse to save
                    # stuff that is marked state='untracked'
                    to_save.append({
                        k: v
                        for k, v in res.items() if k not in ('status', 'state')
                    })
                if a['file'] == '.gitmodules':
                    # filter out .gitmodules, because this is only included for
                    # technical reasons and has nothing to do with the actual content
                    continue
                if GitRepo.is_valid_repo(res['path']):
                    # more accurate report in case of an added submodule
                    # mountpoint.
                    # XXX Actually not sure if this can really happen
                    # (depends on what our low-level code would do)
                    # but worst case is that we loose a little bit of
                    # coverage...
                    res['type'] = 'dataset'
                    res['message'] = 'added new state as submodule'
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    torepoadd,
                    respath_by_status,
                    dir_fail_msg='could not add some content in %s %s',
                    noinfo_dir_msg='nothing to add from %s',
                    noinfo_file_msg='already included in the dataset',
                    action='add',
                    logger=lgr,
                    refds=refds_path):
                if r['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    r = dict(r, **torepoadd[r['path']])

                if r['status'] == 'notneeded':
                    # this could be a file that was staged already, it doesn't need
                    # to be added, but it should be saved/commited if so desired
                    to_save.append({
                        k: v
                        for k, v in r.items() if k not in ('status', 'state')
                    })

                # XXX something is fishy with the next one, rethink when sober....
                if r['path'] == ds_path and r['status'] == 'ok':
                    # this is for the entire dataset itself which was explicitly requested
                    # make sure to save all
                    r['type'] = 'dataset'
                    r['process_content'] = True
                    to_save.append(
                        {k: v
                         for k, v in r.items() if k != 'status'})
                yield r
            if refds_path and ds_path != refds_path and len(
                    respath_by_status.get('success', [])):
                # TODO XXX we have an issue here when with `add('.')` and annex ignores any
                # dotfiles. In this case we end up not saving a dataset completely, because
                # we rely on accurate reporting. there is an issue about this already
                # TODO look up the issue ID
                # if there is a base dataset, but we are below it, and we have anything done to this
                # dataset -> queue dataset itself for saving its state in the parent
                ds_ap = dict(
                    path=ds.path,
                    # we have to look for the parent here, as we must save the
                    # subdataset in the parent and not the whole subdataset itself
                    type='dataset')
                parentds = get_dataset_root(normpath(opj(ds.path, pardir)))
                if parentds:
                    ds_ap['parentds'] = parentds
                if dataset:
                    ds_ap['refds'] = refds_path
                to_save.append(ds_ap)

        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        # TODO tell save what was staged already! Set 'staged=True' for
        # respective annotated paths that are fed into `save`

        # do not reuse any of the sorting done in here for saving, but instead
        # pass on all the annotated paths to have `save` figure out what to do with
        # them -- this is costs something, but should be safer, and frankly is
        # more comprehensible
        for res in Save.__call__(
                # hand-selected annotated paths
                path=to_save,
                dataset=refds_path,
                message=message if message else '[DATALAD] added content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Пример #19
0
    def __call__(
        path=None,
        source=None,
        dataset=None,
        recursive=False,
        recursion_limit=None,
        get_data=True,
        description=None,
        reckless=False,
        #git_opts=None,
        #annex_opts=None,
        #annex_get_opts=None,
        jobs='auto',
        verbose=False,
    ):
        # IMPLEMENTATION CONCEPT:
        #
        # 1. Sort the world into existing handles and the rest
        # 2. Try locate missing handles (obtain subdatasets along the way)
        # 3. Expand into subdatasets with recursion enables (potentially
        #    obtain even more subdatasets
        # 4. Shoot info of which handles to get in each subdataset to,
        #    git-annex, once at the very end

        refds_path = Interface.get_refds_path(dataset)
        if not (dataset or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path

        # remember which results we already reported, to avoid duplicates
        yielded_ds = []
        to_get = []
        unavailable_paths = []
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='get',
                # NOTE: Do not act upon unavailable paths yet! Done below after
                # testing which ones could be obtained
                unavailable_path_status='',
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # we know what to report already
                yield ap
                continue
            if ap.get('state', None) == 'absent' and ap.get(
                    'raw_input', False):
                # if this wasn't found, but directly requested, queue for further
                # exploration
                unavailable_paths.append(ap)
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                # do not report what hasn't arived yet
                # also do not report the base dataset that is already
                # present -- no surprise
                yield dict(ap,
                           status='notneeded',
                           logger=lgr,
                           message='already installed')
                yielded_ds.append(ap['path'])
                ap['process_content'] = get_data
            to_get.append(ap)

        # explore the unknown
        for ap in sorted(unavailable_paths, key=lambda x: x['path']):
            lgr.debug("Investigate yet unavailable path %s", ap)
            # how close can we get?
            dspath = ap.get('parentds', get_dataset_root(ap['path']))
            if dspath is None:
                # nothing we can do for this path
                continue
            lgr.debug("Found containing dataset %s for path %s", dspath,
                      ap['path'])
            ds = Dataset(dspath)
            # now actually obtain whatever is necessary to get to this path
            containing_ds = [dspath]
            for res in _install_necessary_subdatasets(ds,
                                                      ap['path'],
                                                      reckless,
                                                      refds_path,
                                                      description=description):
                # yield immediately so errors could be acted upon outside, before
                # we continue
                if not (res['type'] == 'dataset'
                        and res['path'] in yielded_ds):
                    # unless we reported on this dataset before
                    if res['type'] == 'dataset':
                        # make a record, recursive below might now want to report
                        # a 'notneeded'
                        yielded_ds.append(res['path'])
                    yield res
                # update to the current innermost dataset
                containing_ds.append(res['path'])

            if len(containing_ds) < 2:
                # no subdataset was installed, hence if the path was unavailable
                # before it still is, no need to bother git annex
                ap.update(status='impossible', message='path does not exist')
                yield ap
                continue
            # important to only do the next for the innermost subdataset
            # as the `recursive` logic below relies on that!
            # set the correct parent, for a dataset this would be the second-last
            # reported subdataset
            ap.update(parentds=containing_ds[-1])
            if containing_ds[-1] == ap['path']:
                # the path actually refers to the last installed dataset
                ap.update(parentds=containing_ds[-2],
                          process_content=get_data,
                          type='dataset')
            to_get.append(ap)

        # results of recursive installation of yet undiscovered datasets
        rec_get = []
        if recursive and not recursion_limit == 'existing':
            # obtain any subdatasets underneath the paths given inside the
            # subdatasets that we know already exist
            # unless we do not want recursion into not-yet-installed datasets
            for ap in sorted(to_get, key=lambda x: x['path']):
                if ap['type'] not in ('dataset', 'directory') or not ap.get(
                        'raw_input', False):
                    # a non-directory cannot have content underneath
                    # also we do NOT want to recurse into anything that was specifically
                    # requested, to avoid duplication
                    continue
                subds = Dataset(ap['path'] if ap['type'] ==
                                'dataset' else ap['parentds'])
                lgr.info("Installing %s%s recursively", subds,
                         (" underneath %s" %
                          ap['path'] if subds.path != ap['path'] else ""))
                for res in _recursive_install_subds_underneath(
                        subds,
                        # `ap['path']` was explicitly given as input
                        # we count recursions from the input, hence we
                        # can start with the full number
                        recursion_limit,
                        reckless,
                        start=ap['path'],
                        refds_path=refds_path,
                        description=description):
                    # yield immediately so errors could be acted upon
                    # outside, before we continue
                    if not (res['type'] == 'dataset'
                            and res['path'] in yielded_ds):
                        # unless we reported on this dataset before
                        if res['type'] == 'dataset':
                            # make a record
                            yielded_ds.append(res['path'])
                    yield res
                    if not (res['status'] == 'ok'
                            and res['type'] == 'dataset'):
                        # not a dataset that was just installed, we just reported it
                        # upstairs, and can ignore it from now on
                        continue
                    # paranoia, so popular these days...
                    assert GitRepo.is_valid_repo(res['path'])
                    # keep a copy of the install record for `get` later on
                    get_ap = {
                        k: v
                        for k, v in res.items() if not k == 'status'
                    }
                    get_ap['process_content'] = get_data
                    rec_get.append(get_ap)

        if not get_data:
            # done already
            return

        # merge the two AP lists
        to_get.extend(rec_get)

        # sort into datasets
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_get,
                refds_path=refds_path)
        assert (not completed)

        # hand over to git-annex, get files content,
        # report files in git as 'notneeded' to get
        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            # grab content, ignore subdataset entries
            content = [
                ap['path'] for ap in content_by_ds[ds_path]
                if ap.get('type', None) != 'dataset' or ap['path'] == ds.path
            ]
            if not content:
                # cut this short should there be nothing
                continue
            # needs to be an annex to get content
            if not isinstance(ds.repo, AnnexRepo):
                for r in results_from_paths(
                        content,
                        status='notneeded',
                        message="no dataset annex, content already present",
                        action='get',
                        logger=lgr,
                        refds=refds_path):
                    yield r
                continue
            respath_by_status = {}
            for res in ds.repo.get(content,
                                   options=['--from=%s' %
                                            source] if source else [],
                                   jobs=jobs):
                res = annexjson2result(res,
                                       ds,
                                       type='file',
                                       logger=lgr,
                                       refds=refds_path)
                success = success_status_map[res['status']]
                # TODO: in case of some failed commands (e.g. get) there might
                # be no path in the record.  yoh has only vague idea of logic
                # here so just checks for having 'path', but according to
                # results_from_annex_noinfo, then it would be assumed that
                # `content` was acquired successfully, which is not the case
                if 'path' in res:
                    respath_by_status[success] = \
                        respath_by_status.get(success, []) + [res['path']]
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    content,
                    respath_by_status,
                    dir_fail_msg='could not get some content in %s %s',
                    noinfo_dir_msg='nothing to get from %s',
                    noinfo_file_msg='already present',
                    action='get',
                    logger=lgr,
                    refds=refds_path):
                yield r
def test_create_as_bare(origin, remote_base_path, remote_base_url, public,
                        consumer, tmp_location):

    # Note/TODO: Do we need things like:
    #    git config receive.denyCurrentBranch updateInstead
    #    mv .hooks/post-update.sample hooks/post-update
    #    git update-server-info

    # Test how we build a riaremote from an existing dataset, that is a bare git repo and can be accessed as a git type
    # remote as well. This should basically outline how to publish to that kind of structure as a data store, that is
    # autoenabled, so we can publish to github/gitlab and make that storage known.

    remote_base_path = Path(remote_base_path)

    ds = create(origin)
    populate_dataset(ds)
    ds.save()
    assert_repo_status(ds.path)

    # add the ria remote:
    # Note: For serve_path_via_http to work (which we need later), the directory needs to already exist.
    #       But by default RIARemote will reject to create the remote structure in an already existing directory,
    #       that wasn't created by itself (lacks as ria-layout-version file).
    #       So, we can either configure force-write here or put a version file in it beforehand.
    #       However, this is specific to the test environment!
    with open(str(remote_base_path / 'ria-layout-version'), 'w') as f:
        f.write('1')
    initexternalremote(ds.repo,
                       'riaremote',
                       'ria',
                       config={'base-path': str(remote_base_path)})
    # pretty much any annex command that talks to that remote should now trigger the actual creation on the remote end:
    assert_status('ok', [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='riaremote', fast=True)
    ])

    remote_dataset_path = remote_base_path / ds.id[:3] / ds.id[3:]

    assert remote_base_path.exists()
    assert remote_dataset_path.exists()
    ds.repo.copy_to('.', 'riaremote')

    # Now, let's make the remote end a valid, bare git repository
    eq_(
        subprocess.run(['git', 'init', '--bare'],
                       cwd=str(remote_dataset_path)).returncode, 0)

    #subprocess.run(['mv', 'hooks/post-update.sample', 'hooks/post-update'], cwd=remote_dataset_path)
    #subprocess.run(['git', 'update-server-info'], cwd=remote_dataset_path)

    # TODO: we might need "mv .hooks/post-update.sample hooks/post-update", "git update-server-info" as well
    # add as git remote and push everything
    eq_(
        subprocess.run(
            ['git', 'remote', 'add', 'bare-git',
             str(remote_dataset_path)],
            cwd=origin).returncode, 0)
    # Note: "--mirror" does the job for this test, while it might not be a good default some kind of
    # datalad-create-sibling. However those things need to be configurable for actual publish/creation routine anyway
    eq_(
        subprocess.run(['git', 'push', '--mirror', 'bare-git'],
                       cwd=origin).returncode, 0)

    # annex doesn't know the bare-git remote yet:
    eq_(len(ds.repo.whereis('one.txt')), 2)
    # But after enableremote and a fsck it does:
    eq_(
        subprocess.run(['git', 'annex', 'enableremote', 'bare-git'],
                       cwd=origin).returncode, 0)
    assert_status('ok', [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='bare-git', fast=True)
    ])
    eq_(len(ds.repo.whereis('one.txt')), 3)

    # we can drop and get again via 'bare-git' remote:
    ds.drop('.')
    eq_(len(ds.repo.whereis('one.txt')), 2)
    eq_(
        subprocess.run(
            ['git', 'annex', 'get', 'one.txt', '--from', 'bare-git'],
            cwd=origin).returncode, 0)
    eq_(len(ds.repo.whereis('one.txt')), 3)
    # let's get the other one from riaremote
    eq_(len(ds.repo.whereis(op.join('subdir', 'two'))), 2)
    eq_(
        subprocess.run([
            'git', 'annex', 'get',
            op.join('subdir', 'two'), '--from', 'riaremote'
        ],
                       cwd=origin).returncode, 0)
    eq_(len(ds.repo.whereis(op.join('subdir', 'two'))), 3)

    raise SkipTest("NOT YET DONE")
def test_bare_git(origin, remote_base_path):

    remote_base_path = Path(remote_base_path)

    # This test should take a dataset and create a bare repository at the remote end from it.
    # Given, that it is placed correctly within a tree of dataset, that remote thing should then be usable as a
    # ria-remote as well as as a git-type remote

    ds = create(origin)
    populate_dataset(ds)
    ds.save()
    assert_repo_status(ds.path)

    # Use git to make sure the remote end is what git thinks a bare clone of it should look like
    bare_repo_path = remote_base_path / ds.id[:3] / ds.id[3:]
    subprocess.run(['git', 'clone', '--bare', origin, str(bare_repo_path)])

    # Now, let's have the bare repo as a git remote and use it with annex
    eq_(
        subprocess.run(
            ['git', 'remote', 'add', 'bare-git',
             str(bare_repo_path)],
            cwd=origin).returncode, 0)
    eq_(
        subprocess.run(['git', 'annex', 'enableremote', 'bare-git'],
                       cwd=origin).returncode, 0)
    eq_(
        subprocess.run(['git', 'annex', 'testremote', 'bare-git'],
                       cwd=origin).returncode, 0)
    # copy files to the remote
    ds.repo.copy_to('.', 'bare-git')
    eq_(len(ds.repo.whereis('one.txt')), 2)

    # now we can drop all content locally, reobtain it, and survive an
    # fsck
    ds.drop('.')
    ds.get('.')
    assert_status('ok', [annexjson2result(r, ds) for r in ds.repo.fsck()])

    # Since we created the remote this particular way instead of letting ria-remote create it, we need to put
    # ria-layout-version files into it. Then we should be able to also add it as a ria-remote.
    with open(str(remote_base_path / 'ria-layout-version'), 'w') as f:
        f.write('1')
    with open(str(bare_repo_path / 'ria-layout-version'), 'w') as f:
        f.write('1')

    # Now, add the ria remote:
    initexternalremote(ds.repo,
                       'riaremote',
                       'ria',
                       config={'base-path': str(remote_base_path)})
    # fsck to make availability known
    assert_status('ok', [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='riaremote', fast=True)
    ])
    eq_(len(ds.repo.whereis('one.txt')), 3)

    # Now move content from git-remote to local and see it not being available via bare-git anymore
    eq_(
        subprocess.run(['git', 'annex', 'move', '--all', '--from=bare-git'],
                       cwd=origin).returncode, 0)
    # ria-remote doesn't know yet:
    eq_(len(ds.repo.whereis('one.txt')), 2)

    # But after fsck it does:
    fsck_res = [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='riaremote', fast=True)
    ]
    assert_result_count(
        fsck_res,
        1,
        status='error',
        message=
        '** Based on the location log, one.txt\n** was expected to be present, '
        'but its content is missing.')
    assert_result_count(
        fsck_res,
        1,
        status='error',
        message=
        '** Based on the location log, subdir/two\n** was expected to be present, '
        'but its content is missing.')

    eq_(len(ds.repo.whereis('one.txt')), 1)
Пример #22
0
def _push_data(ds,
               target,
               content,
               data,
               force,
               jobs,
               res_kwargs,
               got_path_arg=False):
    if ds.config.getbool('remote.{}'.format(target), 'annex-ignore', False):
        lgr.debug(
            "Target '%s' is set to annex-ignore, exclude from data-push.",
            target,
        )
        return

    ds_repo = ds.repo

    res_kwargs['target'] = target
    if not ds.config.get('.'.join(('remote', target, 'annex-uuid')), None):
        # this remote either isn't an annex,
        # or hasn't been properly initialized
        # given that there was no annex-ignore, let's try to init it
        # see https://github.com/datalad/datalad/issues/5143 for the story
        ds_repo.localsync(target)

        if not ds.config.get('.'.join(('remote', target, 'annex-uuid')), None):
            # still nothing
            # rather than barfing tons of messages for each file, do one
            # for the entire dataset
            yield dict(
                res_kwargs,
                action='copy',
                status='impossible'
                if force in ('all', 'checkdatapresent') else 'notneeded',
                message=("Target '%s' does not appear to be an annex remote",
                         target))
            return

    # it really looks like we will transfer files, get info on what annex
    # has in store
    # paths must be recoded to a dataset REPO root (in case of a symlinked
    # location
    annex_info_init = \
        {ds_repo.pathobj / Path(c['path']).relative_to(ds.pathobj): c
         for c in content} if ds.pathobj != ds_repo.pathobj else \
        {Path(c['path']): c for c in content}
    content = ds.repo.get_content_annexinfo(
        # paths are taken from `annex_info_init`
        paths=None,
        init=annex_info_init,
        ref='HEAD',
        # this is an expensive operation that is only needed
        # to perform a warning below, and for more accurate
        # progress reporting (exclude unavailable content).
        # limit to cases with explicit paths provided
        eval_availability=True if got_path_arg else False,
    )
    # figure out which of the reported content (after evaluating
    # `since` and `path` arguments needs transport
    to_transfer = [
        c for c in content.values()
        # by force
        if ((
            force in ('all', 'checkdatapresent') or
            # or by modification report
            c.get('state', None) not in ('clean', 'deleted'))
            # only consider annex'ed files
            and 'key' in c)
    ]
    if got_path_arg:
        for c in [c for c in to_transfer if not c.get('has_content', False)]:
            yield dict(
                res_kwargs,
                type=c['type'],
                path=c['path'],
                action='copy',
                status='impossible',
                message='Slated for transport, but no content present',
            )

    cmd = ['copy', '--batch', '-z', '--to', target]

    if jobs:
        cmd.extend(['--jobs', str(jobs)])

    # Since we got here - we already have some  data != "nothing"
    if (data == 'auto') or \
        (
            (data == 'auto-if-wanted') and
            ds_repo.get_preferred_content('wanted', target)
        ):
        lgr.debug("Invoking copy --auto")
        cmd.append('--auto')

    if force not in ('all', 'checkdatapresent'):
        # if we force, we do not trust local knowledge and do the checks
        cmd.append('--fast')

    lgr.debug("Push data from %s to '%s'", ds, target)

    # input has type=dataset, but now it is about files
    res_kwargs.pop('type', None)

    # A set and an OrderedDict is used to track files pointing to the
    # same key.  The set could be dropped, using a single dictionary
    # that has an entry for every seen key and a (likely empty) list
    # of redundant files, but that would mean looping over potentially
    # many keys to yield likely few if any notneeded results.
    seen_keys = set()
    repkey_paths = OrderedDict()

    # produce final path list. use knowledge that annex command will
    # run in the root of the dataset and compact paths to be relative
    # to this location
    file_list = b''
    nbytes = 0
    for c in to_transfer:
        key = c['key']
        if key in seen_keys:
            repkey_paths.setdefault(key, []).append(c['path'])
        else:
            file_list += bytes(Path(c['path']).relative_to(ds.pathobj))
            file_list += b'\0'
            nbytes += c['bytesize']
            seen_keys.add(key)
    lgr.debug('Counted %d bytes of annex data to transfer', nbytes)

    # and go
    res = ds_repo._call_annex_records(
        cmd,
        stdin=file_list,
        progress=True,
        # tailor the progress protocol with the total number of files
        # to be transferred
        total_nbytes=nbytes)
    for j in res:
        yield annexjson2result(j, ds, type='file', **res_kwargs)

    for annex_key, paths in repkey_paths.items():
        for path in paths:
            yield dict(res_kwargs,
                       action='copy',
                       type='file',
                       status='notneeded',
                       path=path,
                       annexkey=annex_key,
                       message='Another file points to the same key')
    return
Пример #23
0
    def __call__(
            path=None,
            source=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            get_data=True,
            description=None,
            reckless=False,
            #git_opts=None,
            #annex_opts=None,
            #annex_get_opts=None,
            jobs='auto',
            verbose=False,
    ):
        # IMPLEMENTATION CONCEPT:
        #
        # 1. Sort the world into existing handles and the rest
        # 2. Try locate missing handles (obtain subdatasets along the way)
        # 3. Expand into subdatasets with recursion enables (potentially
        #    obtain even more subdatasets
        # 4. Shoot info of which handles to get in each subdataset to,
        #    git-annex, once at the very end

        refds_path = Interface.get_refds_path(dataset)
        if not (dataset or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path

        # remember which results we already reported, to avoid duplicates
        yielded_ds = []
        to_get = []
        unavailable_paths = []
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='get',
                # NOTE: Do not act upon unavailable paths yet! Done below after
                # testing which ones could be obtained
                unavailable_path_status='',
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # we know what to report already
                yield ap
                continue
            if ap.get('state', None) == 'absent' and ap.get('raw_input', False):
                # if this wasn't found, but directly requested, queue for further
                # exploration
                unavailable_paths.append(ap)
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                # do not report what hasn't arived yet
                # also do not report the base dataset that is already
                # present -- no surprise
                yield dict(ap, status='notneeded', logger=lgr,
                           message='already installed')
                yielded_ds.append(ap['path'])
                ap['process_content'] = get_data
            to_get.append(ap)

        # explore the unknown
        for ap in sorted(unavailable_paths, key=lambda x: x['path']):
            lgr.debug("Investigate yet unavailable path %s", ap)
            # how close can we get?
            dspath = ap.get('parentds', get_dataset_root(ap['path']))
            if dspath is None:
                # nothing we can do for this path
                continue
            lgr.debug("Found containing dataset %s for path %s", dspath, ap['path'])
            ds = Dataset(dspath)
            # now actually obtain whatever is necessary to get to this path
            containing_ds = [dspath]
            for res in _install_necessary_subdatasets(
                    ds, ap['path'], reckless, refds_path, description=description):
                # yield immediately so errors could be acted upon outside, before
                # we continue
                if not (res['type'] == 'dataset' and res['path'] in yielded_ds):
                    # unless we reported on this dataset before
                    if res['type'] == 'dataset':
                        # make a record, recursive below might now want to report
                        # a 'notneeded'
                        yielded_ds.append(res['path'])
                    yield res
                # update to the current innermost dataset
                containing_ds.append(res['path'])

            if len(containing_ds) < 2:
                # no subdataset was installed, hence if the path was unavailable
                # before it still is, no need to bother git annex
                ap.update(status='impossible',
                          message='path does not exist')
                yield ap
                continue
            # important to only do the next for the innermost subdataset
            # as the `recursive` logic below relies on that!
            # set the correct parent, for a dataset this would be the second-last
            # reported subdataset
            ap.update(parentds=containing_ds[-1])
            if containing_ds[-1] == ap['path']:
                # the path actually refers to the last installed dataset
                ap.update(parentds=containing_ds[-2],
                          process_content=get_data,
                          type='dataset')
            to_get.append(ap)

        # results of recursive installation of yet undiscovered datasets
        rec_get = []
        if recursive and not recursion_limit == 'existing':
            # obtain any subdatasets underneath the paths given inside the
            # subdatasets that we know already exist
            # unless we do not want recursion into not-yet-installed datasets
            for ap in sorted(to_get, key=lambda x: x['path']):
                if ap['type'] not in ('dataset', 'directory') or not ap.get('raw_input', False):
                    # a non-directory cannot have content underneath
                    # also we do NOT want to recurse into anything that was specifically
                    # requested, to avoid duplication
                    continue
                subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds'])
                lgr.info(
                    "Installing %s%s recursively",
                    subds,
                    (" underneath %s" % ap['path']
                     if subds.path != ap['path']
                     else ""))
                for res in _recursive_install_subds_underneath(
                        subds,
                        # `ap['path']` was explicitly given as input
                        # we count recursions from the input, hence we
                        # can start with the full number
                        recursion_limit,
                        reckless,
                        start=ap['path'],
                        refds_path=refds_path,
                        description=description):
                    # yield immediately so errors could be acted upon
                    # outside, before we continue
                    if not (res['type'] == 'dataset' and res['path'] in yielded_ds):
                        # unless we reported on this dataset before
                        if res['type'] == 'dataset':
                            # make a record
                            yielded_ds.append(res['path'])
                    yield res
                    if not (res['status'] == 'ok' and res['type'] == 'dataset'):
                        # not a dataset that was just installed, we just reported it
                        # upstairs, and can ignore it from now on
                        continue
                    # paranoia, so popular these days...
                    assert GitRepo.is_valid_repo(res['path'])
                    # keep a copy of the install record for `get` later on
                    get_ap = {k: v for k, v in res.items()
                              if not k == 'status'}
                    get_ap['process_content'] = get_data
                    rec_get.append(get_ap)

        if not get_data:
            # done already
            return

        # merge the two AP lists
        to_get.extend(rec_get)

        # sort into datasets
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_get,
                refds_path=refds_path)
        assert(not completed)

        # hand over to git-annex, get files content,
        # report files in git as 'notneeded' to get
        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            # grab content, ignore subdataset entries
            content = [ap['path'] for ap in content_by_ds[ds_path]
                       if ap.get('type', None) != 'dataset' or ap['path'] == ds.path]
            if not content:
                # cut this short should there be nothing
                continue
            # needs to be an annex to get content
            if not isinstance(ds.repo, AnnexRepo):
                for r in results_from_paths(
                        content, status='notneeded',
                        message="no dataset annex, content already present",
                        action='get', logger=lgr,
                        refds=refds_path):
                    yield r
                continue
            respath_by_status = {}
            for res in ds.repo.get(
                    content,
                    options=['--from=%s' % source] if source else [],
                    jobs=jobs):
                res = annexjson2result(res, ds, type='file', logger=lgr,
                                       refds=refds_path)
                success = success_status_map[res['status']]
                # TODO: in case of some failed commands (e.g. get) there might
                # be no path in the record.  yoh has only vague idea of logic
                # here so just checks for having 'path', but according to
                # results_from_annex_noinfo, then it would be assumed that
                # `content` was acquired successfully, which is not the case
                if 'path' in res:
                    respath_by_status[success] = \
                        respath_by_status.get(success, []) + [res['path']]
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    content,
                    respath_by_status,
                    dir_fail_msg='could not get some content in %s %s',
                    noinfo_dir_msg='nothing to get from %s',
                    noinfo_file_msg='already present',
                    action='get',
                    logger=lgr,
                    refds=refds_path):
                yield r
Пример #24
0
def _push_data(ds,
               target,
               content,
               data,
               force,
               jobs,
               res_kwargs,
               got_path_arg=False):
    if ds.config.getbool('remote.{}'.format(target), 'annex-ignore', False):
        lgr.debug(
            "Target '%s' is set to annex-ignore, exclude from data-push.",
            target,
        )
        return
    res_kwargs['target'] = target
    if not ds.config.get('.'.join(('remote', target, 'annex-uuid')), None):
        # this remote either isn't an annex,
        # or hasn't been properly initialized
        # rather than barfing tons of messages for each file, do one
        # for the entire dataset
        yield dict(
            res_kwargs,
            action='copy',
            status='impossible'
            if force in ('all', 'checkdatapresent') else 'notneeded',
            message=("Target '%s' does not appear to be an annex remote",
                     target))
        return

    # it really looks like we will transfer files, get info on what annex
    # has in store
    ds_repo = ds.repo
    # paths must be recoded to a dataset REPO root (in case of a symlinked
    # location
    annex_info_init = \
        {ds_repo.pathobj / Path(c['path']).relative_to(ds.pathobj): c
         for c in content} if ds.pathobj != ds_repo.pathobj else \
        {Path(c['path']): c for c in content}
    content = ds.repo.get_content_annexinfo(
        # paths are taken from `annex_info_init`
        paths=None,
        init=annex_info_init,
        ref='HEAD',
        # this is an expensive operation that is only needed
        # to perform a warning below, and for more accurate
        # progress reporting (exclude unavailable content).
        # limit to cases with explicit paths provided
        eval_availability=True if got_path_arg else False,
    )
    # figure out which of the reported content (after evaluating
    # `since` and `path` arguments needs transport
    to_transfer = [
        c for c in content.values()
        # by force
        if ((
            force in ('all', 'checkdatapresent') or
            # or by modification report
            c.get('state', None) not in ('clean', 'deleted'))
            # only consider annex'ed files
            and 'key' in c)
    ]
    if got_path_arg:
        for c in [c for c in to_transfer if not c.get('has_content', False)]:
            yield dict(
                res_kwargs,
                type=c['type'],
                path=c['path'],
                action='copy',
                status='impossible',
                message='Slated for transport, but no content present',
            )

    cmd = [
        'git', 'annex', 'copy', '--batch', '-z', '--to', target, '--json',
        '--json-error-messages', '--json-progress'
    ]

    if jobs:
        cmd.extend(['--jobs', str(jobs)])

    # Since we got here - we already have some  data != "nothing"
    if (data == 'auto') or \
        (
            (data == 'auto-if-wanted') and
            ds_repo.get_preferred_content('wanted', target)
        ):
        lgr.debug("Invoking copy --auto")
        cmd.append('--auto')

    if force not in ('all', 'checkdatapresent'):
        # if we force, we do not trust local knowledge and do the checks
        cmd.append('--fast')

    lgr.debug("Push data from %s to '%s'", ds, target)

    # input has type=dataset, but now it is about files
    res_kwargs.pop('type', None)

    # produce final path list. use knowledge that annex command will
    # run in the root of the dataset and compact paths to be relative
    # to this location
    # XXX must not be a SpooledTemporaryFile -- dunno why, but doesn't work
    # otherwise
    with TemporaryFile() as file_list:
        nbytes = 0
        for c in to_transfer:
            file_list.write(bytes(Path(c['path']).relative_to(ds.pathobj)))
            file_list.write(b'\0')
            nbytes += c['bytesize']

        # rewind stdin buffer
        file_list.seek(0)

        # tailor the progress protocol with the total number of files
        # to be transferred
        class TailoredPushAnnexJsonProtocol(AnnexJsonProtocol):
            total_nbytes = nbytes

        # and go
        # TODO try-except and yield what was captured before the crash
        #res = GitWitlessRunner(
        res = GitWitlessRunner(cwd=ds.path, ).run(
            cmd,
            # TODO report how many in total, and give global progress too
            protocol=TailoredPushAnnexJsonProtocol,
            stdin=file_list)
        for c in ('stdout', 'stderr'):
            if res[c]:
                lgr.debug('Received unexpected %s from `annex copy`: %s', c,
                          res[c])
        for j in res['stdout_json']:
            yield annexjson2result(j, ds, type='file', **res_kwargs)
    return