Пример #1
0
def test_aggregate_with_unavailable_objects_from_subds(path, target):
    base = Dataset(op.join(path, 'origin')).create(force=True)
    # force all metadata objects into the annex
    with open(op.join(base.path, '.datalad', '.gitattributes'), 'w') as f:
        f.write(
            '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n'
        )
    sub = base.create('sub', force=True)
    subsub = base.create(op.join('sub', 'subsub'), force=True)
    base.save(recursive=True)
    assert_repo_status(base.path)
    base.meta_aggregate(recursive=True, into='all')
    assert_repo_status(base.path)

    # now make that a subdataset of a new one, so aggregation needs to get the
    # metadata objects first:
    super = Dataset(target).create()
    super.install("base", source=base.path)
    assert_repo_status(super.path)
    clone = Dataset(op.join(super.path, "base"))
    assert_repo_status(clone.path)
    objpath = PurePosixPath('.datalad/metadata/objects')
    objs = [
        o
        for o in sorted(clone.repo.get_annexed_files(with_content_only=False))
        if objpath in PurePosixPath(o).parents
    ]
    eq_(len(objs), 6)
    eq_(all(clone.repo.file_has_content(objs)), False)

    # now aggregate should get those metadata objects
    super.meta_aggregate(recursive=True, into='all')
    eq_(all(clone.repo.file_has_content(objs)), True)
Пример #2
0
def _yield_dsmeta(ds):
    srcfiles, cfg_srcfiles = _get_dsmeta_srcfiles(ds)
    dsmeta = {}
    for srcfile in srcfiles:
        abssrcfile = ds.pathobj / PurePosixPath(srcfile)
        # TODO get annexed files, or do in a central place?
        if not abssrcfile.exists():
            # nothing to load
            # warn if this was configured
            if srcfile in cfg_srcfiles:
                yield dict(
                    path=ds.path,
                    type='dataset',
                    status='impossible',
                    message=(
                        'configured custom metadata source is not '
                        'available in %s: %s',
                        ds, srcfile),
                )
                # no further operation on half-broken metadata
                return
        lgr.debug('Load custom metadata from %s', abssrcfile)
        meta = jsonload(text_type(abssrcfile))
        dsmeta.update(meta)
    if dsmeta:
        yield dict(
            path=ds.path,
            metadata=dsmeta,
            type='dataset',
            status='ok',
        )
Пример #3
0
def get_refcommit(ds):
    """Get most recent commit that changes any metadata-relevant content.

    This function should be executed in a clean dataset, with no uncommitted
    changes (untracked is OK).

    Returns
    -------
    str or None
      None if there is no matching commit, a hexsha otherwise.
    """
    exclude_paths = [
        ds.repo.pathobj / PurePosixPath(e) for e in exclude_from_metadata
    ]
    count = 0
    diff_cache = {}
    precommit = False
    while True:
        cur = 'HEAD~{:d}'.format(count)
        try:
            # get the diff between the next pair of previous commits
            diff = {
                p.relative_to(ds.repo.pathobj): props
                for p, props in iteritems(ds.repo.diffstatus(
                    PRE_INIT_COMMIT_SHA
                    if precommit
                    else 'HEAD~{:d}'.format(count + 1),
                    cur,
                    # superfluous, but here to state the obvious
                    untracked='no',
                    # this should be OK, unit test covers the cases
                    # of subdataset addition, modification and removal
                    # refcommit evaluation only makes sense in a clean
                    # dataset, and if that is true, any change in the
                    # submodule record will be visible in the parent
                    # already
                    eval_submodule_state='no',
                    # boost performance, we don't care about file types
                    # here
                    eval_file_type=False,
                    _cache=diff_cache))
                if props.get('state', None) != 'clean' \
                and p not in exclude_paths \
                and not any(e in p.parents for e in exclude_paths)
            }
        except ValueError as e:
            # likely ran out of commits to check
            if precommit:
                # end of things
                return None
            else:
                # one last round, taking in the entire history
                precommit = True
                continue
        if diff:
            return ds.repo.get_hexsha(cur)
        # next pair
        count += 1
Пример #4
0
def _get_fmeta_objpath(ds, expr, rec):
    fpath = Path(rec['path'])
    if rec.get('type', None) != 'file':  # pragma: no cover
        # nothing else in here
        return
    # build associated metadata file path from POSIX
    # pieces and convert to platform conventions at the end
    return text_type(ds.pathobj / PurePosixPath(
        expr.format(freldir=fpath.relative_to(ds.pathobj).parent.as_posix(),
                    fname=fpath.name)))
Пример #5
0
def annexjson2result(d, ds, **kwargs):
    """Helper to convert an annex JSON result to a datalad result dict

    Info from annex is rather heterogeneous, partly because some of it
    our support functions are faking.

    This helper should be extended with all needed special cases to
    homogenize the information.

    Parameters
    ----------
    d : dict
      Annex info dict.
    ds : Dataset instance
      Used to determine absolute paths for `file` results. This dataset
      is not used to set `refds` in the result, pass this as a separate
      kwarg if needed.
    **kwargs
      Passes as-is to `get_status_dict`. Must not contain `refds`.
    """
    lgr.debug('received JSON result from annex: %s', d)
    messages = []
    res = get_status_dict(**kwargs)
    res['status'] = 'ok' if d.get('success', False) is True else 'error'
    # we cannot rely on any of these to be available as the feed from
    # git annex (or its wrapper) is not always homogeneous
    if d.get('file'):
        res['path'] = str(ds.pathobj / PurePosixPath(d['file']))
    if 'command' in d:
        res['action'] = d['command']
    if 'key' in d:
        res['annexkey'] = d['key']
    if 'fields' in d:
        # this is annex metadata, filter out timestamps
        res['metadata'] = {
            k: v[0] if isinstance(v, list) and len(v) == 1 else v
            for k, v in d['fields'].items() if not k.endswith('lastchanged')
        }
    if d.get('error-messages', None):
        res['error_message'] = '\n'.join(m.strip()
                                         for m in d['error-messages'])
    # avoid meaningless standard messages, and collision with actual error
    # messages
    elif 'note' in d:
        note = "; ".join(
            ln for ln in d['note'].splitlines()
            if ln != 'checksum...' and not ln.startswith('checking file'))
        if note:
            messages.append(translate_annex_notes.get(note, note))
    if messages:
        res['message'] = '\n'.join(m.strip() for m in messages)
    return res
Пример #6
0
def _parse_gitmodules(ds):
    # TODO read .gitconfig from Git blob?
    gitmodules = ds.pathobj / '.gitmodules'
    if not gitmodules.exists():
        return {}
    # pull out file content
    out, err = ds.repo._git_custom_command(
        '', ['git', 'config', '-z', '-l', '--file', '.gitmodules'])
    # abuse our config parser
    db, _ = _parse_gitconfig_dump(out, {}, None, True)
    mods = {}
    for k, v in iteritems(db):
        if not k.startswith('submodule.'):
            # we don't know what this is
            lgr.debug("Skip unrecognized .gitmodule specification: %s=%s", k,
                      v)
            continue
        k_l = k.split('.')
        # module name is everything after 'submodule.' that is not the variable
        # name
        mod_name = '.'.join(k_l[1:-1])
        mod = mods.get(mod_name, {})
        # variable name is the last 'dot-free' segment in the key
        mod[k_l[-1]] = v
        mods[mod_name] = mod

    out = {}
    # bring into traditional shape
    for name, props in iteritems(mods):
        if 'path' not in props:
            lgr.debug("Failed to get '%s.path', skipping section", name)
            continue
        modprops = {
            'gitmodule_{}'.format(k): v
            for k, v in iteritems(props)
            if not (k.startswith('__') or k == 'path')
        }
        modpath = ds.pathobj / PurePosixPath(props['path'])
        modprops['gitmodule_name'] = name
        out[modpath] = modprops
    return out
Пример #7
0
    def __call__(self, dataset, refcommit, process_type, status):
        # shortcut
        ds = dataset

        repo = ds.repo  # OPT: .repo could be relatively expensive
        if not isinstance(repo, AnnexRepo):
            # nothing to be done
            return

        if process_type not in ('all', 'content'):
            return

        # no progress bar, we are only making a one-shot call to
        # annex, the rest is pretty much instantaneous

        # limit query to paths that are annexed
        query_paths = [
            # go relative to minimize cmdline footprint of annex call
            text_type(Path(s['path']).relative_to(ds.pathobj))
            for s in status
            # anything that looks like an annexed file
            if s.get('type', None) == 'file' \
            and s.get('key', None) is not None
        ]

        log_progress(
            lgr.info,
            'extractorannex',
            'Start annex metadata extraction from %s',
            ds,
            total=len(query_paths),
            label='Annex metadata extraction',
            unit=' Files',
        )
        for fpath, meta in repo.get_metadata(
                query_paths,
                # no timestamps, we are describing the status quo
                timestamps=False,
                # because we have filtered the query to only contained
                # annexed files, we can use batch mode and deal with
                # many files
                batch=True):
            log_progress(lgr.info,
                         'extractorannex',
                         'Extracted annex metadata from %s',
                         fpath,
                         update=1,
                         increment=True)
            meta = {
                k: v[0] if isinstance(v, list) and len(v) == 1 else v
                for k, v in meta.items()
            }
            if not meta:
                # only talk about files that actually carry metadata
                continue
            yield dict(
                # git annex reports the path in POSIX conventions
                path=PurePosixPath(fpath),
                metadata=meta,
                type='file',
                status='ok',
            )
        log_progress(
            lgr.info,
            'extractorannex',
            'Finished annex metadata extraction from %s',
            ds,
        )
Пример #8
0
def postclonecfg_ria(ds, props):
    """Configure a dataset freshly cloned from a RIA store"""
    repo = ds.repo
    # RIA uses hashdir mixed, copying data to it via git-annex (if cloned via
    # ssh) would make it see a bare repo and establish a hashdir lower annex
    # object tree.
    # Moreover, we want the ORA remote to receive all data for the store, so its
    # objects could be moved into archives (the main point of a RIA store).
    RIA_REMOTE_NAME = 'origin'  # don't hardcode everywhere
    ds.config.set(
        'remote.{}.annex-ignore'.format(RIA_REMOTE_NAME), 'true',
        where='local')

    # chances are that if this dataset came from a RIA store, its subdatasets
    # may live there too. Place a subdataset source candidate config that makes
    # get probe this RIA store when obtaining subdatasets
    ds.config.set(
        # we use the label 'origin' for this candidate in order to not have to
        # generate a complicated name from the actual source specification.
        # we pick a cost of 200 to sort it before datalad's default candidates
        # for non-RIA URLs, because they prioritize hierarchical layouts that
        # cannot be found in a RIA store
        'datalad.get.subdataset-source-candidate-200origin',
        # use the entire original URL, up to the fragment + plus dataset ID
        # placeholder, this should make things work with any store setup we
        # support (paths, ports, ...)
        props['source'].split('#', maxsplit=1)[0] + '#{id}',
        where='local')

    # setup publication dependency, if a corresponding special remote exists
    # and was enabled (there could be RIA stores that actually only have repos)
    # make this function be a generator
    ora_remotes = [s for s in ds.siblings('query', result_renderer='disabled')
                   if s.get('annex-externaltype') == 'ora']
    if not ora_remotes and any(
            r.get('externaltype') == 'ora'
            for r in (repo.get_special_remotes().values()
                      if hasattr(repo, 'get_special_remotes')
                      else [])):
        # no ORA remote autoenabled, but configuration known about at least one.
        # Let's check origin's config for datalad.ora-remote.uuid as stored by
        # create-sibling-ria and enable try enabling that one.
        lgr.debug("Found no autoenabled ORA special remote. Trying to look it "
                  "up in source config ...")

        # First figure whether we cloned via SSH, HTTP or local path and then
        # get that config file the same way:
        config_content = None
        scheme = props['giturl'].split(':', 1)[0]
        if scheme in ['http', 'https']:
            try:
                config_content = download_url(
                    "{}{}config".format(
                        props['giturl'],
                        '/' if not props['giturl'].endswith('/') else ''))
            except DownloadError as e:
                lgr.debug("Failed to get config file from source:\n%s",
                          exc_str(e))
        elif scheme == 'ssh':
            # TODO: switch the following to proper command abstraction:
            # SSHRemoteIO ignores the path part ATM. No remote CWD! (To be
            # changed with command abstractions). So we need to get that part to
            # have a valid path to origin's config file:
            cfg_path = PurePosixPath(URL(props['giturl']).path) / 'config'
            op = SSHRemoteIO(props['giturl'])
            try:
                config_content = op.read_file(cfg_path)
            except RIARemoteError as e:
                lgr.debug("Failed to get config file from source: %s",
                          exc_str(e))

        elif scheme == 'file':
            # TODO: switch the following to proper command abstraction:
            op = LocalIO()
            cfg_path = Path(URL(props['giturl']).localpath) / 'config'
            try:
                config_content = op.read_file(cfg_path)
            except (RIARemoteError, OSError) as e:
                lgr.debug("Failed to get config file from source: %s",
                          exc_str(e))
        else:
            lgr.debug("Unknown URL-Scheme %s in %s. Can handle SSH, HTTP or "
                      "FILE scheme URLs.", scheme, props['source'])

        # 3. And read it
        org_uuid = None
        if config_content:
            # TODO: We might be able to spare the saving to a file.
            #       "git config -f -" is not explicitly documented but happens
            #       to work and would read from stdin. Make sure we know this
            #       works for required git versions and on all platforms.
            with make_tempfile(content=config_content) as cfg_file:
                runner = GitWitlessRunner()
                try:
                    result = runner.run(
                        ['git', 'config', '-f', cfg_file,
                         'datalad.ora-remote.uuid'],
                        protocol=StdOutCapture
                    )
                    org_uuid = result['stdout'].strip()
                except CommandError as e:
                    # doesn't contain what we are looking for
                    lgr.debug("Found no UUID for ORA special remote at "
                              "'%s' (%s)", RIA_REMOTE_NAME, exc_str(e))

        # Now, enable it. If annex-init didn't fail to enable it as stored, we
        # wouldn't end up here, so enable with store URL as suggested by the URL
        # we cloned from.
        if org_uuid:
            srs = repo.get_special_remotes()
            if org_uuid in srs.keys():
                # TODO: - Double-check autoenable value and only do this when
                #         true?
                #       - What if still fails? -> Annex shouldn't change config
                #         in that case

                # we only need the store:
                new_url = props['source'].split('#')[0]
                try:
                    repo.enable_remote(srs[org_uuid]['name'],
                                       options=['url={}'.format(new_url)]
                                       )
                    lgr.info("Reconfigured %s for %s",
                             srs[org_uuid]['name'], new_url)
                    # update ora_remotes for considering publication dependency
                    # below
                    ora_remotes = [s for s in
                                   ds.siblings('query',
                                               result_renderer='disabled')
                                   if s.get('annex-externaltype', None) ==
                                   'ora']
                except CommandError as e:
                    lgr.debug("Failed to reconfigure ORA special remote: %s",
                              exc_str(e))
            else:
                lgr.debug("Unknown ORA special remote uuid at '%s': %s",
                          RIA_REMOTE_NAME, org_uuid)
    if ora_remotes:
        if len(ora_remotes) == 1:
            yield from ds.siblings('configure',
                                   name=RIA_REMOTE_NAME,
                                   publish_depends=ora_remotes[0]['name'],
                                   result_filter=None,
                                   result_renderer='disabled')
        else:
            lgr.warning("Found multiple ORA remotes. Couldn't decide which "
                        "publishing to 'origin' should depend on: %s. Consider "
                        "running 'datalad siblings configure -s origin "
                        "--publish-depends ORAREMOTENAME' to set publication "
                        "dependency manually.",
                        [r['name'] for r in ora_remotes])
Пример #9
0
def _get_contained_objs(ds):
    root = ds.pathobj / '.datalad' / 'metadata' / 'objects'
    return set(f for f in ds.repo.get_indexed_files()
               if root in (ds.pathobj / PurePosixPath(f)).parents)
Пример #10
0
def test_url_samples():
    _check_ri("http://example.com", URL, scheme='http', hostname="example.com")
    # "complete" one for classical http
    _check_ri("http://*****:*****@example.com:8080/p/sp?p1=v1&p2=v2#frag",
              URL,
              scheme='http',
              hostname="example.com",
              port=8080,
              username='******',
              password='******',
              path='/p/sp',
              query='p1=v1&p2=v2',
              fragment='frag')

    # sample one for ssh with specifying the scheme
    # XXX? might be useful?  https://github.com/FriendCode/giturlparse.py
    _check_ri("ssh://host/path/sp1",
              URL,
              scheme='ssh',
              hostname='host',
              path='/path/sp1')
    _check_ri("user@host:path/sp1",
              SSHRI,
              hostname='host',
              path='path/sp1',
              username='******')
    _check_ri("host:path/sp1", SSHRI, hostname='host', path='path/sp1')
    _check_ri("host:path", SSHRI, hostname='host', path='path')
    _check_ri("host:/path", SSHRI, hostname='host', path='/path')
    _check_ri("user@host", SSHRI, hostname='host', username='******')
    # TODO!!!  should this be a legit URL like this?
    # _check_ri("host", SSHRI, hostname='host'))
    eq_(repr(RI("host:path")), "SSHRI(hostname='host', path='path')")

    # And now perspective 'datalad', implicit=True urls pointing to the canonical center location
    _check_ri("///", DataLadRI)
    _check_ri("///p/s1", DataLadRI, path='p/s1')
    # could be considered by someone as "URI reference" relative to scheme
    _check_ri("//a/", DataLadRI, remote='a')
    _check_ri("//a/data", DataLadRI, path='data', remote='a')

    # here we will do custom magic allowing only schemes with + in them, such as dl+archive
    # or not so custom as
    _check_ri("hg+https://host/user/proj",
              URL,
              scheme="hg+https",
              hostname='host',
              path='/user/proj')
    # "old" style
    _check_ri("dl+archive:KEY/path/sp1#size=123",
              URL,
              scheme='dl+archive',
              path='KEY/path/sp1',
              fragment='size=123')
    # "new" style
    _check_ri("dl+archive:KEY#path=path/sp1&size=123",
              URL,
              scheme='dl+archive',
              path='KEY',
              fragment='path=path/sp1&size=123')
    # actually above one is probably wrong since we need to encode the path
    _check_ri("dl+archive:KEY#path=path%2Fbsp1&size=123",
              URL,
              scheme='dl+archive',
              path='KEY',
              fragment='path=path%2Fbsp1&size=123')

    #https://en.wikipedia.org/wiki/File_URI_scheme
    _check_ri("file://host", URL, scheme='file', hostname='host')
    _check_ri("file://host/path/sp1",
              URL,
              scheme='file',
              hostname='host',
              path='/path/sp1')
    # stock libraries of Python aren't quite ready for ipv6
    ipv6address = '2001:db8:85a3::8a2e:370:7334'
    _check_ri("file://%s/path/sp1" % ipv6address,
              URL,
              scheme='file',
              hostname=ipv6address,
              path='/path/sp1')
    for lh in ('localhost', '::1', '', '127.3.4.155'):
        _check_ri("file://%s/path/sp1" % lh,
                  URL,
                  localpath='/path/sp1',
                  scheme='file',
                  hostname=lh,
                  path='/path/sp1')
    _check_ri('http://[1fff:0:a88:85a3::ac1f]:8001/index.html',
              URL,
              scheme='http',
              hostname='1fff:0:a88:85a3::ac1f',
              port=8001,
              path='/index.html')
    _check_ri("file:///path/sp1",
              URL,
              localpath='/path/sp1',
              scheme='file',
              path='/path/sp1')
    # we don't do any magical comprehension for home paths/drives for windows
    # of file:// urls, thus leaving /~ and /c: for now:
    _check_ri("file:///~/path/sp1",
              URL,
              localpath='/~/path/sp1',
              scheme='file',
              path='/~/path/sp1')
    _check_ri("file:///%7E/path/sp1",
              URL,
              localpath='/~/path/sp1',
              scheme='file',
              path='/~/path/sp1',
              exact_str=False)
    # not sure but let's check
    _check_ri("file:///c:/path/sp1",
              URL,
              localpath='/c:/path/sp1',
              scheme='file',
              path='/c:/path/sp1',
              exact_str=False)

    # and now implicit paths or actually they are also "URI references"
    _check_ri("f", PathRI, localpath='f', path='f')
    _check_ri("f/s1", PathRI, localpath='f/s1', path='f/s1')
    _check_ri(PurePosixPath("f"), PathRI, localpath='f', path='f')
    _check_ri(PurePosixPath("f/s1"), PathRI, localpath='f/s1', path='f/s1')
    # colons are problematic and might cause confusion into SSHRI
    _check_ri("f/s:1", PathRI, localpath='f/s:1', path='f/s:1')
    _check_ri("f/s:", PathRI, localpath='f/s:', path='f/s:')
    _check_ri("/f", PathRI, localpath='/f', path='/f')
    _check_ri("/f/s1", PathRI, localpath='/f/s1', path='/f/s1')

    # some github ones, just to make sure
    _check_ri("git://host/user/proj",
              URL,
              scheme="git",
              hostname="host",
              path="/user/proj")
    _check_ri("git@host:user/proj",
              SSHRI,
              hostname="host",
              path="user/proj",
              username='******')

    _check_ri('weired:/', SSHRI, hostname='weired', path='/')
    # since schema is not allowing some symbols so we need to add additional check
    _check_ri('weired_url:/', SSHRI, hostname='weired_url', path='/')
    _check_ri('example.com:/', SSHRI, hostname='example.com', path='/')
    _check_ri('example.com:path/sp1',
              SSHRI,
              hostname='example.com',
              path='path/sp1')
    _check_ri('example.com/path/sp1\:fname',
              PathRI,
              localpath='example.com/path/sp1\:fname',
              path='example.com/path/sp1\:fname')
    # ssh is as stupid as us, so we will stay "Consistently" dumb
    """
    $> ssh example.com/path/sp1:fname
    ssh: Could not resolve hostname example.com/path/sp1:fname: Name or service not known

    edit 20190516 yoh: but this looks like a perfectly valid path.
    SSH knows that it is not a path but its SSHRI so it can stay dumb.
    We are trying to be smart and choose between RIs (even when we know that
    it is e.g. a file).
    """
    _check_ri('e.com/p/sp:f',
              PathRI,
              localpath='e.com/p/sp:f',
              path='e.com/p/sp:f')
    _check_ri('[email protected]/mydir',
              PathRI,
              localpath='[email protected]/mydir',
              path='[email protected]/mydir')

    # SSHRIs have .port, but it is empty
    eq_(SSHRI(hostname='example.com').port, '')

    # check that we are getting a warning logged when url can't be reconstructed
    # precisely
    # actually failed to come up with one -- becomes late here
    #_check_ri("http://host///..//p", scheme='http', path='/..//p')

    # actually this one is good enough to trigger a warning and I still don't know
    # what it should exactly be!?
    with swallow_logs(new_level=logging.DEBUG) as cml:
        weired_str = 'weired://'
        weired_url = RI(weired_str)
        repr(weired_url)
        cml.assert_logged('Parsed version of SSHRI .weired:/. '
                          'differs from original .weired://.')
        # but we store original str
        eq_(str(weired_url), weired_str)
        neq_(weired_url.as_str(), weired_str)

    raise SkipTest(
        "TODO: file://::1/some does complain about parsed version dropping ::1"
    )
Пример #11
0
    def __call__(urls,
                 *,
                 dataset=None,
                 path=None,
                 overwrite=False,
                 archive=False,
                 save=True,
                 message=None):
        from ..downloaders.http import HTTPDownloader
        from ..downloaders.providers import Providers

        ds = None
        if save or dataset:
            try:
                ds = require_dataset(dataset,
                                     check_installed=True,
                                     purpose='download urls')
            except NoDatasetFound:
                pass

        common_report = {"action": "download_url", "ds": ds}

        got_ds_instance = isinstance(dataset, Dataset)
        dir_is_target = not path or str(path).endswith(op.sep)
        path = str(resolve_path(path or op.curdir, ds=dataset))
        if dir_is_target:
            # resolve_path() doesn't preserve trailing separators. Add one for
            # the download() call.
            path = path + op.sep
        urls = ensure_list_from_str(urls)

        if not dir_is_target:
            if len(urls) > 1:
                yield get_status_dict(
                    status="error",
                    message=
                    ("When specifying multiple urls, --path should point to "
                     "a directory target (with a trailing separator). Got %r",
                     path),
                    type="file",
                    path=path,
                    **common_report)
                return
            if archive:
                # make sure the file suffix indicated by a URL is preserved
                # so that any further archive processing doesn't have to
                # employ mime type inspection in order to determine the archive
                # type
                from datalad.support.network import URL
                suffixes = PurePosixPath(URL(urls[0]).path).suffixes
                if not Path(path).suffixes == suffixes:
                    path += ''.join(suffixes)
            # we know that we have a single URL
            # download() would be fine getting an existing directory and
            # downloading the URL underneath it, but let's enforce a trailing
            # slash here for consistency.
            if op.isdir(path):
                yield get_status_dict(
                    status="error",
                    message=(
                        "Non-directory path given (no trailing separator) "
                        "but a directory with that name (after adding archive "
                        "suffix) exists"),
                    type="file",
                    path=path,
                    **common_report)
                return

        # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress
        # in % of urls which were already downloaded
        providers = Providers.from_config_files()
        downloaded_paths = []
        path_urls = {}
        need_datalad_remote = False
        for url in urls:
            # somewhat "ugly"
            downloader = providers.get_provider(url).get_downloader(url)
            try:
                downloaded_path = downloader.download(url,
                                                      path=path,
                                                      overwrite=overwrite)
            except Exception as e:
                ce = CapturedException(e)
                yield get_status_dict(status="error",
                                      message=str(ce),
                                      type="file",
                                      path=path,
                                      exception=ce,
                                      **common_report)
            else:
                if not need_datalad_remote \
                   and (downloader.authenticator or downloader.credential or
                        type(downloader) != HTTPDownloader):
                    need_datalad_remote = True
                downloaded_paths.append(downloaded_path)
                path_urls[downloaded_path] = url
                yield get_status_dict(status="ok",
                                      type="file",
                                      path=downloaded_path,
                                      **common_report)

        if downloaded_paths and save and ds is not None:
            msg = message or """\
[DATALAD] Download URLs

URLs:
  {}""".format("\n  ".join(urls))

            for r in Save()(
                    downloaded_paths,
                    message=msg,
                    # ATTN: Pass the original dataset argument to
                    # preserve relative path handling semantics.
                    dataset=dataset,
                    return_type="generator",
                    result_renderer='disabled',
                    result_xfm=None,
                    result_filter=None,
                    on_failure="ignore"):
                yield r

            ds_repo = ds.repo
            if isinstance(ds_repo, AnnexRepo):
                if need_datalad_remote:
                    from datalad.customremotes.base import (
                        ensure_datalad_remote, )
                    ensure_datalad_remote(ds_repo,
                                          autoenable=True,
                                          encryption=None)

                if got_ds_instance:
                    # Paths in `downloaded_paths` are already relative to the
                    # dataset.
                    rpaths = dict(zip(downloaded_paths, downloaded_paths))
                else:
                    # Paths in `downloaded_paths` are already relative to the
                    # current working directory. Take these relative to the
                    # dataset for use with the AnnexRepo method calls.
                    rpaths = {}
                    for orig_path, resolved in zip(
                            downloaded_paths,
                            resolve_path(downloaded_paths, ds=dataset)):
                        rpath = path_under_rev_dataset(ds, resolved)
                        if rpath:
                            rpaths[str(rpath)] = orig_path
                        else:
                            lgr.warning("Path %s not under dataset %s",
                                        orig_path, ds)
                annex_paths = [
                    p for p, annexed in zip(
                        rpaths, ds_repo.is_under_annex(list(rpaths.keys())))
                    if annexed
                ]
                if annex_paths:
                    for path in annex_paths:
                        url = path_urls[rpaths[path]]
                        try:
                            # The file is already present. This is just to
                            # register the URL.
                            ds_repo.add_url_to_file(
                                path,
                                url,
                                # avoid batch mode for single files
                                # https://github.com/datalad/datalad/issues/2849
                                batch=len(annex_paths) > 1,
                                # bypass URL size check, we already have the file
                                options=['--relaxed'])
                        except CommandError as exc:
                            lgr.warning("Registering %s with %s failed: %s",
                                        path, url, CapturedException(exc))

                    if archive:
                        for path in annex_paths:
                            yield from ds.add_archive_content(
                                path,
                                delete=True,
                                on_failure='ignore',
                                return_type='generator',
                                result_renderer='disabled')
Пример #12
0
def _p(rpath):
    return str(Path(PurePosixPath(rpath)))
Пример #13
0
    def __call__(dataset=None,
                 path=None,
                 sources=None,
                 process_type=None,
                 format='native'):
        ds = require_dataset(dataset or curdir,
                             purpose="extract metadata",
                             check_installed=not path)

        # check what extractors we want as sources, and whether they are
        # available
        if not sources:
            sources = ['metalad_core', 'metalad_annex'] \
                + assure_list(get_metadata_type(ds))
        # keep local, who knows what some extractors might pull in
        from pkg_resources import iter_entry_points  # delayed heavy import
        extractors = {}
        for ep in iter_entry_points('datalad.metadata.extractors'):
            if ep.name not in sources:
                # not needed here
                continue
            rec = dict(entrypoint=ep)
            if ep.name in extractors:  # pragma: no cover
                # potential conflict
                if extractors[
                        ep.name]['entrypoint'].dist.project_name == 'datalad':
                    # this is OK, just state it is happening
                    lgr.debug('Extractor %s overrides datalad-core variant',
                              ep)
                    extractors[ep.name] = rec
                elif ep.dist.project_name == 'datalad':
                    # also OK
                    lgr.debug('Prefer extractor %s over datalad-core variant',
                              ep)
                else:
                    msg = ('At least two DataLad extensions provide metadata '
                           'extractor %s: %s vs. %s', ep.name, ep.dist,
                           extractors[ep.name].dist)
                    if ep.name in sources:
                        # this extractor is required -> blow hard
                        raise RuntimeError(msg[0] % msg[1:])
                    else:
                        # still moan
                        lgr.warn(msg)
                    # ignore the newcomer, is listed second in sys.path
            else:
                # this fresh and unique
                extractors[ep.name] = rec
        for msrc in sources:
            if msrc not in extractors:
                # we said that we want to fail, rather then just moan about
                # less metadata
                raise ValueError(
                    "Enabled metadata extractor '{}' not available".format(
                        msrc), )
            # load extractor implementation
            rec = extractors[msrc]
            rec['process_type'] = process_type \
                if process_type and not process_type == 'extractors' \
                else ds.config.obtain(
                    'datalad.metadata.extract-from-{}'.format(
                        msrc.replace('_', '-')),
                    default='all')
            # load the extractor class, no instantiation yet
            try:
                rec['class'] = rec['entrypoint'].load()
            except Exception as e:  # pragma: no cover
                msg = ('Failed %s metadata extraction from %s: %s', msrc, ds,
                       exc_str(e))
                log_progress(lgr.error, 'metadataextractors', *msg)
                raise ValueError(msg[0] % msg[1:])

        res_props = dict(
            action='meta_extract',
            logger=lgr,
        )

        # build report on extractors and their state info
        if process_type == 'extractors':
            for ename, eprops in iteritems(extractors):
                state = {}
                # do not trip over old extractors
                if hasattr(eprops['class'], 'get_state'):
                    state.update(eprops['class']().get_state(ds))

                yield dict(action='meta_extract',
                           path=ds.path,
                           status='ok',
                           logger=lgr,
                           extractor=ename,
                           state=dict(
                               state,
                               process_type=eprops['process_type'],
                           ))
            return

        # build a representation of the dataset's content (incl subds
        # records)
        # go through a high-level command (not just the repo methods) to
        # get all the checks and sanitization of input arguments
        # this call is relatively expensive, but already anticipates
        # demand for information by our core extractors that always run
        # unconditionally, hence no real slowdown here
        # TODO this could be a dict, but MIH cannot think of an access
        # pattern that does not involve iteration over all items
        status = []
        exclude_paths = [
            ds.pathobj / PurePosixPath(e) for e in
            (list(exclude_from_metadata) +
             assure_list(ds.config.get('datalad.metadata.exclude-path', [])))
        ]
        if ds.is_installed():
            # we can make use of status
            res_props.update(refds=ds.path)

            for r in ds.status(
                    # let status sort out all path arg handling
                    # but this will likely make it impossible to use this
                    # command to just process an individual file independent
                    # of a dataset
                    path=path,
                    # it is safe to ask for annex info even when a dataset is
                    # plain Git
                    # NOTE changing to 'annex=availability' has substantial
                    # performance costs, as it involved resolving each annex
                    # symlink on the file-system, which can be really slow
                    # depending on the FS and the number of annexed files
                    annex='basic',
                    # TODO we never want to aggregate metadata from untracked
                    # content, but we might just want to see what we can get
                    # from a file
                    untracked='no',
                    # this command cannot and will not work recursively
                    recursive=False,
                    result_renderer='disabled'):
                # path reports are always absolute and anchored on the dataset
                # (no repo) path
                p = Path(r['path'])
                if p in exclude_paths or \
                        any(e in p.parents for e in exclude_paths):
                    # this needs to be ignore for any further processing
                    continue
                # strip useless context information
                status.append({
                    k: v
                    for k, v in iteritems(r)
                    if (k not in ('refds', 'parentds', 'action',
                                  'status') and not k.startswith('prev_'))
                })

            # determine the commit that we are describing
            refcommit = get_refcommit(ds)
            if refcommit is None or not len(status):
                # this seems extreme, but without a single commit there is
                # nothing we can have, or describe -> blow
                yield dict(
                    res_props,
                    status='error',
                    message=\
                    'No metadata-relevant repository content found. ' \
                    'Cannot determine reference commit for metadata ID',
                    type='dataset',
                    path=ds.path,
                )
                return
            # stamp every result
            res_props['refcommit'] = refcommit
        else:
            # no dataset at hand, take path arg at face value and hope
            # for the best
            # TODO we have to resolve the given path to make it match what
            # status is giving (abspath with ds (not repo) anchor)
            status = [dict(path=p, type='file') for p in assure_list(path)]
            # just for compatibility, mandatory argument list below
            refcommit = None

        if ds.is_installed():
            # check availability requirements and obtain data as needed
            needed_paths = set()
            for rec in extractors.values():
                if hasattr(rec['class'], 'get_required_content'):
                    needed_paths.update(
                        # new extractors do not need any instantiation args
                        s['path'] for s in rec['class']().get_required_content(
                            ds, rec['process_type'], status))
            if needed_paths:
                for r in ds.get(path=needed_paths,
                                return_type='generator',
                                result_renderer='disabled'):
                    if success_status_map.get(
                            r['status'],
                            False) != 'success':  # pragma: no cover
                        # online complain when something goes wrong
                        yield r

        contexts = {}
        nodes_by_context = {}
        try:
            for res in _proc(ds, refcommit, sources, status, extractors,
                             process_type):
                if format == 'native':
                    # that is what we pass around internally
                    res.update(**res_props)
                    yield res
                elif format == 'jsonld':
                    collect_jsonld_metadata(ds.pathobj, res, nodes_by_context,
                                            contexts)
        finally:
            # extractors can come from any source with no guarantee for
            # proper implementation. Let's make sure that we bring the
            # dataset back into a sane state (e.g. no batch processes
            # hanging around). We should do this here, as it is not
            # clear whether extraction results will be saved to the
            # dataset(which would have a similar sanitization effect)
            if ds.repo:
                ds.repo.precommit()
        if format == 'jsonld':
            yield dict(status='ok',
                       type='dataset',
                       path=ds.path,
                       metadata=format_jsonld_metadata(nodes_by_context),
                       **res_props)