Exemplo n.º 1
0
def test_logging_to_a_file(dst):
    ok_(not exists(dst))

    lgr = LoggerHelper("dataladtest-1").get_initialized_logger(logtarget=dst)
    ok_(exists(dst))  # nothing was logged -- no file created

    msg = "Oh my god, they killed Kenny"
    lgr.error(msg)
    with open(dst) as f:
        lines = f.readlines()
    assert_equal(len(lines), 1, "Read more than a single log line: %s" %  lines)
    line = lines[0]
    ok_(msg in line)
    ok_('\033[' not in line,
        msg="There should be no color formatting in log files. Got: %s" % line)
    # verify that time stamp and level are present in the log line
    # do not want to rely on not having race conditions around date/time changes
    # so matching just with regexp
    # (...)? is added to swallow possible traceback logs
    regex = "\[ERROR\]"
    if EnsureBool()(cfg.get('datalad.log.timestamp', False)):
        regex = "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} " + regex
    if EnsureBool()(cfg.get('datalad.log.vmem', False)):
        regex += ' RSS/VMS: \S+/\S+( \S+)?\s*'
    regex += "(\s+\S+\s*)? " + msg
    assert_re_in(regex, line, match=True)
    # Close all handlers so windows is happy -- apparently not closed fast enough
    for handler in lgr.handlers:
        handler.close()
Exemplo n.º 2
0
def test_logging_to_a_file(dst):
    ok_(not exists(dst))

    lgr = LoggerHelper("dataladtest-1").get_initialized_logger(logtarget=dst)
    ok_(exists(dst))  # nothing was logged -- no file created

    msg = "Oh my god, they killed Kenny"
    lgr.error(msg)
    with open(dst) as f:
        lines = f.readlines()
    assert_equal(len(lines), 1, "Read more than a single log line: %s" %  lines)
    line = lines[0]
    ok_(msg in line)
    ok_('\033[' not in line,
        msg="There should be no color formatting in log files. Got: %s" % line)
    # verify that time stamp and level are present in the log line
    # do not want to rely on not having race conditions around date/time changes
    # so matching just with regexp
    # (...)? is added to swallow possible traceback logs
    regex = "\[ERROR\]"
    if EnsureBool()(dl_cfg.get('datalad.log.timestamp', False)):
        regex = "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} " + regex
    if EnsureBool()(dl_cfg.get('datalad.log.vmem', False)):
        regex += ' RSS/VMS: \S+/\S+( \S+)?\s*'
    regex += "(\s+\S+\s*)? " + msg
    assert_re_in(regex, line, match=True)
    # Close all handlers so windows is happy -- apparently not closed fast enough
    for handler in lgr.handlers:
        handler.close()
Exemplo n.º 3
0
def test_global_config():

    # from within tests, global config should be read from faked $HOME (see
    # setup_package)
    glb_cfg_file = Path(os.environ['HOME']) / '.gitconfig'
    assert any(glb_cfg_file.samefile(Path(p)) for p in dl_cfg._cfgfiles)
    assert_equal(dl_cfg.get("user.name"), "DataLad Tester")
    assert_equal(dl_cfg.get("user.email"), "*****@*****.**")
Exemplo n.º 4
0
def test_global_config():

    # from within tests, global config should be read from faked $HOME (see
    # setup_package)
    glb_cfg_file = Path(os.path.expanduser('~')) / '.gitconfig'
    assert any(
        glb_cfg_file.samefile(Path(p)) for p in dl_cfg._stores['git']['files'])
    assert_equal(dl_cfg.get("user.name"), "DataLad Tester")
    assert_equal(dl_cfg.get("user.email"), "*****@*****.**")
Exemplo n.º 5
0
def _ok_metadata(res, msrc, ds, loc):
    restype = res.get('type', None)
    if restype not in ('dataset', 'file'):  # pragma: no cover
        # untested, would need broken extractor
        lgr.error(
            'metadata report for something other than a file or dataset: %s',
            restype)
        return False

    meta = res.get('metadata', None)
    if meta is None or isinstance(meta, dict):
        return True
    else:  # pragma: no cover
        # untested, needs broken extract
        # extractor
        msg = ("Metadata extractor '%s' yielded something other than a "
               "dictionary for dataset %s%s -- this is likely a bug, "
               "please consider reporting it. "
               "This type of native metadata will be ignored. Got: %s",
               msrc, ds, '' if loc is None else ' content {}'.format(loc),
               repr(meta))
        if cfg.get('datalad.runtime.raiseonerror'):
            raise RuntimeError(*msg)

        lgr.error(*msg)
        return False
Exemplo n.º 6
0
def test_install_dataset_from_just_source(src_repo=None, path=None):

    src_ds = Dataset(src_repo).create(result_renderer='disabled', force=True)
    src_ds.save(['INFO.txt', 'test.dat'], to_git=True)
    src_ds.save('test-annex.dat', to_git=False)
    # equivalent repo on github:
    src_url = "https://github.com/datalad/testrepo--basic--r1.git"
    sources = [
        src_ds.path,
        get_local_file_url(src_ds.path, compatibility='git')
    ]
    if not dl_cfg.get('datalad.tests.nonetwork'):
        sources.append(src_url)

    for url in sources:

        with chpwd(path, mkdir=True):
            ds = install(source=url)

        ok_startswith(ds.path, path)
        ok_(ds.is_installed())
        ok_(GitRepo.is_valid_repo(ds.path))
        assert_repo_status(ds.path, annex=None)
        assert_in('INFO.txt', ds.repo.get_indexed_files())

        # cleanup before next iteration
        rmtree(path)
Exemplo n.º 7
0
def test_ExtractedArchive(path):
    archive = op.join(path, fn_archive_obscure_ext)
    earchive = ExtractedArchive(archive)
    assert_false(op.exists(earchive.path))
    # no longer the case -- just using hash for now
    # assert_in(os.path.basename(archive), earchive.path)

    fpath = op.join(
        fn_archive_obscure,  # lead directory
        fn_in_archive_obscure)
    extracted = earchive.get_extracted_filename(fpath)
    eq_(extracted, op.join(earchive.path, fpath))
    assert_false(op.exists(extracted))  # not yet

    extracted_ = earchive.get_extracted_file(fpath)
    eq_(extracted, extracted_)
    assert_true(op.exists(extracted))  # now it should

    extracted_files = earchive.get_extracted_files()
    ok_generator(extracted_files)
    eq_(
        sorted(extracted_files),
        sorted([
            # ['bbc/3.txt', 'bbc/abc']
            op.join(fn_archive_obscure, fn_in_archive_obscure),
            op.join(fn_archive_obscure, '3.txt')
        ]))

    earchive.clean()
    if not dl_cfg.get('datalad.tests.temp.keep'):
        assert_false(op.exists(earchive.path))
Exemplo n.º 8
0
def test_logging_to_a_file(dst):
    ok_(not exists(dst))

    lgr = LoggerHelper("dataladtest-1").get_initialized_logger(logtarget=dst)
    ok_(exists(dst))  # nothing was logged -- no file created

    msg = "Oh my god, they killed Kenny"
    lgr.error(msg)
    with open(dst) as f:
        lines = f.readlines()
    assert_equal(len(lines), 1, "Read more than a single log line: %s" % lines)
    line = lines[0]
    ok_(msg in line)
    ok_(not '\033[' in line,
        msg="There should be no color formatting in log files. Got: %s" % line)
    # verify that time stamp and level are present in the log line
    # do not want to rely on not having race conditions around date/time changes
    # so matching just with regexp
    # .* is added to swallow possible traceback logs
    if EnsureBool()(cfg.get('datalad.log.timestamp', False)):
        ok_(
            re.match(
                "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} \[ERROR\](\s+\S+\s*)? %s"
                % msg, line))
    else:
        ok_(re.match("\[ERROR\](\s+\S+\s*)? %s" % msg, line))
Exemplo n.º 9
0
def test_logging_to_a_file(dst):
    ok_(not exists(dst))

    lgr = LoggerHelper("dataladtest-1").get_initialized_logger(logtarget=dst)
    ok_(exists(dst))  # nothing was logged -- no file created

    msg = "Oh my god, they killed Kenny"
    lgr.error(msg)
    with open(dst) as f:
        lines = f.readlines()
    assert_equal(len(lines), 1, "Read more than a single log line: %s" %  lines)
    line = lines[0]
    ok_(msg in line)
    ok_(not '\033[' in line,
        msg="There should be no color formatting in log files. Got: %s" % line)
    # verify that time stamp and level are present in the log line
    # do not want to rely on not having race conditions around date/time changes
    # so matching just with regexp
    # .* is added to swallow possible traceback logs
    if EnsureBool()(cfg.get('datalad.log.timestamp', False)):
        ok_(re.match("\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} \[ERROR\](\s+\S+\s*)? %s" % msg,
                    line))
    else:
        ok_(re.match("\[ERROR\](\s+\S+\s*)? %s" % msg,
                    line))
Exemplo n.º 10
0
def load_extensions():
    """Load entrypoint for any configured extension package

    Log a warning in case a requested extension is not available, or if
    a requested extension fails on load.

    Extensions to load are taken from the 'datalad.extensions.load'
    configuration item.
    """
    from datalad import cfg
    load_extensions = cfg.get('datalad.extensions.load', get_all=True)
    if load_extensions:
        from datalad.utils import ensure_list
        exts = {
            ename: eload
            for ename, _, eload in iter_entrypoints('datalad.extensions')
        }
        for el in ensure_list(load_extensions):
            if el not in exts:
                lgr.warning('Requested extension %r is not available', el)
                continue
            try:
                exts[el]()
            except Exception as e:
                ce = CapturedException(e)
                lgr.warning('Could not load extension %r: %s', el, ce)
Exemplo n.º 11
0
 def newfunc(*args, **kwargs):
     if on_windows:
         raise SkipTest("SSH currently not available on windows.")
     from datalad import cfg
     test_ssh = cfg.get("datalad.tests.ssh", '')
     if test_ssh in ('', '0', 'false', 'no'):
         raise SkipTest("Run this test by setting DATALAD_TESTS_SSH")
     return func(*args, **kwargs)
Exemplo n.º 12
0
def test_update_fetch_all(path=None):
    path = Path(path)
    remote_1 = str(path / "remote_1")
    remote_2 = str(path / "remote_2")

    ds = Dataset(path / "src").create()
    src = ds.repo.path

    ds_rmt1 = clone(source=src, path=remote_1)
    ds_rmt2 = clone(source=src, path=remote_2)

    ds.siblings('add', name="sibling_1", url=remote_1)
    ds.siblings('add', name="sibling_2", url=remote_2)

    # modify the remotes:
    (ds_rmt1.pathobj / "first.txt").write_text("some file load")
    ds_rmt1.save()

    # TODO: Modify an already present file!

    (ds_rmt2.pathobj / "second.txt").write_text("different file load")
    ds_rmt2.save()

    # Let's init some special remote which we couldn't really update/fetch
    if not dl_cfg.get('datalad.tests.dataladremote'):
        ds.repo.init_remote(
            'datalad',
            ['encryption=none', 'type=external', 'externaltype=datalad'])
    # fetch all remotes
    assert_result_count(ds.update(), 1, status='ok', type='dataset')

    # no merge, so changes are not in active branch:
    assert_not_in("first.txt", ds.repo.get_files(ds.repo.get_active_branch()))
    assert_not_in("second.txt", ds.repo.get_files(ds.repo.get_active_branch()))
    # but we know the changes in remote branches:
    assert_in("first.txt", ds.repo.get_files("sibling_1/" + DEFAULT_BRANCH))
    assert_in("second.txt", ds.repo.get_files("sibling_2/" + DEFAULT_BRANCH))

    # no merge strategy for multiple remotes yet:
    # more clever now, there is a tracking branch that provides a remote
    #assert_raises(NotImplementedError, ds.update, merge=True)

    # merge a certain remote:
    assert_result_count(ds.update(sibling='sibling_1', merge=True),
                        1,
                        action='update',
                        status='ok',
                        type='dataset')

    # changes from sibling_2 still not present:
    assert_not_in("second.txt", ds.repo.get_files(ds.repo.get_active_branch()))
    # changes from sibling_1 merged:
    assert_in("first.txt", ds.repo.get_files(ds.repo.get_active_branch()))
    # it's known to annex, but has no content yet:
    annexprops = ds.repo.get_file_annexinfo("first.txt",
                                            eval_availability=True)
    annexprops['key']  # blows if unknown
    eq_(False, annexprops['has_content'])
Exemplo n.º 13
0
    def get_connection(self,
                       url,
                       use_remote_annex_bundle=True,
                       force_ip=False):
        """Get a singleton, representing a shared ssh connection to `url`

        Parameters
        ----------
        url: str
          ssh url
        force_ip : {False, 4, 6}
          Force the use of IPv4 or IPv6 addresses.

        Returns
        -------
        SSHConnection
        """
        # parse url:
        from datalad.support.network import RI, is_ssh
        if isinstance(url, RI):
            sshri = url
        else:
            if ':' not in url and '/' not in url:
                # it is just a hostname
                lgr.debug("Assuming %r is just a hostname for ssh connection",
                          url)
                url += ':'
            sshri = RI(url)

        if not is_ssh(sshri):
            raise ValueError("Unsupported SSH URL: '{0}', use "
                             "ssh://host/path or host:path syntax".format(url))

        from datalad import cfg
        identity_file = cfg.get("datalad.ssh.identityfile")

        conhash = get_connection_hash(
            sshri.hostname,
            port=sshri.port,
            identity_file=identity_file or "",
            username=sshri.username,
            bundled=use_remote_annex_bundle,
            force_ip=force_ip,
        )
        # determine control master:
        ctrl_path = self.socket_dir / conhash

        # do we know it already?
        if ctrl_path in self._connections:
            return self._connections[ctrl_path]
        else:
            c = SSHConnection(ctrl_path,
                              sshri,
                              identity_file=identity_file,
                              use_remote_annex_bundle=use_remote_annex_bundle,
                              force_ip=force_ip)
            self._connections[ctrl_path] = c
            return c
Exemplo n.º 14
0
 def _get_format(self, log_name=False, log_pid=False):
     from datalad import cfg
     from datalad.config import anything2bool
     show_timestamps = anything2bool(cfg.get('datalad.log.timestamp', False))
     return (("" if not show_timestamps else "$BOLD%(asctime)-15s$RESET ") +
             ("%(name)-15s " if log_name else "") +
             ("{%(process)d}" if log_pid else "") +
             "[%(levelname)s] "
             "%(message)s ")
Exemplo n.º 15
0
Arquivo: log.py Projeto: leej3/datalad
 def _get_format(self, log_name=False, log_pid=False):
     from datalad import cfg
     from datalad.config import anything2bool
     show_timestamps = anything2bool(cfg.get('datalad.log.timestamp', False))
     return (("" if not show_timestamps else "$BOLD%(asctime)-15s$RESET ") +
             ("%(name)-15s " if log_name else "") +
             ("{%(process)d}" if log_pid else "") +
             "[%(levelname)s] "
             "%(message)s ")
Exemplo n.º 16
0
def test_update_fetch_all(src, remote_1, remote_2):
    rmt1 = AnnexRepo.clone(src, remote_1)
    rmt2 = AnnexRepo.clone(src, remote_2)

    ds = Dataset(src)
    ds.siblings('add', name="sibling_1", url=remote_1)
    ds.siblings('add', name="sibling_2", url=remote_2)

    # modify the remotes:
    with open(opj(remote_1, "first.txt"), "w") as f:
        f.write("some file load")
    rmt1.add("first.txt")
    rmt1.commit()
    # TODO: Modify an already present file!

    with open(opj(remote_2, "second.txt"), "w") as f:
        f.write("different file load")
    rmt2.add("second.txt", git=True)
    rmt2.commit(msg="Add file to git.")

    # Let's init some special remote which we couldn't really update/fetch
    if not dl_cfg.get('datalad.tests.dataladremote'):
        ds.repo.init_remote(
            'datalad',
            ['encryption=none', 'type=external', 'externaltype=datalad'])
    # fetch all remotes
    assert_result_count(
        ds.update(), 1, status='ok', type='dataset')

    # no merge, so changes are not in active branch:
    assert_not_in("first.txt",
                  ds.repo.get_files(ds.repo.get_active_branch()))
    assert_not_in("second.txt",
                  ds.repo.get_files(ds.repo.get_active_branch()))
    # but we know the changes in remote branches:
    assert_in("first.txt", ds.repo.get_files("sibling_1/" + DEFAULT_BRANCH))
    assert_in("second.txt", ds.repo.get_files("sibling_2/" + DEFAULT_BRANCH))

    # no merge strategy for multiple remotes yet:
    # more clever now, there is a tracking branch that provides a remote
    #assert_raises(NotImplementedError, ds.update, merge=True)

    # merge a certain remote:
    assert_result_count(
        ds.update(sibling='sibling_1', merge=True),
        1, action='update', status='ok', type='dataset')

    # changes from sibling_2 still not present:
    assert_not_in("second.txt",
                  ds.repo.get_files(ds.repo.get_active_branch()))
    # changes from sibling_1 merged:
    assert_in("first.txt",
              ds.repo.get_files(ds.repo.get_active_branch()))
    # it's known to annex, but has no content yet:
    ds.repo.get_file_key("first.txt")  # raises if unknown
    eq_([False], ds.repo.file_has_content(["first.txt"]))
Exemplo n.º 17
0
        def _get_plugin_specs(param_key=None, cfg_key=None):
            spec = common_params.get(param_key, None)
            if spec is not None:
                # this is already a list of lists
                return spec

            spec = dlcfg.get(cfg_key, None)
            if spec is None:
                return
            elif not isinstance(spec, tuple):
                spec = [spec]
            return [shlex.split(s) for s in spec]
Exemplo n.º 18
0
def _get_github_entity(gh, cred, github_user, github_passwd, github_organization):
    # figure out authentication
    if not (github_user and github_passwd):
        # access to the system secrets
        if github_user:
            # check that they keystore knows about this user
            if github_user != cred.get('user', github_user):
                # there is a mismatch, we need to ask
                creds = cred.enter_new()
                github_user = creds['user']
                github_passwd = creds['password']

        # if a user is provided, go with it, don't even ask any store
        if github_user is None and not cred.is_known:
            # let's figure out authentication
            if github_user is None:
                # check if there is an oauth token from
                # https://github.com/sociomantic/git-hub
                github_user = cfg.get('hub.oauthtoken', None)

        if github_user is None:
            # still nothing, ask if necessary
            creds = cred()
            github_user = creds['user']
            github_passwd = creds['password']

    if not github_user:
        raise gh.BadCredentialsException(403, 'no user specified')

    # this will always succeed, but it might later throw an exception
    # if the credentials were wrong
    # XXX make sure to wipe out known credentials if that happens
    authed_gh = gh.Github(
        github_user,
        password=github_passwd)

    try:
        if github_organization:
            try:
                entity = authed_gh.get_organization(github_organization)
            except gh.UnknownObjectException as e:
                raise ValueError('unknown organization "{}" [{}]'.format(
                                 github_organization,
                                 exc_str(e)))
        else:
            entity = authed_gh.get_user()
    except gh.BadCredentialsException as e:
        # things blew up, wipe out cred store, if anything is in it
        if cred.is_known:
            cred.delete()
        raise e

    return entity
Exemplo n.º 19
0
def _run_extractor(extractor_cls, name, ds, refcommit, status, process_type):
    """Helper to control extractor using the right API

    Central switch to deal with alternative/future APIs is inside
    """
    try:
        # detect supported API and interface as needed
        if issubclass(extractor_cls, MetadataExtractor):
            # new-style, command-like extractors
            extractor = extractor_cls()
            for r in extractor(dataset=ds,
                               refcommit=refcommit,
                               status=status,
                               process_type=process_type):
                yield r
        elif hasattr(extractor_cls, 'get_metadata'):  # pragma: no cover
            # old-style, keep around for a while, but don't sweat over it much
            for res in _yield_res_from_pre2019_extractor(
                    ds,
                    name,
                    extractor_cls,
                    process_type,
                    # old extractors only take a list of relative paths
                    # and cannot benefit from outside knowledge
                    # TODO avoid is_installed() call
                [
                    text_type(Path(p['path']).relative_to(ds.pathobj))
                    if ds.is_installed() else p['path'] for p in status
                ]):
                yield res
        else:  # pragma: no cover
            raise RuntimeError(
                '{} does not have a recognised extractor API'.format(
                    extractor_cls))
    except Exception as e:  # pragma: no cover
        if cfg.get('datalad.runtime.raiseonerror'):
            log_progress(
                lgr.error,
                'metadataextractors',
                'Failed %s metadata extraction from %s',
                name,
                ds,
            )
            raise
        yield get_status_dict(
            ds=ds,
            # any errors will have been reported before
            status='error',
            message=('Failed to get %s metadata (%s): %s', ds, name,
                     exc_str(e)),
        )
Exemplo n.º 20
0
def _get_github_entity(gh, cred, github_login, github_passwd, github_organization):
    if github_login == 'disabledloginfortesting':
        raise gh.BadCredentialsException(403, 'no login specified')
    if not (github_login and github_passwd):
        # we don't have both
        # check if there is an oauth token from
        # https://github.com/sociomantic/git-hub
        token = False
        if not cred.is_known:
            if not github_login:
                # try find a token as login
                github_login = cfg.get('hub.oauthtoken', None)
                token = True
            if not (github_login and (github_passwd or token)):
                # still at least one missing, utilize the credential store
                # to get auth info, pass potential passwd value along
                cred.enter_new(
                    user=github_login,
                    password=github_passwd)
        # now we should really have it
        creds = cred()
        github_login = creds['user']
        github_passwd = creds['password']

    if not github_login:
        raise gh.BadCredentialsException(403, 'no login specified')

    # this will always succeed, but it might later throw an exception
    # if the credentials were wrong
    # and this case, known credentials are wiped out again below
    authed_gh = gh.Github(
        github_login,
        password=github_passwd)

    try:
        if github_organization:
            try:
                entity = authed_gh.get_organization(github_organization)
            except gh.UnknownObjectException as e:
                raise ValueError('unknown organization "{}" [{}]'.format(
                                 github_organization,
                                 exc_str(e)))
        else:
            entity = authed_gh.get_user()
    except gh.BadCredentialsException as e:
        # things blew up, wipe out cred store, if anything is in it
        if cred.is_known:
            cred.delete()
        raise e

    return entity
Exemplo n.º 21
0
    def test_addurls(self, path):
        ds = Dataset(path).create(force=True)

        def get_annex_commit_counts():
            return len(ds.repo.get_revisions("git-annex"))

        n_annex_commits = get_annex_commit_counts()

        ds.addurls(self.json_file, "{url}", "{name}")

        filenames = ["a", "b", "c"]
        for fname in filenames:
            ok_exists(op.join(ds.path, fname))

        for (fname, meta), subdir in zip(ds.repo.get_metadata(filenames),
                                         ["foo", "bar", "foo"]):
            assert_dict_equal(meta, {"subdir": [subdir], "name": [fname]})

        # Ignore this check if we're faking dates because that disables
        # batch mode.
        if not dl_cfg.get('datalad.fake-dates'):
            # We should have two new commits on the git-annex: one for the
            # added urls and one for the added metadata.
            eq_(n_annex_commits + 2, get_annex_commit_counts())

        # Add to already existing links, overwriting.
        with swallow_logs(new_level=logging.DEBUG) as cml:
            ds.addurls(self.json_file, "{url}", "{name}", ifexists="overwrite")
            for fname in filenames:
                assert_in("Removing {}".format(os.path.join(path, fname)),
                          cml.out)

        # Add to already existing links, skipping.
        assert_in_results(ds.addurls(self.json_file,
                                     "{url}",
                                     "{name}",
                                     ifexists="skip"),
                          action="addurls",
                          status="notneeded")

        # Add to already existing links works, as long content is the same.
        ds.addurls(self.json_file, "{url}", "{name}")

        # But it fails if something has changed.
        ds.unlock("a")
        with open(op.join(ds.path, "a"), "w") as ofh:
            ofh.write("changed")
        ds.save("a")

        assert_raises(IncompleteResultsError, ds.addurls, self.json_file,
                      "{url}", "{name}")
Exemplo n.º 22
0
def _get_github_entity(gh, cred, github_login, github_passwd, github_organization):
    if github_login == 'disabledloginfortesting':
        raise gh.BadCredentialsException(403, 'no login specified')
    if not (github_login and github_passwd):
        # we don't have both
        # check if there is an oauth token from
        # https://github.com/sociomantic/git-hub
        token = False
        if not cred.is_known:
            if not github_login:
                # try find a token as login
                github_login = cfg.get('hub.oauthtoken', None)
                token = True
            if not (github_login and (github_passwd or token)):
                # still at least one missing, utilize the credential store
                # to get auth info, pass potential passwd value along
                cred.enter_new(
                    user=github_login,
                    password=github_passwd)
        # now we should really have it
        creds = cred()
        github_login = creds['user']
        github_passwd = creds['password']

    if not github_login:
        raise gh.BadCredentialsException(403, 'no login specified')

    # this will always succeed, but it might later throw an exception
    # if the credentials were wrong
    # and this case, known credentials are wiped out again below
    authed_gh = gh.Github(
        github_login,
        password=github_passwd)

    try:
        if github_organization:
            try:
                entity = authed_gh.get_organization(github_organization)
            except gh.UnknownObjectException as e:
                raise ValueError('unknown organization "{}" [{}]'.format(
                                 github_organization,
                                 exc_str(e)))
        else:
            entity = authed_gh.get_user()
    except gh.BadCredentialsException as e:
        # things blew up, wipe out cred store, if anything is in it
        if cred.is_known:
            cred.delete()
        raise e

    return entity
Exemplo n.º 23
0
def _ok_metadata(meta, mtype, ds, loc):
    if meta is None or isinstance(meta, dict):
        return True

    msg = ("Metadata extractor '%s' yielded something other than a dictionary "
           "for dataset %s%s -- this is likely a bug, please consider "
           "reporting it. "
           "This type of native metadata will be ignored. Got: %s", mtype, ds,
           '' if loc is None else ' content {}'.format(loc), repr(meta))
    if cfg.get('datalad.runtime.raiseonerror'):
        raise RuntimeError(*msg)

    lgr.error(*msg)
    return False
Exemplo n.º 24
0
def get_cached_url_content(url, name=None, fetcher=None, maxage=None):
    """Loader of a document from a url, which caches loaded instance on disk

    Doesn't do anything smart about http headers etc which could provide
    information for cache/proxy servers for how long to retain etc

    TODO: theoretically it is not network specific at all -- and just a memoize
    pattern, but may be some time we would make it treat headers etc correctly.
    And ATM would support any URL we support via providers/downloaders

    Parameters
    ----------
    fetcher: callable, optional
       Function to call with url if needed to be refetched
    maxage: float, optional
       Age in days to retain valid for.  <0 - would retain forever.  If None -
       would consult the config, 0 - would force to reload
    """
    doc_fname = get_url_cache_filename(url, name)
    if maxage is None:
        maxage = float(cfg.get('datalad.locations.cache-maxage'))

    doc = None
    if os.path.exists(doc_fname) and maxage != 0:

        fage = (time.time() - os.stat(doc_fname).st_mtime) / (24. * 3600)
        if maxage < 0 or fage < maxage:
            try:
                lgr.debug("use cached request result to '%s' from %s", url,
                          doc_fname)
                doc = pickle.load(open(doc_fname, 'rb'))
            except Exception as e:  # it is OK to ignore any error and fall back on the true source
                lgr.warning(
                    "cannot load cache from '%s', fall back to download: %s",
                    doc_fname, exc_str(e))

    if doc is None:
        if fetcher is None:
            from datalad.downloaders.providers import Providers
            providers = Providers.from_config_files()
            fetcher = providers.fetch

        doc = fetcher(url)
        assure_dir(dirname(doc_fname))
        # use pickle to store the entire request result dict
        pickle.dump(doc, open(doc_fname, 'wb'))
        lgr.debug("stored result of request to '{}' in {}".format(
            url, doc_fname))
    return doc
Exemplo n.º 25
0
def test_get_resolved_values():
    from datalad.tests.utils_pytest import _get_resolved_flavors
    flavors = ['networkish', 'local']
    eq_(([] if dl_cfg.get('datalad.tests.nonetwork') else ['networkish']) +
        ['local'], _get_resolved_flavors(flavors))

    with patch_config({'datalad.tests.nonetwork': '1'}):
        eq_(_get_resolved_flavors(flavors), ['local'])

        # and one more to see the exception being raised if nothing to teston
        @with_testrepos(flavors=['network'])
        def magical():
            raise AssertionError("Must not be ran")

        assert_raises(Skipped, magical)
Exemplo n.º 26
0
def test_with_tempfile_mkdir():
    dnames = []  # just to store the name within the decorated function

    @with_tempfile(mkdir=True)
    def check_mkdir(d1):
        ok_(os.path.exists(d1))
        ok_(os.path.isdir(d1))
        dnames.append(d1)
        eq_(glob(os.path.join(d1, '*')), [])
        # Create a file to assure we can remove later the temporary load
        with open(os.path.join(d1, "test.dat"), "w") as f:
            f.write("TEST LOAD")

    check_mkdir()
    if not dl_cfg.get('datalad.tests.temp.keep'):
        ok_(not os.path.exists(dnames[0]))  # got removed
Exemplo n.º 27
0
def get_cached_url_content(url, name=None, fetcher=None, maxage=None):
    """Loader of a document from a url, which caches loaded instance on disk

    Doesn't do anything smart about http headers etc which could provide
    information for cache/proxy servers for how long to retain etc

    TODO: theoretically it is not network specific at all -- and just a memoize
    pattern, but may be some time we would make it treat headers etc correctly.
    And ATM would support any URL we support via providers/downloaders

    Parameters
    ----------
    fetcher: callable, optional
       Function to call with url if needed to be refetched
    maxage: float, optional
       Age in days to retain valid for.  <0 - would retain forever.  If None -
       would consult the config, 0 - would force to reload
    """
    doc_fname = get_url_cache_filename(url, name)
    if maxage is None:
        maxage = float(cfg.get('datalad.locations.cache-maxage'))

    doc = None
    if os.path.exists(doc_fname) and maxage != 0:

        fage = (time.time() - os.stat(doc_fname).st_mtime)/(24. * 3600)
        if maxage < 0 or fage < maxage:
            try:
                lgr.debug("use cached request result to '%s' from %s", url, doc_fname)
                doc = pickle.load(open(doc_fname, 'rb'))
            except Exception as e:  # it is OK to ignore any error and fall back on the true source
                lgr.warning(
                    "cannot load cache from '%s', fall back to download: %s",
                    doc_fname, exc_str(e))

    if doc is None:
        if fetcher is None:
            from datalad.downloaders.providers import Providers
            providers = Providers.from_config_files()
            fetcher = providers.fetch

        doc = fetcher(url)
        assure_dir(dirname(doc_fname))
        # use pickle to store the entire request result dict
        pickle.dump(doc, open(doc_fname, 'wb'))
        lgr.debug("stored result of request to '{}' in {}".format(url, doc_fname))
    return doc
Exemplo n.º 28
0
    def _get_cnmeta(self, bids):
        # TODO any custom handling of participants infos should eventually
        # be done by pybids in one way or another
        path_props = {}
        participants_fname = opj(self.ds.path, 'participants.tsv')
        if exists(participants_fname):
            try:
                for rx, info in yield_participant_info(participants_fname):
                    path_props[rx] = info
            except Exception as exc:
                lgr.warning(
                    "Failed to load participants info due to: %s. Skipping the rest of file",
                    exc_str(exc))

        # now go over all files in the dataset and query pybids for its take
        # on each of them
        for f in self.paths:
            # BIDS carries a substantial portion of its metadata in JSON
            # sidecar files. we ignore them here completely
            # this might yield some false-negatives in theory, but
            # this case has not been observed in practice yet, hence
            # doing it cheap for now
            if f.endswith('.json'):
                continue
            md = {}
            try:
                md.update({
                    'bids:{}'.format(k): v
                    for k, v in bids.get_metadata(opj(self.ds.path,
                                                      f)).items()
                    # no nested structures for now (can be monstrous when DICOM
                    # metadata is embedded)
                    if not isinstance(v, dict)
                })
            except Exception as e:
                lgr.debug('no usable BIDS metadata for %s in %s: %s', f,
                          self.ds, exc_str(e))
                if cfg.get('datalad.runtime.raiseonerror'):
                    raise

            # no check al props from other sources and apply them
            for rx in path_props:
                if rx.match(f):
                    md.update(path_props[rx])
            yield f, md
Exemplo n.º 29
0
def _ok_metadata(meta, mtype, ds, loc):
    if meta is None or isinstance(meta, dict):
        return True

    msg = (
        "Metadata extractor '%s' yielded something other than a dictionary "
        "for dataset %s%s -- this is likely a bug, please consider "
        "reporting it. "
        "This type of native metadata will be ignored. Got: %s",
        mtype,
        ds,
        '' if loc is None else ' content {}'.format(loc),
        repr(meta))
    if cfg.get('datalad.runtime.raiseonerror'):
        raise RuntimeError(*msg)

    lgr.error(*msg)
    return False
Exemplo n.º 30
0
def test_with_testrepos():
    repos = []

    @with_testrepos
    def check_with_testrepos(repo):
        repos.append(repo)

    check_with_testrepos()

    eq_(len(repos),
        2 if on_windows  # TODO -- would fail now in DATALAD_TESTS_NONETWORK mode
          else (15 if dl_cfg.get('datalad.tests.nonetwork') else 16))  # local, local-url, clone, network

    for repo in repos:
        if not (repo.startswith('git://') or repo.startswith('http')):
            # either it is a "local" or a removed clone
            ok_(exists(opj(repo, '.git'))
                or
                not exists(opj(repo, '.git', 'remove-me')))
Exemplo n.º 31
0
    def _prep_connection_args(self, url):
        # parse url:
        from datalad.support.network import RI, is_ssh
        if isinstance(url, RI):
            sshri = url
        else:
            if ':' not in url and '/' not in url:
                # it is just a hostname
                lgr.debug("Assuming %r is just a hostname for ssh connection",
                          url)
                url += ':'
            sshri = RI(url)

        if not is_ssh(sshri):
            raise ValueError("Unsupported SSH URL: '{0}', use "
                             "ssh://host/path or host:path syntax".format(url))

        from datalad import cfg
        identity_file = cfg.get("datalad.ssh.identityfile")
        return sshri, identity_file
Exemplo n.º 32
0
    def __init__(self, path=None, puke_if_exists=True):
        if not path:
            path = \
                tempfile.mktemp(**get_tempfile_kwargs(
                    {'dir': dl_cfg.get("datalad.tests.temp.dir")},
                    prefix='testrepo'))
            # to be removed upon teardown
            _TEMP_PATHS_GENERATED.append(path)
        if puke_if_exists and exists(path):
            raise RuntimeError("Directory %s for test repo already exist" %
                               path)
        # swallow logs so we don't print all those about crippled FS etc
        with swallow_logs():
            self.repo = self.REPO_CLASS(path)
            # For additional testing of our datalad remote to not interfer
            # and manage to handle all http urls and requests:
            if self.REPO_CLASS is AnnexRepo and \
                    os.environ.get('DATALAD_TESTS_DATALADREMOTE'):
                init_datalad_remote(self.repo, 'datalad', autoenable=True)

        self._created = False
Exemplo n.º 33
0
def test_keeptemp_via_env_variable():

    if dl_cfg.get('datalad.tests.temp.keep'):  # pragma: no cover
        pytest.skip("We have env variable set to preserve tempfiles")

    files = []

    @with_tempfile()
    def check(f):
        open(f, 'w').write("LOAD")
        files.append(f)

    with patch.dict('os.environ', {}):
        check()

    with patch.dict('os.environ', {'DATALAD_TESTS_TEMP_KEEP': '1'}):
        check()

    eq_(len(files), 2)
    ok_(not exists(files[0]), msg="File %s still exists" % files[0])
    ok_(exists(files[1]), msg="File %s not exists" % files[1])

    rmtemp(files[-1])
Exemplo n.º 34
0
    def eval_func(wrapped, instance, args, kwargs):
        # for result filters and pre/post procedures
        # we need to produce a dict with argname/argvalue pairs for all args
        # incl. defaults and args given as positionals
        allkwargs = get_allargs_as_kwargs(wrapped, args, kwargs)
        # determine class, the __call__ method of which we are decorating:
        # Ben: Note, that this is a bit dirty in PY2 and imposes restrictions on
        # when and how to use eval_results as well as on how to name a command's
        # module and class. As of now, we are inline with these requirements as
        # far as I'm aware.
        mod = sys.modules[wrapped.__module__]
        if PY2:
            # we rely on:
            # - decorated function is method of a subclass of Interface
            # - the name of the class matches the last part of the module's name
            #   if converted to lower
            # for example:
            # ..../where/ever/mycommand.py:
            # class MyCommand(Interface):
            #     @eval_results
            #     def __call__(..)
            command_class_names = \
                [i for i in mod.__dict__
                 if type(mod.__dict__[i]) == type and
                 issubclass(mod.__dict__[i], Interface) and
                 i.lower().startswith(wrapped.__module__.split('.')[-1].replace('datalad_', '').replace('_', ''))]
            assert len(command_class_names) == 1, (command_class_names, mod.__name__)
            command_class_name = command_class_names[0]
        else:
            command_class_name = wrapped.__qualname__.split('.')[-2]
        _func_class = mod.__dict__[command_class_name]
        lgr.debug("Determined class of decorated function: %s", _func_class)

        # retrieve common options from kwargs, and fall back on the command
        # class attributes, or general defaults if needed
        kwargs = kwargs.copy()  # we will pop, which might cause side-effect
        common_params = {
            p_name: kwargs.pop(
                p_name,
                getattr(_func_class, p_name, eval_defaults[p_name]))
            for p_name in eval_params}
        # short cuts and configured setup for common options
        on_failure = common_params['on_failure']
        return_type = common_params['return_type']
        # resolve string labels for transformers too
        result_xfm = common_params['result_xfm']
        if result_xfm in known_result_xfms:
            result_xfm = known_result_xfms[result_xfm]
        result_renderer = common_params['result_renderer']
        # TODO remove this conditional branch entirely, done outside
        if not result_renderer:
            result_renderer = dlcfg.get('datalad.api.result-renderer', None)
        # wrap the filter into a helper to be able to pass additional arguments
        # if the filter supports it, but at the same time keep the required interface
        # as minimal as possible. Also do this here, in order to avoid this test
        # to be performed for each return value
        result_filter = common_params['result_filter']
        _result_filter = result_filter
        if result_filter:
            if isinstance(result_filter, Constraint):
                _result_filter = result_filter.__call__
            if (PY2 and inspect.getargspec(_result_filter).keywords) or \
                    (not PY2 and inspect.getfullargspec(_result_filter).varkw):

                def _result_filter(res):
                    return result_filter(res, **allkwargs)

        def _get_procedure_specs(param_key=None, cfg_key=None, ds=None):
            spec = common_params.get(param_key, None)
            if spec is not None:
                # this is already a list of lists
                return spec

            from datalad.distribution.dataset import Dataset
            ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None
            spec = (ds.config if ds and ds.is_installed()
                    else dlcfg).get(cfg_key, None)
            if spec is None:
                return
            elif not isinstance(spec, tuple):
                spec = [spec]
            return [shlex.split(s) for s in spec]

        # query cfg for defaults
        cmdline_name = cls2cmdlinename(_func_class)
        dataset_arg = allkwargs.get('dataset', None)
        proc_pre = _get_procedure_specs(
            'proc_pre',
            'datalad.{}.proc-pre'.format(cmdline_name),
            ds=dataset_arg)
        proc_post = _get_procedure_specs(
            'proc_post',
            'datalad.{}.proc-post'.format(cmdline_name),
            ds=dataset_arg)

        # this internal helper function actually drives the command
        # generator-style, it may generate an exception if desired,
        # on incomplete results
        def generator_func(*_args, **_kwargs):
            # flag whether to raise an exception
            incomplete_results = []
            # track what actions were performed how many times
            action_summary = {}

            if proc_pre and cmdline_name != 'run-procedure':
                from datalad.interface.run_procedure import RunProcedure
                for procspec in proc_pre:
                    lgr.debug('Running configured pre-procedure %s', procspec)
                    for r in _process_results(
                            RunProcedure.__call__(
                                procspec,
                                dataset=dataset_arg,
                                return_type='generator'),
                            _func_class, action_summary,
                            on_failure, incomplete_results,
                            result_renderer, result_xfm, result_filter,
                            **_kwargs):
                        yield r

            # if a custom summary is to be provided, collect the results
            # of the command execution
            results = []
            do_custom_result_summary = result_renderer == 'tailored' \
                and hasattr(_func_class, 'custom_result_summary_renderer')

            # process main results
            for r in _process_results(
                    wrapped(*_args, **_kwargs),
                    _func_class, action_summary,
                    on_failure, incomplete_results,
                    result_renderer, result_xfm, _result_filter, **_kwargs):
                yield r
                # collect if summary is desired
                if do_custom_result_summary:
                    results.append(r)

            if proc_post and cmdline_name != 'run-procedure':
                from datalad.interface.run_procedure import RunProcedure
                for procspec in proc_post:
                    lgr.debug('Running configured post-procedure %s', procspec)
                    for r in _process_results(
                            RunProcedure.__call__(
                                procspec,
                                dataset=dataset_arg,
                                return_type='generator'),
                            _func_class, action_summary,
                            on_failure, incomplete_results,
                            result_renderer, result_xfm, result_filter,
                            **_kwargs):
                        yield r

            # result summary before a potential exception
            # custom first
            if do_custom_result_summary:
                _func_class.custom_result_summary_renderer(results)
            elif result_renderer == 'default' and action_summary and \
                    sum(sum(s.values()) for s in action_summary.values()) > 1:
                # give a summary in default mode, when there was more than one
                # action performed
                ui.message("action summary:\n  {}".format(
                    '\n  '.join('{} ({})'.format(
                        act,
                        ', '.join('{}: {}'.format(status, action_summary[act][status])
                                  for status in sorted(action_summary[act])))
                                for act in sorted(action_summary))))

            if incomplete_results:
                raise IncompleteResultsError(
                    failed=incomplete_results,
                    msg="Command did not complete successfully")

        if return_type == 'generator':
            # hand over the generator
            return generator_func(*args, **kwargs)
        else:
            @wrapt.decorator
            def return_func(wrapped_, instance_, args_, kwargs_):
                results = wrapped_(*args_, **kwargs_)
                if inspect.isgenerator(results):
                    # unwind generator if there is one, this actually runs
                    # any processing
                    results = list(results)
                # render summaries
                if not result_xfm and result_renderer == 'tailored':
                    # cannot render transformed results
                    if hasattr(_func_class, 'custom_result_summary_renderer'):
                        _func_class.custom_result_summary_renderer(results)
                if return_type == 'item-or-list' and \
                        len(results) < 2:
                    return results[0] if results else None
                else:
                    return results

            return return_func(generator_func)(*args, **kwargs)
Exemplo n.º 35
0
Arquivo: log.py Projeto: leej3/datalad
 def _get_config(self, var, default=None):
     from datalad import cfg
     return cfg.get(self.name.lower() + '.log.' + var, default)
Exemplo n.º 36
0
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None):
    """Make a direct query of a dataset to extract its metadata.

    Parameters
    ----------
    ds : Dataset
    types : list
    """
    errored = False
    dsmeta = dict()
    contentmeta = {}

    if global_meta is not None and content_meta is not None and \
            not global_meta and not content_meta:
        # both are false and not just none
        return dsmeta, contentmeta, errored

    context = {
        '@vocab': 'http://docs.datalad.org/schema_v{}.json'.format(
            vocabulary_version)}

    fullpathlist = paths
    if paths and isinstance(ds.repo, AnnexRepo):
        # Ugly? Jep: #2055
        content_info = zip(paths, ds.repo.file_has_content(paths), ds.repo.is_under_annex(paths))
        paths = [p for p, c, a in content_info if not a or c]
        nocontent = len(fullpathlist) - len(paths)
        if nocontent:
            # TODO better fail, or support incremental and label this file as no present
            lgr.warn(
                '{} files have no content present, '
                'some extractors will not operate on {}'.format(
                    nocontent,
                    'them' if nocontent > 10
                           else [p for p, c, a in content_info if not c and a])
            )

    # pull out potential metadata field blacklist config settings
    blacklist = [re.compile(bl) for bl in assure_list(ds.config.obtain(
        'datalad.metadata.aggregate-ignore-fields',
        default=[]))]
    # enforce size limits
    max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize')
    # keep local, who knows what some extractors might pull in
    from pkg_resources import iter_entry_points  # delayed heavy import
    extractors = {ep.name: ep for ep in iter_entry_points('datalad.metadata.extractors')}

    log_progress(
        lgr.info,
        'metadataextractors',
        'Start metadata extraction from %s', ds,
        total=len(types),
        label='Metadata extraction',
        unit=' extractors',
    )
    for mtype in types:
        mtype_key = mtype
        log_progress(
            lgr.info,
            'metadataextractors',
            'Engage %s metadata extractor', mtype_key,
            update=1,
            increment=True)
        if mtype_key not in extractors:
            # we said that we want to fail, rather then just moan about less metadata
            log_progress(
                lgr.error,
                'metadataextractors',
                'Failed %s metadata extraction from %s', mtype_key, ds,
            )
            raise ValueError(
                'Enabled metadata extractor %s is not available in this installation',
                mtype_key)
        try:
            extractor_cls = extractors[mtype_key].load()
            extractor = extractor_cls(
                ds,
                paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist)
        except Exception as e:
            log_progress(
                lgr.error,
                'metadataextractors',
                'Failed %s metadata extraction from %s', mtype_key, ds,
            )
            raise ValueError(
                "Failed to load metadata extractor for '%s', "
                "broken dataset configuration (%s)?: %s",
                mtype, ds, exc_str(e))
            continue
        try:
            dsmeta_t, contentmeta_t = extractor.get_metadata(
                dataset=global_meta if global_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-dataset-{}'.format(mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()),
                content=content_meta if content_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-content-{}'.format(mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()))
        except Exception as e:
            lgr.error('Failed to get dataset metadata ({}): {}'.format(
                mtype, exc_str(e)))
            if cfg.get('datalad.runtime.raiseonerror'):
                log_progress(
                    lgr.error,
                    'metadataextractors',
                    'Failed %s metadata extraction from %s', mtype_key, ds,
                )
                raise
            errored = True
            # if we dont get global metadata we do not want content metadata
            continue

        if dsmeta_t:
            if _ok_metadata(dsmeta_t, mtype, ds, None):
                dsmeta_t = _filter_metadata_fields(
                    dsmeta_t,
                    maxsize=max_fieldsize,
                    blacklist=blacklist)
                dsmeta[mtype_key] = dsmeta_t
            else:
                errored = True

        unique_cm = {}
        extractor_unique_exclude = getattr(extractor_cls, "_unique_exclude", set())
        # TODO: ATM neuroimaging extractors all provide their own internal
        #  log_progress but if they are all generators, we could provide generic
        #  handling of the progress here.  Note also that log message is actually
        #  seems to be ignored and not used, only the label ;-)
        # log_progress(
        #     lgr.debug,
        #     'metadataextractors_loc',
        #     'Metadata extraction per location for %s', mtype,
        #     # contentmeta_t is a generator... so no cound is known
        #     # total=len(contentmeta_t or []),
        #     label='Metadata extraction per location',
        #     unit=' locations',
        # )
        for loc, meta in contentmeta_t or {}:
            lgr.log(5, "Analyzing metadata for %s", loc)
            # log_progress(
            #     lgr.debug,
            #     'metadataextractors_loc',
            #     'ignoredatm',
            #     label=loc,
            #     update=1,
            #     increment=True)
            if not _ok_metadata(meta, mtype, ds, loc):
                errored = True
                # log_progress(
                #     lgr.debug,
                #     'metadataextractors_loc',
                #     'ignoredatm',
                #     label='Failed for %s' % loc,
                # )
                continue
            # we also want to store info that there was no metadata(e.g. to get a list of
            # files that have no metadata)
            # if there is an issue that a extractor needlessly produces empty records, the
            # extractor should be fixed and not a general switch. For example the datalad_core
            # issues empty records to document the presence of a file
            #elif not meta:
            #    continue

            # apply filters
            meta = _filter_metadata_fields(
                meta,
                maxsize=max_fieldsize,
                blacklist=blacklist)

            if not meta:
                continue

            # assign
            # only ask each metadata extractor once, hence no conflict possible
            loc_dict = contentmeta.get(loc, {})
            loc_dict[mtype_key] = meta
            contentmeta[loc] = loc_dict

            if ds.config.obtain(
                    'datalad.metadata.generate-unique-{}'.format(mtype_key.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()):
                # go through content metadata and inject report of unique keys
                # and values into `dsmeta`
                for k, v in iteritems(meta):
                    if k in dsmeta.get(mtype_key, {}):
                        # if the dataset already has a dedicated idea
                        # about a key, we skip it from the unique list
                        # the point of the list is to make missing info about
                        # content known in the dataset, not to blindly
                        # duplicate metadata. Example: list of samples data
                        # were recorded from. If the dataset has such under
                        # a 'sample' key, we should prefer that, over an
                        # aggregated list of a hopefully-kinda-ok structure
                        continue
                    elif k in extractor_unique_exclude:
                        # the extractor thinks this key is worthless for the purpose
                        # of discovering whole datasets
                        # we keep the key (so we know that some file is providing this key),
                        # but ignore any value it came with
                        unique_cm[k] = None
                        continue
                    vset = unique_cm.get(k, set())
                    vset.add(_val2hashable(v))
                    unique_cm[k] = vset

        # log_progress(
        #     lgr.debug,
        #     'metadataextractors_loc',
        #     'Finished metadata extraction across locations for %s', mtype)

        if unique_cm:
            # per source storage here too
            ucp = dsmeta.get('datalad_unique_content_properties', {})
            # important: we want to have a stable order regarding
            # the unique values (a list). we cannot guarantee the
            # same order of discovery, hence even when not using a
            # set above we would still need sorting. the callenge
            # is that any value can be an arbitrarily complex nested
            # beast
            # we also want to have each unique value set always come
            # in a top-level list, so we known if some unique value
            # was a list, os opposed to a list of unique values

            def _ensure_serializable(val):
                if isinstance(val, ReadOnlyDict):
                    return {k: _ensure_serializable(v) for k, v in iteritems(val)}
                if isinstance(val, (tuple, list)):
                    return [_ensure_serializable(v) for v in val]
                else:
                    return val

            ucp[mtype_key] = {
                k: [_ensure_serializable(i)
                    for i in sorted(
                        v,
                        key=_unique_value_key)] if v is not None else None
                for k, v in iteritems(unique_cm)
                # v == None (disable unique, but there was a value at some point)
                # otherwise we only want actual values, and also no single-item-lists
                # of a non-value
                # those contribute no information, but bloat the operation
                # (inflated number of keys, inflated storage, inflated search index, ...)
                if v is None or (v and not v == {''})}
            dsmeta['datalad_unique_content_properties'] = ucp

    log_progress(
        lgr.info,
        'metadataextractors',
        'Finished metadata extraction from %s', ds,
    )

    # always identify the effective vocabulary - JSON-LD style
    if context:
        dsmeta['@context'] = context

    return dsmeta, contentmeta, errored
Exemplo n.º 37
0
 def newfunc(*args, **kwargs):
     from datalad import cfg
     test_ssh = cfg.get("datalad.tests.ssh", '')
     if not test_ssh or test_ssh in ('0', 'false', 'no'):
         raise SkipTest("Run this test by setting DATALAD_TESTS_SSH")
     return func(*args, **kwargs)
Exemplo n.º 38
0
from datalad.support.annexrepo import AnnexRepo
from datalad.utils import swallow_logs
from datalad.distribution.dataset import Dataset

from .utils import with_tempfile
from .utils import skip_if_no_network
from .utils import with_testrepos
from .utils import on_windows
from .utils import SkipTest


if on_windows:
    raise SkipTest("Can't test direct mode switch, "
                   "if direct mode is forced by OS anyway.")

repo_version = cfg.get("datalad.repo.version", None)
if repo_version and int(repo_version) >= 6:
    raise SkipTest("Can't test direct mode switch, "
                   "if repository version 6 or later is enforced.")


@with_tempfile
@with_tempfile
@with_tempfile
@with_tempfile
def test_direct_cfg(path1, path2, path3, path4):
    with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}):
        # create annex repo in direct mode:
        with swallow_logs(new_level=logging.DEBUG) as cml:
            ar = AnnexRepo(path1, create=True)
            cml.assert_logged("Switching to direct mode",
Exemplo n.º 39
0
Arquivo: log.py Projeto: hanke/datalad
 def _get_config(self, var, default=None):
     from datalad import cfg
     return cfg.get(self.name.lower() + '.log.' + var, default)