示例#1
0
def test_our_metadataset_search(tdir):
    # smoke test for basic search operations on our super-megadataset
    # expensive operation but ok
    ds = install(path=tdir, source="///")
    assert list(ds.search('.', report='*', regex=True))
    assert list(ds.search('.', report='*'))
    assert list(ds.search('.', report_matched=True))

    # and either we could provide output in different formats
    import simplejson
    from datalad.utils import swallow_outputs
    from datalad.api import search_
    with swallow_outputs() as cmo:
        assert list(search_('.', report='*', regex=True, format='json', dataset=ds))
        out = cmo.out
    # since this one is just absorbs all first, we can't go one by one
    assert simplejson.loads(out)

    try:
        import yaml
    except ImportError:
        raise SkipTest("no yaml module")
    with swallow_outputs() as cmo:
        assert list(search_('.', report='*', regex=True, format='yaml', dataset=ds))
        out = cmo.out
    assert yaml.load(out)
示例#2
0
def test_ls_s3():
    url = 's3://datalad-test0-versioned/'
    with swallow_outputs():
        # just to skip if no credentials
        get_test_providers(url)

    with swallow_outputs() as cmo:
        res = ls(url)
        assert_equal(len(res), 17)  # all the entries
        counts = Counter(map(lambda x: x.__class__.__name__, res))
        assert_equal(counts, {'Key': 14, 'DeleteMarker': 3})
        assert_in('Bucket info:', cmo.out)
def test_create_1test_dataset():
    # and just a single dataset
    from datalad.api import create_test_dataset
    with swallow_outputs():
        dss = create_test_dataset()
    eq_(len(dss), 1)
    ok_clean_git(dss[0], annex=False)
示例#4
0
def test_create_1test_dataset():
    # and just a single dataset
    from datalad.api import create_test_dataset
    with swallow_outputs():
        dss = create_test_dataset()
    eq_(len(dss), 1)
    ok_clean_git(dss[0], annex=False)
示例#5
0
def test_create_1test_dataset():
    # and just a single dataset
    from datalad.api import create_test_dataset
    with swallow_outputs():
        dss = create_test_dataset()
    eq_(len(dss), 1)
    assert_repo_status(dss[0], annex=False)
def test_new_relpath(topdir):
    from datalad.api import create_test_dataset
    with swallow_logs(), chpwd(topdir), swallow_outputs():
        dss = create_test_dataset('testds', spec='1')
    eq_(dss[0], opj(topdir, 'testds'))
    eq_(len(dss), 2)  # 1 top + 1 sub-dataset as demanded
    for ds in dss:
        ok_clean_git(ds, annex=False)
示例#7
0
def test_new_relpath(topdir):
    from datalad.api import create_test_dataset
    with swallow_logs(), chpwd(topdir), swallow_outputs():
        dss = create_test_dataset('testds', spec='1')
    eq_(dss[0], opj(topdir, 'testds'))
    eq_(len(dss), 2)  # 1 top + 1 sub-dataset as demanded
    for ds in dss:
        ok_clean_git(ds, annex=False)
示例#8
0
def test_ls_uninstalled(path):
    ds = Dataset(path)
    ds.create()
    ds.create('sub')
    ds.uninstall('sub', check=False)
    with swallow_outputs() as cmo:
        ls([path], recursive=True)
        assert_in('not installed', cmo.out)
示例#9
0
def test_create_test_dataset():
    # rudimentary smoke test
    from datalad.api import create_test_dataset
    with swallow_logs(), swallow_outputs():
        dss = create_test_dataset(spec='2/1-2')
    ok_(5 <= len(dss) <= 7)  # at least five - 1 top, two on top level, 1 in each
    for ds in dss:
        ok_clean_git(ds, annex=None)  # some of them are annex but we just don't check
        ok_(len(glob(opj(ds, 'file*'))))
示例#10
0
def test_create_test_dataset():
    # rudimentary smoke test
    from datalad.api import create_test_dataset
    with swallow_logs(), swallow_outputs():
        dss = create_test_dataset(spec='2/1-2')
    ok_(5 <= len(dss) <= 7)  # at least five - 1 top, two on top level, 1 in each
    for ds in dss:
        assert_repo_status(ds, annex=None)  # some of them are annex but we just don't check
        ok_(len(glob(opj(ds, 'file*'))))
示例#11
0
def test_swallow_outputs():
    with swallow_outputs() as cm:
        eq_(cm.out, '')
        sys.stdout.write("out normal")
        sys.stderr.write("out error")
        eq_(cm.out, 'out normal')
        sys.stdout.write(" and more")
        eq_(cm.out, 'out normal and more')  # incremental
        eq_(cm.err, 'out error')
        eq_(cm.err, 'out error')  # the same value if multiple times
示例#12
0
def test_check_dates_invalid_date():
    skip_if_no_module("dateutil")

    with swallow_outputs() as cmo:
        assert_raises(IncompleteResultsError,
                      check_dates, [],
                      reference_date="not a valid date",
                      return_type="list")
        out = cmo.out
    # The error makes it through the standard renderer.
    assert_in('"status": "error"', out)
示例#13
0
def test_check_dates_invalid_date():
    skip_if_no_module("dateutil")

    with swallow_outputs() as cmo:
        assert_raises(IncompleteResultsError,
                      check_dates, [],
                      reference_date="not a valid date",
                      return_type="list")
        out = cmo.out
    # The error makes it through the standard renderer.
    assert_in('"status": "error"', out)
示例#14
0
def test_ls_noarg(toppath):
    # smoke test pretty much
    AnnexRepo(toppath, create=True)

    # this test is pointless for now and until ls() actually returns
    # something
    with swallow_outputs():
        ls_out = ls(toppath)
        with chpwd(toppath):
            assert_equal(ls_out, ls([]))
            assert_equal(ls_out, ls('.'))
示例#15
0
def test_line_profile():
    skip_if_no_module('line_profiler')

    @line_profile
    def f(j):
        i = j + 1  # xyz
        return i

    with swallow_outputs() as cmo:
        assert_equal(f(3), 4)
        assert_equal(cmo.err, '')
        assert_in('i = j + 1  # xyz', cmo.out)
示例#16
0
 def _test(*args_):
     #print args_
     for args in args_:
         for recursive in [False, True]:
             # in both cases shouldn't fail
             with swallow_outputs() as cmo:
                 ls(args, recursive=recursive)
                 assert_equal(len(cmo.out.rstrip().split('\n')), len(args))
                 assert_in('[annex]', cmo.out)
                 assert_in('[git]', cmo.out)
                 assert_in(DEFAULT_BRANCH, cmo.out)
                 if "bogus" in args:
                     assert_in('unknown', cmo.out)
示例#17
0
    def test_containers_run(self, path):
        if self.image_existed:
            raise SkipTest(
                "Not pulling with containers-run due to existing image: {}".
                format(self.image_name))

        from datalad.api import Dataset
        ds = Dataset(path).create(force=True)
        ds.save(path="foo")
        ds.containers_add("bb", url="dhub://" + self.image_name)
        with swallow_outputs() as out:
            ds.containers_run(["cat", "foo"], container_name="bb")
            assert_in("content", out.out)
def test_hierarchy(topdir):
    # GH 1178
    from datalad.api import create_test_dataset
    with swallow_logs(), swallow_outputs():
        dss = create_test_dataset(topdir, spec='1/1')

    eq_(len(dss), 3)
    eq_(dss[0], topdir)
    for ids, ds in enumerate(dss):
        ok_clean_git(ds, annex=False)
        # each one should have 2 commits (but the last one)-- one for file and
        # another one for sub-dataset
        repo = GitRepo(ds)
        eq_(len(list(repo.get_branch_commits())), 1 + int(ids < 2))
示例#19
0
def test_hierarchy(topdir):
    # GH 1178
    from datalad.api import create_test_dataset
    with swallow_logs(), swallow_outputs():
        dss = create_test_dataset(topdir, spec='1/1')

    eq_(len(dss), 3)
    eq_(dss[0], topdir)
    for ids, ds in enumerate(dss):
        ok_clean_git(ds, annex=False)
        # each one should have 2 commits (but the last one)-- one for file and
        # another one for sub-dataset
        repo = GitRepo(ds)
        eq_(len(list(repo.get_branch_commits())), 1 + int(ids<2))
示例#20
0
def test_docker(path):  # Singularity's "docker://" scheme.
    ds = Dataset(path).create()
    ds.containers_add(
        "bb",
        url=("docker://busybox@sha256:"
             "7964ad52e396a6e045c39b5a44438424ac52e12e4d5a25d94895f2058cb863a0"
             ))

    img = op.join(ds.path, ".datalad", "environments", "bb", "image")
    assert_result_count(ds.containers_list(), 1, path=img, name="bb")
    ok_clean_git(path)

    with swallow_outputs():
        ds.containers_run(["ls", "/singularity"])
示例#21
0
def test_hierarchy(topdir):
    # GH 1178
    from datalad.api import create_test_dataset
    with swallow_logs(), swallow_outputs():
        dss = create_test_dataset(topdir, spec='1/1')

    eq_(len(dss), 3)
    eq_(dss[0], topdir)
    for ids, ds in enumerate(dss):
        assert_repo_status(ds, annex=False)
        # each one should have 2 commits (but the last one)-- one for file and
        # another one for sub-dataset
        repo = repo_from_path(ds)
        if not hasattr(repo,
                       'is_managed_branch') or not repo.is_managed_branch():
            eq_(len(list(repo.get_branch_commits_())), 1 + int(ids < 2))
示例#22
0
def test_check_dates(path):
    skip_if_no_module("dateutil")

    ref_ts = 1218182889  # Fri, 08 Aug 2008 04:08:09 -0400
    refdate = "@{}".format(ref_ts)

    repo = os.path.join(path, "repo")
    with set_date(ref_ts + 5000):
        ar = AnnexRepo(repo)
        ar.add(".")
        ar.commit()

    # The standard renderer outputs json.
    with swallow_outputs() as cmo:
        # Set level to WARNING to avoid the progress bar when
        # DATALAD_TESTS_UI_BACKEND=console.
        with swallow_logs(new_level=logging.WARNING):
            check_dates([repo],
                        reference_date=refdate,
                        return_type="list")
        assert_in("report", json.loads(cmo.out).keys())

    # We find the newer objects.
    newer = call([path], reference_date=refdate)
    eq_(len(newer), 1)
    ok_(newer[0]["report"]["objects"])

    # There are no older objects to find.
    older = call([repo], reference_date=refdate, older=True)
    assert_false(older[0]["report"]["objects"])

    # We can pass the date in RFC 2822 format.
    assert_dict_equal(
        newer[0],
        call([path], reference_date="08 Aug 2008 04:08:09 -0400")[0])

    # paths=None defaults to the current directory.
    with chpwd(path):
        assert_dict_equal(
            newer[0]["report"],
            call(paths=None, reference_date=refdate)[0]["report"])

    # Only commit type is present when annex='none'.
    newer_noannex = call([path], reference_date=refdate, annex="none")
    for entry in newer_noannex[0]["report"]["objects"].values():
        ok_(entry["type"] == "commit")
示例#23
0
def test_quoting(path):
    ds = Dataset(op.join(path, OBSCURE_FILENAME)).create(force=True)
    # Our custom procedure fails if it receives anything other than two
    # procedure arguments (so the script itself receives 3). Check a few cases
    # from the Python API and CLI.
    ds.config.add("datalad.locations.dataset-procedures",
                  "code",
                  where="dataset")
    with swallow_outputs():
        ds.run_procedure(spec=["just2args", "with ' sing", 'with " doub'])
        with assert_raises(CommandError):
            ds.run_procedure(spec=["just2args", "still-one arg"])

        runner = Runner(cwd=ds.path)
        runner.run(
            "datalad run-procedure just2args \"with ' sing\" 'with \" doub'")
        with assert_raises(CommandError):
            runner.run("datalad run-procedure just2args 'still-one arg'")
示例#24
0
def test_check_dates(path=None):
    skip_if_no_module("dateutil")

    ref_ts = 1218182889  # Fri, 08 Aug 2008 04:08:09 -0400
    refdate = "@{}".format(ref_ts)

    repo = os.path.join(path, "repo")
    with set_date(ref_ts + 5000):
        ar = AnnexRepo(repo)
        ar.add(".")
        ar.commit()

    # The standard renderer outputs json.
    with swallow_outputs() as cmo:
        # Set level to WARNING to avoid the progress bar when
        # DATALAD_TESTS_UI_BACKEND=console.
        with swallow_logs(new_level=logging.WARNING):
            check_dates([repo], reference_date=refdate, return_type="list")
        assert_in("report", json.loads(cmo.out).keys())

    # We find the newer objects.
    newer = call([path], reference_date=refdate)
    eq_(len(newer), 1)
    ok_(newer[0]["report"]["objects"])

    # There are no older objects to find.
    older = call([repo], reference_date=refdate, older=True)
    assert_false(older[0]["report"]["objects"])

    # We can pass the date in RFC 2822 format.
    assert_dict_equal(
        newer[0],
        call([path], reference_date="08 Aug 2008 04:08:09 -0400")[0])

    # paths=None defaults to the current directory.
    with chpwd(path):
        assert_dict_equal(
            newer[0]["report"],
            call(paths=None, reference_date=refdate)[0]["report"])

    # Only commit type is present when annex='none'.
    newer_noannex = call([path], reference_date=refdate, annex="none")
    for entry in newer_noannex[0]["report"]["objects"].values():
        ok_(entry["type"] == "commit")
示例#25
0
def test_push_custom_summary(path=None):
    path = Path(path)
    ds = Dataset(path / "ds").create()

    sib = mk_push_target(ds, "sib", str(path / "sib"), bare=False, annex=False)
    (sib.pathobj / "f1").write_text("f1")
    sib.save()

    (ds.pathobj / "f2").write_text("f2")
    ds.save()

    # These options are true by default and our tests usually run with a
    # temporary home, but set them to be sure.
    ds.config.set("advice.pushUpdateRejected", "true", scope="local")
    ds.config.set("advice.pushFetchFirst", "true", scope="local")
    with swallow_outputs() as cmo:
        ds.push(to="sib", result_renderer="default", on_failure="ignore")
        assert_in("Hints:", cmo.out)
        assert_in("action summary:", cmo.out)
示例#26
0
def _check_setup_exceptionhook(interactive):
    old_exceptionhook = sys.excepthook

    post_mortem_tb = []

    def our_post_mortem(tb):
        post_mortem_tb.append(tb)

    with patch('sys.excepthook'), \
            patch('datalad.utils.is_interactive', lambda: interactive), \
            patch('pdb.post_mortem', our_post_mortem):
        setup_exceptionhook()
        our_exceptionhook = sys.excepthook
        ok_(old_exceptionhook != our_exceptionhook)
        with swallow_logs() as cml, swallow_outputs() as cmo:
            # we need to call our_exceptionhook explicitly b/c nose
            # swallows all Exceptions and hook never gets executed
            try:
                raise RuntimeError
            except Exception as e:  # RuntimeError:
                type_, value_, tb_ = sys.exc_info()
            our_exceptionhook(type_, value_, tb_)
            # Happens under tox environment but not in manually crafted
            # ones -- not yet sure what it is about but --dbg does work
            # with python3 so lettting it skip for now
            raise SkipTest(
                "TODO: Not clear why in PY3 calls cleanup if we try to "
                "access the beast"
            )
            #assert_in('Traceback (most recent call last)', cmo.err)
            #assert_in('in _check_setup_exceptionhook', cmo.err)
            #if interactive:
            #    assert_equal(post_mortem_tb[0], tb_)
            #else:
            #    assert_equal(post_mortem_tb, [])
            #    # assert_in('We cannot setup exception hook', cml.out)

    eq_(old_exceptionhook, sys.excepthook)
示例#27
0
def test_quoting(path=None):
    ds = Dataset(op.join(path, OBSCURE_FILENAME)).create(force=True)
    # Our custom procedure fails if it receives anything other than two
    # procedure arguments (so the script itself receives 3). Check a few cases
    # from the Python API and CLI.
    ds.config.add("datalad.locations.dataset-procedures",
                  "code",
                  scope='branch')
    with swallow_outputs():
        ds.run_procedure(spec=["just2args", "with ' sing", 'with " doub'])
        assert_in_results(ds.run_procedure(spec=["just2args", "still-one arg"],
                                           on_failure="ignore",
                                           result_renderer=None),
                          action="run",
                          status="error")

        runner = WitlessRunner(cwd=ds.path)
        runner.run(
            "datalad run-procedure just2args \"with ' sing\" 'with \" doub'",
            protocol=KillOutput)
        with assert_raises(CommandError):
            runner.run("datalad run-procedure just2args 'still-one arg'",
                       protocol=KillOutput)
示例#28
0
def test_setup_exceptionhook(interactive):
    old_exceptionhook = sys.excepthook

    post_mortem_tb = []

    def our_post_mortem(tb):
        post_mortem_tb.append(tb)

    with patch('sys.excepthook'), \
            patch('datalad.utils.is_interactive', lambda: interactive), \
            patch('pdb.post_mortem', our_post_mortem):
        setup_exceptionhook()
        our_exceptionhook = sys.excepthook
        ok_(old_exceptionhook != our_exceptionhook)
        with swallow_logs() as cml, swallow_outputs() as cmo:
            # we need to call our_exceptionhook explicitly b/c nose
            # swallows all Exceptions and hook never gets executed
            try:
                raise RuntimeError
            except Exception as e:  # RuntimeError:
                type_, value_, tb_ = sys.exc_info()
            our_exceptionhook(type_, value_, tb_)

    eq_(old_exceptionhook, sys.excepthook)
示例#29
0
def test_within_ds_file_search(path):
    try:
        import mutagen
    except ImportError:
        raise SkipTest
    ds = Dataset(path).create(force=True)
    # override default and search for datasets and files for this test
    for m in ('egrep', 'textblob', 'autofield'):
        ds.config.add('datalad.search.index-{}-documenttype'.format(m),
                      'all',
                      where='dataset')
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    makedirs(opj(path, 'stim'))
    for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ):
        copy(opj(dirname(dirname(__file__)), 'tests', 'data', src),
             opj(path, dst))
    ds.add('.')
    ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True)
    # If it is not under annex, below addition of metadata silently does
    # not do anything
    list(
        ds.repo.set_metadata(opj('stim', 'stim1.mp3'),
                             init={'importance': 'very'}))
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    # basic sanity check on the metadata structure of the dataset
    dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata']
    for src in ('audio', ):
        # something for each one
        assert_in(src, dsmeta)
        # each src declares its own context
        assert_in('@context', dsmeta[src])
        # we have a unique content metadata summary for each src
        assert_in(src, dsmeta['datalad_unique_content_properties'])

    # test default behavior
    with swallow_outputs() as cmo:
        ds.search(show_keys='name', mode='textblob')

        assert_in("""\
id
meta
parentds
path
type
""", cmo.out)

    target_out = """\
annex.importance
annex.key
audio.bitrate
audio.duration(s)
audio.format
audio.music-Genre
audio.music-album
audio.music-artist
audio.music-channels
audio.music-sample_rate
audio.name
audio.tracknumber
datalad_core.id
datalad_core.refcommit
id
parentds
path
type
"""

    # check generated autofield index keys
    with swallow_outputs() as cmo:
        ds.search(mode='autofield', show_keys='name')
        # it is impossible to assess what is different from that dump
        assert_in(target_out, cmo.out)

    assert_result_count(ds.search('blablob#'), 0)
    # now check that we can discover things from the aggregated metadata
    for mode, query, hitpath, matched in (
        ('egrep', ':mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # same as above, leading : is stripped, in indicates "ALL FIELDS"
        ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # same as above, but with AND condition
            # get both matches
        ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), {
            'type': 'file',
            'audio.format': 'mp3'
        }),
            # case insensitive search
        ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # field selection by expression
        ('egrep', 'audio\.+:mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # random keyword query
        ('textblob', 'mp3', opj('stim', 'stim1.mp3'), {
            'meta': 'mp3'
        }),
            # report which field matched with auto-field
        ('autofield', 'mp3', opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # XXX next one is not supported by current text field analyser
            # decomposes the mime type in [mime, audio, mp3]
            # ('autofield',
            # "'mime:audio/mp3'",
            # opj('stim', 'stim1.mp3'),
            # 'audio.format', 'mime:audio/mp3'),
            # but this one works
        ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), {
            'audio.format': 'mp3'
        }),
            # TODO extend with more complex queries to test whoosh
            # query language configuration
    ):
        res = ds.search(query, mode=mode, full_record=True)
        assert_result_count(
            res,
            1,
            type='file',
            path=opj(ds.path, hitpath),
            # each file must report the ID of the dataset it is from, critical for
            # discovering related content
            dsid=ds.id)
        # in egrep we currently do not search unique values
        # and the queries above aim at files
        assert_result_count(res, 1 if mode == 'egrep' else 2)
        if mode != 'egrep':
            assert_result_count(res,
                                1,
                                type='dataset',
                                path=ds.path,
                                dsid=ds.id)
        # test the key and specific value of the match
        for matched_key, matched_val in matched.items():
            assert_in(matched_key, res[-1]['query_matched'])
            assert_equal(res[-1]['query_matched'][matched_key], matched_val)

    # test a suggestion msg being logged if no hits and key is a bit off
    with swallow_logs(new_level=logging.INFO) as cml:
        res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep')
        assert not res
        assert_in('Did you mean any of', cml.out)
        assert_in('audio.format', cml.out)
        assert_in('audio.bitrate', cml.out)
示例#30
0
def test_within_ds_file_search(path):
    try:
        import nibabel
        import mutagen
    except ImportError:
        raise SkipTest
    ds = Dataset(path).create(force=True)
    ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset')
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    makedirs(opj(path, 'stim'))
    for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')),
                     ('nifti1.nii.gz',
                      opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz')),
                     ('nifti1.nii.gz',
                      opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'))):
        copy(opj(dirname(dirname(__file__)), 'tests', 'data', src),
             opj(path, dst))
    ds.add('.')
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    # basic sanity check on the metadata structure of the dataset
    dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata']
    for src in ('audio', 'bids', 'nifti1'):
        # something for each one
        assert_in(src, dsmeta)
        # each src declares its own context
        assert_in('@context', dsmeta[src])
        # we have a unique content metadata summary for each src
        assert_in(src, dsmeta['datalad_unique_content_properties'])

    # test default behavior
    with swallow_outputs() as cmo:
        ds.search(show_keys=True, mode='textblob')

        assert_in("""\
id
meta
parentds
path
type
""", cmo.out)

    target_out = """\
audio.bitrate
audio.date
audio.duration(s)
audio.format
audio.music-Genre
audio.music-album
audio.music-artist
audio.music-channels
audio.music-sample_rate
audio.name
audio.tracknumber
bids.BIDSVersion
bids.author
bids.citation
bids.conformsto
bids.description
bids.fundedby
bids.license
bids.modality
bids.name
bids.participant.age(years)
bids.participant.gender
bids.participant.handedness
bids.participant.hearing_problems_current
bids.participant.id
bids.participant.language
bids.subject
bids.task
bids.type
id
nifti1.cal_max
nifti1.cal_min
nifti1.datatype
nifti1.description
nifti1.dim
nifti1.freq_axis
nifti1.intent
nifti1.magic
nifti1.phase_axis
nifti1.pixdim
nifti1.qform_code
nifti1.sform_code
nifti1.sizeof_hdr
nifti1.slice_axis
nifti1.slice_duration
nifti1.slice_end
nifti1.slice_order
nifti1.slice_start
nifti1.spatial_resolution(mm)
nifti1.t_unit
nifti1.temporal_spacing(s)
nifti1.toffset
nifti1.vox_offset
nifti1.xyz_unit
parentds
path
type
"""

    # check generated autofield index keys
    with swallow_outputs() as cmo:
        ds.search(mode='autofield', show_keys=True)
        # it is impossible to assess what is different from that dump
        assert_in(target_out, cmo.out)

    assert_result_count(ds.search('blablob#'), 0)
    # now check that we can discover things from the aggregated metadata
    for mode, query, hitpath, matched_key, matched_val in (
            # random keyword query
        ('textblob', 'mp3', opj('stim', 'stim1.mp3'), 'meta', 'mp3'),
            # multi word query implies AND
        ('textblob', ['bold', 'male'],
         opj('sub-01', 'func',
             'sub-01_task-some_bold.nii.gz'), 'meta', 'male'),
            # report which field matched with auto-field
        ('autofield', 'mp3', opj('stim', 'stim1.mp3'), 'audio.format', 'mp3'),
        ('autofield', 'female',
         opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'),
         'bids.participant.gender', 'female'),
            # autofield multi-word query is also AND
        ('autofield', ['bids.type:bold', 'bids.participant.id:01'],
         opj('sub-01', 'func',
             'sub-01_task-some_bold.nii.gz'), 'bids.type', 'bold'),
            # XXX next one is not supported by current text field analyser
            # decomposes the mime type in [mime, audio, mp3]
            # ('autofield',
            # "'mime:audio/mp3'",
            # opj('stim', 'stim1.mp3'),
            # 'audio.format', 'mime:audio/mp3'),
            # but this one works
        ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'),
         'audio.format', 'mp3'),
            # TODO extend with more complex queries to test whoosh
            # query language configuration
    ):
        res = ds.search(query, mode=mode)
        if mode == 'textblob':
            # 'textblob' does datasets by default only (be could be configured otherwise
            assert_result_count(res, 1)
        else:
            # the rest has always a file and the dataset, because they carry metadata in
            # the same structure
            assert_result_count(res, 2)
            assert_result_count(
                res,
                1,
                type='file',
                path=opj(ds.path, hitpath),
                # each file must report the ID of the dataset it is from, critical for
                # discovering related content
                dsid=ds.id)
        assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id)
        # test the key and specific value of the match
        assert_in(matched_key, res[-1]['query_matched'])
        assert_equal(res[-1]['query_matched'][matched_key], matched_val)
示例#31
0
def test_fs_traverse(topdir):
    # setup temp directory tree for testing
    annex = AnnexRepo(topdir)
    AnnexRepo(opj(topdir, 'annexdir'), create=True)
    GitRepo(opj(topdir, 'gitdir'), create=True)
    GitRepo(opj(topdir, 'dir', 'subgit'), create=True)
    annex.add(opj(topdir, 'dir'))
    annex.commit()
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force'])

    # traverse file system in recursive and non-recursive modes
    for recursive in [True, False]:
        # test fs_traverse in display mode
        with swallow_logs(new_level=logging.INFO) as log, swallow_outputs() as cmo:
            repo = AnnexRepo(topdir)
            fs = fs_traverse(topdir, repo, recurse_directories=recursive, json='display')
            if recursive:
                # fs_traverse logs should contain all not ignored subdirectories
                for subdir in [opj(topdir, 'dir'), opj(topdir, 'dir', 'subdir')]:
                    assert_in('Directory: ' + subdir, log.out)
                # fs_traverse stdout contains subdirectory
                assert_in(('file2.txt' and 'dir'), cmo.out)

            # extract info of the top-level child directory
            child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
            # size of dir type child in non-recursive modes should be 0 Bytes(default) as
            # dir type child's size currently has no metadata file for traverser to pick its size from
            # and would require a recursive traversal w/ write to child metadata file mode
            assert_equal(child['size']['total'], {True: '6 Bytes', False: '0 Bytes'}[recursive])
            repo.precommit()  # to possibly stop batch process occupying the stdout

    for recursive in [True, False]:
        # run fs_traverse in write to json 'file' mode
        repo = AnnexRepo(topdir)
        fs = fs_traverse(topdir, repo, recurse_directories=recursive, json='file')
        # fs_traverse should return a dictionary
        assert_equal(isinstance(fs, dict), True)
        # not including git and annex folders
        assert_equal([item for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name']], [])
        # extract info of the top-level child directory
        child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
        # verify node type
        assert_equal(child['type'], 'dir')
        # same node size on running fs_traversal in recursive followed by non-recursive mode
        # verifies child's metadata file being used to find its size
        # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size)
        assert_equal(child['size']['total'], '6 Bytes')

        # verify subdirectory traversal if run in recursive mode
        # In current RF 'nodes' are stripped away during recursive traversal
        # for now... later we might reincarnate them "differently"
        # TODO!
        if False:  # recursive:
            # sub-dictionary should not include git and hidden directory info
            assert_equal([item for item in child['nodes'] if ('subgit' or '.fgit') == item['name']], [])
            # extract subdirectory dictionary, else fail
            subchild = [subitem for subitem in child["nodes"] if subitem['name'] == 'subdir'][0]
            # extract info of file1.txts, else fail
            link = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file1.txt'][0]
            # verify node's sizes and type
            assert_equal(link['size']['total'], '3 Bytes')
            assert_equal(link['size']['ondisk'], link['size']['total'])
            assert_equal(link['type'], 'link')
            # extract info of file2.txt, else fail
            brokenlink = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file2.txt'][0]
            # verify node's sizes and type
            assert_equal(brokenlink['type'], 'link-broken')
            assert_equal(brokenlink['size']['ondisk'], '0 Bytes')
            assert_equal(brokenlink['size']['total'], '3 Bytes')
示例#32
0
def _swallow_outputs(isatty=True):
    with swallow_outputs() as cmo:
        stdout = cmo.handles[0]
        stdout.isatty = lambda: isatty
        yield cmo
示例#33
0
        def run_cmd(dspath):
            ds = Dataset(dspath)
            status_rec = get_status_dict('foreach-dataset',
                                         ds=ds,
                                         path=ds.path,
                                         command=cmd)
            if not ds.is_installed():
                yield dict(status_rec,
                           status="impossible",
                           message="not installed")
                return
            # For consistent environment (Python) and formatting (command) similar to `run` one
            # But for Python command we provide actual ds and refds not paths
            placeholders = dict(
                pwd=pwd,
                # pass actual instances so .format could access attributes even for external commands
                ds=ds,  # if python else ds.path,
                dspath=ds.path,  # just for consistency with `run`
                refds=refds,  # if python else refds.path,
                # Check if the command contains "tmpdir" to avoid creating an
                # unnecessary temporary directory in most but not all cases.
                # Note: different from 'run' - not wrapping match within {} and doing str
                tmpdir=mkdtemp(
                    prefix="datalad-run-") if "tmpdir" in str(cmd) else "")
            try:
                if python:
                    if isinstance(cmd, str):
                        cmd_f, cmd_a, cmd_kw = _PYTHON_CMDS[cmd_type], (
                            cmd, placeholders), {}
                    else:
                        assert _is_callable(cmd)
                        # all placeholders are passed as kwargs to the function
                        cmd_f, cmd_a, cmd_kw = cmd, [], placeholders

                    cm = chpwd_cm(ds.path) if chpwd == 'ds' else nothing_cm()
                    with cm:
                        if output_streams == 'pass-through':
                            res = cmd_f(*cmd_a, **cmd_kw)
                            out = {}
                        elif output_streams == 'capture':
                            with swallow_outputs() as cmo:
                                res = cmd_f(*cmd_a, **cmd_kw)
                                out = {
                                    'stdout': cmo.out,
                                    'stderr': cmo.err,
                                }
                        else:
                            raise RuntimeError(output_streams)
                        if cmd_type == 'eval':
                            status_rec['result'] = res
                        else:
                            assert res is None
                else:
                    try:
                        cmd_expanded = format_command(cmd, **placeholders)
                    except KeyError as exc:
                        yield dict(
                            status_rec,
                            status='impossible',
                            message=(
                                'command has an unrecognized placeholder: %s',
                                exc))
                        return
                    out = ds.repo._git_runner.run(
                        cmd_expanded,
                        cwd=ds.path if chpwd == 'ds' else pwd,
                        protocol=protocol)
                if output_streams == 'capture':
                    status_rec.update(out)
                    # provide some feedback to user in default rendering
                    if any(out.values()):
                        status_rec['message'] = shortened_repr(out, 100)
                status_rec['status'] = 'ok'
                yield status_rec
            except Exception as exc:
                # get a better version with exception handling redoing the whole
                # status dict from scratch
                yield get_status_dict('foreach-dataset',
                                      ds=ds,
                                      path=ds.path,
                                      command=cmd,
                                      exception=exc,
                                      status='error',
                                      message=str(exc))
示例#34
0
def test_fs_traverse(topdir):
    # setup temp directory tree for testing
    annex = AnnexRepo(topdir)
    AnnexRepo(opj(topdir, 'annexdir'), create=True)
    GitRepo(opj(topdir, 'gitdir'), create=True)
    GitRepo(opj(topdir, 'dir', 'subgit'), create=True)
    annex.add(opj(topdir, 'dir'), commit=True)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force'])

    # traverse file system in recursive and non-recursive modes
    for recursive in [True, False]:
        # test fs_traverse in display mode
        with swallow_logs(
                new_level=logging.INFO) as log, swallow_outputs() as cmo:
            fs = fs_traverse(topdir,
                             AnnexRepo(topdir),
                             recurse_directories=recursive,
                             json='display')
            if recursive:
                # fs_traverse logs should contain all not ignored subdirectories
                for subdir in [
                        opj(topdir, 'dir'),
                        opj(topdir, 'dir', 'subdir')
                ]:
                    assert_in('Directory: ' + subdir, log.out)
                # fs_traverse stdout contains subdirectory
                assert_in(('file2.txt' and 'dir'), cmo.out)

            # extract info of the top-level child directory
            child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
            # size of dir type child in non-recursive modes should be 0 Bytes(default) as
            # dir type child's size currently has no metadata file for traverser to pick its size from
            # and would require a recursive traversal w/ write to child metadata file mode
            assert_equal(child['size']['total'], {
                True: '6 Bytes',
                False: '0 Bytes'
            }[recursive])

    for recursive in [True, False]:
        # run fs_traverse in write to json 'file' mode
        fs = fs_traverse(topdir,
                         AnnexRepo(topdir),
                         recurse_directories=recursive,
                         json='file')
        # fs_traverse should return a dictionary
        assert_equal(isinstance(fs, dict), True)
        # not including git and annex folders
        assert_equal([
            item
            for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name']
        ], [])
        # extract info of the top-level child directory
        child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
        # verify node type
        assert_equal(child['type'], 'dir')
        # same node size on running fs_traversal in recursive followed by non-recursive mode
        # verifies child's metadata file being used to find its size
        # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size)
        assert_equal(child['size']['total'], '6 Bytes')

        # verify subdirectory traversal if run in recursive mode
        # In current RF 'nodes' are stripped away during recursive traversal
        # for now... later we might reincarnate them "differently"
        # TODO!
        if False:  # recursive:
            # sub-dictionary should not include git and hidden directory info
            assert_equal([
                item for item in child['nodes']
                if ('subgit' or '.fgit') == item['name']
            ], [])
            # extract subdirectory dictionary, else fail
            subchild = [
                subitem for subitem in child["nodes"]
                if subitem['name'] == 'subdir'
            ][0]
            # extract info of file1.txts, else fail
            link = [
                subnode for subnode in subchild["nodes"]
                if subnode['name'] == 'file1.txt'
            ][0]
            # verify node's sizes and type
            assert_equal(link['size']['total'], '3 Bytes')
            assert_equal(link['size']['ondisk'], link['size']['total'])
            assert_equal(link['type'], 'link')
            # extract info of file2.txt, else fail
            brokenlink = [
                subnode for subnode in subchild["nodes"]
                if subnode['name'] == 'file2.txt'
            ][0]
            # verify node's sizes and type
            assert_equal(brokenlink['type'], 'link-broken')
            assert_equal(brokenlink['size']['ondisk'], '0 Bytes')
            assert_equal(brokenlink['size']['total'], '3 Bytes')
示例#35
0
def test_clean(d=None):
    AnnexRepo(d, create=True)
    ds = Dataset(d)
    assert_status('notneeded', clean(dataset=ds))

    archives_path = ds.pathobj / Path(ARCHIVES_TEMP_DIR)
    annex_tmp_path = ds.pathobj / Path(ANNEX_TEMP_DIR)
    annex_trans_path = ds.pathobj / Path(ANNEX_TRANSFER_DIR)
    index_path = ds.repo.dot_git / Path(SEARCH_INDEX_DOTGITDIR)

    # if we create some temp archives directory
    (archives_path / 'somebogus').mkdir(parents=True)
    res = clean(dataset=ds,
                return_type='item-or-list',
                result_filter=lambda x: x['status'] == 'ok')
    assert_equal(res['path'], str(archives_path))
    assert_equal(res['message'][0] % tuple(res['message'][1:]),
                 "Removed 1 temporary archive directory: somebogus")
    assert_false(archives_path.exists())

    # relative path
    (archives_path / 'somebogus').mkdir(parents=True)
    (archives_path / 'somebogus2').mkdir(parents=True)
    with chpwd(d), swallow_outputs() as cmo:
        res = clean(return_type='item-or-list',
                    result_filter=lambda x: x['status'] == 'ok')
        assert_equal(
            res['message'][0] % tuple(res['message'][1:]),
            "Removed 2 temporary archive directories: somebogus, "
            "somebogus2")
        assert_false(archives_path.exists())

    # and what about git annex temporary files?
    annex_tmp_path.mkdir(parents=True)
    (annex_tmp_path / "somebogus").write_text("load")
    with chpwd(d):
        res = clean(return_type='item-or-list',
                    result_filter=lambda x: x['status'] == 'ok')
        assert_equal(res['path'], str(annex_tmp_path))
        assert_equal(res['message'][0] % tuple(res['message'][1:]),
                     "Removed 1 temporary annex file: somebogus")
        assert_false(annex_tmp_path.exists())

    (annex_trans_path / 'somebogus').mkdir(parents=True, exist_ok=True)
    with chpwd(d):
        res = clean(return_type='item-or-list',
                    result_filter=lambda x: x['status'] == 'ok')
        assert_equal(res['path'], str(annex_trans_path))
        assert_equal(
            res['message'][0] % tuple(res['message'][1:]),
            "Removed 1 annex temporary transfer directory: somebogus")
        assert_false(annex_trans_path.exists())

    # search index
    index_path.mkdir(parents=True)
    (index_path / "MAIN_r55n3hiyvxkdf1fi.seg, _MAIN_1.toc").write_text("noop")
    with chpwd(d):
        res = clean(return_type='item-or-list',
                    result_filter=lambda x: x['status'] == 'ok')
        assert_equal(res['path'], str(index_path))
        assert_equal(
            res['message'][0] % tuple(res['message'][1:]),
            "Removed 1 metadata search index file: "
            "MAIN_r55n3hiyvxkdf1fi.seg, _MAIN_1.toc")
        assert_false(index_path.exists())

    # remove empty directories, too
    archives_path.mkdir(parents=True)
    with chpwd(d):
        res = clean(return_type='item-or-list',
                    result_filter=lambda x: x['status'] == 'ok')
        assert_equal(res['path'], str(archives_path))
        assert_equal(res['message'][0] % tuple(res['message'][1:]),
                     "Removed empty temporary archive directory")
        assert_false(archives_path.exists())

    annex_tmp_path.mkdir(parents=True)
    with chpwd(d):
        res = clean(return_type='item-or-list',
                    result_filter=lambda x: x['status'] == 'ok')
        assert_equal(res['path'], str(annex_tmp_path))
        assert_equal(res['message'][0] % tuple(res['message'][1:]),
                     "Removed empty temporary annex directory")
        assert_false(annex_tmp_path.exists())

    annex_trans_path.mkdir(parents=True)
    with chpwd(d):
        res = clean(return_type='item-or-list',
                    result_filter=lambda x: x['status'] == 'ok')
        assert_equal(res['path'], str(annex_trans_path))
        assert_equal(res['message'][0] % tuple(res['message'][1:]),
                     "Removed empty annex temporary transfer directory")
        assert_false(annex_trans_path.exists())

    index_path.mkdir(parents=True)
    with chpwd(d):
        res = clean(return_type='item-or-list',
                    result_filter=lambda x: x['status'] == 'ok')
        assert_equal(res['path'], str(index_path))
        assert_equal(res['message'][0] % tuple(res['message'][1:]),
                     "Removed empty metadata search index directory")
        assert_false(index_path.exists())
示例#36
0
def test_within_ds_file_search(path):
    try:
        import nibabel
    except ImportError:
        raise SkipTest
    ds = Dataset(path).create(force=True)
    ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset')
    makedirs(opj(path, 'stim'))
    for src, dst in (('nifti1.nii.gz',
                      opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz')),
                     ('nifti1.nii.gz',
                      opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'))):
        copy(opj(dirname(dirname(__file__)), 'tests', 'data', 'files', src),
             opj(path, dst))
    ds.save()
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    # basic sanity check on the metadata structure of the dataset
    dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata']
    for src in ('bids', 'nifti1'):
        # something for each one
        assert_in(src, dsmeta)
        # each src declares its own context
        assert_in('@context', dsmeta[src])
        # we have a unique content metadata summary for each src
        assert_in(src, dsmeta['datalad_unique_content_properties'])

    # test default behavior
    with swallow_outputs() as cmo:
        ds.search(show_keys='name', mode='textblob')

        assert_in("""\
id
meta
parentds
path
type
""", cmo.out)

    target_out = """\
annex.key
bids.BIDSVersion
bids.author
bids.citation
bids.conformsto
bids.datatype
bids.description
"""
    if external_versions['bids'] >= '0.9':
        target_out += "bids.extension\n"
    target_out += """\
bids.fundedby
bids.license
bids.name
bids.subject.age(years)
bids.subject.gender
bids.subject.handedness
bids.subject.hearing_problems_current
bids.subject.id
bids.subject.language
bids.suffix
bids.task
datalad_core.id
datalad_core.refcommit
id
nifti1.cal_max
nifti1.cal_min
nifti1.datatype
nifti1.description
nifti1.dim
nifti1.freq_axis
nifti1.intent
nifti1.magic
nifti1.phase_axis
nifti1.pixdim
nifti1.qform_code
nifti1.sform_code
nifti1.sizeof_hdr
nifti1.slice_axis
nifti1.slice_duration
nifti1.slice_end
nifti1.slice_order
nifti1.slice_start
nifti1.spatial_resolution(mm)
nifti1.t_unit
nifti1.temporal_spacing(s)
nifti1.toffset
nifti1.vox_offset
nifti1.xyz_unit
parentds
path
type
"""

    # check generated autofield index keys
    with swallow_outputs() as cmo:
        ds.search(mode='autofield', show_keys='name')
        # it is impossible to assess what is different from that dump
        # so we will use diff
        diff = list(unified_diff(target_out.splitlines(),
                                 cmo.out.splitlines()))
        assert_in(target_out, cmo.out, msg="Diff: %s" % os.linesep.join(diff))

    assert_result_count(ds.search('blablob#'), 0)
    # now check that we can discover things from the aggregated metadata
    for mode, query, hitpath, matched_key, matched_val in (
            # random keyword query
            # multi word query implies AND
        ('textblob', ['bold', 'female'],
         opj('sub-03', 'func',
             'sub-03_task-some_bold.nii.gz'), 'meta', 'female'),
            # report which field matched with auto-field
        ('autofield', 'female',
         opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'),
         'bids.subject.gender', 'female'),
            # autofield multi-word query is also AND
        ('autofield', ['bids.suffix:bold', 'bids.subject.id:01'],
         opj('sub-01', 'func',
             'sub-01_task-some_bold.nii.gz'), 'bids.suffix', 'bold'),
            # TODO extend with more complex queries to test whoosh
            # query language configuration
    ):
        res = ds.search(query, mode=mode, full_record=True)
        if mode == 'textblob':
            # 'textblob' does datasets by default only (be could be configured otherwise
            assert_result_count(res, 1)
        else:
            # the rest has always a file and the dataset, because they carry metadata in
            # the same structure
            assert_result_count(res, 2)
            assert_result_count(
                res,
                1,
                type='file',
                path=opj(ds.path, hitpath),
                # each file must report the ID of the dataset it is from, critical for
                # discovering related content
                dsid=ds.id)
        assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id)
        # test the key and specific value of the match
        assert_in(matched_key, res[-1]['query_matched'])
        assert_equal(res[-1]['query_matched'][matched_key], matched_val)
示例#37
0
def test_container_from_subdataset(ds_path, src_subds_path, local_file):

    # prepare a to-be subdataset with a registered container
    src_subds = Dataset(src_subds_path).create()
    src_subds.containers_add(name="first",
                             url=get_local_file_url(
                                 op.join(local_file, 'some_container.img')))
    # add it as subdataset to a super ds:
    ds = Dataset(ds_path).create()
    subds = ds.install("sub", source=src_subds_path)
    # add it again one level down to see actual recursion:
    subds.install("subsub", source=src_subds_path)

    # We come up empty without recursive:
    res = ds.containers_list(recursive=False, **RAW_KWDS)
    assert_result_count(res, 0)

    # query available containers from within super:
    res = ds.containers_list(recursive=True, **RAW_KWDS)
    assert_result_count(res, 2)
    assert_in_results(res, action="containers", refds=ds.path)

    # default location within the subdataset:
    target_path = op.join(subds.path, '.datalad', 'environments', 'first',
                          'image')
    assert_result_count(res,
                        1,
                        name='sub/first',
                        type='file',
                        action='containers',
                        status='ok',
                        path=target_path,
                        parentds=subds.path)

    # not installed subdataset doesn't pose an issue:
    sub2 = ds.create("sub2")
    assert_result_count(ds.subdatasets(), 2, type="dataset")
    ds.uninstall("sub2")
    from datalad.tests.utils import assert_false
    assert_false(sub2.is_installed())

    # same results as before, not crashing or somehow confused by a not present
    # subds:
    res = ds.containers_list(recursive=True, **RAW_KWDS)
    assert_result_count(res, 2)
    assert_result_count(res,
                        1,
                        name='sub/first',
                        type='file',
                        action='containers',
                        status='ok',
                        path=target_path,
                        parentds=subds.path)

    # The default renderer includes the image names.
    with swallow_outputs() as out:
        ds.containers_list(recursive=True)
        lines = out.out.splitlines()
    assert_re_in("sub/first", lines)
    assert_re_in("sub/subsub/first", lines)
    # But we are careful not to render partial names from subdataset traversals
    # (i.e. we recurse with containers_list(..., result_renderer=None)).
    with assert_raises(AssertionError):
        assert_re_in("subsub/first", lines)
示例#38
0
def test_within_ds_file_search(path):
    try:
        import mutagen
    except ImportError:
        raise SkipTest
    ds = Dataset(path).create(force=True)
    # override default and search for datasets and files for this test
    for m in ('egrep', 'textblob', 'autofield'):
        ds.config.add(
            'datalad.search.index-{}-documenttype'.format(m), 'all',
            where='dataset')
    ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset')
    makedirs(opj(path, 'stim'))
    for src, dst in (
            ('audio.mp3', opj('stim', 'stim1.mp3')),):
        copy(
            opj(dirname(dirname(__file__)), 'tests', 'data', src),
            opj(path, dst))
    ds.save()
    ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True)
    # If it is not under annex, below addition of metadata silently does
    # not do anything
    ds.repo.set_metadata(
        opj('stim', 'stim1.mp3'), init={'importance': 'very'})
    ds.aggregate_metadata()
    ok_clean_git(ds.path)
    # basic sanity check on the metadata structure of the dataset
    dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata']
    for src in ('audio',):
        # something for each one
        assert_in(src, dsmeta)
        # each src declares its own context
        assert_in('@context', dsmeta[src])
        # we have a unique content metadata summary for each src
        assert_in(src, dsmeta['datalad_unique_content_properties'])

    # test default behavior
    with swallow_outputs() as cmo:
        ds.search(show_keys='name', mode='textblob')

        assert_in("""\
id
meta
parentds
path
type
""", cmo.out)

    target_out = """\
annex.importance
annex.key
audio.bitrate
audio.duration(s)
audio.format
audio.music-Genre
audio.music-album
audio.music-artist
audio.music-channels
audio.music-sample_rate
audio.name
audio.tracknumber
datalad_core.id
datalad_core.refcommit
id
parentds
path
type
"""

    # check generated autofield index keys
    with swallow_outputs() as cmo:
        ds.search(mode='autofield', show_keys='name')
        # it is impossible to assess what is different from that dump
        assert_in(target_out, cmo.out)

    assert_result_count(ds.search('blablob#'), 0)
    # now check that we can discover things from the aggregated metadata
    for mode, query, hitpath, matched in (
        ('egrep',
         ':mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # same as above, leading : is stripped, in indicates "ALL FIELDS"
        ('egrep',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # same as above, but with AND condition
        # get both matches
        ('egrep',
         ['mp3', 'type:file'],
         opj('stim', 'stim1.mp3'),
         {'type': 'file', 'audio.format': 'mp3'}),
        # case insensitive search
        ('egrep',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # field selection by expression
        ('egrep',
         'audio\.+:mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # random keyword query
        ('textblob',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'meta': 'mp3'}),
        # report which field matched with auto-field
        ('autofield',
         'mp3',
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # XXX next one is not supported by current text field analyser
        # decomposes the mime type in [mime, audio, mp3]
        # ('autofield',
        # "'mime:audio/mp3'",
        # opj('stim', 'stim1.mp3'),
        # 'audio.format', 'mime:audio/mp3'),
        # but this one works
        ('autofield',
         "'mime audio mp3'",
         opj('stim', 'stim1.mp3'),
         {'audio.format': 'mp3'}),
        # TODO extend with more complex queries to test whoosh
        # query language configuration
    ):
        res = ds.search(query, mode=mode, full_record=True)
        assert_result_count(
            res, 1, type='file', path=opj(ds.path, hitpath),
            # each file must report the ID of the dataset it is from, critical for
            # discovering related content
            dsid=ds.id)
        # in egrep we currently do not search unique values
        # and the queries above aim at files
        assert_result_count(res, 1 if mode == 'egrep' else 2)
        if mode != 'egrep':
            assert_result_count(
                res, 1, type='dataset', path=ds.path, dsid=ds.id)
        # test the key and specific value of the match
        for matched_key, matched_val in matched.items():
            assert_in(matched_key, res[-1]['query_matched'])
            assert_equal(res[-1]['query_matched'][matched_key], matched_val)

    # test a suggestion msg being logged if no hits and key is a bit off
    with swallow_logs(new_level=logging.INFO) as cml:
        res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep')
        assert not res
        assert_in('Did you mean any of', cml.out)
        assert_in('audio.format', cml.out)
        assert_in('audio.bitrate', cml.out)