def test_aggregate_with_missing_or_duplicate_id(path):
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    subds = ds.create('sub', force=True)
    subds.remove(opj('.datalad', 'config'), if_dirty='ignore')
    assert_false(exists(opj(subds.path, '.datalad', 'config')))
    subsubds = subds.create('subsub', force=True)
    # aggregate from bottom to top, guess native data, no compacting of graph
    # should yield 6 meta data sets, one implicit, and one native per dataset
    # and a second native set for the topmost dataset
    aggregate_metadata(ds, guess_native_type=True, recursive=True)
    # no only ask the top superdataset, no recursion, just reading from the cache
    meta = get_metadata(
        ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False)
    # and we know nothing subsub
    for name in ('grandchild_äöü東',):
        assert_true(sum([s.get('name', '') == assure_unicode(name) for s in meta]))

    # but search should not fail
    with swallow_outputs():
        res1 = list(search_('.', regex=True, dataset=ds))
    assert res1

    # and let's see now if we wouldn't fail if dataset is duplicate if we
    # install the same dataset twice
    subds_clone = ds.install(source=subds.path, path="subds2")
    with swallow_outputs():
        res2 = list(search_('.', regex=True, dataset=ds))
def test_aggregation(path):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    subds = ds.create('sub', force=True)
    subsubds = subds.create('subsub', force=True)
    # aggregate from bottom to top, guess native data, no compacting of graph
    # should yield 6 meta data sets, one implicit, and one native per dataset
    # and a second natiev set for the topmost dataset
    aggregate_metadata(ds, guess_native_type=True, recursive=True)
    # no only ask the top superdataset, no recursion, just reading from the cache
    meta = get_metadata(
        ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False)
    assert_equal(len(meta), 10)
    # same schema
        sum([s.get('@context', {'@vocab': None})['@vocab'] == 'http://schema.org/'
             for s in meta]))
    # three different IDs
    assert_equal(3, len(set([s.get('@id') for s in meta])))
    # and we know about all three datasets
    for name in ('mother_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(sum([s.get('name', None) == assure_unicode(name) for s in meta]))
        # first implicit, then two natives, then aggregate
    success = False
    for m in meta:
        p = m.get('dcterms:hasPart', {})
        if p.get('@id', None) == subsubds.id:
            assert_equal(opj('sub', 'subsub'), p.get('location', None))
            success = True

    # save the toplevel dataset only (see below)
    ds.save('with aggregated meta data', all_changes=True)

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(opj(path, 'clone'), source=ds.path)
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh meta data, the implicit one for the top-most datasets should
    # differ, but the rest not
    clonemeta = get_metadata(
        clone, guess_type=False, ignore_subdatasets=False, ignore_cache=False)

    # make sure the implicit md for the topmost come first
    assert_equal(clonemeta[0]['@id'], clone.id)
    assert_equal(clonemeta[0]['@id'], ds.id)
    assert_equal(clone.repo.get_hexsha(), ds.repo.get_hexsha())
    assert_equal(clonemeta[0]['version'], ds.repo.get_hexsha())
    # all but the implicit is identical
    assert_equal(clonemeta[1:], meta[1:])
    # the implicit md of the clone should list a dataset ID for its subds,
    # although it has not been obtained!

    # now obtain a subdataset in the clone and the IDs should be updated
    partial = get_metadata(clone, guess_type=False, ignore_cache=True)
    # ids don't change
    assert_equal(partial[0]['@id'], clonemeta[0]['@id'])
    # datasets are properly connected

    # query smoke test
    if os.environ.get('DATALAD_TESTS_NONETWORK'):
        raise SkipTest

    assert_equal(len(list(clone.search('mother'))), 1)
    assert_equal(len(list(clone.search('MoTHER'))), 1)  # case insensitive

    child_res = list(clone.search('child'))
    assert_equal(len(child_res), 2)

    # little helper to match names
    def assert_names(res, names, path=clone.path):
        assert_equal(list(map(itemgetter(0), res)),
                     [opj(path, n) for n in names])
    # should yield (location, report) tuples
    assert_names(child_res, ['sub', 'sub/subsub'])

    # result should be identical to invoking search from api
    # and search_ should spit out locations out
    with swallow_outputs() as cmo:
        res = list(search_('child', dataset=clone))
        assert_equal(res, child_res)
        assert_in(res[0][0], cmo.out)
    # and overarching search_ just for smoke testing of processing outputs
    # and not puking (e.g. under PY3)
    with swallow_outputs() as cmo:
        assert list(search_('.', regex=True, dataset=clone))
        assert cmo.out

    # test searching among specified properties only
    assert_names(clone.search('i', search='name'), ['sub', 'sub/subsub'])
    assert_names(clone.search('i', search='keywords'), ['.'])
    # case shouldn't matter
    assert_names(clone.search('i', search='Keywords'), ['.'])
    assert_names(clone.search('i', search=['name', 'keywords']),
                 ['.', 'sub', 'sub/subsub'])

    # without report_matched, we are getting none of the fields
    assert(all([not x for x in map(itemgetter(1), child_res)]))
    # but we would get all if asking for '*'
    assert(all([len(x) >= 9
                for x in map(itemgetter(1),
                             list(clone.search('child', report='*')))]))
    # but we would get only the matching name if we ask for report_matched
        set(map(lambda x: tuple(x[1].keys()),
                clone.search('child', report_matched=True))),
    # and the additional field we might have asked with report
        set(map(lambda x: tuple(sorted(x[1].keys())),
                clone.search('child', report_matched=True,
        set([('name', 'schema:type')])
    # and if we ask report to be 'empty', we should get no fields
    child_res_empty = list(clone.search('child', report=''))
    assert_equal(len(child_res_empty), 2)
        set(map(lambda x: tuple(x[1].keys()), child_res_empty)),

    # more tests on returned paths:
    assert_names(clone.search('datalad'), ['.', 'sub', 'sub/subsub'])
    # if we clone subdataset and query for value present in it and its kid
    clone_sub = clone.install('sub')
    assert_names(clone_sub.search('datalad'), ['.', 'subsub'], clone_sub.path)

    # Test 'and' for multiple search entries
    assert_equal(len(list(clone.search(['child', 'bids']))), 2)
    assert_equal(len(list(clone.search(['child', 'subsub']))), 1)
    assert_equal(len(list(clone.search(['bids', 'sub']))), 2)

    res = list(clone.search('.*', regex=True))  # with regex
    assert_equal(len(res), 3)  # one per dataset

    # we do search, not match
    assert_equal(len(list(clone.search('randchild', regex=True))), 1)
    assert_equal(len(list(clone.search(['gr.nd', 'ch.ld'], regex=True))), 1)
    assert_equal(len(list(clone.search('randchil.', regex=True))), 1)
    assert_equal(len(list(clone.search('^randchild.*', regex=True))), 0)
    assert_equal(len(list(clone.search('^grandchild.*', regex=True))), 1)
    assert_equal(len(list(clone.search('grandchild'))), 1)