def test_aggregate_with_missing_or_duplicate_id(path): # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) subds = ds.create('sub', force=True, if_dirty='ignore') subds.repo.remove(opj('.datalad', 'config')) subds.save() assert_false(exists(opj(subds.path, '.datalad', 'config'))) subsubds = subds.create('subsub', force=True, if_dirty='ignore') # aggregate from bottom to top, guess native data, no compacting of graph # should yield 6 meta data sets, one implicit, and one native per dataset # and a second native set for the topmost dataset aggregate_metadata(ds, guess_native_type=True, recursive=True) # no only ask the top superdataset, no recursion, just reading from the cache meta = get_metadata(ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # and we know nothing subsub for name in ('grandchild_äöü東', ): assert_true( sum([s.get('name', '') == assure_unicode(name) for s in meta])) # but search should not fail with swallow_outputs(): res1 = list(search_('.', regex=True, dataset=ds)) assert res1 # and let's see now if we wouldn't fail if dataset is duplicate if we # install the same dataset twice subds_clone = ds.install(source=subds.path, path="subds2") with swallow_outputs(): res2 = list(search_('.', regex=True, dataset=ds))
def test_aggregate_with_missing_or_duplicate_id(path): # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) subds = ds.create('sub', force=True) subds.remove(opj('.datalad', 'config'), if_dirty='ignore') assert_false(exists(opj(subds.path, '.datalad', 'config'))) subsubds = subds.create('subsub', force=True) # aggregate from bottom to top, guess native data, no compacting of graph # should yield 6 meta data sets, one implicit, and one native per dataset # and a second native set for the topmost dataset aggregate_metadata(ds, guess_native_type=True, recursive=True) # no only ask the top superdataset, no recursion, just reading from the cache meta = get_metadata( ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # and we know nothing subsub for name in ('grandchild_äöü東',): assert_true(sum([s.get('name', '') == assure_unicode(name) for s in meta])) # but search should not fail with swallow_outputs(): res1 = list(search_('.', regex=True, dataset=ds)) assert res1 # and let's see now if we wouldn't fail if dataset is duplicate if we # install the same dataset twice subds_clone = ds.install(source=subds.path, path="subds2") with swallow_outputs(): res2 = list(search_('.', regex=True, dataset=ds))
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) subds = ds.create('sub', force=True, if_dirty='ignore') subsubds = subds.create('subsub', force=True, if_dirty='ignore') # aggregate from bottom to top, guess native data, no compacting of graph # should yield 6 meta data sets, one implicit, and one native per dataset # and a second natiev set for the topmost dataset aggregate_metadata(ds, guess_native_type=True, recursive=True) # no only ask the top superdataset, no recursion, just reading from the cache meta = get_metadata(ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) assert_equal(len(meta), 10) # same schema assert_equal( 10, sum([ s.get('@context', {'@vocab': None})['@vocab'] == 'http://schema.org/' for s in meta ])) # three different IDs assert_equal(3, len(set([s.get('@id') for s in meta]))) # and we know about all three datasets for name in ('mother_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s.get('name', None) == assure_unicode(name) for s in meta])) #print(meta) assert_equal( # first implicit, then two natives, then aggregate meta[3]['dcterms:hasPart']['@id'], subds.id) success = False for m in meta: p = m.get('dcterms:hasPart', {}) if p.get('@id', None) == subsubds.id: assert_equal(opj('sub', 'subsub'), p.get('location', None)) success = True assert_true(success) # save the toplevel dataset only (see below) ds.save('with aggregated meta data', auto_add_changes=True) # now clone the beast to simulate a new user installing an empty dataset clone = install(opj(path, 'clone'), source=ds.path) # ID mechanism works assert_equal(ds.id, clone.id) # get fresh meta data, the implicit one for the top-most datasets should # differ, but the rest not clonemeta = get_metadata(clone, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # make sure the implicit md for the topmost come first assert_equal(clonemeta[0]['@id'], clone.id) assert_equal(clonemeta[0]['@id'], ds.id) assert_equal(clone.repo.get_hexsha(), ds.repo.get_hexsha()) assert_equal(clonemeta[0]['version'], ds.repo.get_hexsha()) # all but the implicit is identical assert_equal(clonemeta[1:], meta[1:]) # the implicit md of the clone should list a dataset ID for its subds, # although it has not been obtained! assert_equal(clonemeta[3]['dcterms:hasPart']['@id'], subds.id) # now obtain a subdataset in the clone and the IDs should be updated clone.install('sub') partial = get_metadata(clone, guess_type=False, ignore_cache=True) # ids don't change assert_equal(partial[0]['@id'], clonemeta[0]['@id']) # datasets are properly connected assert_equal(partial[1]['dcterms:hasPart']['@id'], partial[2]['@id']) # query smoke test if os.environ.get('DATALAD_TESTS_NONETWORK'): raise SkipTest assert_equal(len(list(clone.search('mother'))), 1) assert_equal(len(list(clone.search('MoTHER'))), 1) # case insensitive child_res = list(clone.search('child')) assert_equal(len(child_res), 2) # little helper to match names def assert_names(res, names, path=clone.path): assert_equal(list(map(itemgetter(0), res)), [opj(path, n) for n in names]) # should yield (location, report) tuples assert_names(child_res, ['sub', 'sub/subsub']) # result should be identical to invoking search from api # and search_ should spit out locations out with swallow_outputs() as cmo: res = list(search_('child', dataset=clone)) assert_equal(res, child_res) assert_in(res[0][0], cmo.out) # and overarching search_ just for smoke testing of processing outputs # and not puking (e.g. under PY3) with swallow_outputs() as cmo: assert list(search_('.', regex=True, dataset=clone)) assert cmo.out # test searching among specified properties only assert_names(clone.search('i', search='name'), ['sub', 'sub/subsub']) assert_names(clone.search('i', search='keywords'), ['.']) # case shouldn't matter assert_names(clone.search('i', search='Keywords'), ['.']) assert_names(clone.search('i', search=['name', 'keywords']), ['.', 'sub', 'sub/subsub']) # without report_matched, we are getting none of the fields assert (all([not x for x in map(itemgetter(1), child_res)])) # but we would get all if asking for '*' assert (all([ len(x) >= 9 for x in map(itemgetter(1), list(clone.search('child', report='*'))) ])) # but we would get only the matching name if we ask for report_matched assert_equal( set( map(lambda x: tuple(x[1].keys()), clone.search('child', report_matched=True))), set([('name', )])) # and the additional field we might have asked with report assert_equal( set( map( lambda x: tuple(sorted(x[1].keys())), clone.search('child', report_matched=True, report=['schema:type']))), set([('name', 'schema:type')])) # and if we ask report to be 'empty', we should get no fields child_res_empty = list(clone.search('child', report='')) assert_equal(len(child_res_empty), 2) assert_equal(set(map(lambda x: tuple(x[1].keys()), child_res_empty)), set([tuple()])) # more tests on returned paths: assert_names(clone.search('datalad'), ['.', 'sub', 'sub/subsub']) # if we clone subdataset and query for value present in it and its kid clone_sub = clone.install('sub') assert_names(clone_sub.search('datalad'), ['.', 'subsub'], clone_sub.path) # Test 'and' for multiple search entries assert_equal(len(list(clone.search(['child', 'bids']))), 2) assert_equal(len(list(clone.search(['child', 'subsub']))), 1) assert_equal(len(list(clone.search(['bids', 'sub']))), 2) res = list(clone.search('.*', regex=True)) # with regex assert_equal(len(res), 3) # one per dataset # we do search, not match assert_equal(len(list(clone.search('randchild', regex=True))), 1) assert_equal(len(list(clone.search(['gr.nd', 'ch.ld'], regex=True))), 1) assert_equal(len(list(clone.search('randchil.', regex=True))), 1) assert_equal(len(list(clone.search('^randchild.*', regex=True))), 0) assert_equal(len(list(clone.search('^grandchild.*', regex=True))), 1) assert_equal(len(list(clone.search('grandchild'))), 1)
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) subds = ds.create('sub', force=True) subsubds = subds.create('subsub', force=True) # aggregate from bottom to top, guess native data, no compacting of graph # should yield 6 meta data sets, one implicit, and one native per dataset # and a second natiev set for the topmost dataset aggregate_metadata(ds, guess_native_type=True, recursive=True) # no only ask the top superdataset, no recursion, just reading from the cache meta = get_metadata( ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) assert_equal(len(meta), 10) # same schema assert_equal( 10, sum([s.get('@context', {'@vocab': None})['@vocab'] == 'http://schema.org/' for s in meta])) # three different IDs assert_equal(3, len(set([s.get('@id') for s in meta]))) # and we know about all three datasets for name in ('mother_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true(sum([s.get('name', None) == assure_unicode(name) for s in meta])) #print(meta) assert_equal( # first implicit, then two natives, then aggregate meta[3]['dcterms:hasPart']['@id'], subds.id) success = False for m in meta: p = m.get('dcterms:hasPart', {}) if p.get('@id', None) == subsubds.id: assert_equal(opj('sub', 'subsub'), p.get('location', None)) success = True assert_true(success) # save the toplevel dataset only (see below) ds.save('with aggregated meta data', all_changes=True) # now clone the beast to simulate a new user installing an empty dataset clone = install(opj(path, 'clone'), source=ds.path) # ID mechanism works assert_equal(ds.id, clone.id) # get fresh meta data, the implicit one for the top-most datasets should # differ, but the rest not clonemeta = get_metadata( clone, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # make sure the implicit md for the topmost come first assert_equal(clonemeta[0]['@id'], clone.id) assert_equal(clonemeta[0]['@id'], ds.id) assert_equal(clone.repo.get_hexsha(), ds.repo.get_hexsha()) assert_equal(clonemeta[0]['version'], ds.repo.get_hexsha()) # all but the implicit is identical assert_equal(clonemeta[1:], meta[1:]) # the implicit md of the clone should list a dataset ID for its subds, # although it has not been obtained! assert_equal( clonemeta[3]['dcterms:hasPart']['@id'], subds.id) # now obtain a subdataset in the clone and the IDs should be updated clone.install('sub') partial = get_metadata(clone, guess_type=False, ignore_cache=True) # ids don't change assert_equal(partial[0]['@id'], clonemeta[0]['@id']) # datasets are properly connected assert_equal(partial[1]['dcterms:hasPart']['@id'], partial[2]['@id']) # query smoke test if os.environ.get('DATALAD_TESTS_NONETWORK'): raise SkipTest assert_equal(len(list(clone.search('mother'))), 1) assert_equal(len(list(clone.search('MoTHER'))), 1) # case insensitive child_res = list(clone.search('child')) assert_equal(len(child_res), 2) # little helper to match names def assert_names(res, names, path=clone.path): assert_equal(list(map(itemgetter(0), res)), [opj(path, n) for n in names]) # should yield (location, report) tuples assert_names(child_res, ['sub', 'sub/subsub']) # result should be identical to invoking search from api # and search_ should spit out locations out with swallow_outputs() as cmo: res = list(search_('child', dataset=clone)) assert_equal(res, child_res) assert_in(res[0][0], cmo.out) # and overarching search_ just for smoke testing of processing outputs # and not puking (e.g. under PY3) with swallow_outputs() as cmo: assert list(search_('.', regex=True, dataset=clone)) assert cmo.out # test searching among specified properties only assert_names(clone.search('i', search='name'), ['sub', 'sub/subsub']) assert_names(clone.search('i', search='keywords'), ['.']) # case shouldn't matter assert_names(clone.search('i', search='Keywords'), ['.']) assert_names(clone.search('i', search=['name', 'keywords']), ['.', 'sub', 'sub/subsub']) # without report_matched, we are getting none of the fields assert(all([not x for x in map(itemgetter(1), child_res)])) # but we would get all if asking for '*' assert(all([len(x) >= 9 for x in map(itemgetter(1), list(clone.search('child', report='*')))])) # but we would get only the matching name if we ask for report_matched assert_equal( set(map(lambda x: tuple(x[1].keys()), clone.search('child', report_matched=True))), set([('name',)]) ) # and the additional field we might have asked with report assert_equal( set(map(lambda x: tuple(sorted(x[1].keys())), clone.search('child', report_matched=True, report=['schema:type']))), set([('name', 'schema:type')]) ) # and if we ask report to be 'empty', we should get no fields child_res_empty = list(clone.search('child', report='')) assert_equal(len(child_res_empty), 2) assert_equal( set(map(lambda x: tuple(x[1].keys()), child_res_empty)), set([tuple()]) ) # more tests on returned paths: assert_names(clone.search('datalad'), ['.', 'sub', 'sub/subsub']) # if we clone subdataset and query for value present in it and its kid clone_sub = clone.install('sub') assert_names(clone_sub.search('datalad'), ['.', 'subsub'], clone_sub.path) # Test 'and' for multiple search entries assert_equal(len(list(clone.search(['child', 'bids']))), 2) assert_equal(len(list(clone.search(['child', 'subsub']))), 1) assert_equal(len(list(clone.search(['bids', 'sub']))), 2) res = list(clone.search('.*', regex=True)) # with regex assert_equal(len(res), 3) # one per dataset # we do search, not match assert_equal(len(list(clone.search('randchild', regex=True))), 1) assert_equal(len(list(clone.search(['gr.nd', 'ch.ld'], regex=True))), 1) assert_equal(len(list(clone.search('randchil.', regex=True))), 1) assert_equal(len(list(clone.search('^randchild.*', regex=True))), 0) assert_equal(len(list(clone.search('^grandchild.*', regex=True))), 1) assert_equal(len(list(clone.search('grandchild'))), 1)