Пример #1
0
def test_ignore_nondatasets(path):
    # we want to ignore the version/commits for this test
    def _kill_time(meta):
        for m in meta:
            for k in ('version', 'dcterms:modified'):
                if k in m:
                    del m[k]
        return meta

    ds = Dataset(path).create()
    meta = _kill_time(get_metadata(ds))
    n_subm = 0
    # placing another repo in the dataset has no effect on metadata
    for cls, subpath in ((GitRepo, 'subm'), (AnnexRepo, 'annex_subm')):
        subm_path = opj(ds.path, subpath)
        r = cls(subm_path, create=True)
        with open(opj(subm_path, 'test'), 'w') as f:
            f.write('test')
        r.add('test')
        r.commit('some')
        assert_true(Dataset(subm_path).is_installed())
        assert_equal(meta, _kill_time(get_metadata(ds)))
        # making it a submodule has no effect either
        ds.save(auto_add_changes=True)
        assert_equal(len(ds.get_subdatasets()), n_subm + 1)
        assert_equal(meta, _kill_time(get_metadata(ds)))
        n_subm += 1
Пример #2
0
def test_basic_metadata(path):
    ds = Dataset(opj(path, 'origin'))
    meta = get_metadata(ds)
    assert_equal(sorted(meta[0].keys()), ['@context', 'dcterms:conformsTo'])
    ds.create(force=True, save=False)
    # with subdataset
    sub = ds.create('sub', force=True, if_dirty='ignore')
    ds.save()
    meta = get_metadata(ds)
    assert_equal(sorted(meta[0].keys()), [
        '@context', '@id', 'availableFrom', 'dcterms:conformsTo',
        'dcterms:modified', 'type', 'version'
    ])
    assert_equal(meta[0]['type'], 'Dataset')
    # clone and get relationship info in metadata
    sibling = install(opj(path, 'sibling'), source=opj(path, 'origin'))
    sibling_meta = get_metadata(sibling)
    assert_equal(sibling_meta[0]['@id'], ds.id)
    # origin should learn about the clone
    sibling.repo.push(remote='origin', refspec='git-annex')
    meta = get_metadata(ds)
    assert_equal([m['@id'] for m in meta[0]['availableFrom']],
                 [m['@id'] for m in sibling_meta[0]['availableFrom']])
    meta = get_metadata(ds, guess_type=True)
    # without aggregation there is not trace of subdatasets in the metadata
    assert_not_in('dcterms:hasPart', meta[0])
Пример #3
0
def test_ignore_nondatasets(path):
    # we want to ignore the version/commits for this test
    def _kill_time(meta):
        for m in meta:
            for k in ('version', 'dcterms:modified'):
                if k in m:
                    del m[k]
        return meta

    ds = Dataset(path).create()
    meta = _kill_time(get_metadata(ds))
    n_subm = 0
    # placing another repo in the dataset has no effect on metadata
    for cls, subpath in ((GitRepo, 'subm'), (AnnexRepo, 'annex_subm')):
        subm_path = opj(ds.path, subpath)
        r = cls(subm_path, create=True)
        with open(opj(subm_path, 'test'), 'w') as f:
            f.write('test')
        r.add('test')
        r.commit('some')
        assert_true(Dataset(subm_path).is_installed())
        assert_equal(meta, _kill_time(get_metadata(ds)))
        # making it a submodule has no effect either
        ds.add(subpath)
        assert_equal(len(ds.get_subdatasets()), n_subm + 1)
        assert_equal(meta, _kill_time(get_metadata(ds)))
        n_subm += 1
Пример #4
0
def test_basic_metadata(path):
    ds = Dataset(opj(path, 'origin'))
    meta = get_metadata(ds)
    assert_equal(sorted(meta[0].keys()),
                 ['@context', 'dcterms:conformsTo'])
    ds.create(force=True, save=False)
    # with subdataset
    sub = ds.create('sub', force=True)
    ds.save()
    meta = get_metadata(ds)
    assert_equal(
        sorted(meta[0].keys()),
        ['@context', '@id', 'availableFrom', 'dcterms:conformsTo',
         'dcterms:modified', 'type', 'version'])
    assert_equal(meta[0]['type'], 'Dataset')
    # clone and get relationship info in metadata
    sibling = install(opj(path, 'sibling'), source=opj(path, 'origin'))
    sibling_meta = get_metadata(sibling)
    assert_equal(sibling_meta[0]['@id'], ds.id)
    # origin should learn about the clone
    sibling.repo.push(remote='origin', refspec='git-annex')
    meta = get_metadata(ds)
    assert_equal([m['@id'] for m in meta[0]['availableFrom']],
                 [m['@id'] for m in sibling_meta[0]['availableFrom']])
    meta = get_metadata(ds, guess_type=True)
    # without aggregation there is not trace of subdatasets in the metadata
    assert_not_in('dcterms:hasPart', meta[0])
Пример #5
0
def test_aggregate_with_missing_or_duplicate_id(path):
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    subds = ds.create('sub', force=True, if_dirty='ignore')
    subds.repo.remove(opj('.datalad', 'config'))
    subds.save()
    assert_false(exists(opj(subds.path, '.datalad', 'config')))
    subsubds = subds.create('subsub', force=True, if_dirty='ignore')
    # aggregate from bottom to top, guess native data, no compacting of graph
    # should yield 6 meta data sets, one implicit, and one native per dataset
    # and a second native set for the topmost dataset
    aggregate_metadata(ds, guess_native_type=True, recursive=True)
    # no only ask the top superdataset, no recursion, just reading from the cache
    meta = get_metadata(ds,
                        guess_type=False,
                        ignore_subdatasets=False,
                        ignore_cache=False)
    # and we know nothing subsub
    for name in ('grandchild_äöü東', ):
        assert_true(
            sum([s.get('name', '') == assure_unicode(name) for s in meta]))

    # but search should not fail
    with swallow_outputs():
        res1 = list(search_('.', regex=True, dataset=ds))
    assert res1

    # and let's see now if we wouldn't fail if dataset is duplicate if we
    # install the same dataset twice
    subds_clone = ds.install(source=subds.path, path="subds2")
    with swallow_outputs():
        res2 = list(search_('.', regex=True, dataset=ds))
Пример #6
0
def test_aggregate_with_missing_or_duplicate_id(path):
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    subds = ds.create('sub', force=True)
    subds.remove(opj('.datalad', 'config'), if_dirty='ignore')
    assert_false(exists(opj(subds.path, '.datalad', 'config')))
    subsubds = subds.create('subsub', force=True)
    # aggregate from bottom to top, guess native data, no compacting of graph
    # should yield 6 meta data sets, one implicit, and one native per dataset
    # and a second native set for the topmost dataset
    aggregate_metadata(ds, guess_native_type=True, recursive=True)
    # no only ask the top superdataset, no recursion, just reading from the cache
    meta = get_metadata(
        ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False)
    # and we know nothing subsub
    for name in ('grandchild_äöü東',):
        assert_true(sum([s.get('name', '') == assure_unicode(name) for s in meta]))

    # but search should not fail
    with swallow_outputs():
        res1 = list(search_('.', regex=True, dataset=ds))
    assert res1

    # and let's see now if we wouldn't fail if dataset is duplicate if we
    # install the same dataset twice
    subds_clone = ds.install(source=subds.path, path="subds2")
    with swallow_outputs():
        res2 = list(search_('.', regex=True, dataset=ds))
Пример #7
0
def test_aggregation(path):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    subds = ds.create('sub', force=True, if_dirty='ignore')
    subsubds = subds.create('subsub', force=True, if_dirty='ignore')
    # aggregate from bottom to top, guess native data, no compacting of graph
    # should yield 6 meta data sets, one implicit, and one native per dataset
    # and a second natiev set for the topmost dataset
    aggregate_metadata(ds, guess_native_type=True, recursive=True)
    # no only ask the top superdataset, no recursion, just reading from the cache
    meta = get_metadata(ds,
                        guess_type=False,
                        ignore_subdatasets=False,
                        ignore_cache=False)
    assert_equal(len(meta), 10)
    # same schema
    assert_equal(
        10,
        sum([
            s.get('@context',
                  {'@vocab': None})['@vocab'] == 'http://schema.org/'
            for s in meta
        ]))
    # three different IDs
    assert_equal(3, len(set([s.get('@id') for s in meta])))
    # and we know about all three datasets
    for name in ('mother_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(
            sum([s.get('name', None) == assure_unicode(name) for s in meta]))
    #print(meta)
    assert_equal(
        # first implicit, then two natives, then aggregate
        meta[3]['dcterms:hasPart']['@id'],
        subds.id)
    success = False
    for m in meta:
        p = m.get('dcterms:hasPart', {})
        if p.get('@id', None) == subsubds.id:
            assert_equal(opj('sub', 'subsub'), p.get('location', None))
            success = True
    assert_true(success)

    # save the toplevel dataset only (see below)
    ds.save('with aggregated meta data', auto_add_changes=True)

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(opj(path, 'clone'), source=ds.path)
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh meta data, the implicit one for the top-most datasets should
    # differ, but the rest not
    clonemeta = get_metadata(clone,
                             guess_type=False,
                             ignore_subdatasets=False,
                             ignore_cache=False)

    # make sure the implicit md for the topmost come first
    assert_equal(clonemeta[0]['@id'], clone.id)
    assert_equal(clonemeta[0]['@id'], ds.id)
    assert_equal(clone.repo.get_hexsha(), ds.repo.get_hexsha())
    assert_equal(clonemeta[0]['version'], ds.repo.get_hexsha())
    # all but the implicit is identical
    assert_equal(clonemeta[1:], meta[1:])
    # the implicit md of the clone should list a dataset ID for its subds,
    # although it has not been obtained!
    assert_equal(clonemeta[3]['dcterms:hasPart']['@id'], subds.id)

    # now obtain a subdataset in the clone and the IDs should be updated
    clone.install('sub')
    partial = get_metadata(clone, guess_type=False, ignore_cache=True)
    # ids don't change
    assert_equal(partial[0]['@id'], clonemeta[0]['@id'])
    # datasets are properly connected
    assert_equal(partial[1]['dcterms:hasPart']['@id'], partial[2]['@id'])

    # query smoke test
    if os.environ.get('DATALAD_TESTS_NONETWORK'):
        raise SkipTest

    assert_equal(len(list(clone.search('mother'))), 1)
    assert_equal(len(list(clone.search('MoTHER'))), 1)  # case insensitive

    child_res = list(clone.search('child'))
    assert_equal(len(child_res), 2)

    # little helper to match names
    def assert_names(res, names, path=clone.path):
        assert_equal(list(map(itemgetter(0), res)),
                     [opj(path, n) for n in names])

    # should yield (location, report) tuples
    assert_names(child_res, ['sub', 'sub/subsub'])

    # result should be identical to invoking search from api
    # and search_ should spit out locations out
    with swallow_outputs() as cmo:
        res = list(search_('child', dataset=clone))
        assert_equal(res, child_res)
        assert_in(res[0][0], cmo.out)
    # and overarching search_ just for smoke testing of processing outputs
    # and not puking (e.g. under PY3)
    with swallow_outputs() as cmo:
        assert list(search_('.', regex=True, dataset=clone))
        assert cmo.out

    # test searching among specified properties only
    assert_names(clone.search('i', search='name'), ['sub', 'sub/subsub'])
    assert_names(clone.search('i', search='keywords'), ['.'])
    # case shouldn't matter
    assert_names(clone.search('i', search='Keywords'), ['.'])
    assert_names(clone.search('i', search=['name', 'keywords']),
                 ['.', 'sub', 'sub/subsub'])

    # without report_matched, we are getting none of the fields
    assert (all([not x for x in map(itemgetter(1), child_res)]))
    # but we would get all if asking for '*'
    assert (all([
        len(x) >= 9
        for x in map(itemgetter(1), list(clone.search('child', report='*')))
    ]))
    # but we would get only the matching name if we ask for report_matched
    assert_equal(
        set(
            map(lambda x: tuple(x[1].keys()),
                clone.search('child', report_matched=True))),
        set([('name', )]))
    # and the additional field we might have asked with report
    assert_equal(
        set(
            map(
                lambda x: tuple(sorted(x[1].keys())),
                clone.search('child',
                             report_matched=True,
                             report=['schema:type']))),
        set([('name', 'schema:type')]))
    # and if we ask report to be 'empty', we should get no fields
    child_res_empty = list(clone.search('child', report=''))
    assert_equal(len(child_res_empty), 2)
    assert_equal(set(map(lambda x: tuple(x[1].keys()), child_res_empty)),
                 set([tuple()]))

    # more tests on returned paths:
    assert_names(clone.search('datalad'), ['.', 'sub', 'sub/subsub'])
    # if we clone subdataset and query for value present in it and its kid
    clone_sub = clone.install('sub')
    assert_names(clone_sub.search('datalad'), ['.', 'subsub'], clone_sub.path)

    # Test 'and' for multiple search entries
    assert_equal(len(list(clone.search(['child', 'bids']))), 2)
    assert_equal(len(list(clone.search(['child', 'subsub']))), 1)
    assert_equal(len(list(clone.search(['bids', 'sub']))), 2)

    res = list(clone.search('.*', regex=True))  # with regex
    assert_equal(len(res), 3)  # one per dataset

    # we do search, not match
    assert_equal(len(list(clone.search('randchild', regex=True))), 1)
    assert_equal(len(list(clone.search(['gr.nd', 'ch.ld'], regex=True))), 1)
    assert_equal(len(list(clone.search('randchil.', regex=True))), 1)
    assert_equal(len(list(clone.search('^randchild.*', regex=True))), 0)
    assert_equal(len(list(clone.search('^grandchild.*', regex=True))), 1)
    assert_equal(len(list(clone.search('grandchild'))), 1)
Пример #8
0
def test_aggregation(path):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    subds = ds.create('sub', force=True)
    subsubds = subds.create('subsub', force=True)
    # aggregate from bottom to top, guess native data, no compacting of graph
    # should yield 6 meta data sets, one implicit, and one native per dataset
    # and a second natiev set for the topmost dataset
    aggregate_metadata(ds, guess_native_type=True, recursive=True)
    # no only ask the top superdataset, no recursion, just reading from the cache
    meta = get_metadata(
        ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False)
    assert_equal(len(meta), 10)
    # same schema
    assert_equal(
        10,
        sum([s.get('@context', {'@vocab': None})['@vocab'] == 'http://schema.org/'
             for s in meta]))
    # three different IDs
    assert_equal(3, len(set([s.get('@id') for s in meta])))
    # and we know about all three datasets
    for name in ('mother_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(sum([s.get('name', None) == assure_unicode(name) for s in meta]))
    #print(meta)
    assert_equal(
        # first implicit, then two natives, then aggregate
        meta[3]['dcterms:hasPart']['@id'],
        subds.id)
    success = False
    for m in meta:
        p = m.get('dcterms:hasPart', {})
        if p.get('@id', None) == subsubds.id:
            assert_equal(opj('sub', 'subsub'), p.get('location', None))
            success = True
    assert_true(success)

    # save the toplevel dataset only (see below)
    ds.save('with aggregated meta data', all_changes=True)

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(opj(path, 'clone'), source=ds.path)
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh meta data, the implicit one for the top-most datasets should
    # differ, but the rest not
    clonemeta = get_metadata(
        clone, guess_type=False, ignore_subdatasets=False, ignore_cache=False)

    # make sure the implicit md for the topmost come first
    assert_equal(clonemeta[0]['@id'], clone.id)
    assert_equal(clonemeta[0]['@id'], ds.id)
    assert_equal(clone.repo.get_hexsha(), ds.repo.get_hexsha())
    assert_equal(clonemeta[0]['version'], ds.repo.get_hexsha())
    # all but the implicit is identical
    assert_equal(clonemeta[1:], meta[1:])
    # the implicit md of the clone should list a dataset ID for its subds,
    # although it has not been obtained!
    assert_equal(
        clonemeta[3]['dcterms:hasPart']['@id'],
        subds.id)

    # now obtain a subdataset in the clone and the IDs should be updated
    clone.install('sub')
    partial = get_metadata(clone, guess_type=False, ignore_cache=True)
    # ids don't change
    assert_equal(partial[0]['@id'], clonemeta[0]['@id'])
    # datasets are properly connected
    assert_equal(partial[1]['dcterms:hasPart']['@id'],
                 partial[2]['@id'])

    # query smoke test
    if os.environ.get('DATALAD_TESTS_NONETWORK'):
        raise SkipTest

    assert_equal(len(list(clone.search('mother'))), 1)
    assert_equal(len(list(clone.search('MoTHER'))), 1)  # case insensitive

    child_res = list(clone.search('child'))
    assert_equal(len(child_res), 2)

    # little helper to match names
    def assert_names(res, names, path=clone.path):
        assert_equal(list(map(itemgetter(0), res)),
                     [opj(path, n) for n in names])
    # should yield (location, report) tuples
    assert_names(child_res, ['sub', 'sub/subsub'])

    # result should be identical to invoking search from api
    # and search_ should spit out locations out
    with swallow_outputs() as cmo:
        res = list(search_('child', dataset=clone))
        assert_equal(res, child_res)
        assert_in(res[0][0], cmo.out)
    # and overarching search_ just for smoke testing of processing outputs
    # and not puking (e.g. under PY3)
    with swallow_outputs() as cmo:
        assert list(search_('.', regex=True, dataset=clone))
        assert cmo.out

    # test searching among specified properties only
    assert_names(clone.search('i', search='name'), ['sub', 'sub/subsub'])
    assert_names(clone.search('i', search='keywords'), ['.'])
    # case shouldn't matter
    assert_names(clone.search('i', search='Keywords'), ['.'])
    assert_names(clone.search('i', search=['name', 'keywords']),
                 ['.', 'sub', 'sub/subsub'])

    # without report_matched, we are getting none of the fields
    assert(all([not x for x in map(itemgetter(1), child_res)]))
    # but we would get all if asking for '*'
    assert(all([len(x) >= 9
                for x in map(itemgetter(1),
                             list(clone.search('child', report='*')))]))
    # but we would get only the matching name if we ask for report_matched
    assert_equal(
        set(map(lambda x: tuple(x[1].keys()),
                clone.search('child', report_matched=True))),
        set([('name',)])
    )
    # and the additional field we might have asked with report
    assert_equal(
        set(map(lambda x: tuple(sorted(x[1].keys())),
                clone.search('child', report_matched=True,
                             report=['schema:type']))),
        set([('name', 'schema:type')])
    )
    # and if we ask report to be 'empty', we should get no fields
    child_res_empty = list(clone.search('child', report=''))
    assert_equal(len(child_res_empty), 2)
    assert_equal(
        set(map(lambda x: tuple(x[1].keys()), child_res_empty)),
        set([tuple()])
    )

    # more tests on returned paths:
    assert_names(clone.search('datalad'), ['.', 'sub', 'sub/subsub'])
    # if we clone subdataset and query for value present in it and its kid
    clone_sub = clone.install('sub')
    assert_names(clone_sub.search('datalad'), ['.', 'subsub'], clone_sub.path)

    # Test 'and' for multiple search entries
    assert_equal(len(list(clone.search(['child', 'bids']))), 2)
    assert_equal(len(list(clone.search(['child', 'subsub']))), 1)
    assert_equal(len(list(clone.search(['bids', 'sub']))), 2)

    res = list(clone.search('.*', regex=True))  # with regex
    assert_equal(len(res), 3)  # one per dataset

    # we do search, not match
    assert_equal(len(list(clone.search('randchild', regex=True))), 1)
    assert_equal(len(list(clone.search(['gr.nd', 'ch.ld'], regex=True))), 1)
    assert_equal(len(list(clone.search('randchil.', regex=True))), 1)
    assert_equal(len(list(clone.search('^randchild.*', regex=True))), 0)
    assert_equal(len(list(clone.search('^grandchild.*', regex=True))), 1)
    assert_equal(len(list(clone.search('grandchild'))), 1)