def test_basic_metadata(path): ds = Dataset(opj(path, 'origin')) meta = get_metadata(ds) assert_equal(sorted(meta[0].keys()), ['@context', 'dcterms:conformsTo']) ds.create(force=True, save=False) # with subdataset sub = ds.create('sub', force=True) ds.save() meta = get_metadata(ds) assert_equal( sorted(meta[0].keys()), ['@context', '@id', 'availableFrom', 'dcterms:conformsTo', 'dcterms:modified', 'type', 'version']) assert_equal(meta[0]['type'], 'Dataset') # clone and get relationship info in metadata sibling = install(opj(path, 'sibling'), source=opj(path, 'origin')) sibling_meta = get_metadata(sibling) assert_equal(sibling_meta[0]['@id'], ds.id) # origin should learn about the clone sibling.repo.push(remote='origin', refspec='git-annex') meta = get_metadata(ds) assert_equal([m['@id'] for m in meta[0]['availableFrom']], [m['@id'] for m in sibling_meta[0]['availableFrom']]) meta = get_metadata(ds, guess_type=True) # without aggregation there is not trace of subdatasets in the metadata assert_not_in('dcterms:hasPart', meta[0])
def init_dataset(self, dsdir: Path, create_time: datetime) -> Dataset: ds = Dataset(str(dsdir)) if not ds.is_installed(): log.info("Creating Datalad dataset") with custom_commit_date(create_time): with envset("GIT_CONFIG_PARAMETERS", f"'init.defaultBranch={DEFAULT_BRANCH}'"): ds.create(cfg_proc="text2git") if self.config.backup_remote is not None: ds.repo.init_remote( self.config.backup_remote, [ "type=external", "externaltype=rclone", "chunk=1GB", f"target={self.config.backup_remote}", # I made them matching "prefix=dandi-dandisets/annexstore", "embedcreds=no", "uuid=727f466f-60c3-4778-90b2-b2332856c2f8", "encryption=none", # shared, initialized in 000003 ], ) ds.repo.call_annex(["untrust", self.config.backup_remote]) ds.repo.set_preferred_content( "wanted", "(not metadata=distribution-restrictions=*)", remote=self.config.backup_remote, ) return ds
def test_basic_metadata(path): ds = Dataset(opj(path, 'origin')) meta = get_metadata(ds) assert_equal(sorted(meta[0].keys()), ['@context', 'dcterms:conformsTo']) ds.create(force=True, save=False) # with subdataset sub = ds.create('sub', force=True, if_dirty='ignore') ds.save() meta = get_metadata(ds) assert_equal(sorted(meta[0].keys()), [ '@context', '@id', 'availableFrom', 'dcterms:conformsTo', 'dcterms:modified', 'type', 'version' ]) assert_equal(meta[0]['type'], 'Dataset') # clone and get relationship info in metadata sibling = install(opj(path, 'sibling'), source=opj(path, 'origin')) sibling_meta = get_metadata(sibling) assert_equal(sibling_meta[0]['@id'], ds.id) # origin should learn about the clone sibling.repo.push(remote='origin', refspec='git-annex') meta = get_metadata(ds) assert_equal([m['@id'] for m in meta[0]['availableFrom']], [m['@id'] for m in sibling_meta[0]['availableFrom']]) meta = get_metadata(ds, guess_type=True) # without aggregation there is not trace of subdatasets in the metadata assert_not_in('dcterms:hasPart', meta[0])
def ensure_superdataset(self) -> Dataset: superds = Dataset(self.target_path) if not superds.is_installed(): log.info("Creating Datalad superdataset") with envset("GIT_CONFIG_PARAMETERS", f"'init.defaultBranch={DEFAULT_BRANCH}'"): superds.create(cfg_proc="text2git") return superds
def test_custom_commit_date(tmp_path: Path) -> None: ds = Dataset(tmp_path) ds.create(cfg_proc="text2git") (tmp_path / "file.txt").write_text("This is test text.\n") with custom_commit_date( datetime(2021, 6, 1, 12, 34, 56, tzinfo=timezone.utc)): ds.save(message="Add a file") repo = GitRepo(tmp_path) assert repo.get_commit_date("HEAD") == "2021-06-01T12:34:56+00:00" assert repo.get_commit_author( "HEAD") == "DANDI User <*****@*****.**>"
def test_ephemeral(ds_path=None, store_path=None, clone_path=None): dspath = Path(ds_path) store = Path(store_path) file_test = Path('file1.txt') file_testsub = Path('sub') / 'other.txt' # create the original dataset ds = Dataset(dspath) ds.create(force=True) ds.save() # put into store: ds.create_sibling_ria("ria+{}".format(store.as_uri()), "riastore", new_store_ok=True) ds.push(to="riastore", data="anything") # now, get an ephemeral clone from the RIA store: eph_clone = clone('ria+{}#{}'.format(store.as_uri(), ds.id), clone_path, reckless="ephemeral") # ephemeral clone was properly linked (store has bare repos!): clone_annex = (eph_clone.repo.dot_git / 'annex') assert_true(clone_annex.is_symlink()) assert_true(clone_annex.resolve().samefile(store / ds.id[:3] / ds.id[3:] / 'annex')) if not eph_clone.repo.is_managed_branch(): # TODO: We can't properly handle adjusted branch yet # we don't need to get files in order to access them: assert_equal((eph_clone.pathobj / file_test).read_text(), "some") assert_equal((eph_clone.pathobj / file_testsub).read_text(), "other") # can we unlock those files? eph_clone.unlock(file_test) # change content (eph_clone.pathobj / file_test).write_text("new content") eph_clone.save() # new content should already be in store # (except the store doesn't know yet) res = eph_clone.repo.fsck(remote="riastore-storage", fast=True) assert_equal(len(res), 2) assert_result_count(res, 1, success=True, file=file_test.as_posix()) assert_result_count(res, 1, success=True, file=file_testsub.as_posix()) # push back git history eph_clone.push(to=DEFAULT_REMOTE, data="nothing") # get an update in origin ds.update(merge=True, reobtain_data=True) assert_equal((ds.pathobj / file_test).read_text(), "new content")
def annex_path(tmpdir_factory): path = tmpdir_factory.mktemp('annexes') ds_path = str(path.join(DATASET_ID)) # Create an empty dataset for testing ds = Dataset(ds_path) ds.create() ds.no_annex(BIDS_NO_ANNEX) json_path = os.path.join(ds_path, 'dataset_description.json') with open(json_path, 'w') as f: json.dump(DATASET_DESCRIPTION, f, ensure_ascii=False) ds.add(json_path) ds.save(version_tag=SNAPSHOT_ID) # Setup a seed for any new_dataset uses random.seed(42) return path
def test_get_default_title(path): repo = GitRepo(path) ds = Dataset(path) # There is no dataset initialized yet, so only path will be the title dirname = op.basename(path) eq_(_get_default_title(ds), dirname) # Initialize and get UUID ds.create(force=True) eq_(_get_default_title(ds), '{dirname}#{ds.id}'.format(**locals())) # Tag and get @version # cannot use ds.save since our tags are not annotated, # see https://github.com/datalad/datalad/issues/4139 ds.repo.tag("0.1", message="important version") eq_(_get_default_title(ds), '{dirname}#{ds.id}@0.1'.format(**locals()))
def test_custom_call_fmt(path, local_file): ds = Dataset(path).create() subds = ds.create('sub') # plug in a proper singularity image subds.containers_add( 'mycontainer', url=get_local_file_url(op.join(local_file, 'some_container.img')), image='righthere', call_fmt='echo image={img} cmd={cmd} img_dspath={img_dspath} ' # and environment variable being set/propagated by default 'name=$DATALAD_CONTAINER_NAME') ds.save() # record the effect in super-dataset # Running should work fine either withing sub or within super out = WitlessRunner(cwd=subds.path).run( ['datalad', 'containers-run', '-n', 'mycontainer', 'XXX'], protocol=StdOutCapture) assert_in('image=righthere cmd=XXX img_dspath=. name=mycontainer', out['stdout']) out = WitlessRunner(cwd=ds.path).run( ['datalad', 'containers-run', '-n', 'sub/mycontainer', 'XXX'], protocol=StdOutCapture) assert_in('image=sub/righthere cmd=XXX img_dspath=sub', out['stdout']) # Test within subdirectory of the super-dataset subdir = op.join(ds.path, 'subdir') os.mkdir(subdir) out = WitlessRunner(cwd=subdir).run( ['datalad', 'containers-run', '-n', 'sub/mycontainer', 'XXX'], protocol=StdOutCapture) assert_in('image=../sub/righthere cmd=XXX img_dspath=../sub', out['stdout'])
def test_aggregate_with_missing_or_duplicate_id(path): # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) subds = ds.create('sub', force=True, if_dirty='ignore') subds.repo.remove(opj('.datalad', 'config')) subds.save() assert_false(exists(opj(subds.path, '.datalad', 'config'))) subsubds = subds.create('subsub', force=True, if_dirty='ignore') # aggregate from bottom to top, guess native data, no compacting of graph # should yield 6 meta data sets, one implicit, and one native per dataset # and a second native set for the topmost dataset aggregate_metadata(ds, guess_native_type=True, recursive=True) # no only ask the top superdataset, no recursion, just reading from the cache meta = get_metadata(ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # and we know nothing subsub for name in ('grandchild_äöü東', ): assert_true( sum([s.get('name', '') == assure_unicode(name) for s in meta])) # but search should not fail with swallow_outputs(): res1 = list(search_('.', regex=True, dataset=ds)) assert res1 # and let's see now if we wouldn't fail if dataset is duplicate if we # install the same dataset twice subds_clone = ds.install(source=subds.path, path="subds2") with swallow_outputs(): res2 = list(search_('.', regex=True, dataset=ds))
def test_custom_call_fmt(path, local_file): ds = Dataset(path).create() subds = ds.create('sub') # plug in a proper singularity image subds.containers_add( 'mycontainer', url=get_local_file_url(op.join(local_file, 'some_container.img')), image='righthere', call_fmt='echo image={img} cmd={cmd} img_dspath={img_dspath} ' # and environment variable being set/propagated by default 'name=$DATALAD_CONTAINER_NAME' ) ds.save() # record the effect in super-dataset # Running should work fine either withing sub or within super with swallow_outputs() as cmo: subds.containers_run('XXX', container_name='mycontainer') assert_in('image=righthere cmd=XXX img_dspath=. name=mycontainer', cmo.out) with swallow_outputs() as cmo: ds.containers_run('XXX', container_name='sub/mycontainer') assert_in('image=sub/righthere cmd=XXX img_dspath=sub', cmo.out) # Test within subdirectory of the super-dataset subdir = op.join(ds.path, 'subdir') os.mkdir(subdir) with chpwd(subdir): with swallow_outputs() as cmo: containers_run('XXX', container_name='sub/mycontainer') assert_in('image=../sub/righthere cmd=XXX img_dspath=../sub', cmo.out)
def new_dataset(annex_path): """Create a new dataset with a unique name for one test.""" ds_path = str(annex_path.join(id_generator())) ds = Dataset(ds_path) ds.create() ds.no_annex(BIDS_NO_ANNEX) json_path = os.path.join(ds_path, 'dataset_description.json') dsdesc = { 'BIDSVersion': '1.0.2', 'License': 'This is not a real dataset', 'Name': 'Test fixture new dataset', } with open(json_path, 'w') as f: json.dump(dsdesc, f, ensure_ascii=False) ds.add(json_path) return ds
def test_aggregate_with_missing_or_duplicate_id(path): # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) subds = ds.create('sub', force=True) subds.remove(opj('.datalad', 'config'), if_dirty='ignore') assert_false(exists(opj(subds.path, '.datalad', 'config'))) subsubds = subds.create('subsub', force=True) # aggregate from bottom to top, guess native data, no compacting of graph # should yield 6 meta data sets, one implicit, and one native per dataset # and a second native set for the topmost dataset aggregate_metadata(ds, guess_native_type=True, recursive=True) # no only ask the top superdataset, no recursion, just reading from the cache meta = get_metadata( ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # and we know nothing subsub for name in ('grandchild_äöü東',): assert_true(sum([s.get('name', '') == assure_unicode(name) for s in meta])) # but search should not fail with swallow_outputs(): res1 = list(search_('.', regex=True, dataset=ds)) assert res1 # and let's see now if we wouldn't fail if dataset is duplicate if we # install the same dataset twice subds_clone = ds.install(source=subds.path, path="subds2") with swallow_outputs(): res2 = list(search_('.', regex=True, dataset=ds))
def test_dicom2spec(path): # ### SETUP ### dicoms = get_dicom_dataset('structural') ds = Dataset.create(path, cfg_proc=['hirni']) ds.install(source=dicoms, path='acq100') # Note: Recursive, since aggregation wasn't performed in the installed dastasets # TODO: Use get_raw_sd from above instead of this setup ds.meta_aggregate('acq100', into='top', recursive=True) # ### END SETUP ### # TODO: should it be specfile or acq/specfile? => At least doc needed, # if not change res = ds.hirni_dicom2spec(path='acq100', spec='spec_structural.json') # check for actual location of spec_structural! # => studyds root! assert_result_count(res, 2) assert_result_count(res, 1, path=op.join(ds.path, 'spec_structural.json')) assert_result_count(res, 1, path=op.join(ds.path, '.gitattributes')) ok_clean_git(ds.path) # multiple execution shouldn't change .gitattributes again: from os import stat mtime = stat(op.join(ds.path, '.gitattributes')).st_mtime res = ds.hirni_dicom2spec(path='acq100', spec='spec_structural.json') assert_equal(stat(op.join(ds.path, '.gitattributes')).st_mtime, mtime)
def get_raw_dataset(self): # Note: This is lazy to avoid building on import time, since import is part of nose's discovery and executed # before the dependencies. This leads to datalad's ui backend not yet being correctly set, which in turn # let's the cloning hang within progressbar generation. if not self._dspath: import tempfile kwargs = get_tempfile_kwargs() path = tempfile.mkdtemp(**kwargs) f_dicoms = get_dicom_dataset('functional') s_dicoms = get_dicom_dataset('structural') ds = Dataset.create(path, cfg_proc=['hirni']) ds.install(source=f_dicoms, path=op.join('func_acq', 'dicoms')) ds.install(source=s_dicoms, path=op.join('struct_acq', 'dicoms')) # Note: Recursive, since aggregation wasn't performed in the installed dastasets ds.meta_aggregate([ op.join('func_acq', 'dicoms'), op.join('struct_acq', 'dicoms') ], into='top', recursive=True) # TODO: Figure how to add it to things to be removed after tests ran self._dspath = ds.path return self._dspath
def test_annex_get_from_subdir(topdir=None): ds = Dataset(topdir) ds.create(force=True) ds.save('a.tar.gz') ds.add_archive_content('a.tar.gz', delete=True) fpath = op.join(topdir, 'a', 'd', fn_in_archive_obscure) with chpwd(op.join(topdir, 'a', 'd')): runner = WitlessRunner() runner.run(['git', 'annex', 'drop', '--', fn_in_archive_obscure], protocol=KillOutput) # run git annex drop assert_false(ds.repo.file_has_content( fpath)) # and verify if file deleted from directory runner.run(['git', 'annex', 'get', '--', fn_in_archive_obscure], protocol=KillOutput) # run git annex get assert_true(ds.repo.file_has_content( fpath)) # and verify if file got into directory
def test_dicom_metadata_aggregation(path): dicoms = get_dicom_dataset('structural') ds = Dataset.create(path) ds.install(source=dicoms, path='acq100') ds.aggregate_metadata(recursive=True) res = ds.metadata(get_aggregates=True) assert_result_count(res, 2) assert_result_count(res, 1, path=opj(ds.path, 'acq100'))
def _get_nested_collections(path): ds = Dataset(path).create() c1 = ds.create(ds.pathobj / 'subdir' / 'collection1') c1s1 = c1.create('sub1') c1s2 = c1.create('sub2') c2 = ds.create('collection2') c2s1 = c2.create('sub1') c2s11 = c2s1.create('deepsub1') ds.save(recursive=True) assert_repo_status(ds.path) # return a catalog return dict( root=ds, c1=c1, c1s1=c1s2, c1s2=c1s2, c2=c2, c2s1=c2s1, c2s11=c2s11, )
def test_dicom_metadata_aggregation(path): dicoms = get_dicom_dataset('structural') ds = Dataset.create(path) ds.install(source=dicoms, path='acq100') # Note: Recursive, since aggregation wasn't performed in the installed dastasets ds.meta_aggregate('acq100', into='top', recursive=True) res = ds.meta_dump(reporton='aggregates', recursive=True) assert_result_count(res, 2) assert_result_count(res, 1, path=op.join(ds.path, 'acq100'))
def new_dataset(datalad_store): """Create a new dataset with a unique name for one test.""" ds_path = str(os.path.join(datalad_store.annex_path, id_generator())) ds = Dataset(ds_path) ds.create() ds.no_annex(BIDS_NO_ANNEX) json_path = os.path.join(ds_path, 'dataset_description.json') dsdesc = { 'BIDSVersion': '1.0.2', 'License': 'This is not a real dataset', 'Name': 'Test fixture new dataset', } with open(json_path, 'w') as f: json.dump(dsdesc, f, ensure_ascii=False) ds.save(json_path) changes_path = os.path.join(ds_path, 'CHANGES') with open(changes_path, 'w') as f: f.write(CHANGES) ds.save(changes_path) ds.close() return ds
def make_ds_hierarchy_with_metadata(path): """Test helper that returns the two datasets in the hierarchy The top-level dataset contains an annex'ed file with annex metadata. """ ds = Dataset(path).create(force=True) create_tree(ds.path, {'file.dat': 'content'}) ds.save() ds.repo.set_metadata('file.dat', reset={'tag': ['one', 'two']}) subds = ds.create('sub') # we need one real piece of content for metadata extraction (subds.pathobj / 'real').write_text(text_type('real')) ds.save(recursive=True) return ds, subds
def test_list_contains(path): ds = Dataset(path).create() subds_a = ds.create("a") subds_b = ds.create("b") subds_a_c = subds_a.create("c") add_pyscript_image(subds_a_c, "in-c", "img") add_pyscript_image(subds_a, "in-a", "img") add_pyscript_image(subds_b, "in-b", "img") add_pyscript_image(ds, "in-top", "img") ds.save(recursive=True) assert_result_count(ds.containers_list(recursive=True, **RAW_KWDS), 4) assert_result_count(ds.containers_list(contains=["nowhere"], recursive=True, **RAW_KWDS), 1, name="in-top", action='containers') res = ds.containers_list(contains=[subds_a.path], recursive=True, **RAW_KWDS) assert_result_count(res, 3) assert_in_results(res, name="in-top") assert_in_results(res, name="a/in-a") assert_in_results(res, name="a/c/in-c") res = ds.containers_list(contains=[subds_a_c.path], recursive=True, **RAW_KWDS) assert_result_count(res, 3) assert_in_results(res, name="in-top") assert_in_results(res, name="a/in-a") assert_in_results(res, name="a/c/in-c") res = ds.containers_list(contains=[subds_b.path], recursive=True, **RAW_KWDS) assert_result_count(res, 2) assert_in_results(res, name="in-top") assert_in_results(res, name="b/in-b")
def _single_session_dicom2bids(label, path, toolbox_url): with patch.dict('os.environ', {'DATALAD_HIRNI_TOOLBOX_URL': toolbox_url}): ds = Dataset.create(path, cfg_proc=['hirni']) subject = "02" acquisition = "{sub}_{label}".format(sub=subject, label=label) dicoms = get_dicom_dataset(label) ds.install(source=dicoms, path=op.join(acquisition, 'dicoms')) # Note: Recursive, since aggregation wasn't performed in the installed dastasets ds.meta_aggregate(op.join(acquisition, 'dicoms'), into='top', recursive=True) spec_file = 'spec_{label}.json'.format(label=label) ds.hirni_dicom2spec(path=op.join(acquisition, 'dicoms'), spec=op.join(acquisition, spec_file)) ds.hirni_spec2bids(op.join(acquisition, spec_file))
def test_dryrun(path=None): ds = Dataset(path).create() # see that the correct request would be made res = ds.create_sibling_gin('bogus', credential='some', dry_run=True) assert_result_count(res, 1) res = res[0] eq_(res['request_url'], 'https://gin.g-node.org/api/v1/user/repos') # we dont care much which user-agent, but there should be one assert_in('user-agent', res['request_headers']) # only a placeholder no-token makes it into the request assert_in('NO-TOKEN-AVAILABLE', res['request_headers']['authorization']) # correct name eq_(res['request_data']['name'], 'bogus') # public by default eq_(res['request_data']['private'], False) # it is important that we do not tell the portal to generate some # repo content eq_(res['request_data']['auto_init'], False) # org repo res = ds.create_sibling_gin('strangeorg/bogus', credential='some', dry_run=True) assert_result_count(res, 1) res = res[0] eq_(res['request_data']['name'], 'bogus') eq_(res['request_url'], 'https://gin.g-node.org/api/v1/org/strangeorg/repos') # recursive name, building subds = ds.create('subds') res = ds.create_sibling_gin( 'bogus', recursive=True, credential='some', dry_run=True) eq_(res[-1]['request_data']['name'], 'bogus-subds') # ignore unavailable datasets ds.drop('subds', what='all', reckless='kill', recursive=True) res = ds.create_sibling_gin( 'bogus', recursive=True, credential='some', dry_run=True) eq_(len(res), 1)
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) subds = ds.create('sub', force=True) subsubds = subds.create('subsub', force=True) # aggregate from bottom to top, guess native data, no compacting of graph # should yield 6 meta data sets, one implicit, and one native per dataset # and a second natiev set for the topmost dataset aggregate_metadata(ds, guess_native_type=True, recursive=True) # no only ask the top superdataset, no recursion, just reading from the cache meta = get_metadata( ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) assert_equal(len(meta), 10) # same schema assert_equal( 10, sum([s.get('@context', {'@vocab': None})['@vocab'] == 'http://schema.org/' for s in meta])) # three different IDs assert_equal(3, len(set([s.get('@id') for s in meta]))) # and we know about all three datasets for name in ('mother_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true(sum([s.get('name', None) == assure_unicode(name) for s in meta])) #print(meta) assert_equal( # first implicit, then two natives, then aggregate meta[3]['dcterms:hasPart']['@id'], subds.id) success = False for m in meta: p = m.get('dcterms:hasPart', {}) if p.get('@id', None) == subsubds.id: assert_equal(opj('sub', 'subsub'), p.get('location', None)) success = True assert_true(success) # save the toplevel dataset only (see below) ds.save('with aggregated meta data', all_changes=True) # now clone the beast to simulate a new user installing an empty dataset clone = install(opj(path, 'clone'), source=ds.path) # ID mechanism works assert_equal(ds.id, clone.id) # get fresh meta data, the implicit one for the top-most datasets should # differ, but the rest not clonemeta = get_metadata( clone, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # make sure the implicit md for the topmost come first assert_equal(clonemeta[0]['@id'], clone.id) assert_equal(clonemeta[0]['@id'], ds.id) assert_equal(clone.repo.get_hexsha(), ds.repo.get_hexsha()) assert_equal(clonemeta[0]['version'], ds.repo.get_hexsha()) # all but the implicit is identical assert_equal(clonemeta[1:], meta[1:]) # the implicit md of the clone should list a dataset ID for its subds, # although it has not been obtained! assert_equal( clonemeta[3]['dcterms:hasPart']['@id'], subds.id) # now obtain a subdataset in the clone and the IDs should be updated clone.install('sub') partial = get_metadata(clone, guess_type=False, ignore_cache=True) # ids don't change assert_equal(partial[0]['@id'], clonemeta[0]['@id']) # datasets are properly connected assert_equal(partial[1]['dcterms:hasPart']['@id'], partial[2]['@id']) # query smoke test if os.environ.get('DATALAD_TESTS_NONETWORK'): raise SkipTest assert_equal(len(list(clone.search('mother'))), 1) assert_equal(len(list(clone.search('MoTHER'))), 1) # case insensitive child_res = list(clone.search('child')) assert_equal(len(child_res), 2) # little helper to match names def assert_names(res, names, path=clone.path): assert_equal(list(map(itemgetter(0), res)), [opj(path, n) for n in names]) # should yield (location, report) tuples assert_names(child_res, ['sub', 'sub/subsub']) # result should be identical to invoking search from api # and search_ should spit out locations out with swallow_outputs() as cmo: res = list(search_('child', dataset=clone)) assert_equal(res, child_res) assert_in(res[0][0], cmo.out) # and overarching search_ just for smoke testing of processing outputs # and not puking (e.g. under PY3) with swallow_outputs() as cmo: assert list(search_('.', regex=True, dataset=clone)) assert cmo.out # test searching among specified properties only assert_names(clone.search('i', search='name'), ['sub', 'sub/subsub']) assert_names(clone.search('i', search='keywords'), ['.']) # case shouldn't matter assert_names(clone.search('i', search='Keywords'), ['.']) assert_names(clone.search('i', search=['name', 'keywords']), ['.', 'sub', 'sub/subsub']) # without report_matched, we are getting none of the fields assert(all([not x for x in map(itemgetter(1), child_res)])) # but we would get all if asking for '*' assert(all([len(x) >= 9 for x in map(itemgetter(1), list(clone.search('child', report='*')))])) # but we would get only the matching name if we ask for report_matched assert_equal( set(map(lambda x: tuple(x[1].keys()), clone.search('child', report_matched=True))), set([('name',)]) ) # and the additional field we might have asked with report assert_equal( set(map(lambda x: tuple(sorted(x[1].keys())), clone.search('child', report_matched=True, report=['schema:type']))), set([('name', 'schema:type')]) ) # and if we ask report to be 'empty', we should get no fields child_res_empty = list(clone.search('child', report='')) assert_equal(len(child_res_empty), 2) assert_equal( set(map(lambda x: tuple(x[1].keys()), child_res_empty)), set([tuple()]) ) # more tests on returned paths: assert_names(clone.search('datalad'), ['.', 'sub', 'sub/subsub']) # if we clone subdataset and query for value present in it and its kid clone_sub = clone.install('sub') assert_names(clone_sub.search('datalad'), ['.', 'subsub'], clone_sub.path) # Test 'and' for multiple search entries assert_equal(len(list(clone.search(['child', 'bids']))), 2) assert_equal(len(list(clone.search(['child', 'subsub']))), 1) assert_equal(len(list(clone.search(['bids', 'sub']))), 2) res = list(clone.search('.*', regex=True)) # with regex assert_equal(len(res), 3) # one per dataset # we do search, not match assert_equal(len(list(clone.search('randchild', regex=True))), 1) assert_equal(len(list(clone.search(['gr.nd', 'ch.ld'], regex=True))), 1) assert_equal(len(list(clone.search('randchil.', regex=True))), 1) assert_equal(len(list(clone.search('^randchild.*', regex=True))), 0) assert_equal(len(list(clone.search('^grandchild.*', regex=True))), 1) assert_equal(len(list(clone.search('grandchild'))), 1)
def _test_create_store(host, base_path, ds_path, clone_path): ds = Dataset(ds_path).create(force=True) subds = ds.create('sub', force=True) subds2 = ds.create('sub2', force=True, annex=False) ds.save(recursive=True) assert_repo_status(ds.path) # don't specify special remote. By default should be git-remote + "-storage" res = ds.create_sibling_ria("ria+ssh://test-store:", "datastore") assert_result_count(res, 1, status='ok', action='create-sibling-ria') eq_(len(res), 1) # remotes exist, but only in super siblings = ds.siblings(result_renderer=None) eq_({'datastore', 'datastore-storage', 'here'}, {s['name'] for s in siblings}) sub_siblings = subds.siblings(result_renderer=None) eq_({'here'}, {s['name'] for s in sub_siblings}) sub2_siblings = subds2.siblings(result_renderer=None) eq_({'here'}, {s['name'] for s in sub2_siblings}) # TODO: post-update hook was enabled # check bare repo: git_config = Path(base_path) / ds.id[:3] / ds.id[3:] / 'config' assert git_config.exists() content = git_config.read_text() assert_in("[datalad \"ora-remote\"]", content) super_uuid = ds.config.get("remote.{}.annex-uuid".format('datastore-storage')) assert_in("uuid = {}".format(super_uuid), content) # implicit test of success by ria-installing from store: ds.publish(to="datastore", transfer_data='all') with chpwd(clone_path): if host: # note, we are not using the "test-store"-label here clone('ria+ssh://{}{}#{}'.format(host, base_path, ds.id), path='test_install') else: # TODO: Whenever ria+file supports special remote config (label), # change here: clone('ria+file://{}#{}'.format(base_path, ds.id), path='test_install') installed_ds = Dataset(op.join(clone_path, 'test_install')) assert installed_ds.is_installed() assert_repo_status(installed_ds.repo) eq_(installed_ds.id, ds.id) assert_in(op.join('ds', 'file1.txt'), installed_ds.repo.get_annexed_files()) assert_result_count(installed_ds.get(op.join('ds', 'file1.txt')), 1, status='ok', action='get', path=op.join(installed_ds.path, 'ds', 'file1.txt')) # now, again but recursive. res = ds.create_sibling_ria("ria+ssh://test-store:", "datastore", recursive=True, existing='reconfigure') eq_(len(res), 3) assert_result_count(res, 1, path=str(ds.pathobj), status='ok', action="create-sibling-ria") assert_result_count(res, 1, path=str(subds.pathobj), status='ok', action="create-sibling-ria") assert_result_count(res, 1, path=str(subds2.pathobj), status='ok', action="create-sibling-ria") # remotes now exist in super and sub siblings = ds.siblings(result_renderer=None) eq_({'datastore', 'datastore-storage', 'here'}, {s['name'] for s in siblings}) sub_siblings = subds.siblings(result_renderer=None) eq_({'datastore', 'datastore-storage', 'here'}, {s['name'] for s in sub_siblings}) # but no special remote in plain git subdataset: sub2_siblings = subds2.siblings(result_renderer=None) eq_({'datastore', 'here'}, {s['name'] for s in sub2_siblings}) # for testing trust_level parameter, redo for each label: for trust in ['trust', 'semitrust', 'untrust']: ds.create_sibling_ria("ria+ssh://test-store:", "datastore", existing='reconfigure', trust_level=trust) res = ds.repo.repo_info() assert_in('[datastore-storage]', [r['description'] for r in res['{}ed repositories'.format(trust)]])
def test_container_from_subdataset(ds_path, src_subds_path, local_file): # prepare a to-be subdataset with a registered container src_subds = Dataset(src_subds_path).create() src_subds.containers_add(name="first", url=get_local_file_url( op.join(local_file, 'some_container.img'))) # add it as subdataset to a super ds: ds = Dataset(ds_path).create() subds = ds.install("sub", source=src_subds_path) # add it again one level down to see actual recursion: subds.install("subsub", source=src_subds_path) # We come up empty without recursive: res = ds.containers_list(recursive=False, **RAW_KWDS) assert_result_count(res, 0) # query available containers from within super: res = ds.containers_list(recursive=True, **RAW_KWDS) assert_result_count(res, 2) assert_in_results(res, action="containers", refds=ds.path) # default location within the subdataset: target_path = op.join(subds.path, '.datalad', 'environments', 'first', 'image') assert_result_count(res, 1, name='sub/first', type='file', action='containers', status='ok', path=target_path, parentds=subds.path) # not installed subdataset doesn't pose an issue: sub2 = ds.create("sub2") assert_result_count(ds.subdatasets(), 2, type="dataset") ds.uninstall("sub2") from datalad.tests.utils import assert_false assert_false(sub2.is_installed()) # same results as before, not crashing or somehow confused by a not present # subds: res = ds.containers_list(recursive=True, **RAW_KWDS) assert_result_count(res, 2) assert_result_count(res, 1, name='sub/first', type='file', action='containers', status='ok', path=target_path, parentds=subds.path) # The default renderer includes the image names. with swallow_outputs() as out: ds.containers_list(recursive=True) lines = out.out.splitlines() assert_re_in("sub/first", lines) assert_re_in("sub/subsub/first", lines) # But we are careful not to render partial names from subdataset traversals # (i.e. we recurse with containers_list(..., result_renderer=None)). with assert_raises(AssertionError): assert_re_in("subsub/first", lines)
def _test_create_store(host, base_path, ds_path, clone_path): skip_if_no_module("ria_remote") # special remote needs to be installed ds = Dataset(ds_path).create(force=True) subds = ds.create('sub', force=True) ds.save(recursive=True) assert_repo_status(ds.path) # don't specify special remote. By default should be git-remote + "-ria" res = ds.create_sibling_ria("ria+ssh://test-store:", "datastore") assert_result_count(res, 1, status='ok', action='create-sibling-ria') eq_(len(res), 1) # remotes exist, but only in super siblings = ds.siblings(result_renderer=None) eq_({'datastore', 'datastore-ria', 'here'}, {s['name'] for s in siblings}) sub_siblings = subds.siblings(result_renderer=None) eq_({'here'}, {s['name'] for s in sub_siblings}) # TODO: post-update hook was enabled # implicit test of success by ria-installing from store: ds.publish(to="datastore", transfer_data='all') with chpwd(clone_path): if host: # note, we are not using the "test-store"-label here clone('ria+ssh://{}{}#{}'.format(host, base_path, ds.id), path='test_install') else: # TODO: Whenever ria+file supports special remote config (label), # change here: clone('ria+file://{}#{}'.format(base_path, ds.id), path='test_install') installed_ds = Dataset(op.join(clone_path, 'test_install')) assert installed_ds.is_installed() assert_repo_status(installed_ds.repo) eq_(installed_ds.id, ds.id) assert_in(op.join('ds', 'file1.txt'), installed_ds.repo.get_annexed_files()) assert_result_count(installed_ds.get(op.join('ds', 'file1.txt')), 1, status='ok', action='get', path=op.join(installed_ds.path, 'ds', 'file1.txt')) # now, again but recursive. res = ds.create_sibling_ria("ria+ssh://test-store:", "datastore", recursive=True, existing='reconfigure') eq_(len(res), 2) assert_result_count(res, 2, status='ok', action="create-sibling-ria") # remotes now exist in super and sub siblings = ds.siblings(result_renderer=None) eq_({'datastore', 'datastore-ria', 'here'}, {s['name'] for s in siblings}) sub_siblings = subds.siblings(result_renderer=None) eq_({'datastore', 'datastore-ria', 'here'}, {s['name'] for s in sub_siblings}) # for testing trust_level parameter, redo for each label: for trust in ['trust', 'semitrust', 'untrust']: ds.create_sibling_ria("ria+ssh://test-store:", "datastore", existing='reconfigure', trust_level=trust) res = ds.repo.repo_info() assert_in( '[datastore-ria]', [r['description'] for r in res['{}ed repositories'.format(trust)]])
def add_to_datalad(topdir, studydir, msg, bids): """Do all necessary preparations (if were not done before) and save """ import datalad.api as dl from datalad.api import Dataset from datalad.support.annexrepo import AnnexRepo from datalad.support.external_versions import external_versions assert external_versions['datalad'] >= MIN_VERSION, ( "Need datalad >= {}".format(MIN_VERSION)) # add to reqs studyrelpath = op.relpath(studydir, topdir) assert not studyrelpath.startswith(op.pardir) # so we are under # now we need to test and initiate a DataLad dataset all along the path curdir_ = topdir superds = None subdirs = [''] + [d for d in studyrelpath.split(op.sep) if d != os.curdir] for isubdir, subdir in enumerate(subdirs): curdir_ = op.join(curdir_, subdir) ds = Dataset(curdir_) if not ds.is_installed(): lgr.info("Initiating %s", ds) # would require annex > 20161018 for correct operation on annex v6 # need to add .gitattributes first anyways ds_ = dl.create( curdir_, dataset=superds, force=True, # initiate annex only at the bottom repository annex=isubdir == (len(subdirs) - 1), fake_dates=True, # shared_access='all', ) assert ds == ds_ assert ds.is_installed() superds = ds # TODO: we need a helper (in DataLad ideally) to ease adding such # specifications gitattributes_path = op.join(studydir, '.gitattributes') # We will just make sure that all our desired rules are present in it desired_attrs = """\ * annex.largefiles=(largerthan=100kb) *.json annex.largefiles=nothing *.txt annex.largefiles=nothing *.tsv annex.largefiles=nothing *.nii.gz annex.largefiles=anything *.tgz annex.largefiles=anything *_scans.tsv annex.largefiles=anything """ if op.exists(gitattributes_path): with open(gitattributes_path, 'rb') as f: known_attrs = [ line.decode('utf-8').rstrip() for line in f.readlines() ] else: known_attrs = [] for attr in desired_attrs.split('\n'): if attr not in known_attrs: known_attrs.append(attr) with open(gitattributes_path, 'wb') as f: f.write('\n'.join(known_attrs).encode('utf-8')) # ds might have memories of having ds.repo GitRepo superds = Dataset(topdir) assert op.realpath(ds.path) == op.realpath(studydir) assert isinstance(ds.repo, AnnexRepo) # Add doesn't have all the options of save such as msg and supers ds.save(path=['.gitattributes'], message="Custom .gitattributes", to_git=True) dsh = dsh_path = None if op.lexists(op.join(ds.path, '.heudiconv')): dsh_path = op.join(ds.path, '.heudiconv') dsh = Dataset(dsh_path) if not dsh.is_installed(): # Previously we did not have it as a submodule, and since no # automagic migration is implemented, we just need to check first # if any path under .heudiconv is already under git control if any(x.startswith('.heudiconv/') for x in ds.repo.get_files()): lgr.warning( "%s has .heudiconv not as a submodule from previous" " versions of heudiconv. No automagic migration is " "yet provided", ds) else: dsh = ds.create( path='.heudiconv', force=True, # shared_access='all' ) # Since .heudiconv could contain sensitive information # we place all files under annex and then add if create_file_if_missing(op.join(dsh_path, '.gitattributes'), """* annex.largefiles=anything"""): ds.save( '.heudiconv/.gitattributes', to_git=True, message="Added gitattributes to place all .heudiconv content" " under annex") ds.save('.', recursive=True # not in effect! ? #annex_add_opts=['--include-dotfiles'] ) # TODO: filter for only changed files? # Provide metadata for sensitive information mark_sensitive(ds, 'sourcedata') mark_sensitive(ds, '*_scans.tsv') # top level mark_sensitive(ds, '*/*_scans.tsv') # within subj mark_sensitive(ds, '*/*/*_scans.tsv') # within sess/subj mark_sensitive(ds, '*/anat') # within subj mark_sensitive(ds, '*/*/anat') # within ses/subj if dsh_path: mark_sensitive(ds, '.heudiconv') # entire .heudiconv! superds.save(path=ds.path, message=msg, recursive=True) assert not ds.repo.dirty # TODO: they are still appearing as native annex symlinked beasts """
def get_bids_dataset(): srcrepo = get_sourcerepo() bids_ds = Dataset(path=opj(srcrepo.path, 'datalad_neuroimaging', 'tests', 'data', 'bids')) if bids_ds.is_installed(): return bids_ds try: import heudiconv except ImportError: raise SkipTest # make one bids_ds.create() # place dicoms in the mandated shadow tree structdicom_ds = bids_ds.install(source=get_dicom_dataset('structural'), path=opj('sourcedata', 'sub-02', 'ses-structural'), reckless=True) funcdicom_ds = bids_ds.install(source=get_dicom_dataset('functional'), path=opj('sourcedata', 'sub-02', 'ses-functional'), reckless=True) # dicom dataset is preconfigured for metadata extraction # XXX this is the slowest step of the entire procedure # reading 5k dicoms of the functional data bids_ds.aggregate_metadata(recursive=True) # pull subject ID from metadata res = bids_ds.metadata(funcdicom_ds.path, reporton='datasets', return_type='item-or-list', result_renderer='disabled') subj_id = res['metadata']['dicom']['Series'][0]['PatientID'] # prepare for incoming BIDS metadata that we will want to keep in # Git -- templates would be awesome! with open(opj(bids_ds.path, '.gitattributes'), 'a') as ga: # except for hand-picked global metadata, we want anything # to go into the annex to be able to retract files after # publication ga.write('** annex.largefiles=anything\n') for fn in ('CHANGES', 'README', 'dataset_description.json'): # but not these ga.write('{} annex.largefiles=nothing\n'.format(fn)) bids_ds.add('.gitattributes', to_git=True, message='Initial annex entry configuration') ok_clean_git(bids_ds.path) # conversion of two DICOM datasets to one BIDS dataset for label, ds, scanlabel in ( # structural needs to come first or else heudiconv # will try to rewrite the events.tsv for the functional # run, for some strange reason ('structural', structdicom_ds, 'anat'), ('functional', funcdicom_ds, 'func')): bids_ds.run( [ 'heudiconv', '-f', 'reproin', # TODO fix DICOMs to not have a 'sub' prefix '-s', subj_id, '-c', 'dcm2niix', # TODO decide on the fate of .heudiconv/ # but ATM we need to (re)move it: # https://github.com/nipy/heudiconv/issues/196 '-o', opj(bids_ds.path, '.git', 'stupid', label), '-b', '-a', bids_ds.path, '-l', '', # avoid gory details provided by dcmstack, we have them in # the aggregated DICOM metadata already '--minmeta', '--files', opj(ds.path, 'dicoms') ], message="DICOM conversion of {} scans".format(label)) # remove unwanted stuff that cannot be disabled ATM # https://github.com/nipy/heudiconv/issues/195 # TODO should be removed eventually bids_ds.remove([ p for p in (opj('sourcedata', 'sub-02', scanlabel), opj('sourcedata', 'README')) if op.lexists(opj(bids_ds.path, p)) ], check=False) bids_ds.config.add('datalad.metadata.nativetype', 'bids', where='dataset', reload=False) bids_ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset', reload=True) # XXX need to `add` specifically to make it work in direct mode #bids_ds.save(message='Metadata type config') bids_ds.add('.', message='Metadata type config') # loose dicom datasets bids_ds.uninstall([structdicom_ds.path, funcdicom_ds.path], check=False) # no need for recursion, we already have the dicom dataset's # stuff on record bids_ds.aggregate_metadata(recursive=False, incremental=True) ok_clean_git(bids_ds.path) return bids_ds
def test_demo_raw_ds(path, toolbox_url): ds = Dataset(path) with patch.dict('os.environ', {'DATALAD_HIRNI_TOOLBOX_URL': toolbox_url}): ds.create() # TODO: May be move to ds.create(cfg_proc='hirni') in demo ds.run_procedure('cfg_hirni') # clean repo with an annex: assert_repo_status(ds.repo, annex=True) # README, dataset_description.json and studyspec.json at toplevel and in git for f in ['README', 'studyspec.json', 'dataset_description.json']: ok_file_under_git(ds.path, f, annexed=False) # toolbox installed under code/hirni-toolbox subs = ds.subdatasets() assert_result_count(subs, 1) assert_result_count(subs, 1, path=op.join(ds.path, 'code', 'hirni-toolbox')) ds.hirni_import_dcm( 'https://github.com/datalad/example-dicom-structural/archive/master.tar.gz', 'acq1', anon_subject='001') # acquisition directory and studyspec created + subdataset 'dicoms' within the acquisition dir for f in [ op.join(ds.path, 'acq1'), op.join(ds.path, 'acq1', 'studyspec.json'), op.join(ds.path, 'acq1', 'dicoms') ]: assert_true(op.exists(f)) subs = ds.subdatasets() assert_result_count(subs, 2) assert_result_count(subs, 1, path=op.join(ds.path, 'code', 'hirni-toolbox')) assert_result_count(subs, 1, path=op.join(ds.path, 'acq1', 'dicoms')) # TODO: check actual spec? (Prob. sufficient to test for that in dedicated import-dcm/dcm2spec tests # TODO: check dicom metadata ds.hirni_import_dcm( 'https://github.com/datalad/example-dicom-functional/archive/master.tar.gz', 'acq2', anon_subject='001') # acquisition directory and studyspec created + subdataset 'dicoms' within the acquisition dir for f in [ op.join(ds.path, 'acq2'), op.join(ds.path, 'acq2', 'studyspec.json'), op.join(ds.path, 'acq2', 'dicoms') ]: assert_true(op.exists(f)) subs = ds.subdatasets() assert_result_count(subs, 3) assert_result_count(subs, 1, path=op.join(ds.path, 'code', 'hirni-toolbox')) assert_result_count(subs, 1, path=op.join(ds.path, 'acq1', 'dicoms')) assert_result_count(subs, 1, path=op.join(ds.path, 'acq2', 'dicoms')) # Note from demo: The calls to `git annex addurl` and `datalad save` currently replace a single call to # `datalad download-url` due to a bug in that command. events_file = op.join('acq2', 'events.tsv') ds.repo.add_url_to_file( file_=events_file, url= 'https://github.com/datalad/example-dicom-functional/raw/master/events.tsv' ) ds.save(message="Added stimulation protocol for acquisition 2") ok_file_under_git(ds.path, events_file, annexed=True) ds.hirni_spec4anything( events_file, properties= '{"procedures": {"procedure-name": "copy-converter", "procedure-call": "bash {script} {{location}} ' '{ds}/sub-{{bids-subject}}/func/sub-{{bids-subject}}_task-{{bids-task}}_run-{{bids-run}}_events.tsv' '"}, "type": "events_file"}') ok_file_under_git(ds.path, op.join('acq2', 'studyspec.json'), annexed=False) assert_repo_status(ds.repo, annex=True)
class supers(SuprocBenchmarks): """ Benchmarks on common operations on collections of datasets using datalad API """ timeout = 3600 # need to assure that we are working in a different repository now # see https://github.com/datalad/datalad/issues/1512 # might not be sufficient due to side effects between tests and # thus getting into the same situation ds_count = 0 def setup_cache(self): # creating in CWD so things get removed when ASV is done ds_path = create_test_dataset("testds1", spec='2/-2/-2', seed=0)[0] # Will store into a tarfile since otherwise install -r is way too slow # to be invoked for every benchmark tarfile_path = opj(osp.dirname(ds_path), 'testds1.tar') with tarfile.open(tarfile_path, "w") as tar: # F.CK -- Python tarfile can't later extract those because key dirs are # read-only. For now just a workaround - make it all writeable from datalad.utils import rotree rotree('testds1', ro=False, chmod_files=False) tar.add('testds1', recursive=True) rmtree('testds1') return tarfile_path def setup(self, tarfile_path): import tarfile tempdir = osp.dirname(tarfile_path) with tarfile.open(tarfile_path) as tar: tar.extractall(tempdir) # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed epath = opj(tempdir, 'testds1') epath_unique = epath + str(self.__class__.ds_count) os.rename(epath, epath_unique) self.__class__.ds_count += 1 self.ds = Dataset(epath_unique) print("Finished setup for %s" % tempdir) def teardown(self, tarfile_path): for path in [self.ds.path + '_', self.ds.path]: print("Cleaning up %s" % path) if osp.exists(path): rmtree(path) def time_installr(self, tarfile_path): # somewhat duplicating setup but lazy to do different one for now assert install(self.ds.path + '_', source=self.ds.path, recursive=True) def time_createadd(self, tarfile_path): assert self.ds.create('newsubds') def time_createadd_to_dataset(self, tarfile_path): subds = create(opj(self.ds.path, 'newsubds')) self.ds.add(subds.path) def time_ls(self, tarfile_path): ls(self.ds.path) def time_ls_recursive(self, tarfile_path): ls(self.ds.path, recursive=True) def time_ls_recursive_long_all(self, tarfile_path): ls(self.ds.path, recursive=True, long_=True, all_=True) # TODO: since doesn't really allow to uninstall top level ds... bleh ;) #def time_uninstall(self, tarfile_path): # uninstall(self.ds.path, recursive=True) def time_remove(self, tarfile_path): remove(self.ds.path, recursive=True)
def test_basics(src, dst): # dataset with subdataset, not specific configuration ds = Dataset(src).create() (ds.pathobj / 'file1').write_text('some') ds.save() sub = ds.create('subds') # second one for a result_xfm test below ds.create('subds2') eq_(sub.config.get('datalad.metadata.nativetype'), None) # now clone the super clone = install(source=src, path=dst) # and configure it, such that it modifies each obtained subdataset # on install to have 'bids' listed as a metadata type clone.config.set( 'datalad.result-hook.alwaysbids.call-json', # string substitutions based on the result record are supported 'run_procedure {{"dataset":"{path}","spec":"cfg_metadatatypes bids"}}', where='local', ) # config on which kind of results this hook should operate clone.config.set( 'datalad.result-hook.alwaysbids.match-json', # any successfully installed dataset '{"type":"dataset","action":"install","status":["eq", "ok"]}', where='local', ) # a smoke test to see if a hook definition without any call args works too clone.config.set('datalad.result-hook.wtf.call-json', 'wtf', where='local') clone.config.set( 'datalad.result-hook.wtf.match-json', '{"type":"dataset","action":"install","status":["eq", "ok"]}', where='local', ) # configure another one that will unlock any obtained file # {dsarg} is substituted by the dataset arg of the command that # the eval_func() decorator belongs to # but it may not have any, as this is not the outcome of a # require_dataset(), but rather the verbatim input # it could be more useful to use {refds} clone.config.set( 'datalad.result-hook.unlockfiles.call-json', 'unlock {{"dataset":"{dsarg}","path":"{path}"}}', where='local', ) clone.config.set( 'datalad.result-hook.unlockfiles.match-json', '{"type":"file","action":"get","status":"ok"}', where='local', ) if not on_windows: # and one that runs a shell command on any notneeded file-get clone.config.set( 'datalad.result-hook.annoy.call-json', 'run {{"cmd":"touch {path}_annoyed",' '"dataset":"{dsarg}","explicit":true}}', where='local', ) clone.config.set( 'datalad.result-hook.annoy.match-json', '{"type":["in", ["file"]],"action":"get","status":"notneeded"}', where='local', ) # setup done, now see if it works clone.get('subds') clone_sub = Dataset(clone.pathobj / 'subds') eq_(clone_sub.config.get('datalad.metadata.nativetype'), 'bids') # now the same thing with a result_xfm, should make no difference clone.get('subds2') clone_sub2 = Dataset(clone.pathobj / 'subds2') eq_(clone_sub2.config.get('datalad.metadata.nativetype'), 'bids') # hook auto-unlocks the file if not on_windows: ok_((clone.pathobj / 'file1').is_symlink()) res = clone.get('file1') if not on_windows: # we get to see the results from the hook too! assert_result_count( res, 1, action='unlock', path=str(clone.pathobj / 'file1')) ok_(not (clone.pathobj / 'file1').is_symlink()) if not on_windows: # different hook places annoying file next to a file that was already present annoyed_file = clone.pathobj / 'file1_annoyed' ok_(not annoyed_file.exists()) clone.get('file1') ok_(annoyed_file.exists())
def test_aggregation(path=None): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) # before anything aggregated we would get nothing and only a log warning with swallow_logs(new_level=logging.WARNING) as cml: assert_equal(list(query_aggregated_metadata('all', ds, [])), []) assert_re_in('.*Found no aggregated metadata.*update', cml.out) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') ds.save(recursive=True) assert_repo_status(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True, update_mode='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_in_results(res, action='save', status="ok") # nice and tidy assert_repo_status(ds.path) # quick test of aggregate report aggs = ds.metadata(get_aggregates=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 6) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 3, type='file') # Now that we have annex.key # three different IDs assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == ensure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install( opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 2) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extract same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in( r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) # before anything aggregated we would get nothing and only a log warning with swallow_logs(new_level=logging.WARNING) as cml: assert_equal(list(query_aggregated_metadata('all', ds, [])), []) assert_re_in('.*Found no aggregated metadata.*update', cml.out) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') ds.add('.', recursive=True) ok_clean_git(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True, update_mode='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 6) assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_result_count(res, 3, status='ok', action='save') # nice and tidy ok_clean_git(ds.path) # quick test of aggregate report aggs = ds.metadata(get_aggregates=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 6) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 3, type='file') # Now that we have annex.key # three different IDs assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == assure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install( opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 2) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extact same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in( r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])