def test_result_filter(): # ensure baseline without filtering assert_equal( [r['somekey'] for r in TestUtils().__call__(4)], [0, 1, 2, 3]) # test two functionally equivalent ways to filter results # 1. Constraint-based -- filter by exception # we have a full set of AND and OR operators for this # 2. custom filer function -- filter by boolean return value for filt in ( EnsureKeyChoice('somekey', (0, 2)), lambda x: x['somekey'] in (0, 2)): assert_equal( [r['somekey'] for r in TestUtils().__call__( 4, result_filter=filt)], [0, 2]) # constraint returns full dict assert_dict_equal( TestUtils().__call__( 4, result_filter=filt)[-1], {'action': 'off', 'path': 'some', 'status': 'ok', 'somekey': 2}) # test more sophisticated filters that actually get to see the # API call's kwargs def greatfilter(res, **kwargs): assert_equal(kwargs.get('dataset', 'bob'), 'awesome') return True TestUtils().__call__(4, dataset='awesome', result_filter=greatfilter) def sadfilter(res, **kwargs): assert_equal(kwargs.get('dataset', 'bob'), None) return True TestUtils().__call__(4, result_filter=sadfilter)
def test_extract(): info, subpaths = au.extract( json_stream(ST_DATA["rows"]), "json", url_format="{name}_{debut_season}.com", filename_format="{age_group}//{now_dead}//{name}.csv") eq_(subpaths, {"kid", "kid/no", "adult", "adult/yes", "adult/no"}) eq_([d["url"] for d in info], ["will_1.com", "bob_2.com", "scott_1.com", "max_2.com"]) eq_([d["filename"] for d in info], ["kid/no/will.csv", "adult/yes/bob.csv", "adult/no/scott.csv", "kid/no/max.csv"]) expects = [{"name": "will", "age_group": "kid", "debut_season": "1", "now_dead": "no"}, {"name": "bob", "age_group": "adult", "debut_season": "2", "now_dead": "yes"}, {"name": "scott", "age_group": "adult", "debut_season": "1", "now_dead": "no"}, {"name": "max", "age_group": "kid", "debut_season": "2", "now_dead": "no"}] for d, expect in zip(info, expects): assert_dict_equal(d["meta_args"], expect) eq_([d["subpath"] for d in info], ["kid/no", "adult/yes", "adult/no", "kid/no"])
def test_basic_aggregate(path): # TODO give datasets some more metadata to actually aggregate stuff base = Dataset(opj(path, 'origin')).create(force=True) sub = base.create('sub', force=True) #base.metadata(sub.path, init=dict(homepage='this'), apply2global=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) # we will first aggregate the middle dataset on its own, this will # serve as a smoke test for the reuse of metadata objects later on sub.aggregate_metadata() base.save() assert_repo_status(base.path) base.aggregate_metadata(recursive=True, update_mode='all') assert_repo_status(base.path) direct_meta = base.metadata(recursive=True, return_type='list') # loose the deepest dataset sub.uninstall('subsub', check=False) # no we should eb able to reaggregate metadata, and loose nothing # because we can aggregate aggregated metadata of subsub from sub base.aggregate_metadata(recursive=True, update_mode='all') # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): print(d['path'], a['path']) assert_dict_equal(d, a) # no we can throw away the subdataset tree, and loose no metadata base.uninstall('sub', recursive=True, check=False) assert (not sub.is_installed()) assert_repo_status(base.path) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): assert_dict_equal(d, a)
def test_get_file_parts(): assert_dict_equal(au.get_file_parts("file.tar.gz", "prefix"), {"prefix": "file.tar.gz", "prefix_root_py": "file.tar", "prefix_ext_py": ".gz", "prefix_root": "file", "prefix_ext": ".tar.gz"})
def test_basic_aggregate(path): # TODO give datasets some more metadata to actually aggregate stuff base = Dataset(opj(path, 'origin')).create(force=True) sub = base.create('sub', force=True) #base.metadata(sub.path, init=dict(homepage='this'), apply2global=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) # we will first aggregate the middle dataset on its own, this will # serve as a smoke test for the reuse of metadata objects later on sub.aggregate_metadata() base.save() ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) direct_meta = base.metadata(recursive=True, return_type='list') # loose the deepest dataset sub.uninstall('subsub', check=False) # no we should eb able to reaggregate metadata, and loose nothing # because we can aggregate aggregated metadata of subsub from sub base.aggregate_metadata(recursive=True, update_mode='all') # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): print(d['path'], a['path']) assert_dict_equal(d, a) # no we can throw away the subdataset tree, and loose no metadata base.uninstall('sub', recursive=True, check=False) assert(not sub.is_installed()) ok_clean_git(base.path) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): assert_dict_equal(d, a)
def test_result_filter(): # ensure baseline without filtering assert_equal([r['somekey'] for r in TestUtils().__call__(4)], [0, 1, 2, 3]) # test two functionally equivalent ways to filter results # 1. Constraint-based -- filter by exception # we have a full set of AND and OR operators for this # 2. custom filer function -- filter by boolean return value for filt in (EnsureKeyChoice('somekey', (0, 2)), lambda x: x['somekey'] in (0, 2)): assert_equal([ r['somekey'] for r in TestUtils().__call__(4, result_filter=filt) ], [0, 2]) # constraint returns full dict assert_dict_equal(TestUtils().__call__(4, result_filter=filt)[-1], { 'action': 'off', 'path': 'some', 'status': 'ok', 'somekey': 2 }) # test more sophisticated filters that actually get to see the # API call's kwargs def greatfilter(res, **kwargs): assert_equal(kwargs.get('dataset', 'bob'), 'awesome') return True TestUtils().__call__(4, dataset='awesome', result_filter=greatfilter) def sadfilter(res, **kwargs): assert_equal(kwargs.get('dataset', 'bob'), None) return True TestUtils().__call__(4, result_filter=sadfilter)
def test_basic_aggregate(path): # TODO give datasets some more metadata to actually aggregate stuff base = Dataset(opj(path, 'origin')).create(force=True) sub = base.create('sub', force=True) base.metadata(sub.path, init=dict(homepage='this'), apply2global=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True) ok_clean_git(base.path) direct_meta = base.metadata(recursive=True, return_type='list') # loose the deepest dataset sub.uninstall('subsub', check=False) # no we should eb able to reaggregate metadata, and loose nothing # because we can aggregate aggregated metadata of subsub from sub base.aggregate_metadata(recursive=True) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): print(d['path'], a['path']) assert_dict_equal(d, a) # no we can throw away the subdataset tree, and loose no metadata base.uninstall('sub', recursive=True, check=False) assert (not sub.is_installed()) ok_clean_git(base.path) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): assert_dict_equal(d, a)
def test_compare_content_info(path): # TODO remove when `create` is RF to return the new Dataset ds = RevolutionDataset(Dataset(path).create().path) assert_repo_status(path) # for a clean repo HEAD and worktree query should yield identical results wt = ds.repo.get_content_info(ref=None) assert_dict_equal(wt, ds.repo.get_content_info(ref='HEAD'))
def test_addurls(self, path): ds = Dataset(path).create(force=True) def get_annex_commit_counts(): return int( ds.repo.repo.git.rev_list("--count", "git-annex").strip()) n_annex_commits = get_annex_commit_counts() with chpwd(path): ds.addurls(self.json_file, "{url}", "{name}") filenames = ["a", "b", "c"] for fname in filenames: ok_exists(fname) for (fname, meta), subdir in zip(ds.repo.get_metadata(filenames), ["foo", "bar", "foo"]): assert_dict_equal(meta, {"subdir": [subdir], "name": [fname]}) # Ignore this check if we're faking dates because that disables # batch mode. if not os.environ.get('DATALAD_FAKE__DATES'): # We should have two new commits on the git-annex: one for the # added urls and one for the added metadata. eq_(n_annex_commits + 2, get_annex_commit_counts()) # Add to already existing links, overwriting. with swallow_logs(new_level=logging.DEBUG) as cml: ds.addurls(self.json_file, "{url}", "{name}", ifexists="overwrite") for fname in filenames: assert_in("Removing {}".format(os.path.join(path, fname)), cml.out) # Add to already existing links, skipping. assert_in_results(ds.addurls(self.json_file, "{url}", "{name}", ifexists="skip"), action="addurls", status="notneeded") # Add to already existing links works, as long content is the same. ds.addurls(self.json_file, "{url}", "{name}") # But it fails if something has changed. ds.unlock("a") with open("a", "w") as ofh: ofh.write("changed") ds.save("a") assert_raises(IncompleteResultsError, ds.addurls, self.json_file, "{url}", "{name}")
def test_basic_dsmeta(path): ds = Dataset(path).create() ok_clean_git(path) # ensure clean slate assert_result_count(ds.metadata(), 0) # init res = ds.metadata(init=['tag1', 'tag2'], dataset_global=True) eq_(res[0]['metadata']['tag'], ['tag1', 'tag2']) # init again does nothing res = ds.metadata(init=['tag3'], dataset_global=True) eq_(res[0]['metadata']['tag'], ['tag1', 'tag2']) # reset whole key res = ds.metadata(reset=['tag'], dataset_global=True) assert_result_count(ds.metadata(), 0) # add something arbitrary res = ds.metadata(add=dict(dtype=['heavy'], readme=['short', 'long']), dataset_global=True) eq_(res[0]['metadata']['dtype'], ['heavy']) # sorted! eq_(res[0]['metadata']['readme'], ['long', 'short']) # supply key definitions, no need for dataset_global res = ds.metadata(define_key=dict(mykey='truth')) eq_(res[0]['metadata']['definition'], {'mykey': u'truth'}) # re-supply different key definitions -> error res = ds.metadata(define_key=dict(mykey='lie'), on_failure='ignore') assert_result_count( res, 1, status='error', message=("conflicting definition for key '%s': '%s' != '%s'", "mykey", "lie", "truth")) res = ds.metadata(define_key=dict(otherkey='altfact')) assert_dict_equal(res[0]['metadata']['definition'], { 'mykey': u'truth', 'otherkey': 'altfact' }) # 'definition' is a regular key, we can remove items res = ds.metadata(remove=dict(definition=['mykey']), dataset_global=True) assert_dict_equal(res[0]['metadata']['definition'], {'otherkey': u'altfact'}) res = ds.metadata(remove=dict(definition=['otherkey']), dataset_global=True) # when there are no items left, the key vanishes too assert ('definition' not in res[0]['metadata']) # we still have metadata, so there is a DB file assert (res[0]['metadata']) db_path = opj(ds.path, '.datalad', 'metadata', 'dataset.json') assert (exists(db_path)) ok_clean_git(ds.path) # but if we remove it, the file is gone res = ds.metadata(reset=['readme', 'dtype'], dataset_global=True) eq_(res[0]['metadata'], {}) assert (not exists(db_path)) ok_clean_git(ds.path)
def test_addurls(self, path): ds = Dataset(path).create(force=True) def get_annex_commit_counts(): return int( ds.repo.repo.git.rev_list("--count", "git-annex").strip()) n_annex_commits = get_annex_commit_counts() with chpwd(path): ds.addurls(self.json_file, "{url}", "{name}") filenames = ["a", "b", "c"] for fname in filenames: ok_exists(fname) for (fname, meta), subdir in zip(ds.repo.get_metadata(filenames), ["foo", "bar", "foo"]): assert_dict_equal(meta, {"subdir": [subdir], "name": [fname]}) # Ignore this check if we're faking dates because that disables # batch mode. if not os.environ.get('DATALAD_FAKE__DATES'): # We should have two new commits on the git-annex: one for the # added urls and one for the added metadata. eq_(n_annex_commits + 2, get_annex_commit_counts()) # Add to already existing links, overwriting. with swallow_logs(new_level=logging.DEBUG) as cml: ds.addurls(self.json_file, "{url}", "{name}", ifexists="overwrite") for fname in filenames: assert_in("Removing {}".format(os.path.join(path, fname)), cml.out) # Add to already existing links, skipping. assert_in_results( ds.addurls(self.json_file, "{url}", "{name}", ifexists="skip"), action="addurls", status="notneeded") # Add to already existing links works, as long content is the same. ds.addurls(self.json_file, "{url}", "{name}") # But it fails if something has changed. ds.unlock("a") with open("a", "w") as ofh: ofh.write("changed") ds.save("a") assert_raises(IncompleteResultsError, ds.addurls, self.json_file, "{url}", "{name}")
def test_update_strategy(path): base = Dataset(op.join(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(op.join(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = sub.create(op.join('subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) # we start clean for ds in base, sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the base dataset only, nothing below changes base.meta_aggregate() eq_(len(_get_contained_objs(base)), 2) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the entire tree, but by default only updates # the top-level dataset with all objects, none of the leaf # or intermediate datasets get's touched base.meta_aggregate(recursive=True) eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_referenced_objs(base)), 6) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) res = base.meta_dump(reporton='aggregates', recursive=True) assert_result_count(res, 3) # it is impossible to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status('impossible', ds.meta_dump(reporton='aggregates', on_failure='ignore')) # get the full metadata report target_meta = _kill_time(base.meta_dump()) # now redo full aggregation, this time updating all # (intermediate) datasets base.meta_aggregate(recursive=True, into='all') eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_contained_objs(sub)), 4) eq_(len(_get_contained_objs(subsub)), 2) # it is now OK to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status('ok', ds.meta_dump(reporton='aggregates', on_failure='ignore')) # all of that has no impact on the reported metadata # minus the change in the refcommits for i in zip(target_meta, _kill_time(base.meta_dump())): assert_dict_equal(i[0], i[1])
def test_dicom(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'dicom', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'dicom.dcm'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) # query for the file metadata res = ds.metadata('dicom.dcm') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['dicom'] assert_in('@context', meta) # no point in testing ALL keys, but we got plenty assert (len(meta.keys()) > 70) eq_(meta['SeriesDate'], '20070205') # now ask for the dataset metadata, which should have both the unique props # and a list of imageseries (one in this case, but a list) res = ds.metadata(reporton='datasets') assert_result_count(res, 1) dsmeta = res[0]['metadata']['dicom'] # same context assert_dict_equal(meta['@context'], dsmeta['@context']) meta.pop('@context') eq_(dsmeta['Series'], [meta]) # for this artificial case pretty much the same info also comes out as # unique props, but wrapped in lists ucp = res[0]['metadata']["datalad_unique_content_properties"]['dicom'] assert_dict_equal( { k: [v] for k, v in dsmeta['Series'][0].items() if k not in DicomExtractor._unique_exclude and k in ucp }, { k: v for k, v in ucp.items() if k not in DicomExtractor._unique_exclude }) # buuuut, if we switch of file-based metadata storage ds.config.add('datalad.metadata.aggregate-content-dicom', 'false', where='dataset') ds.aggregate_metadata() res = ds.metadata(reporton='datasets') # the auto-uniquified bits are gone but the Series description stays assert_not_in("datalad_unique_content_properties", res[0]['metadata']) eq_(dsmeta['Series'], [meta])
def test_extract_exclude_autometa_regexp(): info, _ = au.extract( json_stream(ST_DATA["rows"]), "json", url_format="{name}_{debut_season}.com", filename_format="{age_group}//{now_dead}//{name}.csv", exclude_autometa="ea") expects = [{"name": "will", "age_group": "kid"}, {"name": "bob", "age_group": "adult"}, {"name": "scott", "age_group": "adult"}, {"name": "max", "age_group": "kid"}] for d, expect in zip(info, expects): assert_dict_equal(d["meta_args"], expect)
def _compare_metadata_helper(origres, compds): for ores in origres: rpath = relpath(ores['path'], ores['refds']) cres = compds.metadata(rpath, reporton='{}s'.format(ores['type'])) if ores['type'] == 'file': # TODO implement file based lookup continue assert_result_count(cres, 1) cres = cres[0] assert_dict_equal(ores['metadata'], cres['metadata']) if ores['type'] == 'dataset': for i in ('dsid', ): eq_(ores[i], cres[i])
def _compare_metadata_helper(origres, compds): for ores in origres: rpath = op.relpath(ores['path'], ores['refds']) cres = compds.meta_dump(rpath, reporton='{}s'.format(ores['type'])) if ores['type'] == 'file': # TODO implement file based lookup continue assert_result_count(cres, 1) cres = cres[0] assert_dict_equal(ores['metadata'], cres['metadata']) if ores['type'] == 'dataset': eq_(_get_dsid_from_core_metadata(ores['metadata']['metalad_core']), _get_dsid_from_core_metadata(cres['metadata']['metalad_core']))
def test_discover_ds_trace(path, otherdir): ds = make_demo_hierarchy_datasets( path, {k: v for k, v in demo_hierarchy.items() if k in ['a', 'd']}) a = opj(ds.path, 'a') aa = opj(a, 'aa') d = opj(ds.path, 'd') db = opj(d, 'db') # we have to check whether we get the correct hierarchy, as the test # subject is also involved in this assert_true(exists(opj(db, 'file_db'))) ds.add('.', recursive=True) ok_clean_git(ds.path) # now two datasets which are not available locally, but we # know about them (e.g. from metadata) dba = opj(db, 'sub', 'dba') dbaa = opj(dba, 'subsub', 'dbaa') for input, eds, goal in ( ([], None, {}), ([ds.path], None, {}), ([otherdir], None, {}), ([opj(ds.path, 'nothere')], None, {}), ([opj(d, 'nothere')], None, {}), ([opj(db, 'nothere')], None, {}), ([a], None, {ds.path: set([a])}), ([aa, a], None, {ds.path: set([a]), a: set([aa])}), ([db], None, {ds.path: set([d]), d: set([db])}), ([opj(db, 'file_db')], None, {ds.path: set([d]), d: set([db])}), # just a regular non-existing path ([dba], None, {}), # but if we inject this knowledge it must come back out # as the child of the closest existing dataset ([dba], [dba], {ds.path: set([d]), d: set([db]), db: set([dba])}), # regardless of the depth ([dbaa], [dbaa], {ds.path: set([d]), d: set([db]), db: set([dbaa])}), ([dba, dbaa], [dba, dbaa], {ds.path: set([d]), d: set([db]), db: set([dba, dbaa])}), # we can simply add existing and non-existing datasets to the # include list get the desired result ([d, dba, dbaa], [d, dba, dbaa], {ds.path: set([d]), d: set([db]), db: set([dba, dbaa])}), ): spec = {} discover_dataset_trace_to_targets(ds.path, input, [], spec, includeds=eds) assert_dict_equal(spec, goal)
def test_assert_dict_equal(): assert_dict_equal({}, {}) assert_dict_equal({"a": 3}, {"a": 3}) assert_raises(AssertionError, assert_dict_equal, {1: 3}, {1: 4}) assert_raises(AssertionError, assert_dict_equal, {1: 3}, {2: 4}) assert_raises(AssertionError, assert_dict_equal, {1: 3}, {2: 4, 1: 3}) assert_raises(AssertionError, assert_dict_equal, {1: 3}, {2: 4, 1: 'a'}) try: import numpy as np except: # pragma: no cover raise SkipTest("need numpy for this tiny one") # one is scalar another one array assert_raises(AssertionError, assert_dict_equal, {1: 0}, {1: np.arange(1)}) assert_raises(AssertionError, assert_dict_equal, {1: 0}, {1: np.arange(3)})
def test_compare_content_info(path): # TODO remove when `create` is RF to return the new Dataset ds = Dataset(path).create() assert_repo_status(path) # for a clean repo HEAD and worktree query should yield identical results # minus a 'bytesize' report that is readily available for HEAD, but would # not a stat call per file for the worktree, and is not done ATM wt = ds.repo.get_content_info(ref=None) assert_dict_equal( wt, {f: {k: v for k, v in iteritems(p) if k != 'bytesize'} for f, p in iteritems(ds.repo.get_content_info(ref='HEAD'))} )
def test_compare_content_info(path): # TODO remove when `create` is RF to return the new Dataset ds = Dataset(path).create() assert_repo_status(path) # for a clean repo HEAD and worktree query should yield identical results # minus a 'bytesize' report that is readily available for HEAD, but would # not a stat call per file for the worktree, and is not done ATM wt = ds.repo.get_content_info(ref=None) assert_dict_equal( wt, {f: {k: v for k, v in p.items() if k != 'bytesize'} for f, p in ds.repo.get_content_info(ref='HEAD').items()} )
def test_get_url_parts(): eq_(au.get_url_parts(""), {}) assert_dict_equal(au.get_url_parts("http://datalad.org"), {"_url_hostname": "datalad.org"}) assert_dict_equal( au.get_url_parts("http://datalad.org/about.html"), { "_url_hostname": "datalad.org", "_url0": "about.html", "_url_basename": "about.html", "_url_basename_root_py": "about", "_url_basename_ext_py": ".html", "_url_basename_root": "about", "_url_basename_ext": ".html" }) assert_dict_equal(au.get_url_parts("http://datalad.org/about.html"), au.get_url_parts("http://datalad.org//about.html")) assert_dict_equal( au.get_url_parts("http://datalad.org/for/git-users"), { "_url_hostname": "datalad.org", "_url0": "for", "_url1": "git-users", "_url_basename": "git-users", "_url_basename_root_py": "git-users", "_url_basename_ext_py": "", "_url_basename_root": "git-users", "_url_basename_ext": "" })
def test_get_url_parts(): eq_(au.get_url_parts(""), {}) assert_dict_equal(au.get_url_parts("http://datalad.org"), {"_url_hostname": "datalad.org"}) assert_dict_equal(au.get_url_parts("http://datalad.org/about.html"), {"_url_hostname": "datalad.org", "_url0": "about.html", "_url_basename": "about.html", "_url_basename_root_py": "about", "_url_basename_ext_py": ".html", "_url_basename_root": "about", "_url_basename_ext": ".html"}) assert_dict_equal(au.get_url_parts("http://datalad.org/about.html"), au.get_url_parts("http://datalad.org//about.html")) assert_dict_equal( au.get_url_parts("http://datalad.org/for/git-users"), {"_url_hostname": "datalad.org", "_url0": "for", "_url1": "git-users", "_url_basename": "git-users", "_url_basename_root_py": "git-users", "_url_basename_ext_py": "", "_url_basename_root": "git-users", "_url_basename_ext": ""})
def _compare_metadata_helper(origres, compds): for ores in origres: rpath = relpath(ores['path'], ores['refds']) cres = compds.metadata( rpath, reporton='{}s'.format(ores['type'])) if ores['type'] == 'file': # TODO implement file based lookup continue assert_result_count(cres, 1) cres = cres[0] assert_dict_equal(ores['metadata'], cres['metadata']) if ores['type'] == 'dataset': for i in ('dsid', ): eq_(ores[i], cres[i])
def test_check_dates(path): skip_if_no_module("dateutil") ref_ts = 1218182889 # Fri, 08 Aug 2008 04:08:09 -0400 refdate = "@{}".format(ref_ts) repo = os.path.join(path, "repo") with set_date(ref_ts + 5000): ar = AnnexRepo(repo) ar.add(".") ar.commit() # The standard renderer outputs json. with swallow_outputs() as cmo: # Set level to WARNING to avoid the progress bar when # DATALAD_TESTS_UI_BACKEND=console. with swallow_logs(new_level=logging.WARNING): check_dates([repo], reference_date=refdate, return_type="list") assert_in("report", json.loads(cmo.out).keys()) # We find the newer objects. newer = call([path], reference_date=refdate) eq_(len(newer), 1) ok_(newer[0]["report"]["objects"]) # There are no older objects to find. older = call([repo], reference_date=refdate, older=True) assert_false(older[0]["report"]["objects"]) # We can pass the date in RFC 2822 format. assert_dict_equal( newer[0], call([path], reference_date="08 Aug 2008 04:08:09 -0400")[0]) # paths=None defaults to the current directory. with chpwd(path): assert_dict_equal( newer[0]["report"], call(paths=None, reference_date=refdate)[0]["report"]) # Only commit type is present when annex='none'. newer_noannex = call([path], reference_date=refdate, annex="none") for entry in newer_noannex[0]["report"]["objects"].values(): ok_(entry["type"] == "commit")
def test_rerun_commit_message_check(): assert_raises(ValueError, get_run_info, None, """\ [DATALAD RUNCMD] no command === Do not change lines below === { "pwd": ".", "exit": 0 } ^^^ Do not change lines above ^^^""") assert_raises(ValueError, get_run_info, None, """\ [DATALAD RUNCMD] junk json === Do not change lines below === { "pwd": "., "cmd": "echo ok >okfile", "exit": 0 } ^^^ Do not change lines above ^^^""") subject, info = get_run_info( None, """\ [DATALAD RUNCMD] fine === Do not change lines below === { "pwd": ".", "cmd": "echo ok >okfile", "exit": 0 } ^^^ Do not change lines above ^^^""") eq_(subject, "fine") assert_dict_equal(info, {"pwd": ".", "cmd": "echo ok >okfile", "exit": 0})
def test_addurls(self, path): ds = Dataset(path).create(force=True) with chpwd(path): ds.addurls(self.json_file, "{url}", "{name}") filenames = ["a", "b", "c"] for fname in filenames: ok_exists(fname) for (fname, meta), subdir in zip(ds.repo.get_metadata(filenames), ["foo", "bar", "foo"]): assert_dict_equal(meta, {"subdir": [subdir], "name": [fname]}) # Add to already existing links, overwriting. with swallow_logs(new_level=logging.DEBUG) as cml: ds.addurls(self.json_file, "{url}", "{name}", ifexists="overwrite") for fname in filenames: assert_in("Removing {}".format(os.path.join(path, fname)), cml.out) # Add to already existing links, skipping. assert_in_results(ds.addurls(self.json_file, "{url}", "{name}", ifexists="skip"), action="addurls", status="notneeded") # Add to already existing links works, as long content is the same. ds.addurls(self.json_file, "{url}", "{name}") # But it fails if something has changed. ds.unlock("a") with open("a", "w") as ofh: ofh.write("changed") ds.add("a") assert_raises(IncompleteResultsError, ds.addurls, self.json_file, "{url}", "{name}")
def test_nested_metadata(path): ds = Dataset(path).create(force=True) ds.save() ds.aggregate_metadata() # BIDS returns participant info as a nested dict for each file in the # content metadata. On the dataset-level this should automatically # yield a sequence of participant info dicts, without any further action # or BIDS-specific configuration meta = ds.metadata('.', reporton='datasets', return_type='item-or-list')['metadata'] for i in zip( sorted( meta['datalad_unique_content_properties']['bids']['subject'], key=lambda x: x['id']), sorted([{ "age(years)": "20-25", "id": "03", "gender": "female", "handedness": "r", "hearing_problems_current": "n", "language": "english" }, { "age(years)": "30-35", "id": "01", "gender": 'n/a', "handedness": "r", "hearing_problems_current": "n", "language": u"русский" }], key=lambda x: x['id'])): assert_dict_equal(i[0], i[1]) # we can turn off this kind of auto-summary ds.config.add('datalad.metadata.generate-unique-bids', 'false', where='dataset') ds.aggregate_metadata() meta = ds.metadata('.', reporton='datasets', return_type='item-or-list')['metadata'] # protect next test a little, in case we enhance our core extractor in the future # to provide more info if 'datalad_unique_content_properties' in meta: assert_not_in('bids', meta['datalad_unique_content_properties'])
def test_target_ssh_simple(origin, src_path, target_rootpath): # prepare src source = install(src_path, source=origin, result_xfm='datasets', return_type='item-or-list') target_path = opj(target_rootpath, "basic") with swallow_logs(new_level=logging.ERROR) as cml: create_sibling(dataset=source, name="local_target", sshurl="ssh://localhost:22", target_dir=target_path, ui=True) assert_not_in('enableremote local_target failed', cml.out) GitRepo(target_path, create=False) # raises if not a git repo assert_in("local_target", source.repo.get_remotes()) # Both must be annex or git repositories src_is_annex = AnnexRepo.is_valid_repo(src_path) eq_(src_is_annex, AnnexRepo.is_valid_repo(target_path)) # And target one should be known to have a known UUID within the source if annex if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get # basic config in place eq_(local_target_cfg('annex-ignore'), 'false') ok_(local_target_cfg('annex-uuid')) # do it again without force, but use a different name to avoid initial checks # for existing remotes: with assert_raises(RuntimeError) as cm: assert_create_sshwebserver(dataset=source, name="local_target_alt", sshurl="ssh://localhost", target_dir=target_path) ok_( text_type(cm.exception).startswith( "Target path %s already exists. And it fails to rmdir" % target_path)) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() assert_not_equal(target_description, None) assert_not_equal(target_description, target_path) # on yoh's laptop TMPDIR is under HOME, so things start to become # tricky since then target_path is shortened and we would need to know # remote $HOME. To not over-complicate and still test, test only for # the basename of the target_path ok_endswith(target_description, basename(target_path)) # now, with force and correct url, which is also used to determine # target_dir # Note: on windows absolute path is not url conform. But this way it's easy # to test, that ssh path is correctly used. if not on_windows: # add random file under target_path, to explicitly test existing=replace open(opj(target_path, 'random'), 'w').write('123') assert_create_sshwebserver(dataset=source, name="local_target", sshurl="ssh://localhost" + target_path, publish_by_default='master', existing='replace') eq_("ssh://localhost" + urlquote(target_path), source.repo.get_remote_url("local_target")) ok_(source.repo.get_remote_url("local_target", push=True) is None) # ensure target tree actually replaced by source assert_false(exists(opj(target_path, 'random'))) if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes[ "local_target"].config_reader.get eq_(local_target_cfg('annex-ignore'), 'false') eq_(local_target_cfg('annex-uuid').count('-'), 4) # valid uuid # should be added too, even if URL matches prior state eq_(local_target_cfg('push'), 'master') # again, by explicitly passing urls. Since we are on localhost, the # local path should work: cpkwargs = dict( dataset=source, name="local_target", sshurl="ssh://localhost", target_dir=target_path, target_url=target_path, target_pushurl="ssh://localhost" + target_path, ui=True, ) assert_create_sshwebserver(existing='replace', **cpkwargs) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() eq_(target_description, target_path) eq_(target_path, source.repo.get_remote_url("local_target")) eq_("ssh://localhost" + target_path, source.repo.get_remote_url("local_target", push=True)) _test_correct_publish(target_path) # now, push should work: publish(dataset=source, to="local_target") # and we should be able to 'reconfigure' def process_digests_mtimes(digests, mtimes): # it should have triggered a hook, which would have created log and metadata files check_metadata = False for part in 'logs', 'metadata': metafiles = [ k for k in digests if k.startswith(_path_('.git/datalad/%s/' % part)) ] # This is in effect ONLY if we have "compatible" datalad installed on remote # end. ATM we don't have easy way to guarantee that AFAIK (yoh), # so let's not check/enforce (TODO) # assert(len(metafiles) >= 1) # we might have 2 logs if timestamps do not collide ;) # Let's actually do it to some degree if part == 'logs': # always should have those: assert (len(metafiles) >= 1) with open(opj(target_path, metafiles[0])) as f: if 'no datalad found' not in f.read(): check_metadata = True if part == 'metadata': eq_(len(metafiles), bool(check_metadata)) for f in metafiles: digests.pop(f) mtimes.pop(f) # and just pop some leftovers from annex for f in list(digests): if f.startswith('.git/annex/mergedrefs'): digests.pop(f) mtimes.pop(f) orig_digests, orig_mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(orig_digests, orig_mtimes) import time time.sleep(0.1) # just so that mtimes change assert_create_sshwebserver(existing='reconfigure', **cpkwargs) digests, mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(digests, mtimes) assert_dict_equal(orig_digests, digests) # nothing should change in terms of content # but some files should have been modified modified_files = { k for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0) } # collect which files were expected to be modified without incurring any changes ok_modified_files = { _path_('.git/hooks/post-update'), 'index.html', # files which hook would manage to generate _path_('.git/info/refs'), '.git/objects/info/packs' } # on elderly git we don't change receive setting ok_modified_files.add(_path_('.git/config')) ok_modified_files.update( {f for f in digests if f.startswith(_path_('.git/datalad/web'))}) # it seems that with some recent git behavior has changed a bit # and index might get touched if _path_('.git/index') in modified_files: ok_modified_files.add(_path_('.git/index')) assert_set_equal(modified_files, ok_modified_files)
def test_target_ssh_simple(origin, src_path, target_rootpath): # prepare src source = install(src_path, source=origin) target_path = opj(target_rootpath, "basic") # it will try to fetch it so would fail as well since sshurl is wrong with swallow_logs(new_level=logging.ERROR) as cml, \ assert_raises(GitCommandError): create_sibling(dataset=source, target="local_target", sshurl="ssh://localhost", target_dir=target_path, ui=True) # is not actually happening on one of the two basic cases -- TODO figure it out # assert_in('enableremote local_target failed', cml.out) GitRepo(target_path, create=False) # raises if not a git repo assert_in("local_target", source.repo.get_remotes()) eq_("ssh://localhost", source.repo.get_remote_url("local_target")) # should NOT be able to push now, since url isn't correct: # TODO: assumption is wrong if ~ does have .git! fix up! assert_raises(GitCommandError, publish, dataset=source, to="local_target") # Both must be annex or git repositories src_is_annex = AnnexRepo.is_valid_repo(src_path) eq_(src_is_annex, AnnexRepo.is_valid_repo(target_path)) # And target one should be known to have a known UUID within the source if annex if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get # for some reason this was "correct" # eq_(local_target_cfg('annex-ignore'), 'false') # but after fixing creating siblings in # 21f6dd012b2c7b9c0b8b348dcfb3b0ace7e8b2ec it started to fail # I think it is legit since we are trying to fetch now before calling # annex.enable_remote so it doesn't set it up, and fails before assert_raises(Exception, local_target_cfg, 'annex-ignore') # hm, but ATM wouldn't get a uuid since url is wrong assert_raises(Exception, local_target_cfg, 'annex-uuid') # do it again without force: with assert_raises(RuntimeError) as cm: assert_create_sshwebserver(dataset=source, target="local_target", sshurl="ssh://localhost", target_dir=target_path) eq_("Target directory %s already exists." % target_path, str(cm.exception)) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() assert_not_equal(target_description, None) assert_not_equal(target_description, target_path) ok_endswith(target_description, target_path) # now, with force and correct url, which is also used to determine # target_dir # Note: on windows absolute path is not url conform. But this way it's easy # to test, that ssh path is correctly used. if not on_windows: # add random file under target_path, to explicitly test existing=replace open(opj(target_path, 'random'), 'w').write('123') assert_create_sshwebserver(dataset=source, target="local_target", sshurl="ssh://localhost" + target_path, existing='replace') eq_("ssh://localhost" + target_path, source.repo.get_remote_url("local_target")) ok_(source.repo.get_remote_url("local_target", push=True) is None) # ensure target tree actually replaced by source assert_false(exists(opj(target_path, 'random'))) if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes[ "local_target"].config_reader.get eq_(local_target_cfg('annex-ignore'), 'false') eq_(local_target_cfg('annex-uuid').count('-'), 4) # valid uuid # again, by explicitly passing urls. Since we are on localhost, the # local path should work: cpkwargs = dict( dataset=source, target="local_target", sshurl="ssh://localhost", target_dir=target_path, target_url=target_path, target_pushurl="ssh://localhost" + target_path, ui=True, ) assert_create_sshwebserver(existing='replace', **cpkwargs) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() eq_(target_description, target_path) eq_(target_path, source.repo.get_remote_url("local_target")) eq_("ssh://localhost" + target_path, source.repo.get_remote_url("local_target", push=True)) _test_correct_publish(target_path) # now, push should work: publish(dataset=source, to="local_target") # and we should be able to 'reconfigure' def process_digests_mtimes(digests, mtimes): # it should have triggered a hook, which would have created log and metadata files check_metadata = False for part in 'logs', 'metadata': metafiles = [ k for k in digests if k.startswith(_path_('.git/datalad/%s/' % part)) ] # This is in effect ONLY if we have "compatible" datalad installed on remote # end. ATM we don't have easy way to guarantee that AFAIK (yoh), # so let's not check/enforce (TODO) # assert(len(metafiles) >= 1) # we might have 2 logs if timestamps do not collide ;) # Let's actually do it to some degree if part == 'logs': # always should have those: assert (len(metafiles) >= 1) with open(opj(target_path, metafiles[0])) as f: if 'no datalad found' not in f.read(): check_metadata = True if part == 'metadata': eq_(len(metafiles), bool(check_metadata)) for f in metafiles: digests.pop(f) mtimes.pop(f) # and just pop some leftovers from annex for f in list(digests): if f.startswith('.git/annex/mergedrefs'): digests.pop(f) mtimes.pop(f) orig_digests, orig_mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(orig_digests, orig_mtimes) import time time.sleep(0.1) # just so that mtimes change assert_create_sshwebserver(existing='reconfigure', **cpkwargs) digests, mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(digests, mtimes) assert_dict_equal(orig_digests, digests) # nothing should change in terms of content # but some files should have been modified modified_files = { k for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0) } # collect which files were expected to be modified without incurring any changes ok_modified_files = { _path_('.git/hooks/post-update'), 'index.html', # files which hook would manage to generate _path_('.git/info/refs'), '.git/objects/info/packs' } if external_versions['cmd:system-git'] >= '2.4': # on elderly git we don't change receive setting ok_modified_files.add(_path_('.git/config')) ok_modified_files.update( {f for f in digests if f.startswith(_path_('.git/datalad/web'))}) assert_set_equal(modified_files, ok_modified_files)
def test_addurls(self, path): ds = Dataset(path).create(force=True) def get_annex_commit_counts(): return len(ds.repo.get_revisions("git-annex")) n_annex_commits = get_annex_commit_counts() # Meanwhile also test that we can specify path relative # to the top of the dataset, as we generally treat paths in # Python API, and it will be the one saved in commit # message record json_file = op.relpath(self.json_file, ds.path) ds.addurls(json_file, "{url}", "{name}", exclude_autometa="(md5sum|size)") ok_startswith(ds.repo.format_commit('%b', DEFAULT_BRANCH), f"url_file='{json_file}'") filenames = ["a", "b", "c"] for fname in filenames: ok_exists(op.join(ds.path, fname)) for (fname, meta), subdir in zip(ds.repo.get_metadata(filenames), ["foo", "bar", "foo"]): assert_dict_equal(meta, {"subdir": [subdir], "name": [fname]}) # Ignore this check if we're faking dates because that disables # batch mode. # Also ignore if on Windows as it seems as if a git-annex bug # leads to separate meta data commits: # https://github.com/datalad/datalad/pull/5202#discussion_r535429704 if not (dl_cfg.get('datalad.fake-dates') or on_windows): # We should have two new commits on the git-annex: one for the # added urls and one for the added metadata. eq_(n_annex_commits + 2, get_annex_commit_counts()) # Add to already existing links, overwriting. with swallow_logs(new_level=logging.DEBUG) as cml: ds.addurls(self.json_file, "{url}", "{name}", ifexists="overwrite") for fname in filenames: assert_in("Removing {}".format(os.path.join(path, fname)), cml.out) # Add to already existing links, skipping. assert_in_results(ds.addurls(self.json_file, "{url}", "{name}", ifexists="skip"), action="addurls", status="notneeded") # Add to already existing links works, as long content is the same. ds.addurls(self.json_file, "{url}", "{name}") # But it fails if something has changed. ds.unlock("a") with open(op.join(ds.path, "a"), "w") as ofh: ofh.write("changed") ds.save("a") assert_raises(IncompleteResultsError, ds.addurls, self.json_file, "{url}", "{name}")
def test_dicom(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'dicom', where='dataset') copy( op.join(op.dirname(op.dirname(op.dirname(__file__))), 'tests', 'data', 'files', 'dicom.dcm'), path) ds.save() ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) # query for the file metadata res = ds.metadata('dicom.dcm') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['dicom'] assert_in('@context', meta) # no point in testing ALL keys, but we got plenty assert (len(meta.keys()) > 70) eq_(meta['SeriesDate'], '20070205') # Actually a tricky one of the dcm.multival.MultiValue type # which we should extract as a list # https://github.com/datalad/datalad-neuroimaging/issues/49 eq_(meta['ImageType'], ['ORIGINAL', 'PRIMARY', 'EPI', 'NONE']) # make sure we have PatientName -- this is not using a basic data type, but # dicom.valuerep.PersonName3 -- conversion should have handled that # we can only test if the key is there, the source dicom has an empty # string as value eq_(meta['PatientName'], '') # now ask for the dataset metadata, which should have both the unique props # and a list of imageseries (one in this case, but a list) res = ds.metadata(reporton='datasets') assert_result_count(res, 1) dsmeta = res[0]['metadata']['dicom'] # same context assert_dict_equal(meta['@context'], dsmeta['@context']) meta.pop('@context') seriesmeta = dsmeta['Series'] eq_(seriesmeta[0].pop('SeriesDirectory'), op.curdir) eq_(dsmeta['Series'], [meta]) # for this artificial case pretty much the same info also comes out as # unique props, but wrapped in lists ucp = res[0]['metadata']["datalad_unique_content_properties"]['dicom'] assert_dict_equal( { k: [v] for k, v in dsmeta['Series'][0].items() if k not in DicomExtractor._unique_exclude and k in ucp }, { k: v for k, v in ucp.items() if k not in DicomExtractor._unique_exclude }) # buuuut, if we switch of file-based metadata storage ds.config.add('datalad.metadata.aggregate-content-dicom', 'false', where='dataset') ds.aggregate_metadata() res = ds.metadata(reporton='datasets') if not datalad_extracts_annex_key: # the auto-uniquified bits are gone but the Series description stays assert_not_in("datalad_unique_content_properties", res[0]['metadata']) eq_(dsmeta['Series'], [meta])
def test_basic_dsmeta(path): ds = Dataset(path).create() ok_clean_git(path) # ensure clean slate res = ds.metadata(reporton='datasets') assert_result_count(res, 1) _assert_metadata_empty(res[0]['metadata']) # init res = ds.metadata(init=['tag1', 'tag2'], apply2global=True) eq_(res[0]['metadata']['tag'], ['tag1', 'tag2']) # init again does nothing res = ds.metadata(init=['tag3'], apply2global=True) eq_(res[0]['metadata']['tag'], ['tag1', 'tag2']) # reset whole key ds.metadata(reset=['tag'], apply2global=True) res = ds.metadata(reporton='datasets') assert_result_count(res, 1) _assert_metadata_empty(res[0]['metadata']) # add something arbitrary res = ds.metadata(add=dict(dtype=['heavy'], readme=['short', 'long']), apply2global=True, on_failure='ignore') # fails due to unknown keys assert_status('error', res) res = ds.metadata(add=dict(dtype=['heavy'], readme=['short', 'long']), define_key=dict(dtype='is_a_datatype', readme='is_readme_content'), apply2global=True) eq_(res[0]['metadata']['dtype'], 'heavy') # sorted! eq_(res[0]['metadata']['readme'], ['long', 'short']) # check it reports common keys with swallow_outputs() as cmo: ds.metadata(show_keys=True) assert_in('license', cmo.out) # supply key definitions, no need for apply2global res = ds.metadata(define_key=dict(mykey='truth')) eq_(res[0]['metadata']['definition']['mykey'], u'truth') with swallow_outputs() as cmo: ds.metadata(show_keys=True) assert_in('mykey: truth (dataset: {})'.format(ds.path), cmo.out) # re-supply different key definitions -> error res = ds.metadata(define_key=dict(mykey='lie'), on_failure='ignore') assert_result_count( res, 1, status='error', message=("conflicting definition for key '%s': '%s' != '%s'", "mykey", "lie", "truth")) res = ds.metadata(define_key=dict(otherkey='altfact'), ) eq_(res[0]['metadata']['definition']['otherkey'], 'altfact') # 'definition' is a regular key, we can remove items res = ds.metadata(remove=dict(definition=['mykey']), apply2global=True) assert_dict_equal( res[0]['metadata']['definition'], { 'otherkey': u'altfact', 'readme': u'is_readme_content', 'dtype': u'is_a_datatype' }) res = ds.metadata(remove=dict(definition=['otherkey', 'readme', 'dtype']), apply2global=True) # when there are no items left, the key vanishes too assert ('definition' not in res[0]['metadata']) # we still have metadata, so there is a DB file assert (res[0]['metadata']) db_path = opj(ds.path, '.datalad', 'metadata', 'dataset.json') assert (exists(db_path)) ok_clean_git(ds.path) # but if we remove it, the file is gone res = ds.metadata(reset=['readme', 'dtype'], apply2global=True) eq_(res[0]['metadata'], {}) assert (not exists(db_path)) ok_clean_git(ds.path)
def test_target_ssh_simple(origin, src_path, target_rootpath): # prepare src source = install( src_path, source=origin, result_xfm='datasets', return_type='item-or-list') target_path = opj(target_rootpath, "basic") with swallow_logs(new_level=logging.ERROR) as cml: create_sibling( dataset=source, name="local_target", sshurl="ssh://localhost", target_dir=target_path, ui=True) assert_not_in('enableremote local_target failed', cml.out) GitRepo(target_path, create=False) # raises if not a git repo assert_in("local_target", source.repo.get_remotes()) # Both must be annex or git repositories src_is_annex = AnnexRepo.is_valid_repo(src_path) eq_(src_is_annex, AnnexRepo.is_valid_repo(target_path)) # And target one should be known to have a known UUID within the source if annex if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get # basic config in place eq_(local_target_cfg('annex-ignore'), 'false') ok_(local_target_cfg('annex-uuid')) # do it again without force, but use a different name to avoid initial checks # for existing remotes: with assert_raises(RuntimeError) as cm: assert_create_sshwebserver( dataset=source, name="local_target_alt", sshurl="ssh://localhost", target_dir=target_path) ok_(text_type(cm.exception).startswith( "Target path %s already exists. And it fails to rmdir" % target_path)) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() assert_not_equal(target_description, None) assert_not_equal(target_description, target_path) # on yoh's laptop TMPDIR is under HOME, so things start to become # tricky since then target_path is shortened and we would need to know # remote $HOME. To not over-complicate and still test, test only for # the basename of the target_path ok_endswith(target_description, basename(target_path)) # now, with force and correct url, which is also used to determine # target_dir # Note: on windows absolute path is not url conform. But this way it's easy # to test, that ssh path is correctly used. if not on_windows: # add random file under target_path, to explicitly test existing=replace open(opj(target_path, 'random'), 'w').write('123') assert_create_sshwebserver( dataset=source, name="local_target", sshurl="ssh://localhost" + target_path, publish_by_default='master', existing='replace') eq_("ssh://localhost" + urlquote(target_path), source.repo.get_remote_url("local_target")) ok_(source.repo.get_remote_url("local_target", push=True) is None) # ensure target tree actually replaced by source assert_false(exists(opj(target_path, 'random'))) if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get eq_(local_target_cfg('annex-ignore'), 'false') eq_(local_target_cfg('annex-uuid').count('-'), 4) # valid uuid # should be added too, even if URL matches prior state eq_(local_target_cfg('push'), 'master') # again, by explicitly passing urls. Since we are on localhost, the # local path should work: cpkwargs = dict( dataset=source, name="local_target", sshurl="ssh://localhost", target_dir=target_path, target_url=target_path, target_pushurl="ssh://localhost" + target_path, ui=True, ) assert_create_sshwebserver(existing='replace', **cpkwargs) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() eq_(target_description, target_path) eq_(target_path, source.repo.get_remote_url("local_target")) eq_("ssh://localhost" + target_path, source.repo.get_remote_url("local_target", push=True)) _test_correct_publish(target_path) # now, push should work: publish(dataset=source, to="local_target") # and we should be able to 'reconfigure' def process_digests_mtimes(digests, mtimes): # it should have triggered a hook, which would have created log and metadata files check_metadata = False for part in 'logs', 'metadata': metafiles = [k for k in digests if k.startswith(_path_('.git/datalad/%s/' % part))] # This is in effect ONLY if we have "compatible" datalad installed on remote # end. ATM we don't have easy way to guarantee that AFAIK (yoh), # so let's not check/enforce (TODO) # assert(len(metafiles) >= 1) # we might have 2 logs if timestamps do not collide ;) # Let's actually do it to some degree if part == 'logs': # always should have those: assert (len(metafiles) >= 1) with open(opj(target_path, metafiles[0])) as f: if 'no datalad found' not in f.read(): check_metadata = True if part == 'metadata': eq_(len(metafiles), bool(check_metadata)) for f in metafiles: digests.pop(f) mtimes.pop(f) # and just pop some leftovers from annex for f in list(digests): if f.startswith('.git/annex/mergedrefs'): digests.pop(f) mtimes.pop(f) orig_digests, orig_mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(orig_digests, orig_mtimes) import time time.sleep(0.1) # just so that mtimes change assert_create_sshwebserver(existing='reconfigure', **cpkwargs) digests, mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(digests, mtimes) assert_dict_equal(orig_digests, digests) # nothing should change in terms of content # but some files should have been modified modified_files = {k for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0)} # collect which files were expected to be modified without incurring any changes ok_modified_files = { _path_('.git/hooks/post-update'), 'index.html', # files which hook would manage to generate _path_('.git/info/refs'), '.git/objects/info/packs' } # on elderly git we don't change receive setting ok_modified_files.add(_path_('.git/config')) ok_modified_files.update({f for f in digests if f.startswith(_path_('.git/datalad/web'))}) # it seems that with some recent git behavior has changed a bit # and index might get touched if _path_('.git/index') in modified_files: ok_modified_files.add(_path_('.git/index')) assert_set_equal(modified_files, ok_modified_files)
def test_custom_native_merge(path): ds = Dataset(path).create(force=True) # no metadata, because nothing is commited _assert_metadata_empty( ds.metadata(reporton='datasets', result_xfm='metadata', return_type='item-or-list')) # enable BIDS metadata, BIDS metadata should become THE metadata ds.config.add('datalad.metadata.nativetype', 'bids', where='dataset') ds.aggregate_metadata() # no metadata, because still nothing is commited _assert_metadata_empty( ds.metadata(reporton='datasets', result_xfm='metadata', return_type='item-or-list')) ds.add('.') ds.aggregate_metadata() meta = ds.metadata(reporton='datasets', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal({'name': u'myds', 'author': ['one', 'two']}, meta) # now give the ds a custom name, must override the native one # but authors still come from BIDS ds.metadata(apply2global=True, add=dict(name='mycustom')) meta = ds.metadata(reporton='datasets', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal({'name': u'mycustom', 'author': ['one', 'two']}, meta) # we can disable the merge meta = ds.metadata(reporton='datasets', merge_native='none', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal({'name': u'mycustom'}, meta) # we can accumulate values meta = ds.metadata(reporton='datasets', merge_native='add', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal({ 'name': ['mycustom', 'myds'], 'author': ['one', 'two'] }, meta) # we can have native override custom (not sure when needed, though) # add one more custom to make visible ds.metadata(apply2global=True, init=dict(homepage='fresh')) meta = ds.metadata(reporton='datasets', merge_native='reset', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal( { 'name': u'myds', 'author': ['one', 'two'], 'homepage': u'fresh' }, meta) # enable an additional metadata source ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') # we need to reaggregate after the config change ds.aggregate_metadata(merge_native='add') meta = ds.metadata(reporton='datasets', merge_native='add', result_xfm='metadata', return_type='item-or-list') _clean_meta(meta) assert_dict_equal( { 'name': ['mycustom', 'myds', 'someother'], 'author': ['one', 'two'], 'homepage': u'fresh' }, meta)
def test_target_ssh_simple(origin, src_path, target_rootpath): # prepare src source = install(src_path, source=origin) target_path = opj(target_rootpath, "basic") # it will try to fetch it so would fail as well since sshurl is wrong with swallow_logs(new_level=logging.ERROR) as cml, \ assert_raises(GitCommandError): create_sibling( dataset=source, target="local_target", sshurl="ssh://localhost", target_dir=target_path, ui=True) # is not actually happening on one of the two basic cases -- TODO figure it out # assert_in('enableremote local_target failed', cml.out) GitRepo(target_path, create=False) # raises if not a git repo assert_in("local_target", source.repo.get_remotes()) eq_("ssh://localhost", source.repo.get_remote_url("local_target")) # should NOT be able to push now, since url isn't correct: # TODO: assumption is wrong if ~ does have .git! fix up! assert_raises(GitCommandError, publish, dataset=source, to="local_target") # Both must be annex or git repositories src_is_annex = AnnexRepo.is_valid_repo(src_path) eq_(src_is_annex, AnnexRepo.is_valid_repo(target_path)) # And target one should be known to have a known UUID within the source if annex if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get # for some reason this was "correct" # eq_(local_target_cfg('annex-ignore'), 'false') # but after fixing creating siblings in # 21f6dd012b2c7b9c0b8b348dcfb3b0ace7e8b2ec it started to fail # I think it is legit since we are trying to fetch now before calling # annex.enable_remote so it doesn't set it up, and fails before assert_raises(Exception, local_target_cfg, 'annex-ignore') # hm, but ATM wouldn't get a uuid since url is wrong assert_raises(Exception, local_target_cfg, 'annex-uuid') # do it again without force: with assert_raises(RuntimeError) as cm: assert_create_sshwebserver( dataset=source, target="local_target", sshurl="ssh://localhost", target_dir=target_path) eq_("Target directory %s already exists." % target_path, str(cm.exception)) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() assert_not_equal(target_description, None) assert_not_equal(target_description, target_path) ok_endswith(target_description, target_path) # now, with force and correct url, which is also used to determine # target_dir # Note: on windows absolute path is not url conform. But this way it's easy # to test, that ssh path is correctly used. if not on_windows: # add random file under target_path, to explicitly test existing=replace open(opj(target_path, 'random'), 'w').write('123') assert_create_sshwebserver( dataset=source, target="local_target", sshurl="ssh://localhost" + target_path, existing='replace') eq_("ssh://localhost" + target_path, source.repo.get_remote_url("local_target")) ok_(source.repo.get_remote_url("local_target", push=True) is None) # ensure target tree actually replaced by source assert_false(exists(opj(target_path, 'random'))) if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get eq_(local_target_cfg('annex-ignore'), 'false') eq_(local_target_cfg('annex-uuid').count('-'), 4) # valid uuid # again, by explicitly passing urls. Since we are on localhost, the # local path should work: cpkwargs = dict( dataset=source, target="local_target", sshurl="ssh://localhost", target_dir=target_path, target_url=target_path, target_pushurl="ssh://localhost" + target_path, ui=True, ) assert_create_sshwebserver(existing='replace', **cpkwargs) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() eq_(target_description, target_path) eq_(target_path, source.repo.get_remote_url("local_target")) eq_("ssh://localhost" + target_path, source.repo.get_remote_url("local_target", push=True)) _test_correct_publish(target_path) # now, push should work: publish(dataset=source, to="local_target") # and we should be able to 'reconfigure' def process_digests_mtimes(digests, mtimes): # it should have triggered a hook, which would have created log and metadata files check_metadata = False for part in 'logs', 'metadata': metafiles = [k for k in digests if k.startswith(_path_('.git/datalad/%s/' % part))] # This is in effect ONLY if we have "compatible" datalad installed on remote # end. ATM we don't have easy way to guarantee that AFAIK (yoh), # so let's not check/enforce (TODO) # assert(len(metafiles) >= 1) # we might have 2 logs if timestamps do not collide ;) # Let's actually do it to some degree if part == 'logs': # always should have those: assert (len(metafiles) >= 1) with open(opj(target_path, metafiles[0])) as f: if 'no datalad found' not in f.read(): check_metadata = True if part == 'metadata': eq_(len(metafiles), bool(check_metadata)) for f in metafiles: digests.pop(f) mtimes.pop(f) # and just pop some leftovers from annex for f in list(digests): if f.startswith('.git/annex/mergedrefs'): digests.pop(f) mtimes.pop(f) orig_digests, orig_mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(orig_digests, orig_mtimes) import time; time.sleep(0.1) # just so that mtimes change assert_create_sshwebserver(existing='reconfigure', **cpkwargs) digests, mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(digests, mtimes) assert_dict_equal(orig_digests, digests) # nothing should change in terms of content # but some files should have been modified modified_files = {k for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0)} # collect which files were expected to be modified without incurring any changes ok_modified_files = { _path_('.git/hooks/post-update'), 'index.html', # files which hook would manage to generate _path_('.git/info/refs'), '.git/objects/info/packs' } if external_versions['cmd:system-git'] >= '2.4': # on elderly git we don't change receive setting ok_modified_files.add(_path_('.git/config')) ok_modified_files.update({f for f in digests if f.startswith(_path_('.git/datalad/web'))}) assert_set_equal(modified_files, ok_modified_files)