def test_create_1test_dataset(): # and just a single dataset from datalad.api import create_test_dataset with swallow_outputs(): dss = create_test_dataset() eq_(len(dss), 1) ok_clean_git(dss[0], annex=False)
def test_install_simple_local(src, path): origin = Dataset(path) # now install it somewhere else ds = install(path, source=src, description='mydummy') eq_(ds.path, path) ok_(ds.is_installed()) if not isinstance(origin.repo, AnnexRepo): # this means it is a GitRepo ok_(isinstance(origin.repo, GitRepo)) # stays plain Git repo ok_(isinstance(ds.repo, GitRepo)) ok_(not isinstance(ds.repo, AnnexRepo)) ok_(GitRepo.is_valid_repo(ds.path)) eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt'}) ok_clean_git(path, annex=False) else: # must be an annex ok_(isinstance(ds.repo, AnnexRepo)) ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False)) eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt', 'test-annex.dat'}) ok_clean_git(path, annex=True) # no content was installed: ok_(not ds.repo.file_has_content('test-annex.dat')) uuid_before = ds.repo.uuid eq_(ds.repo.get_description(), 'mydummy') # installing it again, shouldn't matter: res = install(path, source=src, result_xfm=None, return_type='list') assert_status('notneeded', res) ok_(ds.is_installed()) if isinstance(origin.repo, AnnexRepo): eq_(uuid_before, ds.repo.uuid)
def test_symlinked_relpath(path): # initially ran into on OSX https://github.com/datalad/datalad/issues/2406 os.makedirs(opj(path, "origin")) dspath = opj(path, "linked") os.symlink('origin', dspath) ds = Dataset(dspath).create() create_tree(dspath, { "mike1": 'mike1', # will be added from topdir "later": "later", # later from within subdir "d": { "mike2": 'mike2', # to be added within subdir } }) # in the root of ds with chpwd(dspath): ds.repo.add("mike1", git=True) ds._save("committing", path="./mike1") # Let's also do in subdirectory with chpwd(opj(dspath, 'd')): ds.repo.add("mike2", git=True) ds._save("committing", path="./mike2") later = opj(pardir, "later") ds.repo.add(later, git=True) ds._save("committing", path=later) ok_clean_git(dspath)
def test_kill(path): # nested datasets with load ds = Dataset(path).create() testfile = opj(ds.path, "file.dat") with open(testfile, 'w') as f: f.write("load") ds.save("file.dat") subds = ds.create('deep1') eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['deep1']) ok_clean_git(ds.path) # and we fail to remove since content can't be dropped res = ds.remove(on_failure='ignore') assert_result_count( res, 1, status='error', path=testfile) # Following two assertions on message are relying on the actual error. # We have a second result with status 'impossible' for the ds, that we need # to filter out for those assertions: err_result = [r for r in res if r['status'] == 'error'][0] assert_result_values_cond( [err_result], 'message', lambda x: "configured minimum number of copies not found" in x or "Could only verify the existence of 0 out of 1 necessary copies" in x ) eq_(ds.remove(recursive=True, check=False, result_xfm='datasets'), [subds, ds]) ok_(not exists(path))
def test_uninstall_without_super(path): # a parent dataset with a proper subdataset, and another dataset that # is just placed underneath the parent, but not an actual subdataset parent = Dataset(path).create() sub = parent.create('sub') ok_clean_git(parent.path) nosub = create(opj(parent.path, 'nosub')) ok_clean_git(nosub.path) subreport = parent.subdatasets() assert_result_count(subreport, 1, path=sub.path) assert_result_count(subreport, 0, path=nosub.path) # it should be possible to uninstall the proper subdataset, even without # explicitly calling the uninstall methods of the parent -- things should # be figured out by datalad uninstall(sub.path) assert not sub.is_installed() # no present subdatasets anymore subreport = parent.subdatasets() assert_result_count(subreport, 1) assert_result_count(subreport, 1, path=sub.path, state='absent') assert_result_count(subreport, 0, path=nosub.path) # but we should fail on an attempt to uninstall the non-subdataset res = uninstall(nosub.path, on_failure='ignore') assert_result_count( res, 1, path=nosub.path, status='error', message="will not uninstall top-level dataset (consider `remove` command)")
def test_remove_file_handle_only(path): ds = Dataset(path).create(force=True) ds.save() ok_clean_git(ds.path) # make sure there is any key ok_(len(ds.repo.get_file_key('one'))) # both files link to the same key eq_(ds.repo.get_file_key('one'), ds.repo.get_file_key('two')) rpath_one = realpath(opj(ds.path, 'one')) eq_(rpath_one, realpath(opj(ds.path, 'two'))) path_two = opj(ds.path, 'two') ok_(exists(path_two)) # remove one handle, should not affect the other ds.remove('two', check=False, message="custom msg") eq_(ds.repo.repo.head.commit.message.rstrip(), "custom msg") eq_(rpath_one, realpath(opj(ds.path, 'one'))) ok_(exists(rpath_one)) ok_(not exists(path_two)) # remove file without specifying the dataset -- shouldn't fail with chpwd(path): remove('one', check=False) ok_(not exists("one")) # and we should be able to remove without saving ds.remove('three', check=False, save=False) ok_(ds.repo.dirty)
def test_drop_nocrash_absent_subds(path): parent = Dataset(path).create() sub = parent.create('sub') parent.uninstall('sub') ok_clean_git(parent.path) with chpwd(path): assert_status('notneeded', drop('.', recursive=True))
def test_install_recursive_repeat(src, path): subsub_src = Dataset(opj(src, 'sub 1', 'subsub')).create(force=True) sub1_src = Dataset(opj(src, 'sub 1')).create(force=True) sub2_src = Dataset(opj(src, 'sub 2')).create(force=True) top_src = Dataset(src).create(force=True) top_src.add('.', recursive=True) ok_clean_git(top_src.path) # install top level: top_ds = install(path, source=src) ok_(top_ds.is_installed() is True) sub1 = Dataset(opj(path, 'sub 1')) ok_(sub1.is_installed() is False) sub2 = Dataset(opj(path, 'sub 2')) ok_(sub2.is_installed() is False) subsub = Dataset(opj(path, 'sub 1', 'subsub')) ok_(subsub.is_installed() is False) # install again, now with data and recursive, but recursion_limit 1: result = get(os.curdir, dataset=path, recursive=True, recursion_limit=1, result_xfm='datasets') # top-level dataset was not reobtained assert_not_in(top_ds, result) assert_in(sub1, result) assert_in(sub2, result) assert_not_in(subsub, result) ok_(top_ds.repo.file_has_content('top_file.txt') is True) ok_(sub1.repo.file_has_content('sub1file.txt') is True) ok_(sub2.repo.file_has_content('sub2file.txt') is True) # install sub1 again, recursively and with data top_ds.install('sub 1', recursive=True, get_data=True) ok_(subsub.is_installed()) ok_(subsub.repo.file_has_content('subsubfile.txt'))
def test_audio(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('audio.mp3') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['audio'] for k, v in target.items(): eq_(meta[k], v) assert_in('@context', meta) uniques = ds.metadata( reporton='datasets', return_type='item-or-list')['metadata']['datalad_unique_content_properties'] # test file has it, but uniques have it blanked out, because the extractor considers it worthless # for discovering whole datasets assert_in('bitrate', meta) eq_(uniques['audio']['bitrate'], None) # 'date' field carries not value, hence gets exclude from the unique report assert_in('date', meta) assert(not meta['date']) assert_not_in('date', uniques['audio'])
def check_renamed_file(recursive, no_annex, path): ds = Dataset(path).create(no_annex=no_annex) create_tree(path, {'old': ''}) ds.add('old') ds.repo._git_custom_command(['old', 'new'], ['git', 'mv']) ds._save(recursive=recursive) ok_clean_git(path)
def test_publish_aggregated(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') base.create('sub', force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # create sibling and publish to it spath = opj(path, 'remote') base.create_sibling( name="local_target", sshurl="ssh://localhost", target_dir=spath) base.publish('.', to='local_target', transfer_data='all') remote = Dataset(spath) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # all object files a present in both datasets eq_(all(base.repo.file_has_content(objs)), True) eq_(all(remote.repo.file_has_content(objs)), True) # and we can squeeze the same metadata out eq_( [{k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds')} for i in base.metadata('sub')], [{k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds')} for i in remote.metadata('sub')], )
def test_get_missing(path): repo = GitRepo(path, create=True) os.makedirs(op.join(path, 'deep')) with open(op.join(path, 'test1'), 'w') as f: f.write('some') with open(op.join(path, 'deep', 'test2'), 'w') as f: f.write('some more') # no files tracked yet, so nothing changed eq_(repo.get_changed_files(), []) repo.add('.') # still no differences between worktree and staged eq_(repo.get_changed_files(), []) eq_(set(repo.get_changed_files(staged=True)), {'test1', op.join('deep', 'test2')}) eq_(set(repo.get_changed_files(staged=True, diff_filter='AD')), {'test1', op.join('deep', 'test2')}) eq_(repo.get_changed_files(staged=True, diff_filter='D'), []) repo.commit() eq_(repo.get_changed_files(), []) eq_(repo.get_changed_files(staged=True), []) ok_clean_git(path, annex=False) unlink(op.join(path, 'test1')) eq_(repo.get_missing_files(), ['test1']) rmtree(op.join(path, 'deep')) eq_(sorted(repo.get_missing_files()), [op.join('deep', 'test2'), 'test1']) # nothing is actually known to be deleted eq_(repo.get_deleted_files(), []) # do proper removal repo.remove(op.join(path, 'test1')) # no longer missing eq_(repo.get_missing_files(), [op.join('deep', 'test2')]) # but deleted eq_(repo.get_deleted_files(), ['test1'])
def test_aggregate_removal(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.add('.', recursive=True) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=subsub.path) # check that we only have object files that are listed in agginfo eq_(_get_contained_objs(base), _get_referenced_objs(base)) # now delete the deepest subdataset to test cleanup of aggregated objects # in the top-level ds base.remove(opj('sub', 'subsub'), check=False) # now aggregation has to detect that subsub is not simply missing, but gone # for good base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # internally consistent state eq_(_get_contained_objs(base), _get_referenced_objs(base)) # info on subsub was removed at all levels res = base.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 2) res = sub.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 1)
def test_GitRepo_add(src, path): gr = GitRepo.clone(src, path) filename = get_most_obscure_supported_name() with open(op.join(path, filename), 'w') as f: f.write("File to add to git") added = gr.add(filename) eq_(added, {'success': True, 'file': filename}) assert_in(filename, gr.get_indexed_files(), "%s not successfully added to %s" % (filename, path)) # uncommitted: ok_(gr.dirty) filename = "another.txt" with open(op.join(path, filename), 'w') as f: f.write("Another file to add to git") # include committing: added2 = gr.add(filename) gr.commit(msg="Add two files.") eq_(added2, {'success': True, 'file': filename}) assert_in(filename, gr.get_indexed_files(), "%s not successfully added to %s" % (filename, path)) ok_clean_git(path)
def test_optimized_cloning(path): # make test repo with one file and one commit originpath = op.join(path, 'origin') repo = GitRepo(originpath, create=True) with open(op.join(originpath, 'test'), 'w') as f: f.write('some') repo.add('test') repo.commit('init') ok_clean_git(originpath, annex=False) from glob import glob def _get_inodes(repo): return dict( [(os.path.join(*o.split(os.sep)[-2:]), os.stat(o).st_ino) for o in glob(os.path.join(repo.path, repo.get_git_dir(repo), 'objects', '*', '*'))]) origin_inodes = _get_inodes(repo) # now clone it in different ways and see what happens to the object storage from datalad.support.network import get_local_file_url clonepath = op.join(path, 'clone') for src in (originpath, get_local_file_url(originpath)): # deprecated assert_raises(DeprecatedError, GitRepo, url=src, path=clonepath) clone = GitRepo.clone(url=src, path=clonepath, create=True) clone_inodes = _get_inodes(clone) eq_(origin_inodes, clone_inodes, msg='with src={}'.format(src)) rmtree(clonepath)
def test_GitRepo_ssh_pull(remote_path, repo_path): from datalad import ssh_manager remote_repo = GitRepo(remote_path, create=True) url = "ssh://localhost" + op.abspath(remote_path) socket_path = op.join(ssh_manager.socket_dir, get_connection_hash('localhost')) repo = GitRepo(repo_path, create=True) repo.add_remote("ssh-remote", url) # modify remote: remote_repo.checkout("ssh-test", ['-b']) with open(op.join(remote_repo.path, "ssh_testfile.dat"), "w") as f: f.write("whatever") remote_repo.add("ssh_testfile.dat") remote_repo.commit("ssh_testfile.dat added.") # file is not locally known yet: assert_not_in("ssh_testfile.dat", repo.get_indexed_files()) # pull changes: repo.pull(remote="ssh-remote", refspec=remote_repo.get_active_branch()) ok_clean_git(repo.path, annex=False) # the connection is known to the SSH manager, since fetch() requested it: assert_in(socket_path, ssh_manager._connections) # and socket was created: ok_(op.exists(socket_path)) # we actually pulled the changes assert_in("ssh_testfile.dat", repo.get_indexed_files())
def test_GitRepo_fetch(test_path, orig_path, clone_path): origin = GitRepo.clone(test_path, orig_path) clone = GitRepo.clone(orig_path, clone_path) filename = get_most_obscure_supported_name() origin.checkout("new_branch", ['-b']) with open(op.join(orig_path, filename), 'w') as f: f.write("New file.") origin.add(filename) origin.commit("new file added.") fetched = clone.fetch(remote='origin') # test FetchInfo list returned by fetch eq_([u'origin/' + clone.get_active_branch(), u'origin/new_branch'], [commit.name for commit in fetched]) ok_clean_git(clone.path, annex=False) assert_in("origin/new_branch", clone.get_remote_branches()) assert_in(filename, clone.get_files("origin/new_branch")) assert_false(op.exists(op.join(clone_path, filename))) # not checked out # create a remote without an URL: origin.add_remote('not-available', 'git://example.com/not/existing') origin.config.unset('remote.not-available.url', where='local') # fetch without provided URL fetched = origin.fetch('not-available') # nothing was done, nothing returned: eq_([], fetched)
def test_remove_more_than_one(path): ds = Dataset(path).create(force=True) ds.save() ok_clean_git(path) # ensure #1912 stays resolved ds.remove(['one', 'two'], check=False) ok_clean_git(path)
def test_publish_gh1691(origin, src_path, dst_path): # prepare src; no subdatasets installed, but mount points present source = install(src_path, source=origin, recursive=False) ok_(exists(opj(src_path, "subm 1"))) assert_false(Dataset(opj(src_path, "subm 1")).is_installed()) # some content modification of the superdataset create_tree(src_path, {'probe1': 'probe1'}) source.add('probe1') ok_clean_git(src_path) # create the target(s): source.create_sibling( 'ssh://localhost:' + dst_path, name='target', recursive=True) # publish recursively, which silently ignores non-installed datasets results = source.publish(to='target', recursive=True) assert_result_count(results, 1) assert_result_count(results, 1, status='ok', type='dataset', path=source.path) # if however, a non-installed subdataset is requsted explicitly, it'll fail results = source.publish(path='subm 1', to='target', on_failure='ignore') assert_result_count(results, 1, status='impossible', type='dataset', action='publish')
def test_add_readme(path): ds = Dataset(path).create(force=True) ds.save() ds.aggregate_metadata() ok_clean_git(ds.path) assert_status('ok', ds.add_readme()) # should use default name eq_( open(opj(path, 'README.md')).read(), """\ # Dataset "demo_ds" this is for play ### Authors - Betty - Tom ### License PDDL ## General information This is a DataLad dataset (id: {id}). For more information on DataLad and on how to work with its datasets, see the DataLad documentation at: http://docs.datalad.org """.format( id=ds.id)) # should skip on re-run assert_status('notneeded', ds.add_readme())
def test_dirty(path): for mode in _dirty_modes: # does nothing without a dataset handle_dirty_dataset(None, mode) # placeholder, but not yet created ds = Dataset(path) # unknown mode assert_raises(ValueError, handle_dirty_dataset, ds, 'MADEUP') # not yet created is very dirty assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail') handle_dirty_dataset(ds, 'ignore') assert_raises(RuntimeError, handle_dirty_dataset, ds, 'save-before') # should yield a clean repo ds.create() orig_state = ds.repo.get_hexsha() _check_all_clean(ds, orig_state) # tainted: untracked with open(opj(ds.path, 'something'), 'w') as f: f.write('some') # we don't want to auto-add untracked files by saving (anymore) assert_raises(AssertionError, _check_auto_save, ds, orig_state) # tainted: staged ds.repo.add('something', git=True) orig_state = _check_auto_save(ds, orig_state) # tainted: submodule # not added to super on purpose! subds = ds.create('subds') _check_all_clean(subds, subds.repo.get_hexsha()) ok_clean_git(ds.path) # subdataset must be added as a submodule! assert_equal(ds.subdatasets(result_xfm='relpaths'), ['subds'])
def test_add_subdataset(path, other): subds = create(opj(path, 'dir'), force=True) ds = create(path, force=True) ok_(subds.repo.dirty) ok_(ds.repo.dirty) assert_not_in('dir', ds.subdatasets(result_xfm='relpaths')) # without a base dataset the next is interpreted as "add everything # in subds to subds" add(subds.path) ok_clean_git(subds.path) assert_not_in('dir', ds.subdatasets(result_xfm='relpaths')) # but with a base directory we add the dataset subds as a subdataset # to ds ds.add(subds.path) assert_in('dir', ds.subdatasets(result_xfm='relpaths')) # create another one other = create(other) # install into superdataset, but don't add other_clone = install(source=other.path, path=opj(ds.path, 'other')) ok_(other_clone.is_installed) assert_not_in('other', ds.subdatasets(result_xfm='relpaths')) # now add, it should pick up the source URL ds.add('other') # and that is why, we can reobtain it from origin ds.uninstall('other') ok_(other_clone.is_installed) ds.get('other') ok_(other_clone.is_installed)
def test_new_relpath(topdir): from datalad.api import create_test_dataset with swallow_logs(), chpwd(topdir), swallow_outputs(): dss = create_test_dataset('testds', spec='1') eq_(dss[0], opj(topdir, 'testds')) eq_(len(dss), 2) # 1 top + 1 sub-dataset as demanded for ds in dss: ok_clean_git(ds, annex=False)
def test_install_dataset_from_just_source(url, path): with chpwd(path, mkdir=True): ds = install(source=url) ok_startswith(ds.path, path) ok_(ds.is_installed()) ok_clean_git(ds.path, annex=False)
def test_bf2541(path): ds = create(path) subds = ds.create('sub') ok_clean_git(ds.path) os.symlink('sub', op.join(ds.path, 'symlink')) with chpwd(ds.path): res = add('.', recursive=True) ok_clean_git(ds.path)
def test_bf2043p2(path): ds = Dataset(path).create(force=True) ds.add('staged', save=False) ok_clean_git(ds.path, head_modified=['staged'], untracked=['untracked']) # plain save does not commit untracked content # this tests the second issue in #2043 with chpwd(path): save() ok_clean_git(ds.path, untracked=['untracked'])
def test_install_dataset_from_just_source(url, path): with chpwd(path, mkdir=True): ds = install(source=url) ok_startswith(ds.path, path) ok_(ds.is_installed()) ok_(GitRepo.is_valid_repo(ds.path)) ok_clean_git(ds.path, annex=None) assert_in('INFO.txt', ds.repo.get_indexed_files())
def test_clone_dataset_from_just_source(url, path): with chpwd(path, mkdir=True): ds = clone(url, result_xfm='datasets', return_type='item-or-list') ok_startswith(ds.path, path) ok_(ds.is_installed()) ok_(GitRepo.is_valid_repo(ds.path)) ok_clean_git(ds.path, annex=None) assert_in('INFO.txt', ds.repo.get_indexed_files())
def test_create_test_dataset(): # rudimentary smoke test from datalad.api import create_test_dataset with swallow_logs(), swallow_outputs(): dss = create_test_dataset(spec='2/1-2') ok_(5 <= len(dss) <= 7) # at least five - 1 top, two on top level, 1 in each for ds in dss: ok_clean_git(ds, annex=None) # some of them are annex but we just don't check ok_(len(glob(opj(ds, 'file*'))))
def test_GitRepo_add_submodule(source, path): top_repo = GitRepo(path, create=True) top_repo.add_submodule('sub', name='sub', url=source) top_repo.commit('submodule added') eq_([s.name for s in top_repo.get_submodules()], ['sub']) ok_clean_git(path) ok_clean_git(op.join(path, 'sub'))
def test_save(path): ds = Dataset(path) with open(opj(path, "new_file.tst"), "w") as f: f.write("something") ds.repo.add("new_file.tst", git=True) ok_(ds.repo.dirty) ds.save("add a new file", all_changes=False) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) with open(opj(path, "new_file.tst"), "w") as f: f.write("modify") ok_(ds.repo.dirty) ds.save("modified new_file.tst", all_changes=True) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) files = ['one.txt', 'two.txt'] for fn in files: with open(opj(path, fn), "w") as f: f.write(fn) ds.add([opj(path, f) for f in files]) # superfluous call to save (add saved it already), should not fail # but report that nothing was saved assert_false(ds.save("set of new files")) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # create subdataset subds = ds.create('subds') ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # modify subds with open(opj(subds.path, "some_file.tst"), "w") as f: f.write("something") subds.add('.') ok_clean_git(subds.path, annex=isinstance(ds.repo, AnnexRepo)) ok_(ds.repo.dirty) # ensure modified subds is committed ds.save(all_changes=True) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo))
def test_bf1886(path): parent = Dataset(path).create() sub = parent.create('sub') ok_clean_git(parent.path) # create a symlink pointing down to the subdataset, and add it os.symlink('sub', opj(parent.path, 'down')) parent.add('down') ok_clean_git(parent.path) # now symlink pointing up os.makedirs(opj(parent.path, 'subdir', 'subsubdir')) os.symlink(opj(pardir, 'sub'), opj(parent.path, 'subdir', 'up')) parent.add(opj('subdir', 'up')) ok_clean_git(parent.path) # now symlink pointing 2xup, as in #1886 os.symlink(opj(pardir, pardir, 'sub'), opj(parent.path, 'subdir', 'subsubdir', 'upup')) parent.add(opj('subdir', 'subsubdir', 'upup')) ok_clean_git(parent.path) # simulatenously add a subds and a symlink pointing to it # create subds, but don't register it sub2 = create(opj(parent.path, 'sub2')) os.symlink( opj(pardir, pardir, 'sub2'), opj(parent.path, 'subdir', 'subsubdir', 'upup2')) parent.add(['sub2', opj('subdir', 'subsubdir', 'upup2')]) ok_clean_git(parent.path) # full replication of #1886: the above but be in subdir of symlink # with no reference dataset sub3 = create(opj(parent.path, 'sub3')) os.symlink( opj(pardir, pardir, 'sub3'), opj(parent.path, 'subdir', 'subsubdir', 'upup3')) # need to use absolute paths with chpwd(opj(parent.path, 'subdir', 'subsubdir')): add([opj(parent.path, 'sub3'), opj(parent.path, 'subdir', 'subsubdir', 'upup3')]) # here is where we need to disagree with the repo in #1886 # we would not expect that `add` registers sub3 as a subdataset # of parent, because no reference dataset was given and the # command cannot decide (with the current semantics) whether # it should "add anything in sub3 to sub3" or "add sub3 to whatever # sub3 is in" ok_clean_git(parent.path, untracked=['sub3/'])
def test_procedure_discovery(path, super_path): ps = run_procedure(discover=True) # there are a few procedures coming with datalad, needs to find them assert_true(len(ps) > 2) # we get three essential properties eq_( sum([ 'procedure_type' in p and 'procedure_callfmt' in p and 'path' in p for p in ps ]), len(ps)) # set up dataset with registered procedure (c&p from test_basics): ds = Dataset(path).create(force=True) # TODO: this procedure would leave a clean dataset, but `run` cannot handle dirty # input yet, so manual for now ds.add('code', to_git=True) ds.run_procedure('setup_yoda_dataset') ok_clean_git(ds.path) # configure dataset to look for procedures in its code folder ds.config.add('datalad.locations.dataset-procedures', 'code', where='dataset') # configure dataset to run the demo procedure prior to the clean command ds.config.add('datalad.clean.proc-pre', 'datalad_test_proc', where='dataset') ds.add(op.join('.datalad', 'config')) # run discovery on the dataset: ps = ds.run_procedure(discover=True) # still needs to find procedures coming with datalad assert_true(len(ps) > 2) # we get three essential properties eq_( sum([ 'procedure_type' in p and 'procedure_callfmt' in p and 'path' in p for p in ps ]), len(ps)) # dataset's procedure needs to be in the results assert_in_results(ps, path=op.join(ds.path, 'code', 'datalad_test_proc.py')) # make it a subdataset and try again: super = Dataset(super_path).create() super.install('sub', source=ds.path) ps = super.run_procedure(discover=True) # still needs to find procedures coming with datalad assert_true(len(ps) > 2) # we get three essential properties eq_( sum([ 'procedure_type' in p and 'procedure_callfmt' in p and 'path' in p for p in ps ]), len(ps)) # dataset's procedure needs to be in the results assert_in_results(ps, path=op.join(super.path, 'sub', 'code', 'datalad_test_proc.py')) if not on_windows: # no symlinks import os # create a procedure which is a broken symlink, but recognizable as a # python script: os.symlink(op.join(super.path, 'sub', 'not_existent'), op.join(super.path, 'sub', 'code', 'broken_link_proc.py')) # broken symlink at procedure location, but we can't tell, whether it is # an actual procedure without any guess on how to execute it: os.symlink(op.join(super.path, 'sub', 'not_existent'), op.join(super.path, 'sub', 'code', 'unknwon_broken_link')) ps = super.run_procedure(discover=True) # still needs to find procedures coming with datalad and the dataset # procedure registered before assert_true(len(ps) > 3) assert_in_results(ps, path=op.join(super.path, 'sub', 'code', 'broken_link_proc.py'), state='absent') assert_not_in_results(ps, path=op.join(super.path, 'sub', 'code', 'unknwon_broken_link'))
def test_uninstall_recursive(path): ds = Dataset(path).create(force=True) subds = ds.create('deep', force=True) # we add one file eq_(len(subds.add('.')), 1) # save all -> all clean ds.save(all_changes=True, recursive=True) ok_clean_git(subds.path) ok_clean_git(ds.path) # now uninstall in subdataset through superdataset target_fname = opj('deep', 'dir', 'test') # sane starting point ok_(exists(opj(ds.path, target_fname))) # doesn't have the minimum number of copies for a safe drop # TODO: better exception assert_raises(CommandError, ds.drop, target_fname, recursive=True) # this should do it ds.drop(target_fname, check=False, recursive=True) # link is dead lname = opj(ds.path, target_fname) ok_(not exists(lname)) # entire hierarchy saved ok_clean_git(subds.path) ok_clean_git(ds.path) # now same with actual handle removal # content is dropped already, so no checks in place anyway ds.remove(target_fname, check=True, recursive=True) ok_(not exists(lname) and not lexists(lname)) ok_clean_git(subds.path) ok_clean_git(ds.path)
def test_get_modified_subpaths(path): ds = Dataset(path).create(force=True) suba = ds.create('ba', force=True) subb = ds.create('bb', force=True) subsub = ds.create(opj('bb', 'bba', 'bbaa'), force=True) ds.add('.', recursive=True) ok_clean_git(path) orig_base_commit = ds.repo.repo.commit().hexsha # nothing was modified compared to the status quo, output must be empty eq_([], list(get_modified_subpaths( [dict(path=ds.path)], ds, orig_base_commit))) # modify one subdataset create_tree(subsub.path, {'added': 'test'}) subsub.add('added') # it will replace the requested path with the path of the closest # submodule that is modified assert_result_count( get_modified_subpaths( [dict(path=ds.path)], ds, orig_base_commit), 1, type='dataset', path=subb.path) # make another one dirty create_tree(suba.path, {'added': 'test'}) # now a single query path will result in the two modified subdatasets assert_result_count( get_modified_subpaths( [dict(path=ds.path)], ds, orig_base_commit), 2, type='dataset') # now save uptop, this will the new state of subb, but keep suba dirty ds.save(subb.path, recursive=True) # now if we ask for what was last saved, we only get the new state of subb assert_result_count( get_modified_subpaths( [dict(path=ds.path)], ds, 'HEAD~1..HEAD'), 1, type='dataset', path=subb.path) # comparing the working tree to head will the dirty suba instead assert_result_count( get_modified_subpaths( [dict(path=ds.path)], ds, 'HEAD'), 1, type='dataset', path=suba.path) # add/save everything, become clean ds.add('.', recursive=True) ok_clean_git(path) # nothing is reported as modified assert_result_count( get_modified_subpaths( [dict(path=ds.path)], ds, 'HEAD'), 0) # but looking all the way back, we find all changes assert_result_count( get_modified_subpaths( [dict(path=ds.path)], ds, orig_base_commit), 2, type='dataset') # now we ask specifically for the file we added to subsub above query = [dict(path=opj(subsub.path, 'added'))] res = list(get_modified_subpaths(query, ds, orig_base_commit)) # we only get this one result back, and not all the submodule state changes # that were also saved in the superdatasets assert_result_count(res, 1) assert_result_count( res, 1, type='file', path=opj(subsub.path, 'added'), state='added') # but if we are only looking at the last saved change (suba), we will not # find our query return something res = get_modified_subpaths(query, ds, 'HEAD^') assert_result_count(res, 0) # deal with removal (force insufiicient copies error) ds.remove(suba.path, check=False) ok_clean_git(path) res = list(get_modified_subpaths([dict(path=ds.path)], ds, 'HEAD~1..HEAD')) # removed submodule + .gitmodules update assert_result_count(res, 2) assert_result_count( res, 1, type_src='dataset', path=suba.path)
def test_annotate_paths(dspath, nodspath): # this test doesn't use API`remove` to avoid circularities ds = make_demo_hierarchy_datasets(dspath, demo_hierarchy) ds.add('.', recursive=True) ok_clean_git(ds.path) with chpwd(dspath): # with and without an explicitly given path the result is almost the # same inside a dataset without_path = annotate_paths(on_failure='ignore') pwd_res = annotate_paths(path='.', on_failure='ignore') assert_result_count( without_path, 1, type='dataset', path=dspath) assert_result_count( pwd_res, 1, type='dataset', path=dspath, orig_request='.', raw_input=True) # make sure going into a subdataset vs giving it as a path has no # structural impact eq_( [{k: v for k, v in ap.items() if k not in ('registered_subds', 'raw_input', 'orig_request', 'refds')} for ap in annotate_paths(path='b', recursive=True)], [{k: v for k, v in ap.items() if k not in ('registered_subds', 'raw_input', 'orig_request', 'refds')} for ap in annotate_paths(dataset='b', recursive=True)]) # now do it again, pointing to the ds directly res = ds.annotate_paths(on_failure='ignore') # no request, no refds, but otherwise the same eq_(len(res), len(pwd_res)) eq_({k: pwd_res[0][k] for k in pwd_res[0] if k in ('path', 'type', 'action', 'status')}, {k: res[0][k] for k in res[0] if k not in ('refds',)}) # will refuse a path that is not a dataset as refds res = annotate_paths(dataset=nodspath, on_failure='ignore') assert_result_count( res, 1, status='error', path=nodspath, message='given reference dataset is not a dataset') # recursion with proper base dataset parentds = Dataset(opj(dspath, 'a')) base_res = parentds.annotate_paths(recursive=True) # needs to find 'aa' and the base assert_result_count(base_res, 2) assert_result_count(base_res, 2, type='dataset') assert_result_count( base_res, 1, type='dataset', state='clean', parentds=parentds.path, path=opj(parentds.path, 'aa'), status='') # same recursion but without a base dataset res = annotate_paths(path=opj(dspath, 'a'), recursive=True) # needs to find 'aa' and 'a' again assert_result_count(res, 2) eq_(res[-1], {k: base_res[-1][k] for k in base_res[-1] if k not in ('refds',)}) assert_result_count( res, 1, type='dataset', status='', # it does not auto-discover parent datasets without force or a refds #parentds=parentds.path, path=parentds.path) # but we can force parent discovery res = parentds.annotate_paths( path=opj(dspath, 'a'), recursive=True, force_parentds_discovery=True) assert_result_count(res, 2) assert_result_count( res, 1, type='dataset', status='', parentds=dspath, path=parentds.path) # recursion with multiple disjoint seeds, no common base eq_([basename(p) for p in annotate_paths( path=[opj(dspath, 'a'), opj(dspath, 'b', 'bb', 'bba')], recursive=True, result_xfm='paths')], ['a', 'aa', 'bba', 'bbaa']) # recursion with partially overlapping seeds, no duplicate results eq_([basename(p) for p in annotate_paths( path=[opj(dspath, 'b'), opj(dspath, 'b', 'bb', 'bba')], recursive=True, result_xfm='paths')], ['b', 'ba', 'bb', 'bba', 'bbaa']) # get straight from a file fpath = opj('a', 'aa', 'file_aa') res = ds.annotate_paths(fpath) assert_result_count(res, 1) assert_result_count( res, 1, orig_request=fpath, raw_input=True, type='file', path=opj(ds.path, fpath), parentds=opj(ds.path, 'a', 'aa'), status='') # now drop it dropres = ds.drop(fpath, check=False) assert_result_count(dropres, 1, path=res[0]['path'], status='ok') # ask for same file again, use 'notneeded' for unavailable to try trigger # any difference droppedres = ds.annotate_paths(fpath, unavailable_path_status='notneeded') # but we get the same result eq_(res, droppedres) # now try the same on an uninstalled dataset subdspath = opj('b', 'bb') # before before_res = ds.annotate_paths(subdspath, recursive=True, unavailable_path_status='error') assert_result_count(before_res, 3, status='', type='dataset') uninstall_res = ds.uninstall(subdspath, recursive=True, check=False) assert_result_count(uninstall_res, 3, status='ok', type='dataset') # after after_res = ds.annotate_paths(subdspath, unavailable_path_status='error', on_failure='ignore') # uninstall hides all low-level datasets assert_result_count(after_res, 1) # but for the top-most uninstalled one it merely reports absent state now assert_result_count( after_res, 1, state='absent', **{k: before_res[0][k] for k in before_res[0] if k not in ('state', 'status')}) # however, this beauty doesn't come for free, so it can be disabled # which will make the uninstalled subdataset like a directory in the # parent (or even just a non-existing path, if the mountpoint dir isn't # present after_res = ds.annotate_paths(subdspath, force_subds_discovery=False) assert_result_count( after_res, 1, type='directory', path=before_res[0]['path'], parentds=before_res[0]['parentds']) # feed annotated paths into annotate_paths, it shouldn't change things # upon second run # datasets and file res = ds.annotate_paths(['.', fpath], recursive=True) # make a copy, just to the sure orig_res = deepcopy(res) assert_result_count(res, 7) # and in again, no recursion this time res_again = ds.annotate_paths(res) # doesn't change a thing eq_(orig_res, res_again) # and in again, with recursion this time res_recursion_again = ds.annotate_paths(res, recursive=True) assert_result_count(res_recursion_again, 7) # doesn't change a thing eq_(orig_res, res_recursion_again)
def test_publish_simple(origin, src_path, dst_path): # prepare src source = install(path=src_path, source=origin, recursive=True) # TODO: For now, circumnavigate the detached head issue. # Figure out, what to do. for subds in source.get_dataset_handles(recursive=True): AnnexRepo(opj(src_path, subds), init=True, create=True).git_checkout("master") # forget we cloned it (provide no 'origin' anymore), which should lead to # setting tracking branch to target: source.repo.git_remote_remove("origin") # create plain git at target: target = GitRepo(dst_path, create=True) target.git_checkout("TMP", "-b") source.repo.git_remote_add("target", dst_path) res = publish(dataset=source, dest="target") eq_(res, source) ok_clean_git(src_path, annex=False) ok_clean_git(dst_path, annex=False) eq_(list(target.git_get_branch_commits("master")), list(source.repo.git_get_branch_commits("master"))) # don't fail when doing it again res = publish(dataset=source, dest="target") eq_(res, source) ok_clean_git(src_path, annex=False) ok_clean_git(dst_path, annex=False) eq_(list(target.git_get_branch_commits("master")), list(source.repo.git_get_branch_commits("master"))) eq_(list(target.git_get_branch_commits("git-annex")), list(source.repo.git_get_branch_commits("git-annex"))) # 'target/master' should be tracking branch at this point, so # try publishing without `dest`: # some modification: with open(opj(src_path, 'test_mod_file'), "w") as f: f.write("Some additional stuff.") source.repo.git_add(opj(src_path, 'test_mod_file')) source.repo.git_commit("Modified.") ok_clean_git(src_path, annex=False) res = publish(dataset=source) eq_(res, source) ok_clean_git(dst_path, annex=False) eq_(list(target.git_get_branch_commits("master")), list(source.repo.git_get_branch_commits("master"))) eq_(list(target.git_get_branch_commits("git-annex")), list(source.repo.git_get_branch_commits("git-annex")))
def test_diff_recursive(path): ds = Dataset(path).create() sub = ds.create('sub') # look at the last change, and confirm a dataset was added res = ds.diff(revision='HEAD~1..HEAD') assert_result_count(res, 1, action='diff', state='added', path=sub.path, type='dataset') # now recursive res = ds.diff(recursive=True, revision='HEAD~1..HEAD') # we also get the entire diff of the subdataset from scratch assert_status('ok', res) ok_(len(res) > 3) # one specific test assert_result_count(res, 1, action='diff', state='added', path=opj(sub.path, '.datalad', 'config')) # now we add a file to just the parent create_tree(ds.path, { 'onefile': 'tobeadded', 'sub': { 'twofile': 'tobeadded' } }) res = ds.diff(recursive=True, report_untracked='all') assert_result_count(res, 3) assert_result_count(res, 1, action='diff', state='untracked', path=opj(ds.path, 'onefile'), type='file') assert_result_count(res, 1, action='diff', state='modified', path=sub.path, type='dataset') assert_result_count(res, 1, action='diff', state='untracked', path=opj(sub.path, 'twofile'), type='file') # save sub sub.add('.') # save sub in parent ds.save() # save addition in parent ds.add('.') ok_clean_git(ds.path) # look at the last change, only one file was added res = ds.diff(revision='HEAD~1..HEAD') assert_result_count(res, 1) assert_result_count(res, 1, action='diff', state='added', path=opj(ds.path, 'onefile'), type='file') # now the exact same thing with recursion, must not be different from the call # above res = ds.diff(recursive=True, revision='HEAD~1..HEAD') assert_result_count(res, 1) # last change in parent assert_result_count(res, 1, action='diff', state='added', path=opj(ds.path, 'onefile'), type='file') # one further back brings in the modified subdataset, and the added file within it res = ds.diff(recursive=True, revision='HEAD~2..HEAD') assert_result_count(res, 3) assert_result_count(res, 1, action='diff', state='added', path=opj(ds.path, 'onefile'), type='file') assert_result_count(res, 1, action='diff', state='added', path=opj(sub.path, 'twofile'), type='file') assert_result_count(res, 1, action='diff', state='modified', path=sub.path, type='dataset')
def test_within_ds_file_search(path): try: import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) # override default and search for datasets and files for this test for m in ('egrep', 'textblob', 'autofield'): ds.config.add('datalad.search.index-{}-documenttype'.format(m), 'all', where='dataset') ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') makedirs(opj(path, 'stim')) for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ): copy(opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.add('.') # yoh: CANNOT FIGURE IT OUT since in direct mode it gets added to git # directly BUT # - output reports key, so seems to be added to annex! # - when I do manually in cmdline - goes to annex ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True) # If it is not under annex, below addition of metadata silently does # not do anything list( ds.repo.set_metadata(opj('stim', 'stim1.mp3'), init={'importance': 'very'})) ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio', ): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.importance audio.bitrate audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber datalad_core.id datalad_core.refcommit id parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched in ( ('egrep', ':mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # same as above, leading : is stripped, in indicates "ALL FIELDS" ('egrep', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # same as above, but with AND condition # get both matches ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), { 'type': 'file', 'audio.format': 'mp3' }), # case insensitive search ('egrep', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # field selection by expression ('egrep', 'audio\.+:mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), { 'meta': 'mp3' }), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) # in egrep we currently do not search unique values # and the queries above aim at files assert_result_count(res, 1 if mode == 'egrep' else 2) if mode != 'egrep': assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match for matched_key, matched_val in matched.items(): assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val)
def test_update_simple(origin, src_path, dst_path): # prepare src source = install(src_path, source=origin, recursive=True) # forget we cloned it (provide no 'origin' anymore), which should lead to # setting tracking branch to target: source.repo.remove_remote("origin") # dataset without sibling will not need updates assert_status('notneeded', source.update()) # deprecation message doesn't ruin things assert_status('notneeded', source.update(fetch_all=True)) # but error if unknown sibling is given assert_status('impossible', source.update(sibling='funky', on_failure='ignore')) # get a clone to update later on: dest = install(dst_path, source=src_path, recursive=True) # test setup done; # assert all fine ok_clean_git(dst_path) ok_clean_git(src_path) # update yields nothing => up-to-date assert_status('ok', dest.update()) ok_clean_git(dst_path) # modify origin: with open(opj(src_path, "update.txt"), "w") as f: f.write("Additional content") source.save(path="update.txt", message="Added update.txt") ok_clean_git(src_path) # fail when asked to update a non-dataset assert_status( 'impossible', source.update("update.txt", on_failure='ignore')) # fail when asked to update a something non-existent assert_status( 'impossible', source.update("nothere", on_failure='ignore')) # update without `merge` only fetches: assert_status('ok', dest.update()) # modification is not known to active branch: assert_not_in("update.txt", dest.repo.get_files(dest.repo.get_active_branch())) # modification is known to branch origin/master assert_in("update.txt", dest.repo.get_files("origin/master")) # merge: assert_status('ok', dest.update(merge=True)) # modification is now known to active branch: assert_in("update.txt", dest.repo.get_files(dest.repo.get_active_branch())) # it's known to annex, but has no content yet: dest.repo.get_file_key("update.txt") # raises if unknown eq_([False], dest.repo.file_has_content(["update.txt"])) # smoke-test if recursive update doesn't fail if submodule is removed # and that we can run it from within a dataset without providing it # explicitly assert_result_count( dest.remove('subm 1'), 1, status='ok', action='remove', path=opj(dest.path, 'subm 1')) with chpwd(dest.path): assert_result_count( update(recursive=True), 2, status='ok', type='dataset') assert_result_count( dest.update(merge=True, recursive=True), 2, status='ok', type='dataset') # and now test recursive update with merging in differences create_tree(opj(source.path, '2'), {'load.dat': 'heavy'}) source.save(opj('2', 'load.dat'), message="saving changes within subm2", recursive=True) assert_result_count( dest.update(merge=True, recursive=True), 2, status='ok', type='dataset') # and now we can get new file dest.get('2/load.dat') ok_file_has_content(opj(dest.path, '2', 'load.dat'), 'heavy')
def test_update_volatile_subds(originpath, otherpath, destpath): origin = Dataset(originpath).create() ds = install( source=originpath, path=destpath, result_xfm='datasets', return_type='item-or-list') # as a submodule sname = 'subm 1' osm1 = origin.create(sname) assert_result_count(ds.update(), 1, status='ok', type='dataset') # nothing without a merge, no inappropriate magic assert_not_in(sname, ds.subdatasets(result_xfm='relpaths')) assert_result_count(ds.update(merge=True), 1, status='ok', type='dataset') # and we should be able to do update with recursive invocation assert_result_count(ds.update(merge=True, recursive=True), 1, status='ok', type='dataset') # known, and placeholder exists assert_in(sname, ds.subdatasets(result_xfm='relpaths')) ok_(exists(opj(ds.path, sname))) # remove from origin origin.remove(sname) assert_result_count(ds.update(merge=True), 1, status='ok', type='dataset') # gone locally, wasn't checked out assert_not_in(sname, ds.subdatasets(result_xfm='relpaths')) assert_false(exists(opj(ds.path, sname))) # re-introduce at origin osm1 = origin.create(sname) create_tree(osm1.path, {'load.dat': 'heavy'}) origin.save(opj(osm1.path, 'load.dat')) assert_result_count(ds.update(merge=True), 1, status='ok', type='dataset') # grab new content of uninstall subdataset, right away ds.get(opj(ds.path, sname, 'load.dat')) ok_file_has_content(opj(ds.path, sname, 'load.dat'), 'heavy') # modify ds and subds at origin create_tree(origin.path, {'mike': 'this', sname: {'probe': 'little'}}) origin.save(recursive=True) ok_clean_git(origin.path) # updates for both datasets should come down the pipe assert_result_count(ds.update(merge=True, recursive=True), 2, status='ok', type='dataset') ok_clean_git(ds.path) # now remove just-installed subdataset from origin again origin.remove(sname, check=False) assert_not_in(sname, origin.subdatasets(result_xfm='relpaths')) assert_in(sname, ds.subdatasets(result_xfm='relpaths')) # merge should disconnect the installed subdataset, but leave the actual # ex-subdataset alone assert_result_count(ds.update(merge=True, recursive=True), 1, type='dataset') assert_not_in(sname, ds.subdatasets(result_xfm='relpaths')) ok_file_has_content(opj(ds.path, sname, 'load.dat'), 'heavy') ok_(Dataset(opj(ds.path, sname)).is_installed()) # now remove the now disconnected subdataset for further tests # not using a bound method, not giving a parentds, should # not be needed to get a clean dataset remove(op.join(ds.path, sname), check=False) ok_clean_git(ds.path) # new separate subdataset, not within the origin dataset otherds = Dataset(otherpath).create() # install separate dataset as a submodule ds.install(source=otherds.path, path='other') create_tree(otherds.path, {'brand': 'new'}) otherds.save() ok_clean_git(otherds.path) # pull in changes res = ds.update(merge=True, recursive=True) assert_result_count( res, 2, status='ok', action='update', type='dataset') # the next is to check for #2858 ok_clean_git(ds.path)
def test_gh3421(path): # failed to add d/sub:file ds = Dataset(path).create(force=True) ds.add('top:file') ds.add(opj('d', 'sub:file')) ok_clean_git(ds.path)
def test_container_files(path, super_path): raise SkipTest('SingularityHub is gone for now') ds = Dataset(path).create() cmd = ['dir'] if on_windows else ['ls'] # plug in a proper singularity image ds.containers_add( 'mycontainer', url=testimg_url, image='righthere', # the next one is auto-guessed #call_fmt='singularity exec {img} {cmd}' ) assert_result_count(ds.containers_list(), 1, path=op.join(ds.path, 'righthere'), name='mycontainer') ok_clean_git(path) def assert_no_change(res, path): # this command changed nothing # # Avoid specifying the action because it will change from "add" to # "save" in DataLad v0.12. assert_result_count(res, 1, status='notneeded', path=path, type='dataset') # now we can run stuff in the container # and because there is just one, we don't even have to name the container res = ds.containers_run(cmd) # container becomes an 'input' for `run` -> get request, but "notneeded" assert_result_count(res, 1, action='get', status='notneeded', path=op.join(ds.path, 'righthere'), type='file') assert_no_change(res, ds.path) # same thing as we specify the container by its name: res = ds.containers_run(cmd, container_name='mycontainer') # container becomes an 'input' for `run` -> get request, but "notneeded" assert_result_count(res, 1, action='get', status='notneeded', path=op.join(ds.path, 'righthere'), type='file') assert_no_change(res, ds.path) # we can also specify the container by its path: res = ds.containers_run(cmd, container_name=op.join(ds.path, 'righthere')) # container becomes an 'input' for `run` -> get request, but "notneeded" assert_result_count(res, 1, action='get', status='notneeded', path=op.join(ds.path, 'righthere'), type='file') assert_no_change(res, ds.path) # Now, test the same thing, but with this dataset being a subdataset of # another one: super_ds = Dataset(super_path).create() super_ds.install("sub", source=path) # When running, we don't discover containers in subdatasets with assert_raises(ValueError) as cm: super_ds.containers_run(cmd) assert_in("No known containers", text_type(cm.exception)) # ... unless we need to specify the name res = super_ds.containers_run(cmd, container_name="sub/mycontainer") # container becomes an 'input' for `run` -> get request (needed this time) assert_result_count(res, 1, action='get', status='ok', path=op.join(super_ds.path, 'sub', 'righthere'), type='file') assert_no_change(res, super_ds.path)
def test_get_recurse_subdatasets(src, path): ds = install(path, source=src, result_xfm='datasets', return_type='item-or-list') # ask for the two subdatasets specifically. This will obtain them, # but not any content of any files in them subds1, subds2 = ds.get(['subm 1', '2'], get_data=False, description="youcouldnotmakethisup", result_xfm='datasets') for d in (subds1, subds2): eq_(d.repo.get_description(), 'youcouldnotmakethisup') # there are 3 files to get: test-annex.dat within each dataset: rel_path_sub1 = opj(basename(subds1.path), 'test-annex.dat') rel_path_sub2 = opj(basename(subds2.path), 'test-annex.dat') annexed_files = {'test-annex.dat', rel_path_sub1, rel_path_sub2} # None of them is currently present: ok_(ds.repo.file_has_content('test-annex.dat') is False) ok_(subds1.repo.file_has_content('test-annex.dat') is False) ok_(subds2.repo.file_has_content('test-annex.dat') is False) ok_clean_git(subds1.path) # explicitly given path in subdataset => implicit recursion: # MIH: Nope, we fulfill the dataset handle, but that doesn't # imply fulfilling all file handles result = ds.get(rel_path_sub1, recursive=True) # all good actions assert_status('ok', result) assert_in_results(result, path=opj(ds.path, rel_path_sub1), status='ok') ok_(subds1.repo.file_has_content('test-annex.dat') is True) # drop it: subds1.repo.drop('test-annex.dat') ok_(subds1.repo.file_has_content('test-annex.dat') is False) # now, with a path not explicitly pointing within a # subdataset, but recursive option: # get everything: result = ds.get(recursive=True, result_filter=lambda x: x.get('type') != 'dataset') assert_status('ok', result) eq_( set([ item.get('path')[len(ds.path) + 1:] for item in result if item['type'] == 'file' ]), annexed_files) ok_(ds.repo.file_has_content('test-annex.dat') is True) ok_(subds1.repo.file_has_content('test-annex.dat') is True) ok_(subds2.repo.file_has_content('test-annex.dat') is True) # drop them: ds.repo.drop('test-annex.dat') subds1.repo.drop('test-annex.dat') subds2.repo.drop('test-annex.dat') ok_(ds.repo.file_has_content('test-annex.dat') is False) ok_(subds1.repo.file_has_content('test-annex.dat') is False) ok_(subds2.repo.file_has_content('test-annex.dat') is False) # now, the very same call, but without recursive: result = ds.get('.', recursive=False) assert_status('ok', result) # one report is on the requested dir eq_(len(result) - 1, 1) assert_result_count(result, 1, path=opj(ds.path, 'test-annex.dat'), status='ok') ok_(ds.repo.file_has_content('test-annex.dat') is True) ok_(subds1.repo.file_has_content('test-annex.dat') is False) ok_(subds2.repo.file_has_content('test-annex.dat') is False)
def get_baseline(p): ds = Dataset(p).create() sub = ds.create('sub', save=False) # subdataset saw another commit after becoming a submodule ok_clean_git(ds.path, index_modified=['sub']) return ds
def test_diff(path, norepo): with chpwd(norepo): assert_status('impossible', diff(on_failure='ignore')) ds = Dataset(path).create() ok_clean_git(ds.path) # reports stupid revision input assert_result_count(ds.diff(revision='WTF', on_failure='ignore'), 1, status='impossible', message="fatal: bad revision 'WTF'") assert_result_count(ds.diff(), 0) # no diff assert_result_count(ds.diff(), 0) assert_result_count(ds.diff(revision='HEAD'), 0) # bogus path makes no difference assert_result_count(ds.diff(path='THIS', revision='HEAD'), 0) # comparing to a previous state we should get a diff in most cases # for this test, let's not care what exactly it is -- will do later assert len(ds.diff(revision='HEAD~1')) > 0 # let's introduce a known change create_tree(ds.path, {'new': 'empty'}) ds.add('.', to_git=True) ok_clean_git(ds.path) res = ds.diff(revision='HEAD~1') assert_result_count(res, 1) assert_result_count(res, 1, action='diff', path=opj(ds.path, 'new'), state='added') # we can also find the diff without going through the dataset explicitly with chpwd(ds.path): assert_result_count(diff(revision='HEAD~1'), 1, action='diff', path=opj(ds.path, 'new'), state='added') # no diff against HEAD assert_result_count(ds.diff(), 0) # modify known file create_tree(ds.path, {'new': 'notempty'}) for diffy in (None, 'HEAD'): res = ds.diff(revision=diffy) assert_result_count(res, 1) assert_result_count(res, 1, action='diff', path=opj(ds.path, 'new'), state='modified') # but if we give another path, it doesn't show up assert_result_count(ds.diff('otherpath'), 0) # giving the right path must work though assert_result_count(ds.diff('new'), 1, action='diff', path=opj(ds.path, 'new'), state='modified') # stage changes ds.add('.', to_git=True, save=False) # no diff, because we staged the modification assert_result_count(ds.diff(), 0) # but we can get at it assert_result_count(ds.diff(staged=True), 1, action='diff', path=opj(ds.path, 'new'), state='modified') # OR assert_result_count(ds.diff(revision='HEAD'), 1, action='diff', path=opj(ds.path, 'new'), state='modified') ds.save() ok_clean_git(ds.path) # untracked stuff create_tree(ds.path, {'deep': {'down': 'untracked', 'down2': 'tobeadded'}}) # a plain diff should report the untracked file # but not directly, because the parent dir is already unknown res = ds.diff() assert_result_count(res, 1) assert_result_count(res, 1, state='untracked', type='directory', path=opj(ds.path, 'deep')) # report of individual files is also possible assert_result_count(ds.diff(report_untracked='all'), 2, state='untracked', type='file') # an unmatching path will hide this result assert_result_count(ds.diff(path='somewhere'), 0) # perfect match and anything underneath will do assert_result_count(ds.diff(path='deep'), 1, state='untracked', path=opj(ds.path, 'deep'), type='directory') assert_result_count(ds.diff(path='deep'), 1, state='untracked', path=opj(ds.path, 'deep')) # now we stage on of the two files in deep ds.add(opj('deep', 'down2'), to_git=True, save=False) # without any reference it will ignore the staged stuff and report the remaining # untracked file assert_result_count(ds.diff(), 1, state='untracked', path=opj(ds.path, 'deep', 'down'), type='file') res = ds.diff(staged=True) assert_result_count(res, 1, state='untracked', path=opj(ds.path, 'deep', 'down'), type='file') assert_result_count(res, 1, state='added', path=opj(ds.path, 'deep', 'down2'), type='file')
def test_exif(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'exif', where='dataset') copy(opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('exif.jpg') assert_result_count(res, 1) # compare full expected metadata set to catch any change of mind on the # side of the EXIF library assert_result_count( res, 1, metadata={ "exif:InteroperabilityVersion": "[48, 49, 48, 48]", "exif:ExifVersion": 221.0, "exif:FocalLengthIn35mmFilm": 38.0, "exif:CompressedBitsPerPixel": 5.0, "exif:GainControl": "None", "exif:Compression": "JPEG (old-style)", "exif:PrintIM": "[80, 114, 105, 110, 116, 73, 77, 0, 48, 51, 48, 48, 0, 0, 0, 5, 0, 1, 0, 22, 0, 22, 0, 2, 1, 0, 0, 0, 1, 0, 5, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 16, 131, 0, 0, 0]", "exif:Make": "CASIO COMPUTER CO.,LTD.", "exif:Sharpness": "Normal", "exif:Contrast": "Normal", "exif:ColorSpace": "sRGB", "exif:ExposureMode": "Auto Exposure", "exif:ExposureBiasValue": 0.0, "exif:ExifImageWidth": 4.0, "exif:ComponentsConfiguration": "YCbCr", "exif:DateTimeOriginal": "2011:03:13 16:36:02", "exif:MaxApertureValue": "14/5", "exif:DateTime": "2017:10:08 10:21:03", "exif:InteroperabilityOffset": 30412.0, "exif:InteroperabilityIndex": "R98", "exif:FileSource": "Digital Camera", "exif:ResolutionUnit": "Pixels/Inch", "exif:FNumber": "27/10", "exif:ExposureProgram": "Program Normal", "exif:DigitalZoomRatio": "0/0", "exif:LightSource": "Unknown", "exif:ExifImageLength": 3.0, "exif:FlashPixVersion": 100.0, "exif:CustomRendered": "Normal", "exif:Flash": "Flash fired, auto mode", "exif:WhiteBalance": "Auto", "exif:Orientation": "Horizontal (normal)", "exif:ExposureTime": "1/60", "exif:Software": "GIMP 2.8.20", "exif:Model": "EX-S600", "exif:FocalLength": "31/5", "exif:SceneCaptureType": "Standard", "exif:ExifOffset": 272.0, "exif:Saturation": "Normal", "exif:YCbCrPositioning": "Centered", "exif:DateTimeDigitized": "2011:03:13 16:36:02", "exif:XResolution": 72.0, "exif:YResolution": 72.0, "exif:MeteringMode": "Pattern", })
def test_within_ds_file_search(path): try: import nibabel except ImportError: raise SkipTest ds = Dataset(path).create(force=True) ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset') makedirs(opj(path, 'stim')) for src, dst in (('nifti1.nii.gz', opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz')), ('nifti1.nii.gz', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'))): copy(opj(dirname(dirname(__file__)), 'tests', 'data', 'files', src), opj(path, dst)) ds.save() ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('bids', 'nifti1'): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.key bids.BIDSVersion bids.author bids.citation bids.conformsto bids.datatype bids.description """ if external_versions['bids'] >= '0.9': target_out += "bids.extension\n" target_out += """\ bids.fundedby bids.license bids.name bids.subject.age(years) bids.subject.gender bids.subject.handedness bids.subject.hearing_problems_current bids.subject.id bids.subject.language bids.suffix bids.task datalad_core.id datalad_core.refcommit id nifti1.cal_max nifti1.cal_min nifti1.datatype nifti1.description nifti1.dim nifti1.freq_axis nifti1.intent nifti1.magic nifti1.phase_axis nifti1.pixdim nifti1.qform_code nifti1.sform_code nifti1.sizeof_hdr nifti1.slice_axis nifti1.slice_duration nifti1.slice_end nifti1.slice_order nifti1.slice_start nifti1.spatial_resolution(mm) nifti1.t_unit nifti1.temporal_spacing(s) nifti1.toffset nifti1.vox_offset nifti1.xyz_unit parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump # so we will use diff diff = list(unified_diff(target_out.splitlines(), cmo.out.splitlines())) assert_in(target_out, cmo.out, msg="Diff: %s" % os.linesep.join(diff)) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched_key, matched_val in ( # random keyword query # multi word query implies AND ('textblob', ['bold', 'female'], opj('sub-03', 'func', 'sub-03_task-some_bold.nii.gz'), 'meta', 'female'), # report which field matched with auto-field ('autofield', 'female', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'), 'bids.subject.gender', 'female'), # autofield multi-word query is also AND ('autofield', ['bids.suffix:bold', 'bids.subject.id:01'], opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz'), 'bids.suffix', 'bold'), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) if mode == 'textblob': # 'textblob' does datasets by default only (be could be configured otherwise assert_result_count(res, 1) else: # the rest has always a file and the dataset, because they carry metadata in # the same structure assert_result_count(res, 2) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val)
def test_install_into_dataset(source, top_path): ds = create(top_path) ok_clean_git(ds.path) subds = ds.install("sub", source=source, save=False) if isinstance(subds.repo, AnnexRepo) and subds.repo.is_direct_mode(): ok_(exists(opj(subds.path, '.git'))) else: ok_(isdir(opj(subds.path, '.git'))) ok_(subds.is_installed()) assert_in('sub', ds.get_subdatasets()) # sub is clean: ok_clean_git(subds.path, annex=False) # top is not: assert_raises(AssertionError, ok_clean_git, ds.path, annex=False) ds.save('addsub') # now it is: ok_clean_git(ds.path, annex=False) # but we could also save while installing and there should be no side-effect # of saving any other changes if we state to not auto-save changes # Create a dummy change create_tree(ds.path, {'dummy.txt': 'buga'}) ok_clean_git(ds.path, untracked=['dummy.txt']) subds_ = ds.install("sub2", source=source, if_dirty='ignore') eq_(subds_.path, opj(ds.path, "sub2")) # for paranoid yoh ;) ok_clean_git(ds.path, untracked=['dummy.txt']) # and we should achieve the same behavior if we create a dataset # and then decide to "add" it create(_path_(top_path, 'sub3'), if_dirty='ignore') ok_clean_git(ds.path, untracked=['dummy.txt', 'sub3/']) ds.install('sub3', if_dirty='ignore') ok_clean_git(ds.path, untracked=['dummy.txt'])
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) # before anything aggregated we would get nothing and only a log warning with swallow_logs(new_level=logging.WARNING) as cml: assert_equal(list(query_aggregated_metadata('all', ds, [])), []) assert_re_in('.*Found no aggregated metadata.*update', cml.out) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') ds.add('.', recursive=True) ok_clean_git(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True, update_mode='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 6) assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_result_count(res, 3, status='ok', action='save') # nice and tidy ok_clean_git(ds.path) # quick test of aggregate report aggs = ds.metadata(get_aggregates=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 6) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 3, type='file') # Now that we have annex.key # three different IDs assert_equal( 3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == assure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install(opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 2) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extact same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in(r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def test_install_dataset_from(url, path): ds = install(path=path, source=url) eq_(ds.path, path) ok_(ds.is_installed()) ok_clean_git(path, annex=False)
def test_bf1886(path): parent = Dataset(path).create() sub = parent.create('sub') ok_clean_git(parent.path) # create a symlink pointing down to the subdataset, and add it os.symlink('sub', opj(parent.path, 'down')) parent.add('down') ok_clean_git(parent.path) # now symlink pointing up os.makedirs(opj(parent.path, 'subdir', 'subsubdir')) os.symlink(opj(pardir, 'sub'), opj(parent.path, 'subdir', 'up')) parent.add(opj('subdir', 'up')) ok_clean_git(parent.path) # now symlink pointing 2xup, as in #1886 os.symlink(opj(pardir, pardir, 'sub'), opj(parent.path, 'subdir', 'subsubdir', 'upup')) parent.add(opj('subdir', 'subsubdir', 'upup')) ok_clean_git(parent.path) # simulatenously add a subds and a symlink pointing to it # create subds, but don't register it sub2 = create(opj(parent.path, 'sub2')) os.symlink( opj(pardir, pardir, 'sub2'), opj(parent.path, 'subdir', 'subsubdir', 'upup2')) parent.add(['sub2', opj('subdir', 'subsubdir', 'upup2')]) ok_clean_git(parent.path) # full replication of #1886: the above but be in subdir of symlink # with no reference dataset sub3 = create(opj(parent.path, 'sub3')) os.symlink( opj(pardir, pardir, 'sub3'), opj(parent.path, 'subdir', 'subsubdir', 'upup3')) # need to use absolute paths with chpwd(opj(parent.path, 'subdir', 'subsubdir')): rev_save([opj(parent.path, 'sub3'), opj(parent.path, 'subdir', 'subsubdir', 'upup3')]) # in contrast to `add` only operates on a single top-level dataset # although it is not specified, it get's discovered based on the PWD # the logic behind that feels a bit shaky # consult discussion in https://github.com/datalad/datalad/issues/3230 # if this comes up as an issue at some point ok_clean_git(parent.path)
def test_configs(path): # set up dataset with registered procedure (c&p from test_basics): ds = Dataset(path).create(force=True) # TODO: this procedure would leave a clean dataset, but `run` cannot handle dirty # input yet, so manual for now ds.add('code', to_git=True) ds.run_procedure('setup_yoda_dataset') ok_clean_git(ds.path) # configure dataset to look for procedures in its code folder ds.config.add('datalad.locations.dataset-procedures', 'code', where='dataset') # 1. run procedure based on execution guessing by run_procedure: ds.run_procedure(spec=['datalad_test_proc', 'some_arg']) # look for traces ok_file_has_content(op.join(ds.path, 'fromproc.txt'), 'some_arg\n') # 2. now configure specific call format including usage of substitution config # for run: ds.config.add('datalad.procedures.datalad_test_proc.call-format', 'python "{script}" "{ds}" {{mysub}} {args}', where='dataset') ds.config.add('datalad.run.substitutions.mysub', 'dataset-call-config', where='dataset') # TODO: Should we allow for --inputs/--outputs arguments for run_procedure # (to be passed into run)? ds.unlock("fromproc.txt") # run again: ds.run_procedure(spec=['datalad_test_proc', 'some_arg']) # look for traces ok_file_has_content(op.join(ds.path, 'fromproc.txt'), 'dataset-call-config\n') # 3. have a conflicting config at user-level, which should override the # config on dataset level: ds.config.add('datalad.procedures.datalad_test_proc.call-format', 'python "{script}" "{ds}" local {args}', where='local') ds.unlock("fromproc.txt") # run again: ds.run_procedure(spec=['datalad_test_proc', 'some_arg']) # look for traces ok_file_has_content(op.join(ds.path, 'fromproc.txt'), 'local\n') # 4. get configured help message: r = ds.run_procedure('datalad_test_proc', help_proc=True, on_failure='ignore') assert_true(len(r) == 1) assert_in_results(r, status="impossible") ds.config.add('datalad.procedures.datalad_test_proc.help', "This is a help message", where='dataset') r = ds.run_procedure('datalad_test_proc', help_proc=True) assert_true(len(r) == 1) assert_in_results(r, message="This is a help message", status='ok')
def test_subdataset_save(path): parent = Dataset(path).create() sub = parent.create('sub') ok_clean_git(parent.path) create_tree(parent.path, { "untracked": 'ignore', 'sub': { "new": "wanted"}}) sub.add('new') # defined state: one untracked, modified (but clean in itself) subdataset ok_clean_git(sub.path) ok_clean_git(parent.path, untracked=['untracked'], index_modified=['sub']) # `save sub` does not save the parent!! with chpwd(parent.path): assert_status('notneeded', save(path=sub.path)) ok_clean_git(parent.path, untracked=['untracked'], index_modified=['sub']) # `save -d .` saves the state change in the subdataset, but leaves any untracked # content alone with chpwd(parent.path): assert_status('ok', parent._save()) ok_clean_git(parent.path, untracked=['untracked']) # get back to the original modified state and check that -S behaves in # exactly the same way create_tree(parent.path, { 'sub': { "new2": "wanted2"}}) sub.add('new2') ok_clean_git(parent.path, untracked=['untracked'], index_modified=['sub']) with chpwd(parent.path): assert_status( # notneeded to save sub, but need to save parent ['ok', 'notneeded'], # the key condition of this test is that no reference dataset is # given! save(path='sub', super_datasets=True)) # save super must not cause untracked content to be commited! ok_clean_git(parent.path, untracked=['untracked'])
def test_uninstall_recursive(path): ds = Dataset(path).create(force=True) subds = ds.create('deep', force=True) # we add one file, but we get a response for the requested # directory too res = subds.add('.') assert_result_count(res, 1, action='add', status='ok', type='file') assert_result_count(res, 1, action='save', status='ok', type='dataset') # save all -> all clean ds.save(recursive=True) ok_clean_git(subds.path) ok_clean_git(ds.path) # now uninstall in subdataset through superdataset target_fname = opj('deep', 'dir', 'test') # sane starting point ok_(exists(opj(ds.path, target_fname))) # doesn't have the minimum number of copies for a safe drop res = ds.drop(target_fname, recursive=True, on_failure='ignore') assert_status('error', res) assert_result_values_cond( res, 'message', lambda x: "configured minimum number of copies not found" in x or "Could only verify the existence of 0 out of 1 necessary copies" in x) # this should do it ds.drop(target_fname, check=False, recursive=True) # link is dead lname = opj(ds.path, target_fname) ok_(not exists(lname)) # entire hierarchy saved ok_clean_git(subds.path) ok_clean_git(ds.path) # now same with actual handle removal # content is dropped already, so no checks in place anyway ds.remove(target_fname, check=True, recursive=True) ok_(not exists(lname) and not lexists(lname)) ok_clean_git(subds.path) ok_clean_git(ds.path)
def test_recursive_save(path): ds = Dataset(path).create() # nothing to save assert_status('notneeded', ds._save()) subds = ds.create('sub') # subdataset presence already saved ok_clean_git(ds.path) subsubds = subds.create('subsub') assert_equal( ds.subdatasets(recursive=True, fulfilled=True, result_xfm='paths'), [subds.path, subsubds.path]) newfile_name = opj(subsubds.path, 'test') with open(newfile_name, 'w') as f: f.write('some') # saves the status change of the subdataset due to the subsubdataset addition assert_result_values_equal( ds._save(result_filter=is_ok_dataset), 'path', [ds.path]) # make the new file known to its dataset ds.add(newfile_name, save=False) # but remains dirty because of the uncommited file down below assert ds.repo.dirty # auto-add will save nothing deep down without recursive assert_status('notneeded', ds._save()) assert ds.repo.dirty # with recursive pick up the change in subsubds assert_result_values_equal( ds._save(recursive=True, result_filter=is_ok_dataset), 'path', [subsubds.path, subds.path, ds.path]) # at this point the entire tree is clean ok_clean_git(ds.path) states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] # now we save recursively, nothing should happen res = ds._save(recursive=True) # we do not get any report from a subdataset, because we detect at the # very top that the entire tree is clean assert_result_count(res, 1) assert_result_count(res, 1, status='notneeded', action='save', path=ds.path) # now we introduce new files all the way down create_tree(subsubds.path, {"mike1": 'mike1'}) # because we cannot say from the top if there is anything to do down below, # we have to traverse and we will get reports for all dataset, but there is # nothing actually saved res = ds._save(recursive=True) assert_result_count(res, 3) assert_status('notneeded', res) subsubds_indexed = subsubds.repo.get_indexed_files() assert_not_in('mike1', subsubds_indexed) assert_equal(states, [d.repo.get_hexsha() for d in (ds, subds, subsubds)]) unlink(opj(subsubds.path, 'mike1')) ok_clean_git(ds.path) # modify content in subsub and try saving testfname = newfile_name subsubds.unlock(testfname) with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') # the following should all do nothing # no auto_add assert_status('notneeded', ds._save()) # no recursive assert_status('notneeded', ds._save()) # an explicit target saves only the corresponding dataset assert_result_values_equal( save(path=[testfname]), 'path', [subsubds.path]) # plain recursive without any files given will save the beast assert_result_values_equal( ds._save(recursive=True, result_filter=is_ok_dataset), 'path', [subds.path, ds.path]) # there is nothing else to save assert_status('notneeded', ds._save(recursive=True)) ok_clean_git(ds.path) # one more time and check that all datasets in the hierarchy are not # contaminated with untracked files states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] testfname = opj('sub', 'subsub', 'saveme2') with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') assert_status('notneeded', ds._save(recursive=True)) newstates = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] for old, new in zip(states, newstates): assert_equal(old, new) assert ds.repo.dirty unlink(opj(ds.path, testfname)) ok_clean_git(ds.path) # now let's check saving "upwards" create_tree(subds.path, {"testnew": 'smth', "testadded": "added"}) subds.repo.add("testadded") indexed_files = subds.repo.get_indexed_files() assert subds.repo.dirty assert ds.repo.dirty assert not subsubds.repo.dirty create_tree(subsubds.path, {"testnew2": 'smth'}) assert subsubds.repo.dirty # and indexed files didn't change assert_equal(indexed_files, subds.repo.get_indexed_files()) ok_clean_git(subds.repo, untracked=['testnew'], index_modified=['subsub'], head_modified=['testadded']) old_states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] subsubds._save(message="savingtestmessage", super_datasets=True) # this save actually didn't save anything in subsub (or anywhere), # because there were only untracked bits pending for old, new in zip(old_states, [d.repo.get_hexsha() for d in (ds, subds, subsubds)]): assert_equal(old, new) # but now we are saving this untracked bit specifically subsubds._save(message="savingtestmessage", path=['testnew2'], super_datasets=True) ok_clean_git(subsubds.repo) # but its super should have got only the subsub saved # not the file we created ok_clean_git(subds.repo, untracked=['testnew'], head_modified=['testadded']) # check commits to have correct messages # there are no more dedicated superdataset-save commits anymore, because # superdatasets get saved as part of the processed hierarchy and can contain # other parts in the commit (if so instructed) assert_equal(next(subsubds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal(next(subds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal(next(ds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') # and if we try to save while being within that subsubds path subsubds.unlock('testnew2') create_tree(subsubds.path, {"testnew2": 'smth2'}) # trying to replicate https://github.com/datalad/datalad/issues/1540 subsubds._save(message="saving new changes", all_updated=True) # no super with chpwd(subds.path): # no explicit dataset is provided by path is provided save(path=['subsub'], message='saving sub', super_datasets=True) # super should get it saved too assert_equal(next(ds.repo.get_branch_commits('master')).message.rstrip(), 'saving sub')
def test_recursive_save(path): ds = Dataset(path).create() # nothing to save assert_false(ds.save()) subds = ds.create('sub') # subdataset presence already saved ok_clean_git(ds.path) subsubds = subds.create('subsub') assert_equal( ds.get_subdatasets(recursive=True, absolute=True, fulfilled=True), [subsubds.path, subds.path]) newfile_name = opj(subsubds.path, 'test') with open(newfile_name, 'w') as f: f.write('some') # saves the status change of the subdataset due to the subsubdataset addition assert_equal(ds.save(all_changes=True), [ds]) # make the new file known to its dataset # with #1141 this would be #ds.add(newfile_name, save=False) subsubds.add(newfile_name, save=False) # but remains dirty because of the untracked file down below assert ds.repo.dirty # auto-add will save nothing deep down without recursive assert_equal(ds.save(all_changes=True), []) assert ds.repo.dirty # with recursive pick up the change in subsubds assert_equal(ds.save(all_changes=True, recursive=True), [subsubds, subds, ds]) # modify content in subsub and try saving testfname = newfile_name subsubds.unlock(testfname) with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') # the following should all do nothing # no auto_add assert_false(ds.save()) # no recursive assert_false(ds.save(all_changes=True)) # an explicit target saves only the corresponding dataset assert_equal(save(files=[testfname]), [subsubds]) # plain recursive without any files given will save the beast assert_equal(ds.save(recursive=True), [subds, ds]) # there is nothing else to save assert_false(ds.save(all_changes=True, recursive=True)) # one more time and check that all datasets in the hierarchy get updated states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] testfname = opj('sub', 'subsub', 'saveme2') with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') assert_true(ds.save(all_changes=True, recursive=True)) newstates = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] for old, new in zip(states, newstates): assert_not_equal(old, new) # now let's check saving "upwards" assert not subds.repo.dirty create_tree(subds.path, {"testnew": 'smth', "testadded": "added"}) subds.repo.add("testadded") indexed_files = subds.repo.get_indexed_files() assert subds.repo.dirty assert ds.repo.dirty assert not subsubds.repo.dirty create_tree(subsubds.path, {"testnew2": 'smth'}) assert subsubds.repo.dirty # and indexed files didn't change assert_equal(indexed_files, subds.repo.get_indexed_files()) ok_clean_git(subds.repo, untracked=['testnew'], index_modified=['subsub'], head_modified=['testadded']) subsubds.save(message="savingtestmessage", super_datasets=True, all_changes=True) ok_clean_git(subsubds.repo) # but its super should have got only the subsub saved # not the file we created ok_clean_git(subds.repo, untracked=['testnew'], head_modified=['testadded']) # check commits to have correct messages # there are no more dedicated superdataset-save commits anymore, because # superdatasets get saved as part of the processed hierarchy and can contain # other parts in the commit (if so instructed) assert_equal( next(subsubds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal( next(subds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal( next(ds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage')
def test_add_recursive(path): # make simple hierarchy parent = Dataset(path).create() ok_clean_git(parent.path) sub1 = parent.create(opj('down', 'sub1')) ok_clean_git(parent.path) sub2 = parent.create('sub2') # next one make the parent dirty subsub = sub2.create('subsub') ok_clean_git(parent.path, index_modified=['sub2']) res = parent.rev_save() ok_clean_git(parent.path) # now add content deep in the hierarchy create_tree(subsub.path, {'new': 'empty'}) ok_clean_git(parent.path, index_modified=['sub2']) # recursive add should not even touch sub1, because # it knows that it is clean res = parent.add('.', recursive=True) # the key action is done assert_result_count(res, 1, path=opj(subsub.path, 'new'), action='add', status='ok') # sub1 is untouched, and not reported assert_result_count(res, 0, path=sub1.path) # saved all the way up assert_result_count(res, 3, action='save', status='ok') ok_clean_git(parent.path)
def __call__( # it is optional, because `rerun` can get a recorded one cmd=None, dataset=None, message=None, rerun=False): if rerun and cmd: lgr.warning('Ignoring provided command in --rerun mode') cmd = None if not dataset: # act on the whole dataset if nothing else was specified dataset = get_dataset_root(curdir) ds = require_dataset(dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path # delayed imports from datalad.cmd import Runner from datalad.tests.utils import ok_clean_git lgr.debug('tracking command output underneath %s', ds) try: # base assumption is that the animal smells superb ok_clean_git(ds.path) except AssertionError: yield get_status_dict( 'run', ds=ds, status='impossible', message= 'unsaved modifications present, cannot detect changes by command' ) return if not cmd and not rerun: # TODO here we would need to recover a cmd when a rerun is attempted return if rerun: # pull run info out of the last commit message err_info = get_status_dict('run', ds=ds) if not ds.repo.get_hexsha(): yield dict(err_info, status='impossible', message='cannot re-run command, nothing recorded') return last_commit_msg = ds.repo.repo.head.commit.message cmdrun_regex = r'\[DATALAD RUNCMD\] (.*)=== Do not change lines below ===\n(.*)\n\^\^\^ Do not change lines above \^\^\^' runinfo = re.match(cmdrun_regex, last_commit_msg, re.MULTILINE | re.DOTALL) if not runinfo: yield dict( err_info, status='impossible', message= 'cannot re-run command, last saved state does not look like a recorded command run' ) return rec_msg, runinfo = runinfo.groups() if message is None: # re-use commit message, if nothing new was given message = rec_msg try: runinfo = json.loads(runinfo) except Exception as e: yield dict( err_info, status='error', message= ('cannot re-run command, command specification is not valid JSON: %s', e.message)) return if 'cmd' not in runinfo: yield dict( err_info, status='error', message= 'cannot re-run command, command specification missing in recorded state' ) return cmd = runinfo['cmd'] rec_exitcode = runinfo.get('exit', 0) rel_pwd = runinfo.get('pwd', None) if rel_pwd: # recording is relative to the dataset pwd = normpath(opj(ds.path, rel_pwd)) else: rel_pwd = None # normalize, just in case pwd = None # now we have to find out what was modified during the last run, and enable re-modification # ideally, we would bring back the entire state of the tree with #1424, but we limit ourself # to file addition/not-in-place-modification for now to_unlock = [] for r in ds.diff(recursive=True, revision='HEAD~1...HEAD', return_type='generator', result_renderer=None): if r.get('type', None) == 'file' and \ r.get('state', None) in ('added', 'modified'): r.pop('status', None) to_unlock.append(r) if to_unlock: for r in ds.unlock(to_unlock, return_type='generator', result_xfm=None): yield r else: # not a rerun, figure out where we are running pwd = ds.path rel_pwd = curdir # anticipate quoted compound shell commands cmd = cmd[0] if isinstance(cmd, list) and len(cmd) == 1 else cmd # TODO do our best to guess which files to unlock based on the command string # in many cases this will be impossible (but see --rerun). however, # generating new data (common case) will be just fine already # we have a clean dataset, let's run things cmd_exitcode = None runner = Runner(cwd=pwd) try: lgr.info("== Command start (output follows) =====") runner.run( cmd, # immediate output log_online=True, # not yet sure what we should do with the command output # IMHO `run` itself should be very silent and let the command talk log_stdout=False, log_stderr=False, expect_stderr=True, expect_fail=True, # TODO stdin ) except CommandError as e: # strip our own info from the exception. The original command output # went to stdout/err -- we just have to exitcode in the same way cmd_exitcode = e.code if not rerun or rec_exitcode != cmd_exitcode: # we failed during a fresh run, or in a different way during a rerun # the latter can easily happen if we try to alter a locked file # # let's fail here, the command could have had a typo or some # other undesirable condition. If we would `add` nevertheless, # we would need to rerun and aggregate annex content that we # likely don't want # TODO add switch to ignore failure (some commands are stupid) # TODO add the ability to `git reset --hard` the dataset tree on failure # we know that we started clean, so we could easily go back, needs gh-1424 # to be able to do it recursively raise CommandError(code=cmd_exitcode) lgr.info("== Command exit (modification check follows) =====") # ammend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode if cmd_exitcode is not None else 0, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd # compose commit message cmd_shorty = (' '.join(cmd) if isinstance(cmd, list) else cmd) cmd_shorty = '{}{}'.format(cmd_shorty[:40], '...' if len(cmd_shorty) > 40 else '') msg = '[DATALAD RUNCMD] {}\n\n=== Do not change lines below ===\n{}\n^^^ Do not change lines above ^^^'.format( message if message is not None else cmd_shorty, json.dumps(run_info, indent=1), sort_keys=True, ensure_ascii=False, encoding='utf-8') for r in ds.add('.', recursive=True, message=msg): yield r
def test_save(path): ds = Dataset(path) with open(opj(path, "new_file.tst"), "w") as f: f.write("something") ds.repo.add("new_file.tst", git=True) ok_(ds.repo.dirty) ds._save("add a new file") ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) with open(opj(path, "new_file.tst"), "w") as f: f.write("modify") ok_(ds.repo.dirty) ds._save("modified new_file.tst") ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # save works without ds and files given in the PWD with open(opj(path, "new_file.tst"), "w") as f: f.write("rapunzel") with chpwd(path): save("love rapunzel") ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # and also without `-a` when things are staged with open(opj(path, "new_file.tst"), "w") as f: f.write("exotic") ds.repo.add("new_file.tst", git=True) with chpwd(path): save("love marsians") ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) files = ['one.txt', 'two.txt'] for fn in files: with open(opj(path, fn), "w") as f: f.write(fn) ds.add([opj(path, f) for f in files]) # superfluous call to save (add saved it already), should not fail # but report that nothing was saved assert_status('notneeded', ds._save("set of new files")) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # create subdataset subds = ds.create('subds') ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # modify subds with open(opj(subds.path, "some_file.tst"), "w") as f: f.write("something") subds.add('.') ok_clean_git(subds.path, annex=isinstance(subds.repo, AnnexRepo)) # Note/TODO: ok_clean_git is failing in direct mode, due to staged but # uncommited .datalad (probably caused within create) ok_(ds.repo.dirty) # ensure modified subds is committed ds._save() ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # now introduce a change downstairs subds.create('someotherds') ok_clean_git(subds.path, annex=isinstance(subds.repo, AnnexRepo)) ok_(ds.repo.dirty) # and save via subdataset path ds._save('subds') ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo))