def test_add_recursive(path=None): # make simple hierarchy parent = Dataset(path).create() assert_repo_status(parent.path) sub1 = parent.create(op.join('down', 'sub1')) assert_repo_status(parent.path) sub2 = parent.create('sub2') # next one make the parent dirty subsub = sub2.create('subsub') assert_repo_status(parent.path, modified=['sub2']) res = parent.save() assert_repo_status(parent.path) # now add content deep in the hierarchy create_tree(subsub.path, {'new': 'empty'}) assert_repo_status(parent.path, modified=['sub2']) # recursive add should not even touch sub1, because # it knows that it is clean res = parent.save(recursive=True, jobs=5) # the key action is done assert_result_count(res, 1, path=op.join(subsub.path, 'new'), action='add', status='ok') # saved all the way up assert_result_count(res, 3, action='save', status='ok') assert_repo_status(parent.path)
def test_invalid_call(path=None): with chpwd(path): # ^ Change directory so that we don't fail with an # InvalidGitRepositoryError if the test is executed from a git # worktree. # needs a SSH URL assert_raises(InsufficientArgumentsError, create_sibling, '') assert_raises(ValueError, create_sibling, 'http://ignore.me') # needs an actual dataset assert_raises(ValueError, create_sibling, 'datalad-test:/tmp/somewhere', dataset='/nothere') # pre-configure a bogus remote ds = Dataset(path).create() ds.repo.add_remote('bogus', 'http://bogus.url.com') # fails to reconfigure by default with generated # and also when given an existing name for res in (ds.create_sibling('bogus:/tmp/somewhere', on_failure='ignore'), ds.create_sibling('datalad-test:/tmp/somewhere', name='bogus', on_failure='ignore')): assert_result_count( res, 1, status='error', message= ("sibling '%s' already configured (specify alternative name, or force reconfiguration via --existing", 'bogus')) if not have_webui(): # need an extension package assert_raises(RuntimeError, ds.create_sibling, '', ui=True)
def check_addurls_from_key(self, key_arg, expected_backend, fake_dates, path): ds = Dataset(path).create(force=True, fake_dates=fake_dates) if OLD_EXAMINEKEY and ds.repo.is_managed_branch(): raise SkipTest("Adjusted branch functionality requires " "more recent `git annex examinekey`") ds.addurls(self.json_file, "{url}", "{name}", exclude_autometa="*", key=key_arg, result_renderer='disabled') repo = ds.repo repo_path = ds.repo.pathobj paths = [repo_path / x for x in "ac"] annexinfo = repo.get_content_annexinfo(eval_availability=True) for path in paths: pstat = annexinfo[path] eq_(pstat["backend"], expected_backend) assert_false(pstat["has_content"]) get_res = ds.get(paths, result_renderer='disabled', on_failure="ignore") assert_result_count(get_res, 2, action="get", status="ok")
def test_get_mixed_hierarchy(src=None, path=None): origin = Dataset(src).create(annex=False) origin_sub = origin.create('subds') with open(opj(origin.path, 'file_in_git.txt'), "w") as f: f.write('no idea') with open(opj(origin_sub.path, 'file_in_annex.txt'), "w") as f: f.write('content') origin.save('file_in_git.txt', to_git=True) origin_sub.save('file_in_annex.txt') origin.save() # now, install that thing: ds, subds = install(path, source=src, recursive=True, result_xfm='datasets', return_type='item-or-list', result_filter=None) ok_(subds.repo.file_has_content("file_in_annex.txt") is False) # and get: result = ds.get(curdir, recursive=True) # git repo and subds assert_status(['ok', 'notneeded'], result) assert_result_count(result, 1, path=opj(subds.path, "file_in_annex.txt"), status='ok') ok_(subds.repo.file_has_content("file_in_annex.txt") is True)
def test_no_worktree_impact_false_deletions(path=None): ds = Dataset(path).create() # create a branch that has no new content ds.repo.call_git(['checkout', '-b', 'test']) # place two successive commits with file additions into the default branch ds.repo.call_git(['checkout', DEFAULT_BRANCH]) (ds.pathobj / 'identical').write_text('should be') ds.save() (ds.pathobj / 'new').write_text('yes') ds.save() # now perform a diff for the last commit, there is one file that remained # identifical ds.repo.call_git(['checkout', 'test']) res = ds.diff(fr=DEFAULT_BRANCH + '~1', to=DEFAULT_BRANCH, result_renderer='disabled') # under no circumstances can there be any reports on deleted files # because we never deleted anything assert_result_count(res, 0, state='deleted') # the identical file must be reported clean assert_result_count( res, 1, state='clean', path=str(ds.pathobj / 'identical'), )
def test_install_recursive_with_data(src=None, path=None): _make_dataset_hierarchy(src) # now again; with data: res = install(path, source=src, recursive=True, get_data=True, result_filter=None, result_xfm=None) assert_status('ok', res) # installed a dataset and two subdatasets, and one file with content in # each assert_result_count(res, 5, type='dataset', action='install') assert_result_count(res, 2, type='file', action='get') # we recurse top down during installation, so toplevel should appear at # first position in returned list eq_(res[0]['path'], path) top_ds = YieldDatasets()(res[0]) ok_(top_ds.is_installed()) def all_have_content(repo): ainfo = repo.get_content_annexinfo(init=None, eval_availability=True) return all(st["has_content"] for st in ainfo.values()) if isinstance(top_ds.repo, AnnexRepo): ok_(all_have_content(top_ds.repo)) for subds in top_ds.subdatasets(recursive=True, result_xfm='datasets'): ok_(subds.is_installed(), "Not installed: %s" % (subds, )) if isinstance(subds.repo, AnnexRepo): ok_(all_have_content(subds.repo))
def test_gh1426(origin_path=None, target_path=None): # set up a pair of repos, one the published copy of the other origin = Dataset(origin_path).create() target = mk_push_target(origin, 'target', target_path, annex=True, bare=False) origin.push(to='target') assert_repo_status(origin.path) assert_repo_status(target.path) eq_(origin.repo.get_hexsha(DEFAULT_BRANCH), target.get_hexsha(DEFAULT_BRANCH)) # gist of #1426 is that a newly added subdataset does not cause the # superdataset to get published origin.create('sub') assert_repo_status(origin.path) neq_(origin.repo.get_hexsha(DEFAULT_BRANCH), target.get_hexsha(DEFAULT_BRANCH)) # now push res = origin.push(to='target') assert_result_count(res, 1, status='ok', type='dataset', path=origin.path, action='publish', target='target', operations=['fast-forward']) eq_(origin.repo.get_hexsha(DEFAULT_BRANCH), target.get_hexsha(DEFAULT_BRANCH))
def test_add_files(path=None): ds = Dataset(path).create(force=True) test_list_1 = ['test_annex.txt'] test_list_2 = ['test.txt'] test_list_3 = ['test1.dat', 'test2.dat'] test_list_4 = [ op.join('dir', 'testindir'), op.join('dir', OBSCURE_FILENAME) ] for arg in [(test_list_1[0], False), (test_list_2[0], True), (test_list_3, False), (test_list_4, False)]: # special case 4: give the dir: if arg[0] == test_list_4: result = ds.save('dir', to_git=arg[1]) status = ds.repo.get_content_annexinfo(['dir']) else: result = ds.save(arg[0], to_git=arg[1]) for a in ensure_list(arg[0]): assert_result_count(result, 1, path=str(ds.pathobj / a)) status = ds.repo.get_content_annexinfo( ut.Path(p) for p in ensure_list(arg[0])) for f, p in status.items(): if arg[1]: assert p.get('key', None) is None, f else: assert p.get('key', None) is not None, f
def test_get_subdataset_direct_fetch(path=None): path = Path(path) origin = Dataset(path / "origin").create() for sub in ["s0", "s1"]: sds = origin.create(origin.pathobj / sub) sds.repo.commit(msg="another commit", options=["--allow-empty"]) origin.save() s0 = Dataset(origin.pathobj / "s0") s1 = Dataset(origin.pathobj / "s1") # Abandon the recorded commit so that it needs to be brought down by a # direct fetch. s0.repo.call_git(["reset", "--hard", "HEAD~"]) s1.repo.call_git(["reset", "--hard", "HEAD~"]) # Tweak the configuration of s0 to make the direct fetch fail. # Disallow direct oid fetch (default). s0.repo.config.set("uploadpack.allowAnySHA1InWant", "false", scope="local") # Configure the fetcher to avoid v2, which allows fetching unadvertised # objects regardless of the value of uploadpack.allowAnySHA1InWant. s0.repo.config.set("protocol.version", "0", scope="local") # Configure s1 to succeed with direct fetch. s1.repo.config.set("uploadpack.allowAnySHA1InWant", "true", scope="local") clone = install(str(path / "clone"), source="ssh://datalad-test:" + origin.repo.pathobj.as_posix()) res = clone.get(["s0", "s1"], on_failure="ignore") assert_result_count(res, 1, action="install", type="dataset", status="error") assert_result_count(res, 1, action="install", type="dataset", status="ok")
def test_gh3356(src=None, path=None): # create toy version of gh-3356 scenario origin = Dataset(src).create() origin_sub = origin.create(origin.pathobj / 'subdir' / 'subds') for p in ((origin_sub.pathobj / 'data' / 'file_in_annex.txt'), (origin_sub.pathobj / 'data' / 'file_in_annex2.txt')): p.parent.mkdir(parents=True, exist_ok=True) p.write_text(p.name) origin.save(recursive=True) clone = install(path, source=src, result_xfm='datasets', return_type='item-or-list') targetpaths = [ opj('subdir', 'subds', 'data', 'file_in_annex.txt'), opj('subdir', 'subds', 'data', 'file_in_annex2.txt'), ] with chpwd(path): res = get(targetpaths) # get() must report success on two files assert_result_count(res, 2, action='get', type='file', status='ok') # status must report content for two files assert_result_count(clone.status(recursive=True, annex='all'), 2, action='status', has_content=True)
def test_get_in_unavailable_subdataset(src=None, path=None): _make_dataset_hierarchy(src) root = install(path, source=src, result_xfm='datasets', return_type='item-or-list') targetpath = opj('sub1', 'sub2') targetabspath = opj(root.path, targetpath) with chpwd(path): res = get(targetabspath) assert_result_count(res, 2, status='ok', action='install', type='dataset') # dry-fit result filter that only returns the result that matched the requested # path filtered = [r for r in res if only_matching_paths(r, path=targetabspath)] assert_result_count(filtered, 1, status='ok', action='install', type='dataset', path=targetabspath) # we got the dataset, and its immediate content, but nothing below sub2 = Dataset(targetabspath) ok_(sub2.is_installed()) ok_(sub2.repo.file_has_content('file_in_annex.txt') is True) ok_(not Dataset(opj(targetabspath, 'sub3')).is_installed())
def test_install_skip_failed_recursive(src=None, path=None): _mk_submodule_annex(src, fname="test-annex.dat", fcontent="whatever") # install top level: ds = install(path, source=src) sub1 = Dataset(opj(path, 'subm 1')) sub2 = Dataset(opj(path, '2')) # sabotage recursive installation of 'subm 1' by polluting the target: with open(opj(path, 'subm 1', 'blocking.txt'), "w") as f: f.write("sdfdsf") with swallow_logs(new_level=logging.WARNING) as cml: result = ds.get(os.curdir, recursive=True, on_failure='ignore', result_xfm=None) # toplevel dataset was in the house already assert_result_count(result, 0, path=ds.path, type='dataset') # subm 1 should fail to install. [1] since comes after '2' submodule assert_in_results( result, status='error', path=sub1.path, type='dataset', message='target path already exists and not empty, refuse to ' 'clone into target path') assert_in_results(result, status='ok', path=sub2.path)
def test_audio(path=None): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'audio', scope='branch') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'), path) ds.save() assert_repo_status(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('audio.mp3') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['audio'] for k, v in target.items(): eq_(meta[k], v) assert_in('@context', meta) uniques = ds.metadata(reporton='datasets', return_type='item-or-list' )['metadata']['datalad_unique_content_properties'] # test file has it, but uniques have it blanked out, because the extractor considers it worthless # for discovering whole datasets assert_in('bitrate', meta) eq_(uniques['audio']['bitrate'], None) # 'date' field carries not value, hence gets exclude from the unique report assert_in('date', meta) assert (not meta['date']) assert_not_in('date', uniques['audio'])
def test_download_url_existing_dir_no_slash_exception(path=None): with chpwd(path): res = download_url('url', path="dir", save=False, on_failure='ignore') assert_result_count(res, 1, status='error') assert_message( "Non-directory path given (no trailing separator) " "but a directory with that name (after adding " "archive suffix) exists", res)
def test_no_store(path=None): ds = Dataset(path).create() # check that we fail without '--new-store-ok' when there is no store assert_result_count(ds.create_sibling_ria("'ria+file:///no/where'", "datastore", on_failure='ignore'), 1, status="error")
def test_newthings_coming_down(originpath=None, destpath=None): origin = GitRepo(originpath, create=True) create_tree(originpath, {'load.dat': 'heavy'}) Dataset(originpath).save('load.dat') ds = install(source=originpath, path=destpath, result_xfm='datasets', return_type='item-or-list') assert_is_instance(ds.repo, GitRepo) assert_in(DEFAULT_REMOTE, ds.repo.get_remotes()) # turn origin into an annex origin = AnnexRepo(originpath, create=True) # clone doesn't know yet assert_false(knows_annex(ds.path)) # but after an update it should # no merge, only one sibling, no parameters should be specific enough assert_result_count(ds.update(), 1, status='ok', type='dataset') assert (knows_annex(ds.path)) # no branches appeared eq_(ds.repo.get_branches(), [DEFAULT_BRANCH]) # now merge, and get an annex assert_result_count(ds.update(merge=True), 1, action='update', status='ok', type='dataset') assert_in('git-annex', ds.repo.get_branches()) assert_is_instance(ds.repo, AnnexRepo) # should be fully functional testfname = opj(ds.path, 'load.dat') assert_false(ds.repo.file_has_content(testfname)) ds.get('.') ok_file_has_content(opj(ds.path, 'load.dat'), 'heavy') # check that a new tag comes down origin.tag('first!') assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(ds.repo.get_tags(output='name')[0], 'first!') # and now we destroy the remote annex origin.call_git(['config', '--remove-section', 'annex']) rmtree(opj(origin.path, '.git', 'annex'), chmod_files=True) origin.call_git(['branch', '-D', 'git-annex']) origin = GitRepo(originpath) assert_false(knows_annex(originpath)) # and update the local clone # for now this should simply not fail (see gh-793), later might be enhanced to a # graceful downgrade before_branches = ds.repo.get_branches() ok_(any("git-annex" in b for b in ds.repo.get_remote_branches())) assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(before_branches, ds.repo.get_branches()) # annex branch got pruned assert_false(any("git-annex" in b for b in ds.repo.get_remote_branches())) # check that a new tag comes down even if repo types mismatch origin.tag('second!') assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(ds.repo.get_tags(output='name')[-1], 'second!')
def test_update_fetch_all(path=None): path = Path(path) remote_1 = str(path / "remote_1") remote_2 = str(path / "remote_2") ds = Dataset(path / "src").create() src = ds.repo.path ds_rmt1 = clone(source=src, path=remote_1) ds_rmt2 = clone(source=src, path=remote_2) ds.siblings('add', name="sibling_1", url=remote_1) ds.siblings('add', name="sibling_2", url=remote_2) # modify the remotes: (ds_rmt1.pathobj / "first.txt").write_text("some file load") ds_rmt1.save() # TODO: Modify an already present file! (ds_rmt2.pathobj / "second.txt").write_text("different file load") ds_rmt2.save() # Let's init some special remote which we couldn't really update/fetch if not dl_cfg.get('datalad.tests.dataladremote'): ds.repo.init_remote( 'datalad', ['encryption=none', 'type=external', 'externaltype=datalad']) # fetch all remotes assert_result_count(ds.update(), 1, status='ok', type='dataset') # no merge, so changes are not in active branch: assert_not_in("first.txt", ds.repo.get_files(ds.repo.get_active_branch())) assert_not_in("second.txt", ds.repo.get_files(ds.repo.get_active_branch())) # but we know the changes in remote branches: assert_in("first.txt", ds.repo.get_files("sibling_1/" + DEFAULT_BRANCH)) assert_in("second.txt", ds.repo.get_files("sibling_2/" + DEFAULT_BRANCH)) # no merge strategy for multiple remotes yet: # more clever now, there is a tracking branch that provides a remote #assert_raises(NotImplementedError, ds.update, merge=True) # merge a certain remote: assert_result_count(ds.update(sibling='sibling_1', merge=True), 1, action='update', status='ok', type='dataset') # changes from sibling_2 still not present: assert_not_in("second.txt", ds.repo.get_files(ds.repo.get_active_branch())) # changes from sibling_1 merged: assert_in("first.txt", ds.repo.get_files(ds.repo.get_active_branch())) # it's known to annex, but has no content yet: annexprops = ds.repo.get_file_annexinfo("first.txt", eval_availability=True) annexprops['key'] # blows if unknown eq_(False, annexprops['has_content'])
def test_update_how_subds_different(path=None, *, follow, action): path = Path(path) ds_src = Dataset(path / "source").create() ds_src_sub = ds_src.create("sub") ds_src.save() ds_clone = install(source=ds_src.path, path=path / "clone", recursive=True, result_xfm="datasets") (ds_clone.pathobj / "foo").write_text("foo") ds_clone.save() ds_clone_sub = Dataset(ds_clone.pathobj / "sub") (ds_src_sub.pathobj / "bar").write_text("bar") ds_src.save(recursive=True) # Add unrecorded state to make --follow=sibling/parentds differ. (ds_src_sub.pathobj / "baz").write_text("baz") ds_src_sub.save() ds_clone_repo = ds_clone.repo ds_clone_hexsha_pre = ds_clone_repo.get_hexsha() ds_clone_sub_repo = ds_clone_sub.repo ds_clone_sub_branch_pre = ds_clone_sub_repo.get_active_branch() res = ds_clone.update(follow=follow, how="merge", how_subds=action, recursive=True) assert_result_count(res, 1, action="merge", status="ok", path=ds_clone.path) assert_result_count(res, 1, action=f"update.{action}", status="ok", path=ds_clone_sub.path) ds_clone_hexsha_post = ds_clone_repo.get_hexsha() neq_(ds_clone_hexsha_pre, ds_clone_hexsha_post) neq_(ds_src.repo.get_hexsha(), ds_clone_hexsha_post) ok_(ds_clone_repo.is_ancestor(ds_clone_hexsha_pre, ds_clone_hexsha_post)) eq_(ds_clone_sub.repo.get_hexsha(), ds_src_sub.repo.get_hexsha(None if follow == "sibling" else "HEAD~")) ds_clone_sub_branch_post = ds_clone_sub_repo.get_active_branch() if action == "checkout": neq_(ds_clone_sub_branch_pre, ds_clone_sub_branch_post) assert_false(ds_clone_sub_branch_post) else: eq_(ds_clone_sub_branch_pre, ds_clone_sub_branch_post)
def test_diff_nonexistent_ref_unicode(path=None): ds = Dataset(path).create() assert_result_count(ds.diff(fr="HEAD", to=u"β", on_failure="ignore", result_renderer='disabled'), 1, path=ds.path, status="impossible")
def test_file_extraction(path=None): skip_if_no_module('libxmp') # go into virgin dir to avoid detection of any dataset with chpwd(path): res = extract_metadata( types=['xmp'], files=[testpath]) assert_result_count(res, 1, type='file', status='ok', action='metadata', path=testpath) assert_in('xmp', res[0]['metadata'])
def test_ephemeral(ds_path=None, store_path=None, clone_path=None): dspath = Path(ds_path) store = Path(store_path) file_test = Path('file1.txt') file_testsub = Path('sub') / 'other.txt' # create the original dataset ds = Dataset(dspath) ds.create(force=True) ds.save() # put into store: ds.create_sibling_ria("ria+{}".format(store.as_uri()), "riastore", new_store_ok=True) ds.push(to="riastore", data="anything") # now, get an ephemeral clone from the RIA store: eph_clone = clone('ria+{}#{}'.format(store.as_uri(), ds.id), clone_path, reckless="ephemeral") # ephemeral clone was properly linked (store has bare repos!): clone_annex = (eph_clone.repo.dot_git / 'annex') assert_true(clone_annex.is_symlink()) assert_true(clone_annex.resolve().samefile(store / ds.id[:3] / ds.id[3:] / 'annex')) if not eph_clone.repo.is_managed_branch(): # TODO: We can't properly handle adjusted branch yet # we don't need to get files in order to access them: assert_equal((eph_clone.pathobj / file_test).read_text(), "some") assert_equal((eph_clone.pathobj / file_testsub).read_text(), "other") # can we unlock those files? eph_clone.unlock(file_test) # change content (eph_clone.pathobj / file_test).write_text("new content") eph_clone.save() # new content should already be in store # (except the store doesn't know yet) res = eph_clone.repo.fsck(remote="riastore-storage", fast=True) assert_equal(len(res), 2) assert_result_count(res, 1, success=True, file=file_test.as_posix()) assert_result_count(res, 1, success=True, file=file_testsub.as_posix()) # push back git history eph_clone.push(to=DEFAULT_REMOTE, data="nothing") # get an update in origin ds.update(merge=True, reobtain_data=True) assert_equal((ds.pathobj / file_test).read_text(), "new content")
def test_get_relays_command_errors(path=None): ds = Dataset(path).create() (ds.pathobj / "foo").write_text("foo") ds.save() ds.drop("foo", reckless='kill') assert_result_count(ds.get("foo", on_failure="ignore", result_renderer='disabled'), 1, action="get", type="file", status="error")
def test_update_strategy(path=None): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) # we start clean for ds in base, sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the base dataset only, nothing below changes base.aggregate_metadata() eq_(len(_get_contained_objs(base)), 2) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the entire tree, but by default only updates # the top-level dataset with all objects, none of the leaf # or intermediate datasets gets touched base.aggregate_metadata(recursive=True) eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_referenced_objs(base)), 6) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) # it is impossible to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status('impossible', ds.metadata(get_aggregates=True, on_failure='ignore')) # get the full metadata report target_meta = base.metadata(return_type='list') # now redo full aggregation, this time updating all # (intermediate) datasets base.aggregate_metadata(recursive=True, update_mode='all') eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_contained_objs(sub)), 4) eq_(len(_get_contained_objs(subsub)), 2) # it is now OK to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status('ok', ds.metadata(get_aggregates=True, on_failure='ignore')) # all of that has no impact on the reported metadata eq_(target_meta, base.metadata(return_type='list'))
def test_download_url_return(toppath=None, topurl=None, outdir=None): # Ensure that out directory has trailing slash. outdir = opj(outdir, "") files = ['file1.txt', 'file2.txt'] urls = [topurl + f for f in files] outfiles = [opj(outdir, f) for f in files] out1 = download_url(urls[0], path=outdir, save=False) assert_result_count(out1, 1) eq_(out1[0]['path'], outfiles[0]) # can't overwrite out2 = download_url(urls, path=outdir, on_failure='ignore', save=False) assert_result_count(out2, 1, status='error') assert_in('file1.txt already exists', out2[0]['message']) assert_result_count(out2, 1, status='ok') # only 2nd one eq_(out2[1]['path'], outfiles[1]) out3 = download_url(urls, path=outdir, overwrite=True, on_failure='ignore', save=False) assert_result_count(out3, 2, status='ok') eq_([r['path'] for r in out3], outfiles)
def test_ds_extraction(path=None): skip_if_no_module('libxmp') ds = Dataset(path).create() copy(testpath, path) ds.save() assert_repo_status(ds.path) res = extract_metadata( types=['xmp'], dataset=ds, # artificially disable extraction from any file in the dataset files=[]) assert_result_count( res, 1, type='dataset', status='ok', action='metadata', path=path, refds=ds.path) assert_in('xmp', res[0]['metadata']) # now the more useful case: getting everything for xmp from a dataset res = extract_metadata( types=['xmp'], dataset=ds) assert_result_count(res, 2) assert_result_count( res, 1, type='dataset', status='ok', action='metadata', path=path, refds=ds.path) assert_result_count( res, 1, type='file', status='ok', action='metadata', path=opj(path, 'xmp.pdf'), parentds=ds.path) for r in res: assert_in('xmp', r['metadata'])
def test_drop_after(self=None, path=None): ds = Dataset(path).create(force=True) ds.repo.set_gitattributes([('a*', {'annex.largefiles': 'nothing'})]) # make some files go to git, so we could test that we do not blow # while trying to drop what is in git not annex res = ds.addurls(self.json_file, '{url}', '{name}', drop_after=True, result_renderer='disabled') assert_result_count(res, 3, action='addurl', status='ok') # a, b, c even if a goes to git assert_result_count(res, 2, action='drop', status='ok') # b, c
def test_archive(path=None): ds = Dataset(opj(path, 'ds')).create(force=True) ds.save() committed_date = ds.repo.get_commit_date() default_outname = opj(path, 'datalad_{}.tar.gz'.format(ds.id)) with chpwd(path): res = list(ds.export_archive()) assert_status('ok', res) assert_result_count(res, 1) assert (isabs(res[0]['path'])) assert_true(os.path.exists(default_outname)) custom_outname = opj(path, 'myexport.tar.gz') # feed in without extension ds.export_archive(filename=custom_outname[:-7]) assert_true(os.path.exists(custom_outname)) custom1_md5 = md5sum(custom_outname) # encodes the original archive filename -> different checksum, despit # same content assert_not_equal(md5sum(default_outname), custom1_md5) # should really sleep so if they stop using time.time - we know time.sleep(1.1) ds.export_archive(filename=custom_outname) # should not encode mtime, so should be identical assert_equal(md5sum(custom_outname), custom1_md5) def check_contents(outname, prefix): with tarfile.open(outname) as tf: nfiles = 0 for ti in tf: # any annex links resolved assert_false(ti.issym()) ok_startswith(ti.name, prefix + '/') assert_equal(ti.mtime, committed_date) if '.datalad' not in ti.name: # ignore any files in .datalad for this test to not be # susceptible to changes in how much we generate a meta info nfiles += 1 # we have exactly four files (includes .gitattributes for default # MD5E backend), and expect no content for any directory assert_equal(nfiles, 4) check_contents(default_outname, 'datalad_%s' % ds.id) check_contents(custom_outname, 'myexport') # now loose some content ds.drop('file_up', reckless='kill') assert_raises(IOError, ds.export_archive, filename=opj(path, 'my')) ds.export_archive(filename=opj(path, 'partial'), missing_content='ignore') assert_true(os.path.exists(opj(path, 'partial.tar.gz')))
def _compare_metadata_helper(origres, compds): for ores in origres: rpath = relpath(ores['path'], ores['refds']) cres = compds.metadata( rpath, reporton='{}s'.format(ores['type'])) if ores['type'] == 'file': # TODO implement file based lookup continue assert_result_count(cres, 1) cres = cres[0] assert_dict_equal(ores['metadata'], cres['metadata']) if ores['type'] == 'dataset': for i in ('dsid', ): eq_(ores[i], cres[i])
def test_push_git_annex_branch_many_paths_same_data(path=None): path = Path(path) ds = Dataset(path / "ds").create(force=True) ds.save() mk_push_target(ds, "target", str(path / "target"), annex=True, bare=False) nbytes = sum( ds.repo.get_content_annexinfo(paths=[f])[f]["bytesize"] for f in [ ds.repo.pathobj / "f0", ds.repo.pathobj / "f3", ds.repo.pathobj / "f4" ]) with swallow_logs(new_level=logging.DEBUG) as cml: res = ds.push(to="target") assert_in("{} bytes of annex data".format(nbytes), cml.out) # 3 files point to content already covered by another file. assert_result_count(res, 3, action="copy", type="file", status="notneeded")
def test_update_git_smoke(src_path=None, dst_path=None): # Apparently was just failing on git repos for basic lack of coverage, hence this quick test ds = Dataset(src_path).create(annex=False) target = install(dst_path, source=src_path, result_xfm='datasets', return_type='item-or-list') create_tree(ds.path, {'file.dat': '123'}) ds.save('file.dat') assert_result_count(target.update(recursive=True, merge=True), 1, action='update', status='ok', type='dataset') ok_file_has_content(opj(target.path, 'file.dat'), '123')