def test_add_archive_use_archive_dir(repo_path): repo = AnnexRepo(repo_path, create=True) with chpwd(repo_path): # Let's add first archive to the repo with default setting archive_path = opj('4u', '1.tar.gz') # check it gives informative error if archive is not already added with assert_raises(RuntimeError) as cmr: add_archive_content(archive_path) assert_re_in( "You should run ['\"]datalad save 4u\\\\1\\.tar\\.gz['\"] first" if on_windows else "You should run ['\"]datalad save 4u/1\\.tar\\.gz['\"] first", str(cmr.exception), match=False) with swallow_outputs(): repo.add(archive_path) repo.commit("added 1.tar.gz") ok_archives_caches(repo.path, 0) add_archive_content(archive_path, strip_leading_dirs=True, use_current_dir=True) ok_(not exists(opj('4u', '1 f.txt'))) ok_file_under_git(repo.path, '1 f.txt', annexed=True) ok_archives_caches(repo.path, 0) # and now let's extract under archive dir add_archive_content(archive_path, strip_leading_dirs=True) ok_file_under_git(repo.path, opj('4u', '1 f.txt'), annexed=True) ok_archives_caches(repo.path, 0) add_archive_content(opj('4u', 'sub.tar.gz')) ok_file_under_git(repo.path, opj('4u', 'sub', '2 f.txt'), annexed=True) ok_archives_caches(repo.path, 0)
def check_compress_file(ext, annex, path, name): # we base the archive name on the filename, in order to also # be able to properly test compressors where the corresponding # archive format has no capability of storing a filename # (i.e. where the archive name itself determines the filename # of the decompressed file, like .xz) archive = op.join(name, _filename + ext) compress_files([_filename], archive, path=path) assert_true(op.exists(archive)) if annex: # It should work even when file is annexed and is a symlink to the # key from datalad.support.annexrepo import AnnexRepo repo = AnnexRepo(path, init=True) repo.add(_filename) repo.commit(files=[_filename], msg="commit") dir_extracted = name + "_extracted" try: decompress_file(archive, dir_extracted) except MissingExternalDependency as exc: raise SkipTest(exc_str(exc)) _filepath = op.join(dir_extracted, _filename) ok_file_has_content(_filepath, 'content')
def test_proxying_open_testrepobased(repo): TEST_CONTENT = "content to be annex-addurl'd" fname = 'test-annex.dat' fpath = opj(repo, fname) assert_raises(IOError, open, fpath) aio = AutomagicIO(activate=True) try: with swallow_outputs(): # now we should be able just to request to open this file with open(fpath) as f: content = f.read() eq_(content, TEST_CONTENT) finally: aio.deactivate() # and now that we have fetched it, nothing should forbid us to open it again with open(fpath) as f: eq_(f.read(), TEST_CONTENT) annex = AnnexRepo(repo, create=False) # Let's create another file deeper under the directory with the same content # so it would point to the same key, which we would drop and repeat the drill fpath2 = opj(repo, 'd1', 'd2', 'test2.dat') os.makedirs(dirname(fpath2)) with open(fpath2, 'w') as f: f.write(content) annex.add(fpath2) annex.drop(fpath2) annex.commit("added and dropped") assert_raises(IOError, open, fpath2) # Let's use context manager form with AutomagicIO() as aio: ok_(isinstance(aio, AutomagicIO)) ok_(aio.active) # swallowing output would cause trouble while testing with # DATALAD_ASSERT_NO_OPEN_FILES mode on. Reason is not 100% clear # on why underlying git-annex process would be dumping to stdout or err #with swallow_outputs(): # now we should be able just to request to open this file with open(fpath2) as f: content = f.read() eq_(content, TEST_CONTENT) annex.drop(fpath2) assert_raises(IOError, open, fpath2) # Let's use relative path with chpwd(opj(repo, 'd1')): # Let's use context manager form with AutomagicIO() as aio, \ swallow_outputs(), \ open(opj('d2', 'test2.dat')) as f: content = f.read() eq_(content, TEST_CONTENT)
def test_direct_cfg(path1, path2): # and if repo already exists and we have env var - we fail too # Adding backend so we get some commit into the repo ar = AnnexRepo(path1, create=True, backend='MD5E') del ar AnnexRepo._unique_instances.clear() # fight flyweight for path in (path1, path2): with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}): # try to create annex repo in direct mode as see how it fails with assert_raises(DirectModeNoLongerSupportedError) as cme: AnnexRepo(path, create=True) assert_in("no longer supported by DataLad", str(cme.exception)) # we have generic part assert_in("datalad.repo.direct configuration", str(cme.exception)) # situation specific part # assert not op.exists(path2) # that we didn't create it - we do! # fixing for that would be too cumbersome since we first call GitRepo.__init__ # with create ar = AnnexRepo(path1) # check if we somehow didn't reset the flag assert not ar.is_direct_mode() if ar.config.obtain("datalad.repo.version") >= 6: raise SkipTest( "Created repo not v5, cannot test detection of direct mode repos") # and if repo existed before and was in direct mode, we fail too # Since direct= option was deprecated entirely, we use protected method now ar._set_direct_mode(True) assert ar.is_direct_mode() del ar # but we would need to disable somehow the flywheel with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}): with assert_raises(DirectModeNoLongerSupportedError) as cme: AnnexRepo(path1, create=False) # TODO: RM DIRECT decide what should we here -- should we test/blow? # ATM both tests below just pass ar2 = AnnexRepo(path2, create=True) # happily can do it since it doesn't need a worktree to do the clone ar2.add_submodule('sub1', url=path1) ar2sub1 = AnnexRepo(op.join(path2, 'sub1')) # but now let's convert that sub1 to direct mode assert not ar2sub1.is_direct_mode() ar2sub1._set_direct_mode(True) assert ar2sub1.is_direct_mode() del ar2 del ar2sub1 AnnexRepo._unique_instances.clear() # fight flyweight ar2 = AnnexRepo(path2) ar2.get_submodules() # And what if we are trying to add pre-cloned repo in direct mode? ar2sub2 = AnnexRepo.clone(path1, op.join(path2, 'sub2')) ar2sub2._set_direct_mode(True) del ar2sub2 AnnexRepo._unique_instances.clear() # fight flyweight ar2.add('sub2')
def test_add_archive_content_zip(repo_path): repo = AnnexRepo(repo_path, create=True) with chpwd(repo_path): with swallow_outputs(): repo.add(["1.zip"]) repo.commit("add 1.zip") add_archive_content("1.zip") ok_file_under_git(opj(repo.path, "1", "foo"), annexed=True) ok_file_under_git(opj("1", "dir", "bar"), annexed=True) ok_archives_caches(repo.path, 0)
def test_direct_cfg(path1, path2): # and if repo already exists and we have env var - we fail too # Adding backend so we get some commit into the repo ar = AnnexRepo(path1, create=True, backend='MD5E') del ar; AnnexRepo._unique_instances.clear() # fight flyweight for path in (path1, path2): with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}): # try to create annex repo in direct mode as see how it fails with assert_raises(DirectModeNoLongerSupportedError) as cme: AnnexRepo(path, create=True) assert_in("no longer supported by DataLad", str(cme.exception)) # we have generic part assert_in("datalad.repo.direct configuration", str(cme.exception)) # situation specific part # assert not op.exists(path2) # that we didn't create it - we do! # fixing for that would be too cumbersome since we first call GitRepo.__init__ # with create ar = AnnexRepo(path1) # check if we somehow didn't reset the flag assert not ar.is_direct_mode() if ar.config.obtain("datalad.repo.version") >= 6: raise SkipTest("Created repo not v5, cannot test detection of direct mode repos") # and if repo existed before and was in direct mode, we fail too # Since direct= option was deprecated entirely, we use protected method now ar._set_direct_mode(True) assert ar.is_direct_mode() del ar # but we would need to disable somehow the flywheel with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}): with assert_raises(DirectModeNoLongerSupportedError) as cme: AnnexRepo(path1, create=False) # TODO: RM DIRECT decide what should we here -- should we test/blow? # ATM both tests below just pass ar2 = AnnexRepo(path2, create=True) # happily can do it since it doesn't need a worktree to do the clone ar2.add_submodule('sub1', url=path1) ar2sub1 = AnnexRepo(op.join(path2, 'sub1')) # but now let's convert that sub1 to direct mode assert not ar2sub1.is_direct_mode() ar2sub1._set_direct_mode(True) assert ar2sub1.is_direct_mode() del ar2; del ar2sub1; AnnexRepo._unique_instances.clear() # fight flyweight ar2 = AnnexRepo(path2) ar2.get_submodules() # And what if we are trying to add pre-cloned repo in direct mode? ar2sub2 = AnnexRepo.clone(path1, op.join(path2, 'sub2')) ar2sub2._set_direct_mode(True) del ar2sub2; AnnexRepo._unique_instances.clear() # fight flyweight ar2.add('sub2')
def test_add_archive_content_absolute_path(path): repo = AnnexRepo(opj(path, "ds"), create=True) repo.add(["1.tar.gz"]) repo.commit("1.tar.gz") abs_tar_gz = opj(path, "ds", "1.tar.gz") add_archive_content(abs_tar_gz, annex=repo) ok_file_under_git(opj(path, "ds", "1", "foo"), annexed=True) commit_msg = repo.format_commit("%B") # The commit message uses relative paths. assert_not_in(abs_tar_gz, commit_msg) assert_in("1.tar.gz", commit_msg) with assert_raises(FileNotInRepositoryError): add_archive_content(opj(path, "notds", "2.tar.gz"), annex=repo)
def test_get_contentlocation(tdir): repo = AnnexRepo(tdir, create=True, init=True) repo.add('file.dat') repo.commit('added file.dat') key = repo.get_file_key('file.dat') cr = AnnexCustomRemote(tdir) key_path = cr.get_contentlocation(key, absolute=False) assert not isabs(key_path) key_path_abs = cr.get_contentlocation(key, absolute=True) assert isabs(key_path_abs) assert cr._contentlocations == {key: key_path} repo.drop('file.dat', options=['--force']) assert not cr.get_contentlocation(key, absolute=True)
def test_get_contentlocation(tdir=None): repo = AnnexRepo(tdir, create=True, init=True) repo.add('file.dat') repo.commit('added file.dat') # TODO contentlocation would come with eval_availability=True key = repo.get_file_annexinfo('file.dat')['key'] cr = ArchiveAnnexCustomRemote(None, path=tdir) key_path = cr.get_contentlocation(key, absolute=False) assert not isabs(key_path) key_path_abs = cr.get_contentlocation(key, absolute=True) assert isabs(key_path_abs) assert cr._contentlocations == {key: key_path} repo.drop('file.dat', options=['--force']) assert not cr.get_contentlocation(key, absolute=True)
def put_file_under_git(path, filename=None, content=None, annexed=False): """Place file under git/annex and return used Repo """ annex, file_repo_path, filename, path, repo = _prep_file_under_git(path, filename) if content is None: content = "" with open(opj(repo.path, file_repo_path), 'w') as f_: f_.write(content) if annexed: if not isinstance(repo, AnnexRepo): repo = AnnexRepo(repo.path) repo.add(file_repo_path, commit=True, _datalad_msg=True) else: repo.add(file_repo_path, git=True, _datalad_msg=True) ok_file_under_git(repo.path, file_repo_path, annexed) return repo
def test_check_dates(path): skip_if_no_module("dateutil") ref_ts = 1218182889 # Fri, 08 Aug 2008 04:08:09 -0400 refdate = "@{}".format(ref_ts) repo = os.path.join(path, "repo") with set_date(ref_ts + 5000): ar = AnnexRepo(repo) ar.add(".") ar.commit() # The standard renderer outputs json. with swallow_outputs() as cmo: # Set level to WARNING to avoid the progress bar when # DATALAD_TESTS_UI_BACKEND=console. with swallow_logs(new_level=logging.WARNING): check_dates([repo], reference_date=refdate, return_type="list") assert_in("report", json.loads(cmo.out).keys()) # We find the newer objects. newer = call([path], reference_date=refdate) eq_(len(newer), 1) ok_(newer[0]["report"]["objects"]) # There are no older objects to find. older = call([repo], reference_date=refdate, older=True) assert_false(older[0]["report"]["objects"]) # We can pass the date in RFC 2822 format. assert_dict_equal( newer[0], call([path], reference_date="08 Aug 2008 04:08:09 -0400")[0]) # paths=None defaults to the current directory. with chpwd(path): assert_dict_equal( newer[0]["report"], call(paths=None, reference_date=refdate)[0]["report"]) # Only commit type is present when annex='none'. newer_noannex = call([path], reference_date=refdate, annex="none") for entry in newer_noannex[0]["report"]["objects"].values(): ok_(entry["type"] == "commit")
def test_get_dataset_root(path): eq_(get_dataset_root('/nonexistent'), None) with chpwd(path): repo = AnnexRepo(os.curdir, create=True) subdir = opj('some', 'deep') fname = opj(subdir, 'dummy') os.makedirs(subdir) with open(fname, 'w') as f: f.write('some') repo.add(fname) # we can find this repo eq_(get_dataset_root(os.curdir), os.curdir) # and we get the type of path that we fed in eq_(get_dataset_root(abspath(os.curdir)), abspath(os.curdir)) # subdirs are no issue eq_(get_dataset_root(subdir), os.curdir) # non-dir paths are no issue eq_(get_dataset_root(fname), os.curdir)
def test_check_dates(path=None): skip_if_no_module("dateutil") ref_ts = 1218182889 # Fri, 08 Aug 2008 04:08:09 -0400 refdate = "@{}".format(ref_ts) repo = os.path.join(path, "repo") with set_date(ref_ts + 5000): ar = AnnexRepo(repo) ar.add(".") ar.commit() # The standard renderer outputs json. with swallow_outputs() as cmo: # Set level to WARNING to avoid the progress bar when # DATALAD_TESTS_UI_BACKEND=console. with swallow_logs(new_level=logging.WARNING): check_dates([repo], reference_date=refdate, return_type="list") assert_in("report", json.loads(cmo.out).keys()) # We find the newer objects. newer = call([path], reference_date=refdate) eq_(len(newer), 1) ok_(newer[0]["report"]["objects"]) # There are no older objects to find. older = call([repo], reference_date=refdate, older=True) assert_false(older[0]["report"]["objects"]) # We can pass the date in RFC 2822 format. assert_dict_equal( newer[0], call([path], reference_date="08 Aug 2008 04:08:09 -0400")[0]) # paths=None defaults to the current directory. with chpwd(path): assert_dict_equal( newer[0]["report"], call(paths=None, reference_date=refdate)[0]["report"]) # Only commit type is present when annex='none'. newer_noannex = call([path], reference_date=refdate, annex="none") for entry in newer_noannex[0]["report"]["objects"].values(): ok_(entry["type"] == "commit")
def test_update_fetch_all(src, remote_1, remote_2): rmt1 = AnnexRepo(remote_1, src) rmt2 = AnnexRepo(remote_2, src) ds = Dataset(src) ds.add_sibling(name="sibling_1", url=remote_1) ds.add_sibling(name="sibling_2", url=remote_2) # modify the remotes: with open(opj(remote_1, "first.txt"), "w") as f: f.write("some file load") rmt1.add("first.txt", commit=True) # TODO: Modify an already present file! with open(opj(remote_2, "second.txt"), "w") as f: f.write("different file load") rmt2.add("second.txt", git=True, commit=True, msg="Add file to git.") # fetch all remotes ds.update(fetch_all=True) # no merge, so changes are not in active branch: assert_not_in("first.txt", ds.repo.get_files(ds.repo.get_active_branch())) assert_not_in("second.txt", ds.repo.get_files(ds.repo.get_active_branch())) # but we know the changes in remote branches: assert_in("first.txt", ds.repo.get_files("sibling_1/master")) assert_in("second.txt", ds.repo.get_files("sibling_2/master")) # no merge strategy for multiple remotes yet: assert_raises(NotImplementedError, ds.update, merge=True, fetch_all=True) # merge a certain remote: ds.update(name="sibling_1", merge=True) # changes from sibling_2 still not present: assert_not_in("second.txt", ds.repo.get_files(ds.repo.get_active_branch())) # changes from sibling_1 merged: assert_in("first.txt", ds.repo.get_files(ds.repo.get_active_branch())) # it's known to annex, but has no content yet: ds.repo.get_file_key("first.txt") # raises if unknown eq_([False], ds.repo.file_has_content(["first.txt"]))
def test_interactions(tdir): # Just a placeholder since constructor expects a repo repo = AnnexRepo(tdir, create=True, init=True) repo.add('file.dat') repo.commit('added file.dat') for scenario in BASE_INTERACTION_SCENARIOS + [ [ ('GETAVAILABILITY', 'AVAILABILITY %s' % DEFAULT_AVAILABILITY), ('GETCOST', 'COST %d' % DEFAULT_COST), ('TRANSFER RETRIEVE somekey somefile', re.compile('TRANSFER-FAILURE RETRIEVE somekey NotImplementedError().*')), ], [ # by default we do not require any fancy init # no urls supported by default ('CLAIMURL http://example.com', 'CLAIMURL-FAILURE'), # we know that is just a single option, url, is expected so full # one would be passed ('CLAIMURL http://example.com roguearg', 'CLAIMURL-FAILURE'), ] ]: check_interaction_scenario(AnnexCustomRemote, tdir, scenario)
def check_compress_file(ext, annex, path, name): archive = name + ext compress_files([_filename], archive, path=path) assert_true(exists(archive)) if annex: # It should work even when file is annexed and is a symlink to the # key from datalad.support.annexrepo import AnnexRepo repo = AnnexRepo(path, init=True) repo.add(_filename) repo.commit(files=[_filename], msg="commit") dir_extracted = name + "_extracted" try: decompress_file(archive, dir_extracted) except MissingExternalDependency as exc: raise SkipTest(exc_str(exc)) _filepath = op.join(dir_extracted, _filename) import glob print(dir_extracted) print(glob.glob(dir_extracted + '/*')) ok_file_has_content(_filepath, 'content')
def test_ls_json(topdir): annex = AnnexRepo(topdir, create=True) dsj = Dataset(topdir) # create some file and commit it with open(opj(dsj.path, 'subdsfile.txt'), 'w') as f: f.write('123') dsj.add(path='subdsfile.txt') dsj.save("Hello!", version_tag=1) # add a subdataset dsj.install('subds', source=topdir) subdirds = dsj.create(_path_('dir/subds2'), force=True) subdirds.add('file') git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'), commit=True) # commit to git to init git repo annex.add(opj(topdir, 'dir', 'subgit'), commit=True) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir'), commit=True) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link meta_dir = opj('.git', 'datalad', 'metadata') meta_path = opj(topdir, meta_dir) def get_metahash(*path): if not path: path = ['/'] return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() def get_metapath(dspath, *path): return _path_(dspath, meta_dir, get_metahash(*path)) def get_meta(dspath, *path): with open(get_metapath(dspath, *path)) as f: return js.load(f) for all_ in [True, False]: # recurse directories for recursive in [True, False]: for state in ['file', 'delete']: # subdataset should have its json created and deleted when # all=True else not subds_metapath = get_metapath(opj(topdir, 'subds')) exists_prior = exists(subds_metapath) #with swallow_logs(), swallow_outputs(): dsj = _ls_json(topdir, json=state, all_=all_, recursive=recursive) exists_post = exists(subds_metapath) # print("%s %s -> %s" % (state, exists_prior, exists_post)) assert_equal(exists_post, (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metapath = get_metapath(topdir) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metapath = get_metapath(topdir, 'dir', 'subdir') assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden', ), ('dir', 'subgit')]: assert_false(exists(get_metapath(topdir, *subdir))) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total']) # check size of subdataset subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') # dir/subds2 must not be listed among nodes of the top dataset: topds_nodes = {x['name']: x for x in dsj['nodes']} assert_in('subds', topds_nodes) # XXX # # condition here is a bit a guesswork by yoh later on # # TODO: here and below clear destiny/interaction of all_ and recursive # assert_equal(dsj['size']['total'], # '15 Bytes' if (recursive and all_) else # ('9 Bytes' if (recursive or all_) else '3 Bytes') # ) # https://github.com/datalad/datalad/issues/1674 if state == 'file' and all_: dirj = get_meta(topdir, 'dir') dir_nodes = {x['name']: x for x in dirj['nodes']} # it should be present in the subdir meta assert_in('subds2', dir_nodes) # and not in topds assert_not_in('subds2', topds_nodes) # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: dsj = _ls_json(topdir, json='file', all_=False) subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes')
def test_ls_json(topdir): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it with open(opj(ds.path, 'subdsfile.txt'), 'w') as f: f.write('123') ds.add(path='subdsfile.txt') ds.save("Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'), commit=True) # commit to git to init git repo annex.add(opj(topdir, 'dir', 'subgit'), commit=True) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir'), commit=True) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link meta_dir = opj('.git', 'datalad', 'metadata') meta_path = opj(topdir, meta_dir) def get_metahash(*path): return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() for all_ in [True, False]: for recursive in [True, False]: for state in ['file', 'delete']: with swallow_logs(), swallow_outputs(): ds = _ls_json(topdir, json=state, all_=all_, recursive=recursive) # subdataset should have its json created and deleted when all=True else not subds_metahash = get_metahash('/') subds_metapath = opj(topdir, 'subds', meta_dir, subds_metahash) assert_equal(exists(subds_metapath), (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metahash = get_metahash('/') ds_metapath = opj(meta_path, ds_metahash) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metahash = get_metahash('dir', 'subdir') child_metapath = opj(meta_path, child_metahash) assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden'), ('dir', 'subgit')]: child_metahash = get_metahash(*subdir) assert_equal(exists(opj(meta_path, child_metahash)), False) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(ds['nodes'][0]['size']['total'], ds['size']['total']) # check size of subdataset subds = [ item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: ds = _ls_json(topdir, json='file', all_=False) subds = [ item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes')
def test_check_dates(path): refdate = 1218182889 with set_date(refdate - 1): ar = AnnexRepo(path, create=True) def tag_object(tag): """Return object for tag. Do not dereference it. """ # We can't use ar.get_tags because that returns the commit's hexsha, # not the tag's, and ar.get_hexsha is limited to commit objects. return ar.call_git_oneline( ["rev-parse", "refs/tags/{}".format(tag)], read_only=True) ar.add("foo") ar.commit("add foo") foo_commit = ar.get_hexsha() ar.commit("add foo") ar.tag("foo-tag", "tag before refdate") foo_tag = tag_object("foo-tag") # Make a lightweight tag to make sure `tag_dates` doesn't choke on it. ar.tag("light") with set_date(refdate + 1): ar.add("bar") ar.commit("add bar") bar_commit = ar.get_hexsha() ar.tag("bar-tag", "tag after refdate") bar_tag = tag_object("bar-tag") with set_date(refdate + 2): # Drop an annexed file so that we have more blobs in the git-annex # branch than its current tree. ar.drop("bar", options=["--force"]) results = {} for which in ["older", "newer"]: result = check_dates(ar, refdate, which=which)["objects"] ok_(result) if which == "newer": assert_in(bar_commit, result) assert_not_in(foo_commit, result) assert_in(bar_tag, result) elif which == "older": assert_in(foo_commit, result) assert_not_in(bar_commit, result) assert_in(foo_tag, result) results[which] = result ok_(any( x.get("filename") == "uuid.log" for x in results["older"].values())) newer_tree = check_dates(ar, refdate, annex="tree")["objects"] def is_annex_log_blob(entry): return (entry["type"] == "annex-blob" and entry["filename"].endswith(".log")) def num_logs(entries): return sum(map(is_annex_log_blob, entries.values())) # Because we dropped bar above, we should have one more blob in the # git-annex branch than in the current tree of the git-annex branch. eq_(num_logs(results["newer"]) - num_logs(newer_tree), 1) # Act like today is one day from the reference timestamp to check that we # get the same results with the one-day-back default. seconds_in_day = 60 * 60 * 24 with patch('time.time', return_value=refdate + seconds_in_day): assert_equal(check_dates(ar, annex="tree")["objects"], newer_tree) # We can give a path (str) instead of a GitRepo object. assert_equal( check_dates(path, refdate, annex="tree")["objects"], newer_tree) with assert_raises(ValueError): check_dates(ar, refdate, which="unrecognized")
def test_check_dates(path): refdate = 1218182889 with set_date(refdate - 1): ar = AnnexRepo(path, create=True) ar.add("foo") ar.commit("add foo") foo_commit = ar.get_hexsha() ar.commit("add foo") ar.tag("foo-tag", "tag before refdate") # We can't use ar.get_tags because that returns the commit's hexsha, # not the tag's, and ar.get_hexsha is limited to commit objects. foo_tag = ar.repo.git.rev_parse("foo-tag") # Make a lightweight tag to make sure `tag_dates` doesn't choke on it. ar.tag("light") with set_date(refdate + 1): ar.add("bar") ar.commit("add bar") bar_commit = ar.get_hexsha() ar.tag("bar-tag", "tag after refdate") bar_tag = ar.repo.git.rev_parse("bar-tag") with set_date(refdate + 2): # Drop an annexed file so that we have more blobs in the git-annex # branch than its current tree. ar.drop("bar", options=["--force"]) results = {} for which in ["older", "newer"]: result = check_dates(ar, refdate, which=which)["objects"] ok_(result) if which == "newer": assert_in(bar_commit, result) assert_not_in(foo_commit, result) assert_in(bar_tag, result) elif which == "older": assert_in(foo_commit, result) assert_not_in(bar_commit, result) assert_in(foo_tag, result) results[which] = result ok_(any(x.get("filename") == "uuid.log" for x in results["older"].values())) newer_tree = check_dates(ar, refdate, annex="tree")["objects"] def is_annex_log_blob(entry): return (entry["type"] == "annex-blob" and entry["filename"].endswith(".log")) def num_logs(entries): return sum(map(is_annex_log_blob, entries.values())) # Because we dropped bar above, we should have one more blob in the # git-annex branch than in the current tree of the git-annex branch. eq_(num_logs(results["newer"]) - num_logs(newer_tree), 1) # Act like today is one day from the reference timestamp to check that we # get the same results with the one-day-back default. seconds_in_day = 60 * 60 * 24 with patch('time.time', return_value=refdate + seconds_in_day): assert_equal(check_dates(ar, annex="tree")["objects"], newer_tree) # We can give a path (str) instead of a GitRepo object. assert_equal(check_dates(path, refdate, annex="tree")["objects"], newer_tree) with assert_raises(ValueError): check_dates(ar, refdate, which="unrecognized")
def _test_proxying_open(generate_load, verify_load, repo): annex = AnnexRepo(repo, create=True) fpath1 = opj(repo, "test") fpath2 = opj(repo, 'd1', 'd2', 'test2') # generate load fpath1 = generate_load(fpath1) or fpath1 os.makedirs(dirname(fpath2)) fpath2 = generate_load(fpath2) or fpath2 annex.add([fpath1, fpath2]) verify_load(fpath1) verify_load(fpath2) annex.commit("Added some files") # clone to another repo repo2 = repo + "_2" annex2 = AnnexRepo.clone(repo, repo2) # verify that can't access fpath1_2 = fpath1.replace(repo, repo2) fpath2_2 = fpath2.replace(repo, repo2) EXPECTED_EXCEPTIONS = (IOError, OSError) assert_raises(EXPECTED_EXCEPTIONS, verify_load, fpath1_2) with AutomagicIO(): # verify that it doesn't even try to get files which do not exist with patch('datalad.support.annexrepo.AnnexRepo.get') as gricm: # if we request absent file assert_raises(EXPECTED_EXCEPTIONS, open, fpath1_2 + "_", 'r') # no get should be called assert_false(gricm.called) verify_load(fpath1_2) verify_load(fpath2_2) # and even if we drop it -- we still can get it no problem annex2.drop(fpath2_2) assert_false(annex2.file_has_content(fpath2_2)) verify_load(fpath2_2) assert_true(annex2.file_has_content(fpath2_2)) annex2.drop(fpath2_2) assert_false(annex2.file_has_content(fpath2_2)) assert_true(os.path.isfile(fpath2_2)) # In check_once mode, if we drop it, it wouldn't be considered again annex2.drop(fpath2_2) assert_false(annex2.file_has_content(fpath2_2)) with AutomagicIO(check_once=True): verify_load(fpath2_2) assert_true(annex2.file_has_content(fpath2_2)) annex2.drop(fpath2_2) assert_false(annex2.file_has_content(fpath2_2)) assert_false(os.path.isfile(fpath2_2)) # if we override stdout with something not supporting fileno, like tornado # does which ruins using get under IPython # TODO: we might need to refuse any online logging in other places like that annex2.drop(fpath2_2) class StringIOfileno(StringIO): def fileno(self): raise Exception("I have no clue how to do fileno") with patch('sys.stdout', new_callable=StringIOfileno), \ patch('sys.stderr', new_callable=StringIOfileno): with AutomagicIO(): assert_false(annex2.file_has_content(fpath2_2)) verify_load(fpath2_2) assert_true(annex2.file_has_content(fpath2_2))
def test_fs_traverse(topdir): # setup temp directory tree for testing annex = AnnexRepo(topdir) AnnexRepo(opj(topdir, 'annexdir'), create=True) GitRepo(opj(topdir, 'gitdir'), create=True) GitRepo(opj(topdir, 'dir', 'subgit'), create=True) annex.add(opj(topdir, 'dir'), commit=True) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # traverse file system in recursive and non-recursive modes for recursive in [True, False]: # test fs_traverse in display mode with swallow_logs(new_level=logging.INFO) as log, swallow_outputs() as cmo: fs = fs_traverse(topdir, AnnexRepo(topdir), recursive=recursive, json='display') if recursive: # fs_traverse logs should contain all not ignored subdirectories for subdir in [opj(topdir, 'dir'), opj(topdir, 'dir', 'subdir')]: assert_in('Directory: ' + subdir, log.out) # fs_traverse stdout contains subdirectory assert_in(('file2.txt' and 'dir'), cmo.out) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # size of dir type child in non-recursive modes should be 0 Bytes(default) as # dir type child's size currently has no metadata file for traverser to pick its size from # and would require a recursive traversal w/ write to child metadata file mode assert_equal(child['size']['total'], {True: '6 Bytes', False: '0 Bytes'}[recursive]) for recursive in [True, False]: # run fs_traverse in write to json 'file' mode fs = fs_traverse(topdir, AnnexRepo(topdir), recursive=recursive, json='file') # fs_traverse should return a dictionary assert_equal(isinstance(fs, dict), True) # not including git and annex folders assert_equal([item for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name']], []) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # verify node type assert_equal(child['type'], 'dir') # same node size on running fs_traversal in recursive followed by non-recursive mode # verifies child's metadata file being used to find its size # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size) assert_equal(child['size']['total'], '6 Bytes') # verify subdirectory traversal if run in recursive mode if recursive: # sub-dictionary should not include git and hidden directory info assert_equal([item for item in child['nodes'] if ('subgit' or '.fgit') == item['name']], []) # extract subdirectory dictionary, else fail subchild = [subitem for subitem in child["nodes"] if subitem['name'] == 'subdir'][0] # extract info of file1.txts, else fail link = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file1.txt'][0] # verify node's sizes and type assert_equal(link['size']['total'], '3 Bytes') assert_equal(link['size']['ondisk'], link['size']['total']) assert_equal(link['type'], 'link') # extract info of file2.txt, else fail brokenlink = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file2.txt'][0] # verify node's sizes and type assert_equal(brokenlink['type'], 'link-broken') assert_equal(brokenlink['size']['ondisk'], '0 Bytes') assert_equal(brokenlink['size']['total'], '3 Bytes')
def test_ls_json(topdir, topurl): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it with open(opj(ds.path, 'subdsfile.txt'), 'w') as f: f.write('123') ds.add(path='subdsfile.txt') ds.save("Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) subdirds = ds.create(_path_('dir/subds2'), force=True) subdirds.add('file') git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt')) # commit to git to init git repo git.commit() annex.add(opj(topdir, 'dir', 'subgit')) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir')) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link annex.commit() git.add('fgit.txt') # commit to git to init git repo git.commit() # annex.add doesn't add submodule, so using ds.add ds.add(opj('dir', 'subgit')) # add the non-dataset git repo to annex ds.add('dir') # add to annex (links) ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False) # broken-link # register "external" submodule by installing and uninstalling it ext_url = topurl + '/dir/subgit/.git' # need to make it installable via http Runner()('git update-server-info', cwd=opj(topdir, 'dir', 'subgit')) ds.install(opj('dir', 'subgit_ext'), source=ext_url) ds.uninstall(opj('dir', 'subgit_ext')) meta_dir = opj('.git', 'datalad', 'metadata') def get_metahash(*path): if not path: path = ['/'] return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() def get_metapath(dspath, *path): return _path_(dspath, meta_dir, get_metahash(*path)) def get_meta(dspath, *path): with open(get_metapath(dspath, *path)) as f: return js.load(f) # Let's see that there is no crash if one of the files is available only # in relaxed URL mode, so no size could be picked up ds.repo.add_url_to_file( 'fromweb', topurl + '/noteventhere', options=['--relaxed']) for all_ in [True, False]: # recurse directories for recursive in [True, False]: for state in ['file', 'delete']: # subdataset should have its json created and deleted when # all=True else not subds_metapath = get_metapath(opj(topdir, 'subds')) exists_prior = exists(subds_metapath) #with swallow_logs(), swallow_outputs(): dsj = _ls_json( topdir, json=state, all_=all_, recursive=recursive ) ok_startswith(dsj['tags'], '1-') exists_post = exists(subds_metapath) # print("%s %s -> %s" % (state, exists_prior, exists_post)) assert_equal(exists_post, (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metapath = get_metapath(topdir) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metapath = get_metapath(topdir, 'dir', 'subdir') assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden',), ('dir', 'subgit')]: assert_false(exists(get_metapath(topdir, *subdir))) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total']) # check size of subdataset subds = [item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0] assert_equal(subds['size']['total'], '3 Bytes') # dir/subds2 must not be listed among nodes of the top dataset: topds_nodes = {x['name']: x for x in dsj['nodes']} assert_in('subds', topds_nodes) # XXX # # condition here is a bit a guesswork by yoh later on # # TODO: here and below clear destiny/interaction of all_ and recursive # assert_equal(dsj['size']['total'], # '15 Bytes' if (recursive and all_) else # ('9 Bytes' if (recursive or all_) else '3 Bytes') # ) # https://github.com/datalad/datalad/issues/1674 if state == 'file' and all_: dirj = get_meta(topdir, 'dir') dir_nodes = {x['name']: x for x in dirj['nodes']} # it should be present in the subdir meta assert_in('subds2', dir_nodes) assert_not_in('url_external', dir_nodes['subds2']) assert_in('subgit_ext', dir_nodes) assert_equal(dir_nodes['subgit_ext']['url'], ext_url) # and not in topds assert_not_in('subds2', topds_nodes) # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: dsj = _ls_json(topdir, json='file', all_=False) subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') assert_equal( topds_nodes['fromweb']['size']['total'], UNKNOWN_SIZE )
def test_ls_json(topdir): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it open(opj(ds.path, 'subdsfile.txt'), 'w').write('123') ds.add(path='subdsfile.txt') ds.save("Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'), commit=True) # commit to git to init git repo annex.add(opj(topdir, 'dir', 'subgit'), commit=True) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir'), commit=True) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link meta_dir = opj('.git', 'datalad', 'metadata') meta_path = opj(topdir, meta_dir) def get_metahash(*path): return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() for all_ in [True, False]: for recursive in [True, False]: for state in ['file', 'delete']: with swallow_logs(), swallow_outputs(): ds = _ls_json(topdir, json=state, all_=all_, recursive=recursive) # subdataset should have its json created and deleted when all=True else not subds_metahash = get_metahash('/') subds_metapath = opj(topdir, 'subds', meta_dir, subds_metahash) assert_equal(exists(subds_metapath), (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metahash = get_metahash('/') ds_metapath = opj(meta_path, ds_metahash) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metahash = get_metahash('dir', 'subdir') child_metapath = opj(meta_path, child_metahash) assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden'), ('dir', 'subgit')]: child_metahash = get_metahash(*subdir) assert_equal(exists(opj(meta_path, child_metahash)), False) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(ds['nodes'][0]['size']['total'], ds['size']['total']) # check size of subdataset subds = [item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0] assert_equal(subds['size']['total'], '3 Bytes') # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: ds = _ls_json(topdir, json='file', all_=False) subds = [item for item in ds['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0] assert_equal(subds['size']['total'], '3 Bytes')
def test_fs_traverse(topdir): # setup temp directory tree for testing annex = AnnexRepo(topdir) AnnexRepo(opj(topdir, 'annexdir'), create=True) GitRepo(opj(topdir, 'gitdir'), create=True) GitRepo(opj(topdir, 'dir', 'subgit'), create=True) annex.add(opj(topdir, 'dir'), commit=True) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # traverse file system in recursive and non-recursive modes for recursive in [True, False]: # test fs_traverse in display mode with swallow_logs( new_level=logging.INFO) as log, swallow_outputs() as cmo: fs = fs_traverse(topdir, AnnexRepo(topdir), recurse_directories=recursive, json='display') if recursive: # fs_traverse logs should contain all not ignored subdirectories for subdir in [ opj(topdir, 'dir'), opj(topdir, 'dir', 'subdir') ]: assert_in('Directory: ' + subdir, log.out) # fs_traverse stdout contains subdirectory assert_in(('file2.txt' and 'dir'), cmo.out) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # size of dir type child in non-recursive modes should be 0 Bytes(default) as # dir type child's size currently has no metadata file for traverser to pick its size from # and would require a recursive traversal w/ write to child metadata file mode assert_equal(child['size']['total'], { True: '6 Bytes', False: '0 Bytes' }[recursive]) for recursive in [True, False]: # run fs_traverse in write to json 'file' mode fs = fs_traverse(topdir, AnnexRepo(topdir), recurse_directories=recursive, json='file') # fs_traverse should return a dictionary assert_equal(isinstance(fs, dict), True) # not including git and annex folders assert_equal([ item for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name'] ], []) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # verify node type assert_equal(child['type'], 'dir') # same node size on running fs_traversal in recursive followed by non-recursive mode # verifies child's metadata file being used to find its size # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size) assert_equal(child['size']['total'], '6 Bytes') # verify subdirectory traversal if run in recursive mode # In current RF 'nodes' are stripped away during recursive traversal # for now... later we might reincarnate them "differently" # TODO! if False: # recursive: # sub-dictionary should not include git and hidden directory info assert_equal([ item for item in child['nodes'] if ('subgit' or '.fgit') == item['name'] ], []) # extract subdirectory dictionary, else fail subchild = [ subitem for subitem in child["nodes"] if subitem['name'] == 'subdir' ][0] # extract info of file1.txts, else fail link = [ subnode for subnode in subchild["nodes"] if subnode['name'] == 'file1.txt' ][0] # verify node's sizes and type assert_equal(link['size']['total'], '3 Bytes') assert_equal(link['size']['ondisk'], link['size']['total']) assert_equal(link['type'], 'link') # extract info of file2.txt, else fail brokenlink = [ subnode for subnode in subchild["nodes"] if subnode['name'] == 'file2.txt' ][0] # verify node's sizes and type assert_equal(brokenlink['type'], 'link-broken') assert_equal(brokenlink['size']['ondisk'], '0 Bytes') assert_equal(brokenlink['size']['total'], '3 Bytes')
def _test_AnnexDB(cls, path): filepath1 = opj(path, 'file1.txt') filep2 = opj('d', 'file2.txt') filepath2 = opj(path, filep2) annex = AnnexRepo(path, create=True) # PhysicalFileStatusesDB relies on information in annex so files # must be committed first annex.add('file1.txt') annex.commit("initial commit") db = cls(annex=annex) def set_db_status_from_file(fpath): """To test JsonFileStatusesDB, we need to keep updating the status stored""" if cls is JsonFileStatusesDB: # we need first to set the status db.set(fpath, db._get_fileattributes_status(fpath)) set_db_status_from_file('file1.txt') status1 = db.get('file1.txt') assert(status1.size) status1_ = db.get('file1.txt') assert_equal(status1, status1_) assert_false(db.is_different('file1.txt', status1)) assert_false(db.is_different('file1.txt', status1_)) # even if we add a filename specification status1_.filename = 'file1.txt' assert_false(db.is_different('file1.txt', status1_)) status1_.filename = 'different.txt' assert_false(db.is_different('file1.txt', status1_)) os.unlink(filepath1) # under annex- - we don't have unlock yet and thus can't inplace augment with open(filepath1, 'a') as f: f.write('+') # Note/TODO: fixed (realpath) path should go. Inner logic has to adapt to # dataset singletons, that don't resolve symlinks set_db_status_from_file(realpath(filepath1)) assert(db.is_different('file1.txt', status1)) # we should be able to get status of files out and inside of git set_db_status_from_file('2git') status_git1 = db.get('2git') annex.add('2git', git=True) annex.commit("added 2git") assert_equal(db.get('2git'), status_git1) # we should be able to get status of files with relative path to top dir and abs path set_db_status_from_file(filep2) status2 = db.get(filep2) # Note/TODO: fixed (realpath) path should go. Inner logic has to adapt to # dataset singletons, that don't resolve symlinks status2_full = db.get(realpath(filepath2)) assert_equal(status2, status2_full) # TODO? what about relative to curdir?? #with chpwd(opj(path, 'd')): # status2_dir = db.get('./file2.txt') # assert_equal(status2, status2_dir) # since we asked about each file we added to DB/annex -- none should be # known as "deleted" assert_equal(db.get_obsolete(), []) # Possibly save its state for persistent storage #import pdb; pdb.set_trace() db.save() # but, if we create another DB which wasn't queried yet db2 = cls(annex=annex) # all files should be returned # TODO: fixed by using realpath, but there should be a cleaner # adaption to dataset singletons, that are NOT resolving symlinks, while the # underlying repos do! assert_equal( set(db2.get_obsolete()), {opj(realpath(path), p) for p in ['file1.txt', filep2, '2git']}) # and if we query one, it shouldn't be listed as deleted any more status2_ = db2.get(filep2) assert_equal(status2, status2_) # TODO: fixed by using realpath, but there should be a cleaner # adaption to dataset singletons, that are NOT resolving symlinks, while the # underlying repos do! assert_equal( set(db2.get_obsolete()), {opj(realpath(path), p) for p in ['file1.txt', '2git']}) # and if we queried with ./ prefix, should still work db2.get(curdir + sep + 'file1.txt') # TODO: fixed by using realpath, but there should be a cleaner # adaption to dataset singletons, that are NOT resolving symlinks, while the # underlying repos do! assert_equal( set(db2.get_obsolete()), {opj(realpath(path), p) for p in ['2git']}) # and if we queried with a full path, should still work # TODO: fixed by using realpath, but there should be a cleaner # adaption to dataset singletons, that are NOT resolving symlinks, while the # underlying repos do! db2.get(opj(realpath(path), '2git')) assert_equal(db2.get_obsolete(), [])
def test_ls_json(topdir, topurl): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it with open(opj(ds.path, 'subdsfile.txt'), 'w') as f: f.write('123') ds.save(path='subdsfile.txt', message="Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) subdirds = ds.create(_path_('dir/subds2'), force=True) subdirds.save('file') git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt')) # commit to git to init git repo git.commit() annex.add(opj(topdir, 'dir', 'subgit')) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir')) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link annex.commit() git.add('fgit.txt') # commit to git to init git repo git.commit() # annex.add doesn't add submodule, so using ds.add ds.save(opj('dir', 'subgit')) # add the non-dataset git repo to annex ds.save('dir') # add to annex (links) ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False) # broken-link # register "external" submodule by installing and uninstalling it ext_url = topurl + '/dir/subgit/.git' # need to make it installable via http WitlessRunner(cwd=opj(topdir, 'dir', 'subgit')).run( ['git', 'update-server-info']) ds.install(opj('dir', 'subgit_ext'), source=ext_url) ds.uninstall(opj('dir', 'subgit_ext')) meta_dir = opj('.git', 'datalad', 'metadata') def get_metahash(*path): if not path: path = ['/'] return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() def get_metapath(dspath, *path): return _path_(dspath, meta_dir, get_metahash(*path)) def get_meta(dspath, *path): with open(get_metapath(dspath, *path)) as f: return js.load(f) # Let's see that there is no crash if one of the files is available only # in relaxed URL mode, so no size could be picked up ds.repo.add_url_to_file('fromweb', topurl + '/noteventhere', options=['--relaxed']) for all_ in [True, False]: # recurse directories for recursive in [True, False]: for state in ['file', 'delete']: # subdataset should have its json created and deleted when # all=True else not subds_metapath = get_metapath(opj(topdir, 'subds')) exists_prior = exists(subds_metapath) #with swallow_logs(), swallow_outputs(): dsj = _ls_json(topdir, json=state, all_=all_, recursive=recursive) ok_startswith(dsj['tags'], '1-') exists_post = exists(subds_metapath) # print("%s %s -> %s" % (state, exists_prior, exists_post)) assert_equal(exists_post, (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metapath = get_metapath(topdir) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metapath = get_metapath(topdir, 'dir', 'subdir') assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden', ), ('dir', 'subgit')]: assert_false(exists(get_metapath(topdir, *subdir))) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total']) # check size of subdataset subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') # dir/subds2 must not be listed among nodes of the top dataset: topds_nodes = {x['name']: x for x in dsj['nodes']} assert_in('subds', topds_nodes) # XXX # # condition here is a bit a guesswork by yoh later on # # TODO: here and below clear destiny/interaction of all_ and recursive # assert_equal(dsj['size']['total'], # '15 Bytes' if (recursive and all_) else # ('9 Bytes' if (recursive or all_) else '3 Bytes') # ) # https://github.com/datalad/datalad/issues/1674 if state == 'file' and all_: dirj = get_meta(topdir, 'dir') dir_nodes = {x['name']: x for x in dirj['nodes']} # it should be present in the subdir meta assert_in('subds2', dir_nodes) assert_not_in('url_external', dir_nodes['subds2']) assert_in('subgit_ext', dir_nodes) assert_equal(dir_nodes['subgit_ext']['url'], ext_url) # and not in topds assert_not_in('subds2', topds_nodes) # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: dsj = _ls_json(topdir, json='file', all_=False) subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') assert_equal(topds_nodes['fromweb']['size']['total'], UNKNOWN_SIZE)
def __call__(path=None, force=False, description=None, dataset=None, no_annex=False, save=True, annex_version=None, annex_backend='MD5E', native_metadata_type=None, shared_access=None, git_opts=None, annex_opts=None, annex_init_opts=None, text_no_annex=None): # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD if path and dataset: # Given a path and a dataset (path) not pointing to installed # dataset if not dataset.is_installed(): msg = "No installed dataset at %s found." % dataset.path dsroot = get_dataset_root(dataset.path) if dsroot: msg += " If you meant to add to the %s dataset, use that path " \ "instead but remember that if dataset is provided, " \ "relative paths are relative to the top of the " \ "dataset." % dsroot raise ValueError(msg) # sanity check first if git_opts: lgr.warning( "`git_opts` argument is presently ignored, please complain!") if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if annex_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex and declaring no " "annex repo.") if annex_init_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex init and declaring no " "annex repo.") if not isinstance(force, bool): raise ValueError( "force should be bool, got %r. Did you mean to provide a 'path'?" % force) annotated_paths = AnnotatePaths.__call__( # nothing given explicitly, assume create fresh right here path=path if path else getpwd() if dataset is None else None, dataset=dataset, recursive=False, action='create', # we need to know whether we have to check for potential # subdataset collision force_parentds_discovery=True, # it is absolutely OK to have something that does not exist unavailable_path_status='', unavailable_path_msg=None, # if we have a dataset given that actually exists, we want to # fail if the requested path is not in it nondataset_path_status='error' \ if isinstance(dataset, Dataset) and dataset.is_installed() else '', on_failure='ignore') path = None for r in annotated_paths: if r['status']: # this is dealt with already yield r continue if path is not None: raise ValueError( "`create` can only handle single target path or dataset") path = r if len(annotated_paths) and path is None: # we got something, we complained already, done return # we know that we need to create a dataset at `path` assert (path is not None) # prep for yield path.update({'logger': lgr, 'type': 'dataset'}) # just discard, we have a new story to tell path.pop('message', None) if 'parentds' in path: subs = Subdatasets.__call__( dataset=path['parentds'], # any known fulfilled=None, recursive=False, contains=path['path'], result_xfm='relpaths') if len(subs): path.update({ 'status': 'error', 'message': ('collision with known subdataset %s/ in dataset %s', subs[0], path['parentds']) }) yield path return # TODO here we need a further test that if force=True, we need to look if # there is a superdataset (regardless of whether we want to create a # subdataset or not), and if that superdataset tracks anything within # this directory -- if so, we need to stop right here and whine, because # the result of creating a repo here will produce an undesired mess if git_opts is None: git_opts = {} if shared_access: # configure `git --shared` value git_opts['shared'] = shared_access # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if isinstance(dataset, Dataset) and dataset.path == path['path'] \ else Dataset(path['path']) # don't create in non-empty directory without `force`: if isdir(tbds.path) and listdir(tbds.path) != [] and not force: path.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore' }) yield path return if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) GitRepo(tbds.path, url=None, create=True, git_opts=git_opts) else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo(tbds.path, url=None, create=True, backend=annex_backend, version=annex_version, description=description, git_opts=git_opts, annex_opts=annex_opts, annex_init_opts=annex_init_opts) if text_no_annex: git_attributes_file = opj(tbds.path, '.gitattributes') with open(git_attributes_file, 'a') as f: f.write('* annex.largefiles=(not(mimetype=text/*))\n') tbrepo.add([git_attributes_file], git=True) tbrepo.commit("Instructed annex to add text files to git", _datalad_msg=True, files=[git_attributes_file]) if native_metadata_type is not None: if not isinstance(native_metadata_type, list): native_metadata_type = [native_metadata_type] for nt in native_metadata_type: tbds.config.add('datalad.metadata.nativetype', nt) # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' if id_var in tbds.config: # make sure we reset this variable completely, in case of a re-create tbds.config.unset(id_var, where='dataset') tbds.config.add(id_var, tbds.id if tbds.id is not None else uuid.uuid1().urn.split(':')[-1], where='dataset') # make sure that v6 annex repos never commit content under .datalad with open(opj(tbds.path, '.datalad', '.gitattributes'), 'a') as gitattr: # TODO this will need adjusting, when annex'ed aggregate meta data # comes around gitattr.write( '# Text files (according to file --mime-type) are added directly to git.\n' ) gitattr.write( '# See http://git-annex.branchable.com/tips/largefiles/ for more info.\n' ) gitattr.write('** annex.largefiles=nothing\n') # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbds.add('.datalad', to_git=True, save=save, message='[DATALAD] new dataset') # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if save and isinstance(dataset, Dataset) and dataset.path != tbds.path: # we created a dataset in another dataset # -> make submodule for r in dataset.add(tbds.path, save=True, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r path.update({'status': 'ok'}) yield path
def test_interactions(tdir): # Just a placeholder since constructor expects a repo repo = AnnexRepo(tdir, create=True, init=True) repo.add('file.dat') repo.commit('added file.dat') class FIFO(object): def __init__(self, content=None, default=None): """ Parameters ---------- content default If defined, will be the one returned if empty. If not defined -- would raise an Exception """ self.content = content or [] self.default = default def _pop(self): # return empty line, usually to signal if self.content: v = self.content.pop(0) # allow for debug if v.startswith('DEBUG '): # next one return self._pop() return v else: if self.default is not None: return self.default else: raise IndexError("we are empty") def write(self, l): self.content.append(l) def read(self): return self._pop() def readline(self): return self._pop().rstrip('\n') def flush(self): pass # working hard # now we should test interactions import re ERROR_ARGS = re.compile('^ERROR .*(missing|takes) .*\d+ .*argument') for scenario in [ [], # default of doing nothing [ # support of EXPORT which by default is not supported ('EXPORTSUPPORTED', 'EXPORTSUPPORTED-FAILURE'), ], [ # some unknown option ('FANCYNEWOPTION', 'UNSUPPORTED-REQUEST'), ], [ # get the COST etc for , and make sure we do not # fail right on unsupported ('FANCYNEWOPTION', 'UNSUPPORTED-REQUEST'), ('GETCOST', 'COST %d' % DEFAULT_COST), ('GETCOST roguearg', ERROR_ARGS), ('GETAVAILABILITY', 'AVAILABILITY %s' % DEFAULT_AVAILABILITY), ('INITREMOTE', 'INITREMOTE-SUCCESS' ), # by default we do not require any fancy init # no urls supported by default ('CLAIMURL http://example.com', 'CLAIMURL-FAILURE'), # we know that is just a single option, url, is expected so full # one would be passed ('CLAIMURL http://example.com roguearg', 'CLAIMURL-FAILURE'), # but if not enough params -- ERROR_ARGS ('CLAIMURL', ERROR_ARGS) ] ]: # First one is always version and # Final empty command to signal the end of the transactions scenario = [(None, 'VERSION 1')] + scenario + [('', None)] fin, fout = FIFO(), FIFO(default='') for in_, out_ in scenario: if in_ is not None: fin.write(in_ + '\n') cr = AnnexCustomRemote(tdir, fin=fin, fout=fout) cr.main() for in_, out_ in scenario: if out_ is not None: out_read = fout.readline() if isinstance(out_, type(ERROR_ARGS)): assert out_.match(out_read), (out_, out_read) else: eq_(out_, out_read) out_read = fout.readline() eq_(out_read, '') # nothing left to say