def test_balsa_pipeline1(ind, topurl, outd, clonedir): list(initiate_dataset( template="balsa", dataset_name='dataladtest-WG33', path=outd, data_fields=['dataset_id'])({'dataset_id': 'WG33'})) with chpwd(outd): pipeline = ofpipeline('WG33', url=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # since now we base incoming on master -- and there were nothing custom # in master after incoming-processed, both branches should be the same eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # but that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) commits = {b: list(repo.get_branch_commits(b)) for b in branches} # all commits out there -- init ds + init crawler + 1*(incoming, processed) # The number of commits in master differs based on the create variant used # (the one DataLad's master makes only one commit). ncommits_master = len(commits["master"]) assert_in(ncommits_master, [4, 5]) # incoming branches from master but lacks one merge commit. eq_(len(commits['incoming']), ncommits_master - 1) # incoming-processed is on master. eq_(len(commits['incoming-processed']), ncommits_master) with chpwd(outd): eq_(set(glob('*')), {'dir1', 'file1.nii'}) all_files = sorted(find_files('.')) fpath = opj(outd, 'file1.nii') ok_file_has_content(fpath, "content of file1.nii") ok_file_under_git(fpath, annexed=True) fpath2 = opj(outd, 'dir1', 'file2.nii') ok_file_has_content(fpath2, "content of file2.nii") ok_file_under_git(fpath2, annexed=True) target_files = { './.datalad/crawl/crawl.cfg', './.datalad/crawl/statuses/incoming.json', './.datalad/meta/balsa.json', './.datalad/config', './file1.nii', './dir1/file2.nii', } eq_(set(all_files), target_files)
def test_balsa_pipeline1(ind, topurl, outd, clonedir): list( initiate_dataset(template="balsa", dataset_name='dataladtest-WG33', path=outd, data_fields=['dataset_id'])({ 'dataset_id': 'WG33' })) with chpwd(outd): pipeline = ofpipeline('WG33', url=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) commits = {b: list(repo.get_branch_commits(b)) for b in branches} eq_(len(commits['incoming']), 1) eq_(len(commits['incoming-processed']), 2) eq_( len(commits['master']), 6 ) # all commits out there -- init ds + init crawler + 1*(incoming, processed, merge) with chpwd(outd): eq_(set(glob('*')), {'dir1', 'file1.nii'}) all_files = sorted(find_files('.')) fpath = opj(outd, 'file1.nii') ok_file_has_content(fpath, "content of file1.nii") ok_file_under_git(fpath, annexed=True) fpath2 = opj(outd, 'dir1', 'file2.nii') ok_file_has_content(fpath2, "content of file2.nii") ok_file_under_git(fpath2, annexed=True) target_files = { './.datalad/crawl/crawl.cfg', './.datalad/crawl/statuses/incoming.json', './.datalad/meta/balsa.json', './.datalad/config', './file1.nii', './dir1/file2.nii', } eq_(set(all_files), target_files)
def test_publish_recursive(origin, src_path, dst_path, sub1_pub, sub2_pub): # prepare src source = install(src_path, source=origin, recursive=True)[0] # create plain git at target: target = GitRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # subdatasets have no remote yet, so recursive publishing should fail: with assert_raises(ValueError) as cm: publish(dataset=source, to="target", recursive=True) assert_in("No sibling 'target' found.", str(cm.exception)) # now, set up targets for the submodules: sub1_target = GitRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = AnnexRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, 'subm 2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) # publish recursively with swallow_logs(new_level=logging.DEBUG) as cml: res = publish(dataset=source, to="target", recursive=True) assert_not_in('forced update', cml.out, "we probably haven't merged git-annex before pushing") # testing result list # (Note: Dataset lacks __eq__ for now. Should this be based on path only?) assert_is_instance(res, tuple) assert_is_instance(res[0], list) assert_is_instance(res[1], list) eq_(res[1], []) # nothing failed/was skipped for item in res[0]: assert_is_instance(item, Dataset) eq_({res[0][0].path, res[0][1].path, res[0][2].path}, {src_path, sub1.path, sub2.path}) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) eq_(list(target.get_branch_commits("git-annex")), list(source.repo.get_branch_commits("git-annex"))) eq_(list(sub1_target.get_branch_commits("master")), list(sub1.get_branch_commits("master"))) eq_(list(sub1_target.get_branch_commits("git-annex")), list(sub1.get_branch_commits("git-annex"))) eq_(list(sub2_target.get_branch_commits("master")), list(sub2.get_branch_commits("master"))) eq_(list(sub2_target.get_branch_commits("git-annex")), list(sub2.get_branch_commits("git-annex"))) # test for publishing with --since. By default since no changes, only current pushed res_ = publish(dataset=source, recursive=True) # only current one would get pushed eq_(set(r.path for r in res_[0]), {src_path}) # all get pushed res_ = publish(dataset=source, recursive=True, since='HEAD^') eq_(set(r.path for r in res_[0]), {src_path, sub1.path, sub2.path}) # Let's now update one subm with open(opj(sub2.path, "file.txt"), 'w') as f: f.write('') sub2.add('file.txt') sub2.commit("") # TODO: Doesn't work: https://github.com/datalad/datalad/issues/636 #source.save("changed sub2", auto_add_changes=True) source.repo.commit("", options=['-a']) res_ = publish(dataset=source, recursive=True) # only updated ones were published eq_(set(r.path for r in res_[0]), {src_path, sub2.path})
def test_publish_with_data(origin, src_path, dst_path, sub1_pub, sub2_pub): # prepare src source = install(src_path, source=origin, recursive=True)[0] source.repo.get('test-annex.dat') # create plain git at target: target = AnnexRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # now, set up targets for the submodules: sub1_target = GitRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = GitRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, 'subm 2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) # TMP: Insert the fetch to prevent GitPython to fail after the push, # because it cannot resolve the SHA of the old commit of the remote, # that git reports back after the push. # TODO: Figure out, when to fetch things in general; Alternatively: # Is there an option for push, that prevents GitPython from failing? source.repo.fetch("target") res = publish(dataset=source, to="target", path=['test-annex.dat']) eq_(res, ([source, 'test-annex.dat'], [])) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # TODO: last commit in git-annex branch differs. Probably fine, # but figure out, when exactly to expect this for proper testing: eq_( list(target.get_branch_commits("git-annex"))[1:], list(source.repo.get_branch_commits("git-annex"))[1:]) # we need compare target/master: target.checkout("master") eq_(target.file_has_content(['test-annex.dat']), [True]) source.repo.fetch("target") res = publish(dataset=source, to="target", path=['.']) eq_(res, ([source, 'test-annex.dat'], [])) source.repo.fetch("target") import glob res = publish(dataset=source, to="target", path=glob.glob1(source.path, '*')) # Note: This leads to recursive publishing, since expansion of '*' # contains the submodules themselves in this setup # collect result paths: result_paths = [] for item in res[0]: if isinstance(item, Dataset): result_paths.append(item.path) else: result_paths.append(item) eq_( { source.path, opj(source.path, "subm 1"), opj(source.path, "subm 2"), 'test-annex.dat' }, set(result_paths))
def test_publish_with_data(origin, src_path, dst_path, sub1_pub, sub2_pub, dst_clone_path): # prepare src source = install(src_path, source=origin, recursive=True) source.repo.get('test-annex.dat') # create plain git at target: target = AnnexRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # now, set up targets for the submodules: sub1_target = GitRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = GitRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, 'subm 2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) # TMP: Insert the fetch to prevent GitPython to fail after the push, # because it cannot resolve the SHA of the old commit of the remote, # that git reports back after the push. # TODO: Figure out, when to fetch things in general; Alternatively: # Is there an option for push, that prevents GitPython from failing? source.repo.fetch("target") res = publish(dataset=source, to="target", path=['test-annex.dat']) # first it would publish data and then push eq_(res, (['test-annex.dat', source], [])) # XXX master was not checked out in dst! eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # TODO: last commit in git-annex branch differs. Probably fine, # but figure out, when exactly to expect this for proper testing: # yoh: they differ because local annex records information about now # file being available in that remote, and remote one does it via a call in # the hook I guess. So they both get the same information but in two # different commits. I do not observe such behavior of remote having git-annex # automagically updated in older clones # which do not have post-receive hook on remote side eq_(list(target.get_branch_commits("git-annex"))[1:], list(source.repo.get_branch_commits("git-annex"))[1:]) # we need compare target/master: target.checkout("master") ok_(target.file_has_content('test-annex.dat')) # make sure that whatever we published is actually consumable dst_clone = install( dst_clone_path, source=dst_path, result_xfm='datasets', return_type='item-or-list') nok_(dst_clone.repo.file_has_content('test-annex.dat')) res = dst_clone.get('test-annex.dat') ok_(dst_clone.repo.file_has_content('test-annex.dat')) source.repo.fetch("target") res = publish(dataset=source, to="target", path=['.']) # there is nothing to publish on 2nd attempt #eq_(res, ([source, 'test-annex.dat'], [])) eq_(res, ([], [])) source.repo.fetch("target") import glob res = publish(dataset=source, to="target", path=glob.glob1(source.path, '*')) # Note: This leads to recursive publishing, since expansion of '*' # contains the submodules themselves in this setup # collect result paths: result_paths = [] for item in res[0]: result_paths.append(item.path if isinstance(item, Dataset) else item) # only the subdatasets, targets are plain git repos, hence # no file content is pushed, all content in super was pushed # before eq_({sub1.path, sub2.path}, set(result_paths)) # if we publish again -- nothing to be published eq_(source.publish(to="target"), ([], [])) # if we drop a file and publish again -- dataset should be published # since git-annex branch was updated source.drop('test-annex.dat') eq_(source.publish(to="target"), ([source], [])) eq_(source.publish(to="target"), ([], [])) # and empty again if we try again
def test_publish_recursive(pristine_origin, origin_path, src_path, dst_path, sub1_pub, sub2_pub): # we will be publishing back to origin, so to not alter testrepo # we will first clone it origin = install(origin_path, source=pristine_origin, recursive=True) # prepare src source = install(src_path, source=origin_path, recursive=True) # create plain git at target: target = GitRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # subdatasets have no remote yet, so recursive publishing should fail: with assert_raises(ValueError) as cm: publish(dataset=source, to="target", recursive=True) assert_in("Unknown target sibling 'target'", exc_str(cm.exception)) # now, set up targets for the submodules: sub1_target = GitRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = AnnexRepo(sub2_pub, create=True) # we will be testing presence of the file content, so let's make it progress sub2_target.config.set('receive.denyCurrentBranch', 'updateInstead', where='local') sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, 'subm 2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) # publish recursively with swallow_logs(new_level=logging.DEBUG) as cml: res = publish(dataset=source, to="target", recursive=True) assert_not_in( 'forced update', cml.out, "we probably haven't merged git-annex before pushing" ) # testing result list # (Note: Dataset lacks __eq__ for now. Should this be based on path only?) assert_is_instance(res, tuple) assert_is_instance(res[0], list) assert_is_instance(res[1], list) eq_(res[1], []) # nothing failed/was skipped for item in res[0]: assert_is_instance(item, Dataset) eq_({res[0][0].path, res[0][1].path, res[0][2].path}, {src_path, sub1.path, sub2.path}) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) eq_(list(target.get_branch_commits("git-annex")), list(source.repo.get_branch_commits("git-annex"))) eq_(list(sub1_target.get_branch_commits("master")), list(sub1.get_branch_commits("master"))) eq_(list(sub1_target.get_branch_commits("git-annex")), list(sub1.get_branch_commits("git-annex"))) eq_(list(sub2_target.get_branch_commits("master")), list(sub2.get_branch_commits("master"))) eq_(list(sub2_target.get_branch_commits("git-annex")), list(sub2.get_branch_commits("git-annex"))) # we are tracking origin but origin has different git-annex, since we # cloned from it, so it is not aware of our git-annex neq_(list(origin.repo.get_branch_commits("git-annex")), list(source.repo.get_branch_commits("git-annex"))) # So if we first publish to it recursively, we would update # all sub-datasets since git-annex branch would need to be pushed res_ = publish(dataset=source, recursive=True) eq_(set(r.path for r in res_[0]), set(opj(*([source.path] + x)) for x in ([], ['subm 1'], ['subm 2']))) # and now should carry the same state for git-annex eq_(list(origin.repo.get_branch_commits("git-annex")), list(source.repo.get_branch_commits("git-annex"))) # test for publishing with --since. By default since no changes, nothing pushed res_ = publish(dataset=source, recursive=True) eq_(set(r.path for r in res_[0]), set()) # still nothing gets pushed, because origin is up to date res_ = publish(dataset=source, recursive=True, since='HEAD^') eq_(set(r.path for r in res_[0]), set([])) # and we should not fail if we run it from within the dataset with chpwd(source.path): res_ = publish(recursive=True, since='HEAD^') eq_(set(r.path for r in res_[0]), set([])) # Let's now update one subm with open(opj(sub2.path, "file.txt"), 'w') as f: f.write('') # add to subdataset, does not alter super dataset! # MIH: use `to_git` because original test author used # and explicit `GitRepo.add` -- keeping this for now Dataset(sub2.path).add('file.txt', to_git=True) # Let's now update one subm create_tree(sub2.path, {'file.dat': 'content'}) # add to subdataset, without reflecting the change in its super(s) Dataset(sub2.path).add('file.dat') # note: will publish to origin here since that is what it tracks res_published, res_skipped = publish(dataset=source, recursive=True) # only updates published, i.e. just the subdataset, super wasn't altered # nothing copied! eq_(res_published, [Dataset(sub2.path)]) eq_(res_skipped, []) # since published to origin -- destination should not get that file nok_(lexists(opj(sub2_target.path, 'file.dat'))) res_published, res_skipped = publish(dataset=source, to='target', recursive=True) eq_(res_published, [Dataset(sub2.path)]) # Note: with updateInstead only in target2 and not saving change in # super-dataset we would have made remote dataset, if we had entire # hierarchy, to be somewhat inconsistent. # But here, since target datasets are independent -- it is ok # and the file itself was not transferred but now exists ok_(lexists(opj(sub2_target.path, 'file.dat'))) nok_(sub2_target.file_has_content('file.dat')) # but now we can redo publish recursively, at least stating to consider # explicitly to copy . res_published, res_skipped = publish( '.', dataset=source, to='target', recursive=True ) ok_(sub2_target.file_has_content('file.dat')) eq_(res_published, ['file.dat']) # note that this report makes little sense without path to the repository
def test_publish_with_data(origin, src_path, dst_path, sub1_pub, sub2_pub, dst_clone_path): # prepare src source = install(src_path, source=origin, recursive=True) source.repo.get('test-annex.dat') # create plain git at target: target = AnnexRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # now, set up targets for the submodules: # the need to be annexes, because we want to be able to copy data to them # further down sub1_target = AnnexRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = AnnexRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, '2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) res = publish(dataset=source, to="target", path=['test-annex.dat'], result_xfm='paths') # first it would publish data and then push # TODO order is not fixed (yet) #eq_(res, [opj(source.path, 'test-annex.dat'), source.path]) eq_(set(res), set([opj(source.path, 'test-annex.dat'), source.path])) # XXX master was not checked out in dst! eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # TODO: last commit in git-annex branch differs. Probably fine, # but figure out, when exactly to expect this for proper testing: # yoh: they differ because local annex records information about now # file being available in that remote, and remote one does it via a call in # the hook I guess. So they both get the same information but in two # different commits. I do not observe such behavior of remote having git-annex # automagically updated in older clones # which do not have post-receive hook on remote side eq_( list(target.get_branch_commits("git-annex"))[1:], list(source.repo.get_branch_commits("git-annex"))[1:]) # we need compare target/master: target.checkout("master") ok_(target.file_has_content('test-annex.dat')) # make sure that whatever we published is actually consumable dst_clone = install(dst_clone_path, source=dst_path, result_xfm='datasets', return_type='item-or-list') nok_(dst_clone.repo.file_has_content('test-annex.dat')) res = dst_clone.get('test-annex.dat') ok_(dst_clone.repo.file_has_content('test-annex.dat')) res = publish(dataset=source, to="target", path=['.']) # there is nothing to publish on 2nd attempt #eq_(res, ([source, 'test-annex.dat'], [])) assert_result_count(res, 1, status='notneeded') import glob res = publish(dataset=source, to="target", path=glob.glob1(source.path, '*')) # Note: This leads to recursive publishing, since expansion of '*' # contains the submodules themselves in this setup # only the subdatasets, targets are plain git repos, hence # no file content is pushed, all content in super was pushed # before assert_result_count(res, 3) assert_result_count(res, 1, status='ok', path=sub1.path) assert_result_count(res, 1, status='ok', path=sub2.path) assert_result_count(res, 1, status='notneeded', path=source.path) # if we publish again -- nothing to be published res = source.publish(to="target") assert_result_count(res, 1, status='notneeded', path=source.path) # if we drop a file and publish again -- dataset should be published # since git-annex branch was updated source.drop('test-annex.dat') res = source.publish(to="target") assert_result_count(res, 1, status='ok', path=source.path) # and empty again if we try again res = source.publish(to="target") assert_result_count(res, 1, status='notneeded', path=source.path)
def test_publish_recursive(pristine_origin, origin_path, src_path, dst_path, sub1_pub, sub2_pub): # we will be publishing back to origin, so to not alter testrepo # we will first clone it origin = install(origin_path, source=pristine_origin, recursive=True) # prepare src source = install(src_path, source=origin.path, recursive=True) # we will be trying to push into this later on, need to give permissions... origin_sub2 = Dataset(opj(origin_path, '2')) origin_sub2.config.set('receive.denyCurrentBranch', 'updateInstead', where='local') ## TODO this manual fixup is needed due to gh-1548 -- needs proper solution #os.remove(opj(origin_sub2.path, '.git')) #os.rename(opj(origin_path, '.git', 'modules', '2'), opj(origin_sub2.path, '.git')) # create plain git at target: target = GitRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # subdatasets have no remote yet, so recursive publishing should fail: res = publish(dataset=source, to="target", recursive=True, on_failure='ignore') assert_result_count(res, 3) assert_result_count(res, 1, status='ok', type='dataset', path=source.path) assert_result_count(res, 2, status='error', message=("Unknown target sibling '%s' for publication", 'target')) # now, set up targets for the submodules: sub1_target = GitRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = AnnexRepo(sub2_pub, create=True) # we will be testing presence of the file content, so let's make it progress sub2_target.config.set('receive.denyCurrentBranch', 'updateInstead', where='local') sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, '2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) # publish recursively with swallow_logs(new_level=logging.DEBUG) as cml: res = publish(dataset=source, to="target", recursive=True) assert_not_in('forced update', cml.out, "we probably haven't merged git-annex before pushing") # testing result list # base dataset was already published above, notneeded again assert_status(('ok', 'notneeded'), res) # nothing failed assert_result_count(res, 3, type='dataset') eq_({r['path'] for r in res}, {src_path, sub1.path, sub2.path}) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) eq_(list(target.get_branch_commits("git-annex")), list(source.repo.get_branch_commits("git-annex"))) eq_(list(sub1_target.get_branch_commits("master")), list(sub1.get_branch_commits("master"))) eq_(list(sub1_target.get_branch_commits("git-annex")), list(sub1.get_branch_commits("git-annex"))) eq_(list(sub2_target.get_branch_commits("master")), list(sub2.get_branch_commits("master"))) eq_(list(sub2_target.get_branch_commits("git-annex")), list(sub2.get_branch_commits("git-annex"))) # we are tracking origin but origin has different git-annex, since we # cloned from it, so it is not aware of our git-annex neq_(list(origin.repo.get_branch_commits("git-annex")), list(source.repo.get_branch_commits("git-annex"))) # So if we first publish to it recursively, we would update # all sub-datasets since git-annex branch would need to be pushed res_ = publish(dataset=source, recursive=True) assert_result_count(res_, 1, status='ok', path=source.path) assert_result_count(res_, 1, status='ok', path=sub1.path) assert_result_count(res_, 1, status='ok', path=sub2.path) # and now should carry the same state for git-annex eq_(list(origin.repo.get_branch_commits("git-annex")), list(source.repo.get_branch_commits("git-annex"))) # test for publishing with --since. By default since no changes, nothing pushed res_ = publish(dataset=source, recursive=True) assert_result_count(res_, 3, status='notneeded', type='dataset') # still nothing gets pushed, because origin is up to date res_ = publish(dataset=source, recursive=True, since='HEAD^') assert_result_count(res_, 3, status='notneeded', type='dataset') # and we should not fail if we run it from within the dataset with chpwd(source.path): res_ = publish(recursive=True, since='HEAD^') assert_result_count(res_, 3, status='notneeded', type='dataset') # Let's now update one subm with open(opj(sub2.path, "file.txt"), 'w') as f: f.write('') # add to subdataset, does not alter super dataset! # MIH: use `to_git` because original test author used # and explicit `GitRepo.add` -- keeping this for now Dataset(sub2.path).add('file.txt', to_git=True) # Let's now update one subm create_tree(sub2.path, {'file.dat': 'content'}) # add to subdataset, without reflecting the change in its super(s) Dataset(sub2.path).add('file.dat') # note: will publish to origin here since that is what it tracks res_ = publish(dataset=source, recursive=True, on_failure='ignore') ## only updates published, i.e. just the subdataset, super wasn't altered ## nothing copied! assert_status(('ok', 'notneeded'), res_) assert_result_count(res_, 1, status='ok', path=sub2.path, type='dataset') assert_result_count(res_, 0, path=opj(sub2.path, 'file.dat'), type='file') # since published to origin -- destination should not get that file nok_(lexists(opj(sub2_target.path, 'file.dat'))) res_ = publish(dataset=source, to='target', recursive=True) assert_status(('ok', 'notneeded'), res_) assert_result_count(res_, 1, status='ok', path=sub2.path, type='dataset') assert_result_count(res_, 0, path=opj(sub2.path, 'file.dat'), type='file') # Note: with updateInstead only in target2 and not saving change in # super-dataset we would have made remote dataset, if we had entire # hierarchy, to be somewhat inconsistent. # But here, since target datasets are independent -- it is ok # and the file itself was transferred ok_(lexists(opj(sub2_target.path, 'file.dat'))) nok_(sub2_target.file_has_content('file.dat')) ## but now we can redo publish recursively, with explicitly requested data transfer res_ = publish(dataset=source, to='target', recursive=True, transfer_data='all') ok_(sub2_target.file_has_content('file.dat')) assert_result_count(res_, 1, status='ok', path=opj(sub2.path, 'file.dat')) # Let's save those present changes and publish while implying "since last # merge point" source.save(message="Changes in subm2") # and test if it could deduce the remote/branch to push to source.config.set('branch.master.remote', 'target', where='local') with chpwd(source.path): res_ = publish(since='', recursive=True) # TODO: somehow test that there were no even attempt to diff within "subm 1" # since if `--since=''` worked correctly, nothing has changed there and it # should have not been even touched assert_status(('ok', 'notneeded'), res_) assert_result_count(res_, 1, status='ok', path=source.path, type='dataset')
def test_publish_with_data(origin, src_path, dst_path, sub1_pub, sub2_pub, dst_clone_path): # prepare src source = install(src_path, source=origin, recursive=True) source.repo.get('test-annex.dat') # create plain git at target: target = AnnexRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # now, set up targets for the submodules: # the need to be annexes, because we want to be able to copy data to them # further down sub1_target = AnnexRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = AnnexRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, '2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) res = publish(dataset=source, to="target", path=['test-annex.dat'], result_xfm='paths') # first it would publish data and then push # TODO order is not fixed (yet) #eq_(res, [opj(source.path, 'test-annex.dat'), source.path]) eq_(set(res), set([opj(source.path, 'test-annex.dat'), source.path])) # XXX master was not checked out in dst! eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # TODO: last commit in git-annex branch differs. Probably fine, # but figure out, when exactly to expect this for proper testing: # yoh: they differ because local annex records information about now # file being available in that remote, and remote one does it via a call in # the hook I guess. So they both get the same information but in two # different commits. I do not observe such behavior of remote having git-annex # automagically updated in older clones # which do not have post-receive hook on remote side eq_(list(target.get_branch_commits("git-annex"))[1:], list(source.repo.get_branch_commits("git-annex"))[1:]) # we need compare target/master: target.checkout("master") ok_(target.file_has_content('test-annex.dat')) # make sure that whatever we published is actually consumable dst_clone = install( dst_clone_path, source=dst_path, result_xfm='datasets', return_type='item-or-list') nok_(dst_clone.repo.file_has_content('test-annex.dat')) res = dst_clone.get('test-annex.dat') ok_(dst_clone.repo.file_has_content('test-annex.dat')) res = publish(dataset=source, to="target", path=['.']) # there is nothing to publish on 2nd attempt #eq_(res, ([source, 'test-annex.dat'], [])) assert_result_count(res, 1, status='notneeded') import glob res = publish(dataset=source, to="target", path=glob.glob1(source.path, '*')) # Note: This leads to recursive publishing, since expansion of '*' # contains the submodules themselves in this setup # only the subdatasets, targets are plain git repos, hence # no file content is pushed, all content in super was pushed # before assert_result_count(res, 3) assert_result_count(res, 1, status='ok', path=sub1.path) assert_result_count(res, 1, status='ok', path=sub2.path) assert_result_count(res, 1, status='notneeded', path=source.path) # if we publish again -- nothing to be published res = source.publish(to="target") assert_result_count(res, 1, status='notneeded', path=source.path) # if we drop a file and publish again -- dataset should be published # since git-annex branch was updated source.drop('test-annex.dat') res = source.publish(to="target") assert_result_count(res, 1, status='ok', path=source.path) # and empty again if we try again res = source.publish(to="target") assert_result_count(res, 1, status='notneeded', path=source.path)
def test_publish_recursive(pristine_origin, origin_path, src_path, dst_path, sub1_pub, sub2_pub): # we will be publishing back to origin, so to not alter testrepo # we will first clone it origin = install(origin_path, source=pristine_origin, recursive=True) # prepare src source = install(src_path, source=origin.path, recursive=True) # we will be trying to push into this later on, need to give permissions... origin_sub2 = Dataset(opj(origin_path, '2')) origin_sub2.config.set( 'receive.denyCurrentBranch', 'updateInstead', where='local') ## TODO this manual fixup is needed due to gh-1548 -- needs proper solution #os.remove(opj(origin_sub2.path, '.git')) #os.rename(opj(origin_path, '.git', 'modules', '2'), opj(origin_sub2.path, '.git')) # create plain git at target: target = GitRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # subdatasets have no remote yet, so recursive publishing should fail: res = publish(dataset=source, to="target", recursive=True, on_failure='ignore') assert_result_count(res, 3) assert_result_count( res, 1, status='ok', type='dataset', path=source.path) assert_result_count( res, 2, status='error', message=("Unknown target sibling '%s' for publication", 'target')) # now, set up targets for the submodules: sub1_target = GitRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = AnnexRepo(sub2_pub, create=True) # we will be testing presence of the file content, so let's make it progress sub2_target.config.set('receive.denyCurrentBranch', 'updateInstead', where='local') sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, '2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) # publish recursively with swallow_logs(new_level=logging.DEBUG) as cml: res = publish(dataset=source, to="target", recursive=True) assert_not_in( 'forced update', cml.out, "we probably haven't merged git-annex before pushing" ) # testing result list # base dataset was already published above, notneeded again assert_status(('ok', 'notneeded'), res) # nothing failed assert_result_count( res, 3, type='dataset') eq_({r['path'] for r in res}, {src_path, sub1.path, sub2.path}) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) eq_(list(target.get_branch_commits("git-annex")), list(source.repo.get_branch_commits("git-annex"))) eq_(list(sub1_target.get_branch_commits("master")), list(sub1.get_branch_commits("master"))) eq_(list(sub1_target.get_branch_commits("git-annex")), list(sub1.get_branch_commits("git-annex"))) eq_(list(sub2_target.get_branch_commits("master")), list(sub2.get_branch_commits("master"))) eq_(list(sub2_target.get_branch_commits("git-annex")), list(sub2.get_branch_commits("git-annex"))) # we are tracking origin but origin has different git-annex, since we # cloned from it, so it is not aware of our git-annex neq_(list(origin.repo.get_branch_commits("git-annex")), list(source.repo.get_branch_commits("git-annex"))) # So if we first publish to it recursively, we would update # all sub-datasets since git-annex branch would need to be pushed res_ = publish(dataset=source, recursive=True) assert_result_count(res_, 1, status='ok', path=source.path) assert_result_count(res_, 1, status='ok', path=sub1.path) assert_result_count(res_, 1, status='ok', path=sub2.path) # and now should carry the same state for git-annex eq_(list(origin.repo.get_branch_commits("git-annex")), list(source.repo.get_branch_commits("git-annex"))) # test for publishing with --since. By default since no changes, nothing pushed res_ = publish(dataset=source, recursive=True) assert_result_count( res_, 3, status='notneeded', type='dataset') # still nothing gets pushed, because origin is up to date res_ = publish(dataset=source, recursive=True, since='HEAD^') assert_result_count( res_, 3, status='notneeded', type='dataset') # and we should not fail if we run it from within the dataset with chpwd(source.path): res_ = publish(recursive=True, since='HEAD^') assert_result_count( res_, 3, status='notneeded', type='dataset') # Let's now update one subm with open(opj(sub2.path, "file.txt"), 'w') as f: f.write('') # add to subdataset, does not alter super dataset! # MIH: use `to_git` because original test author used # and explicit `GitRepo.add` -- keeping this for now Dataset(sub2.path).add('file.txt', to_git=True) # Let's now update one subm create_tree(sub2.path, {'file.dat': 'content'}) # add to subdataset, without reflecting the change in its super(s) Dataset(sub2.path).add('file.dat') # note: will publish to origin here since that is what it tracks res_ = publish(dataset=source, recursive=True, on_failure='ignore') ## only updates published, i.e. just the subdataset, super wasn't altered ## nothing copied! assert_status(('ok', 'notneeded'), res_) assert_result_count(res_, 1, status='ok', path=sub2.path, type='dataset') assert_result_count(res_, 0, path=opj(sub2.path, 'file.dat'), type='file') # since published to origin -- destination should not get that file nok_(lexists(opj(sub2_target.path, 'file.dat'))) res_ = publish(dataset=source, to='target', recursive=True) assert_status(('ok', 'notneeded'), res_) assert_result_count(res_, 1, status='ok', path=sub2.path, type='dataset') assert_result_count(res_, 0, path=opj(sub2.path, 'file.dat'), type='file') # Note: with updateInstead only in target2 and not saving change in # super-dataset we would have made remote dataset, if we had entire # hierarchy, to be somewhat inconsistent. # But here, since target datasets are independent -- it is ok # and the file itself was transferred ok_(lexists(opj(sub2_target.path, 'file.dat'))) nok_(sub2_target.file_has_content('file.dat')) ## but now we can redo publish recursively, with explicitly requested data transfer res_ = publish( dataset=source, to='target', recursive=True, transfer_data='all' ) ok_(sub2_target.file_has_content('file.dat')) assert_result_count( res_, 1, status='ok', path=opj(sub2.path, 'file.dat')) # Let's save those present changes and publish while implying "since last # merge point" source.save(message="Changes in subm2") # and test if it could deduce the remote/branch to push to source.config.set('branch.master.remote', 'target', where='local') with chpwd(source.path): res_ = publish(since='', recursive=True) # TODO: somehow test that there were no even attempt to diff within "subm 1" # since if `--since=''` worked correctly, nothing has changed there and it # should have not been even touched assert_status(('ok', 'notneeded'), res_) assert_result_count(res_, 1, status='ok', path=source.path, type='dataset')
def test_openfmri_pipeline2(ind, topurl, outd): # no versioned files -- should still work! ;) list( initiate_dataset(template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({ 'dataset': 'ds666' })) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits = {b: list(repo.get_branch_commits(b)) for b in branches} commits_hexsha = { b: list(repo.get_branch_commits(b, value='hexsha')) for b in branches } commits_l = { b: list(repo.get_branch_commits(b, limit='left-only')) for b in branches } # all commits out there: # backend set, dataset init, crawler, init, incoming (shares with master -1), # (2 or 3 commits, depending on create variant) # incoming-processed, merge, aggregate metadata: ncommits_master = len(commits['master']) assert_in(ncommits_master, [5, 6]) assert_in(len(commits_l['master']), [4, 5]) eq_(len(commits['incoming']), ncommits_master - 2) eq_(len(commits_l['incoming']), ncommits_master - 2) eq_(len(commits['incoming-processed']), ncommits_master - 1) eq_(len(commits_l['incoming-processed']), ncommits_master - 2) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = { b: list(repo.get_branch_commits(b, value='hexsha')) for b in branches } eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new eq_(out[0]['datalad_stats'], ActivityStats(files=2, skipped=2, urls=2)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) os.rename(opj(ind, 'ds666', 'ds666_R2.0.0.tar.gz'), opj(ind, 'ds666', 'ds666.tar.gz')) with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) eq_(out[0]['datalad_stats'], ActivityStats()) # was committed stats_total = out[0]['datalad_stats'].get_total() stats_total.downloaded_size = 0 eq_( stats_total, ActivityStats(files=4, overwritten=1, skipped=1, downloaded=1, merges=[['incoming', 'incoming-processed']], versions=['1.0.0'], renamed=1, urls=2, add_annex=2)) # in reality there is also 1.0.0+1 tag since file changed but no version suffix eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1']) check_dropall_get(repo)
def test_openfmri_pipeline1(ind, topurl, outd, clonedir): index_html = opj(ind, 'ds666', 'index.html') list( initiate_dataset(template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({ 'dataset': 'ds666' })) repo = AnnexRepo(outd, create=False) # to be used in the checks # Since datalad 0.11.2 all .metadata/objects go under annex. # Here we have a test where we force drop all annexed content, # to mitigate that let's place all metadata under git dotdatalad_attributes_file = opj('.datalad', '.gitattributes') repo.set_gitattributes([('metadata/objects/**', { 'annex.largefiles': '(nothing)' })], dotdatalad_attributes_file) # --amend so we do not cause change in # of commits below repo.commit("gitattributes", files=dotdatalad_attributes_file, options=['--amend']) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath_nover, "mighty load in old format") # # And now versioned files were specified! # add_to_index(index_html, content=_versioned_files) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) ok_( not exists(t1w_fpath_nover), "%s file should no longer be there if unversioned files get removed correctly" % t1w_fpath_nover) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits = {b: list(repo.get_branch_commits(b)) for b in branches} commits_hexsha = { b: list(repo.get_branch_commits(b, value='hexsha')) for b in branches } commits_l = { b: list(repo.get_branch_commits(b, limit='left-only')) for b in branches } # all commits out there: # backend set, dataset init, crawler init # (2 or 3 commits, depending on create variant) # + 3*(incoming, processed, merge) # + 3*aggregate-metadata update # - 1 since now that incoming starts with master, there is one less merge # In --incremental mode there is a side effect of absent now # 2*remove of obsolete metadata object files, # see https://github.com/datalad/datalad/issues/2772 ncommits_master = len(commits['master']) assert_in(ncommits_master, [13, 14]) assert_in(len(commits_l['master']), [8, 9]) eq_(len(commits['incoming']), ncommits_master - 8) eq_(len(commits_l['incoming']), ncommits_master - 8) eq_(len(commits['incoming-processed']), ncommits_master - 5) eq_(len(commits_l['incoming-processed']), ncommits_master - 8) # Check tags for the versions eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1']) # +1 because original "release" was assumed to be 1.0.0 repo_tags = repo.get_tags() eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1']) # Ben: The tagged ones currently are the ones with the message # '[DATALAD] dataset aggregate metadata update\n': eq_(repo_tags[0]['hexsha'], commits_l['master'][4].hexsha) # next to the last one eq_(repo_tags[-1]['hexsha'], commits_l['master'][0].hexsha) # the last one def hexsha(l): return l.__class__(x.hexsha for x in l) # Verify that we have desired tree of merges eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1].hexsha, commits_l['incoming'][0].hexsha)) eq_( hexsha(commits_l['incoming-processed'][2].parents), ( commits_l['incoming-processed'][3].hexsha, # also in master commits_l['incoming'][2].hexsha, )) # ben: The following two comparisons are targeting these commits: # commit "Merge branch 'incoming-processed'\n" in commits_l['master'], # parents are: # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed'] eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2].hexsha, commits_l['incoming-processed'][0].hexsha)) eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4].hexsha, commits_l['incoming-processed'][1].hexsha)) with chpwd(outd): eq_(set(glob('*')), {'changelog.txt', 'sub-1'}) all_files = sorted(find_files('.')) t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath, "mighty load 1.0.1") ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False) ok_file_under_git(t1w_fpath, annexed=True) try: # this is the new way from datalad.metadata.metadata import get_ds_aggregate_db_locations ds = Dataset('.') dbloc, objbase = get_ds_aggregate_db_locations(ds) dbloc = op.relpath(dbloc, start=ds.path) except ImportError: # this stopped working in early 2019 versions of datalad from datalad.metadata.metadata import agginfo_relpath dbloc = agginfo_relpath target_files = { './.datalad/config', './.datalad/crawl/crawl.cfg', # no more! # './.datalad/config.ttl', './.datalad/datalad.ttl', './.datalad/crawl/statuses/incoming.json', './.datalad/crawl/versions/incoming.json', './changelog.txt', './sub-1/anat/sub-1_T1w.dat', './sub-1/beh/responses.tsv', './' + dbloc, } target_incoming_files = { '.gitattributes', # we marked default backend right in the incoming # we now base 'incoming' on master branch, so we get all those as well '.datalad/.gitattributes', '.datalad/config', '.datalad/crawl/crawl.cfg', 'changelog.txt', 'ds666.tar.gz', 'ds666-beh_R1.0.1.tar.gz', 'ds666_R1.0.0.tar.gz', 'ds666_R1.0.1.tar.gz', 'ds666_R2.0.0.tar.gz', '.datalad/crawl/statuses/incoming.json', '.datalad/crawl/versions/incoming.json' } # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # check that -beh was committed in 2nd commit in incoming, not the first one assert_not_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][-1].hexsha)) assert_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][0].hexsha)) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = { b: list(repo.get_branch_commits(b, value='hexsha')) for b in branches } eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new # actually we do manage to add_git 1 (README) since it is generated committed directly to git # BUT now fixed -- if not committed (was the same), should be marked as skipped # Nothing was committed so stats leaked all the way up eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) # rerun pipeline when new content is available # add new revision, rerun pipeline and check that stuff was processed/added correctly add_to_index( index_html, content= '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>') with chpwd(outd): out = run_pipeline(pipeline) all_files_updated = sorted(find_files('.')) eq_(len(out), 1) assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats()) # there is no overlays ATM, so behav would be gone since no 2.0.0 for it! target_files.remove('./sub-1/beh/responses.tsv') # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files_updated if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # new instance so it re-reads git stuff etc # repo = AnnexRepo(outd, create=False) # to be used in the checks commits_ = {b: list(repo.get_branch_commits(b)) for b in branches} commits_hexsha_ = { b: list(repo.get_branch_commits(b, value='hexsha')) for b in branches } commits_l_ = { b: list(repo.get_branch_commits(b, limit='left-only')) for b in branches } assert_not_equal(commits_hexsha, commits_hexsha_) eq_(out[0]['datalad_stats'], ActivityStats()) # commit happened so stats were consumed # numbers seems to be right total_stats = out[0]['datalad_stats'].get_total() # but for some reason downloaded_size fluctuates.... why? probably archiving...? total_stats.downloaded_size = 0 eq_( total_stats, ActivityStats( files=8, skipped=5, downloaded=1, renamed=1, urls=6, add_annex=2, # add_git=1, # README versions=['2.0.0'], merges=[['incoming', 'incoming-processed']])) check_dropall_get(repo) # Let's see if pipeline would remove files we stopped tracking remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>') with chpwd(outd): with swallow_logs(new_level=logging.WARNING) as cml: out = run_pipeline(pipeline) # since files get removed in incoming, but repreprocessed completely # incomming-processed and merged into master -- new commits will come # They shouldn't have any difference but still should be new commits assert_in("There is already a tag 2.0.0 in the repository", cml.out) eq_(len(out), 1) incoming_files = repo.get_files('incoming') target_incoming_files.remove('ds666_R1.0.0.tar.gz') eq_(set(incoming_files), target_incoming_files) commits_hexsha_removed = { b: list(repo.get_branch_commits(b, value='hexsha')) for b in branches } # our 'statuses' database should have recorded the change thus got a diff # which propagated through all branches for b in 'master', 'incoming-processed': # with non persistent DB we had no changes # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), []) eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0])[0].a_path, '.datalad/crawl/statuses/incoming.json') dincoming = repo.repo.branches['incoming'].commit.diff( commits_hexsha_['incoming'][0]) eq_(len(dincoming), 2) # 2 diff objects -- 1 file removed, 1 statuses updated eq_(set([d.a_path for d in dincoming]), {'.datalad/crawl/statuses/incoming.json', 'ds666_R1.0.0.tar.gz'}) # since it seems to diff "from current to the specified", it will be listed as new_file assert any(d.new_file for d in dincoming) eq_(out[0]['datalad_stats'].get_total().removed, 1) assert_not_equal(commits_hexsha_, commits_hexsha_removed) # we will check if a clone would be crawling just as good from datalad.api import crawl # make a brand new clone GitRepo.clone(outd, clonedir) def _pipeline(*args, **kwargs): """Helper to mock openfmri.pipeline invocation so it looks at our 'server'""" kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False}) return ofpipeline(*args, **kwargs) with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline): output, stats = crawl( ) # we should be able to recrawl without doing anything ok_(stats, ActivityStats(files=6, skipped=6, urls=5))
def test_publish_recursive(origin, src_path, dst_path, sub1_pub, sub2_pub): # prepare src source = install(src_path, source=origin, recursive=True)[0] # create plain git at target: target = GitRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # subdatasets have no remote yet, so recursive publishing should fail: with assert_raises(ValueError) as cm: publish(dataset=source, to="target", recursive=True) assert_in("No sibling 'target' found", exc_str(cm.exception)) # now, set up targets for the submodules: sub1_target = GitRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = AnnexRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, 'subm 2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) # publish recursively with swallow_logs(new_level=logging.DEBUG) as cml: res = publish(dataset=source, to="target", recursive=True) assert_not_in( 'forced update', cml.out, "we probably haven't merged git-annex before pushing" ) # testing result list # (Note: Dataset lacks __eq__ for now. Should this be based on path only?) assert_is_instance(res, tuple) assert_is_instance(res[0], list) assert_is_instance(res[1], list) eq_(res[1], []) # nothing failed/was skipped for item in res[0]: assert_is_instance(item, Dataset) eq_({res[0][0].path, res[0][1].path, res[0][2].path}, {src_path, sub1.path, sub2.path}) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) eq_(list(target.get_branch_commits("git-annex")), list(source.repo.get_branch_commits("git-annex"))) eq_(list(sub1_target.get_branch_commits("master")), list(sub1.get_branch_commits("master"))) eq_(list(sub1_target.get_branch_commits("git-annex")), list(sub1.get_branch_commits("git-annex"))) eq_(list(sub2_target.get_branch_commits("master")), list(sub2.get_branch_commits("master"))) eq_(list(sub2_target.get_branch_commits("git-annex")), list(sub2.get_branch_commits("git-annex"))) # test for publishing with --since. By default since no changes, only current pushed res_ = publish(dataset=source, recursive=True) # only current one would get pushed eq_(set(r.path for r in res_[0]), {src_path}) # all get pushed res_ = publish(dataset=source, recursive=True, since='HEAD^') eq_(set(r.path for r in res_[0]), {src_path, sub1.path, sub2.path}) # Let's now update one subm with open(opj(sub2.path, "file.txt"), 'w') as f: f.write('') sub2.add('file.txt') sub2.commit("") # TODO: Doesn't work: https://github.com/datalad/datalad/issues/636 #source.save("changed sub2", all_changes=True) source.repo.commit("", options=['-a']) res_ = publish(dataset=source, recursive=True) # only updated ones were published eq_(set(r.path for r in res_[0]), {src_path, sub2.path})
def test_publish_with_data(origin, src_path, dst_path, sub1_pub, sub2_pub): # prepare src source = install(src_path, source=origin, recursive=True)[0] source.repo.get('test-annex.dat') # create plain git at target: target = AnnexRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # now, set up targets for the submodules: sub1_target = GitRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = GitRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, 'subm 2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) # TMP: Insert the fetch to prevent GitPython to fail after the push, # because it cannot resolve the SHA of the old commit of the remote, # that git reports back after the push. # TODO: Figure out, when to fetch things in general; Alternatively: # Is there an option for push, that prevents GitPython from failing? source.repo.fetch("target") res = publish(dataset=source, to="target", path=['test-annex.dat']) eq_(res, ([source, 'test-annex.dat'], [])) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # TODO: last commit in git-annex branch differs. Probably fine, # but figure out, when exactly to expect this for proper testing: eq_(list(target.get_branch_commits("git-annex"))[1:], list(source.repo.get_branch_commits("git-annex"))[1:]) # we need compare target/master: target.checkout("master") eq_(target.file_has_content(['test-annex.dat']), [True]) source.repo.fetch("target") res = publish(dataset=source, to="target", path=['.']) eq_(res, ([source, 'test-annex.dat'], [])) source.repo.fetch("target") import glob res = publish(dataset=source, to="target", path=glob.glob1(source.path, '*')) # Note: This leads to recursive publishing, since expansion of '*' # contains the submodules themselves in this setup # collect result paths: result_paths = [] for item in res[0]: if isinstance(item, Dataset): result_paths.append(item.path) else: result_paths.append(item) eq_({source.path, opj(source.path, "subm 1"), opj(source.path, "subm 2"), 'test-annex.dat'}, set(result_paths))