def test_find_files(): tests_dir = dirname(__file__) proj_dir = normpath(opj(dirname(__file__), pardir)) ff = find_files('.*', proj_dir) ok_generator(ff) files = list(ff) assert (len(files) > 10) # we have more than 10 test files here assert_in(opj(tests_dir, 'test_utils.py'), files) # and no directories should be mentioned assert_not_in(tests_dir, files) ff2 = find_files('.*', proj_dir, dirs=True) files2 = list(ff2) assert_in(opj(tests_dir, 'test_utils.py'), files2) assert_in(tests_dir, files2) # now actually matching the path ff3 = find_files(r'.*\\test_.*\.py$' if on_windows else r'.*/test_.*\.py$', proj_dir, dirs=True) files3 = list(ff3) assert_in(opj(tests_dir, 'test_utils.py'), files3) assert_not_in(tests_dir, files3) for f in files3: ok_startswith(basename(f), 'test_')
def test_add_delete_after_and_drop(self): # To test that .tar gets removed # but that new stuff was added to annex repo. We know the key since default # backend and content remain the same key1 = 'SHA256E-s5--16d3ad1974655987dd7801d70659990b89bfe7e931a0a358964e64e901761cc0.dat' # previous state of things: prev_files = list(find_files('.*', self.annex.path)) assert_equal(self.annex.whereis(key1, key=True, output='full'), {}) commits_prior = list(self.annex.get_branch_commits_('git-annex')) add_archive_content('1.tar', annex=self.annex, strip_leading_dirs=True, delete_after=True) commits_after = list(self.annex.get_branch_commits_('git-annex')) # There should be a single commit for all additions +1 to initiate datalad-archives gh-1258 # If faking dates, there should be another +1 because # annex.alwayscommit isn't set to false. assert_equal(len(commits_after), # We expect one more when faking dates because # annex.alwayscommit isn't set to false. len(commits_prior) + 2 + self.annex.fake_dates_enabled) assert_equal(prev_files, list(find_files('.*', self.annex.path))) w = self.annex.whereis(key1, key=True, output='full') assert_equal(len(w), 2) # in archive, and locally since we didn't drop # Let's now do the same but also drop content add_archive_content('1.tar', annex=self.annex, strip_leading_dirs=True, delete_after=True, drop_after=True) assert_equal(prev_files, list(find_files('.*', self.annex.path))) w = self.annex.whereis(key1, key=True, output='full') assert_equal(len(w), 1) # in archive # there should be no .datalad temporary files hanging around self.assert_no_trash_left_behind()
def test_find_files_exclude_vcs(repo): ff = find_files('.*', repo, dirs=True) files = list(ff) assert_equal({basename(f) for f in files}, {'d1', 'git'}) assert_not_in(opj(repo, '.git'), files) ff = find_files('.*', repo, dirs=True, exclude_vcs=False) files = list(ff) assert_equal({basename(f) for f in files}, {'d1', 'git', '.git', '1'}) assert_in(opj(repo, '.git'), files)
def assert_no_trash_left_behind(self): assert_equal( list( find_files(r'\.datalad..*', self.annex.path, exclude="config", dirs=True)), [])
def test_crawl_api_recursive(get_subdatasets_, run_pipeline_, load_pipeline_from_config_, get_repo_pipeline_script_path_, get_lofilename_, chpwd_, tdir): pwd = getpwd() with chpwd(tdir): output, stats = crawl(recursive=True) assert_equal(pwd, getpwd()) if external_versions['mock'] < '1.0.1': raise SkipTest( "needs a more recent mock which throws exceptions in side_effects") assert_equal(output, [[]] * 4 + [None]) # for now output is just a list of outputs assert_equal( stats, ActivityStats( datasets_crawled=5, datasets_crawl_failed=1)) # nothing was done but we got it crawled chpwd_.assert_has_calls([ call(None), call('path1'), call('path1/path1_1'), call('path2'), ], any_order=True) assert_equal( list(find_files('.*', tdir, exclude_vcs=False)), [_path_(tdir, 'some.log')]) # no files were generated besides the log
def get_core_metadata_filenames(self): return list(find_files( 'meta\.json', topdir=opj(self.ds.path, '.datalad', 'meta'), exclude=None, exclude_vcs=False, exclude_datalad=False, dirs=False))
def get_core_metadata_filenames(self): return list( find_files('meta\.json', topdir=opj(self.ds.path, '.datalad', 'meta'), exclude=None, exclude_vcs=False, exclude_datalad=False, dirs=False))
def test_balsa_pipeline1(ind, topurl, outd, clonedir): list(initiate_dataset( template="balsa", dataset_name='dataladtest-WG33', path=outd, data_fields=['dataset_id'])({'dataset_id': 'WG33'})) with chpwd(outd): pipeline = ofpipeline('WG33', url=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # since now we base incoming on master -- and there were nothing custom # in master after incoming-processed, both branches should be the same eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # but that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) get_branch_commits = repo.get_branch_commits_ \ if hasattr(repo, 'get_branch_commits_') else repo.get_branch_commits commits = {b: list(get_branch_commits(b)) for b in branches} # all commits out there -- init ds + init crawler + 1*(incoming, processed) # The number of commits in master differs based on the create variant used # (the one DataLad's master makes only one commit). ncommits_master = len(commits["master"]) assert_in(ncommits_master, [4, 5]) # incoming branches from master but lacks one merge commit. eq_(len(commits['incoming']), ncommits_master - 1) # incoming-processed is on master. eq_(len(commits['incoming-processed']), ncommits_master) with chpwd(outd): eq_(set(glob('*')), {'dir1', 'file1.nii'}) all_files = sorted(find_files('.')) fpath = opj(outd, 'file1.nii') ok_file_has_content(fpath, "content of file1.nii") ok_file_under_git(fpath, annexed=True) fpath2 = opj(outd, 'dir1', 'file2.nii') ok_file_has_content(fpath2, "content of file2.nii") ok_file_under_git(fpath2, annexed=True) target_files = { './.datalad/crawl/crawl.cfg', './.datalad/crawl/statuses/incoming.json', './.datalad/meta/balsa.json', './.datalad/config', './file1.nii', './dir1/file2.nii', } eq_(set(all_files), target_files)
def test_balsa_pipeline1(ind, topurl, outd, clonedir): list( initiate_dataset(template="balsa", dataset_name='dataladtest-WG33', path=outd, data_fields=['dataset_id'])({ 'dataset_id': 'WG33' })) with chpwd(outd): pipeline = ofpipeline('WG33', url=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) commits = {b: list(repo.get_branch_commits(b)) for b in branches} eq_(len(commits['incoming']), 1) eq_(len(commits['incoming-processed']), 2) eq_( len(commits['master']), 6 ) # all commits out there -- init ds + init crawler + 1*(incoming, processed, merge) with chpwd(outd): eq_(set(glob('*')), {'dir1', 'file1.nii'}) all_files = sorted(find_files('.')) fpath = opj(outd, 'file1.nii') ok_file_has_content(fpath, "content of file1.nii") ok_file_under_git(fpath, annexed=True) fpath2 = opj(outd, 'dir1', 'file2.nii') ok_file_has_content(fpath2, "content of file2.nii") ok_file_under_git(fpath2, annexed=True) target_files = { './.datalad/crawl/crawl.cfg', './.datalad/crawl/statuses/incoming.json', './.datalad/meta/balsa.json', './.datalad/config', './file1.nii', './dir1/file2.nii', } eq_(set(all_files), target_files)
def test_add_delete_after_and_drop(self): # To test that .tar gets removed # but that new stuff was added to annex repo. We know the key since # default backend and content remain the same key1 = 'MD5E-s5--db87ebcba59a8c9f34b68e713c08a718.dat' repo = self.ds.repo # previous state of things: prev_files = list(find_files('.*', self.ds.path)) assert_equal(repo.whereis(key1, key=True, output='full'), {}) commits_prior = list(repo.get_branch_commits_('git-annex')) self.ds.add_archive_content('1.tar', strip_leading_dirs=True, delete_after=True) commits_after = list(repo.get_branch_commits_('git-annex')) # There should be a single commit for all additions +1 to initiate # datalad-archives gh-1258 # If faking dates, there should be another +1 because # annex.alwayscommit isn't set to false. assert_equal( len(commits_after), # We expect one more when faking dates because # annex.alwayscommit isn't set to false. len(commits_prior) + 2 + repo.fake_dates_enabled) assert_equal(prev_files, list(find_files('.*', self.ds.path))) w = repo.whereis(key1, key=True, output='full') assert_equal(len(w), 2) # in archive, and locally since we didn't drop # Let's now do the same but also drop content self.ds.add_archive_content('1.tar', strip_leading_dirs=True, delete_after=True, drop_after=True) assert_equal(prev_files, list(find_files('.*', self.ds.path))) w = repo.whereis(key1, key=True, output='full') assert_equal(len(w), 1) # in archive # there should be no .datalad temporary files hanging around self.assert_no_trash_left_behind()
def test_add_archive_dirs(path_orig=None, url=None, repo_path=None): # change to repo_path with chpwd(repo_path): # create annex repo ds = Dataset(repo_path).create(force=True) repo = ds.repo # add archive to the repo so we could test with swallow_outputs(): repo.add_url_to_file('1.tar.gz', opj(url, '1.tar.gz')) repo.commit("added 1.tar.gz") # test with excludes and annex options add_archive_content( '1.tar.gz', existing='archive-suffix', # Since inconsistent and seems in many cases no # leading dirs to strip, keep them as provided strip_leading_dirs=True, delete=True, leading_dirs_consider=['crcns.*', '1'], leading_dirs_depth=2, use_current_dir=False, exclude='.*__MACOSX.*') # some junk penetrates eq_( repo.get_description( uuid=DATALAD_SPECIAL_REMOTES_UUIDS[ARCHIVES_SPECIAL_REMOTE]), '[%s]' % ARCHIVES_SPECIAL_REMOTE) all_files = sorted(find_files('.')) # posixify paths to make it work on Windows as well all_files = [Path(file).as_posix() for file in all_files] target_files = { 'CR24A/behaving1/1 f.txt', 'CR24C/behaving3/3 f.txt', 'CR24D/behaving2/2 f.txt', '.datalad/config', } eq_(set(all_files), target_files) # regression test: the subdir in MACOSX wasn't excluded and its name was # getting stripped by leading_dir_len # if stripping and exclude didn't work this fails assert_false(exists('__MACOSX')) # if exclude doesn't work then name of subdir gets stripped by # leading_dir_len assert_false(exists('c-1_data')) # if exclude doesn't work but everything else works this fails assert_false(exists('CR24B'))
def test_add_archive_dirs(path_orig, url, repo_path): # change to repo_path with chpwd(repo_path): # create annex repo repo = AnnexRepo(repo_path, create=True) # add archive to the repo so we could test with swallow_outputs(): repo.add_urls([opj(url, '1.tar.gz')], options=["--pathdepth", "-1"]) repo.commit("added 1.tar.gz") # test with excludes and annex options add_archive_content( '1.tar.gz', existing='archive-suffix', # Since inconsistent and seems in many cases no leading dirs to strip, keep them as provided strip_leading_dirs=True, delete=True, leading_dirs_consider=['crcns.*', '1'], leading_dirs_depth=2, use_current_dir=False, exclude='.*__MACOSX.*') # some junk penetrates if external_versions['cmd:annex'] >= '6.20170208': # should have fixed remotes eq_( repo.get_description( uuid=DATALAD_SPECIAL_REMOTES_UUIDS[ARCHIVES_SPECIAL_REMOTE] ), '[%s]' % ARCHIVES_SPECIAL_REMOTE) all_files = sorted(find_files('.')) target_files = { './CR24A/behaving1/1 f.txt', './CR24C/behaving3/3 f.txt', './CR24D/behaving2/2 f.txt', } eq_(set(all_files), target_files) # regression test: the subdir in MACOSX wasn't excluded and its name was getting stripped by leading_dir_len assert_false(exists( '__MACOSX')) # if stripping and exclude didn't work this fails assert_false( exists('c-1_data') ) # if exclude doesn't work then name of subdir gets stripped by leading_dir_len assert_false( exists('CR24B') ) # if exclude doesn't work but everything else works this fails
def get_mtimes_and_digests(target_path): """Return digests (md5) and mtimes for all the files under target_path""" from datalad.utils import find_files from datalad.support.digests import Digester digester = Digester(['md5']) # bother only with existing ones for this test, i.e. skip annexed files without content target_files = [ f for f in find_files('.*', topdir=target_path, exclude_vcs=False, exclude_datalad=False) if exists(f) ] # let's leave only relative paths for easier analysis target_files_ = [relpath(f, target_path) for f in target_files] digests = {frel: digester(f) for f, frel in zip(target_files, target_files_)} mtimes = {frel: os.stat(f).st_mtime for f, frel in zip(target_files, target_files_)} return digests, mtimes
def test_balsa_pipeline2(ind, topurl, outd, clonedir): list( initiate_dataset(template="balsa", dataset_name='dataladtest-WG33', path=outd, data_fields=['dataset_id'])({ 'dataset_id': 'WG33' })) with chpwd(outd): with swallow_logs(new_level=logging.WARN) as cml: pipeline = ofpipeline('WG33', url=topurl) out = run_pipeline(pipeline) assert_true( 'The following files do not exist in the canonical tarball, ' 'but are individually listed files and will not be kept:' in cml.out) assert_true( './file1.nii varies in content from the individually downloaded ' 'file with the same name, it is removed and file from canonical ' 'tarball is kept' in cml.out) eq_(len(out), 1) with chpwd(outd): eq_(set(glob('*')), {'dir1', 'file1.nii'}) all_files = sorted(find_files('.')) fpath = opj(outd, 'file1.nii') ok_file_has_content(fpath, "content of file1.nii") ok_file_under_git(fpath, annexed=True) fpath2 = opj(outd, 'dir1', 'file2.nii') ok_file_has_content(fpath2, "content of file2.nii") ok_file_under_git(fpath2, annexed=True) target_files = { './.datalad/config', './.datalad/crawl/crawl.cfg', './.datalad/crawl/statuses/incoming.json', './.datalad/meta/balsa.json', './file1.nii', './dir1/file2.nii', } eq_(set(all_files), target_files)
def get_obsolete(self): """Returns full paths for files which weren't queried, thus must have been deleted Note that it doesn't track across branches, etc. """ if not self._track_queried: raise RuntimeError( "Cannot determine which files were removed since track_queried was set to False" ) obsolete = [] # those aren't tracked by annexificator datalad_path = opj(self.annex.path, HANDLE_META_DIR) for fpath in find_files('.*', topdir=self.annex.path): filepath = self._get_filepath(fpath) if filepath.startswith(datalad_path): continue if fpath not in self._queried_filepaths: obsolete.append(filepath) return obsolete
def test_openfmri_pipeline1(ind, topurl, outd, clonedir): index_html = opj(ind, 'ds666', 'index.html') list( initiate_dataset(template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({ 'dataset': 'ds666' })) repo = AnnexRepo(outd, create=False) # to be used in the checks # Since datalad 0.11.2 all .metadata/objects go under annex. # Here we have a test where we force drop all annexed content, # to mitigate that let's place all metadata under git dotdatalad_attributes_file = opj('.datalad', '.gitattributes') repo.set_gitattributes([('metadata/objects/**', { 'annex.largefiles': '(nothing)' })], dotdatalad_attributes_file) # --amend so we do not cause change in # of commits below repo.commit("gitattributes", files=dotdatalad_attributes_file, options=['--amend']) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath_nover, "mighty load in old format") # # And now versioned files were specified! # add_to_index(index_html, content=_versioned_files) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) ok_( not exists(t1w_fpath_nover), "%s file should no longer be there if unversioned files get removed correctly" % t1w_fpath_nover) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } # all commits out there: # dataset init, crawler init # (2 commits) # + 3*(incoming, processed, merge) # + 3*aggregate-metadata update # - 1 since now that incoming starts with master, there is one less merge # In --incremental mode there is a side effect of absent now # 2*remove of obsolete metadata object files, # see https://github.com/datalad/datalad/issues/2772 # TODO inspect by knowledgeable person and re-enable #ncommits_master = len(commits_hexsha['master']) #assert_in(ncommits_master, [13, 14]) #assert_in(len(commits_l['master']), [8, 9]) # TODO inspect by knowledgeable person and re-enable #eq_(len(commits_hexsha['incoming']), ncommits_master - 8) #eq_(len(commits_l['incoming']), ncommits_master - 8) #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5) #eq_(len(commits_l['incoming-processed']), ncommits_master - 8) # Check tags for the versions eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1']) # +1 because original "release" was assumed to be 1.0.0 repo_tags = repo.get_tags() eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1']) # Ben: The tagged ones currently are the ones with the message # '[DATALAD] dataset aggregate metadata update\n': #eq_(repo_tags[0]['hexsha'], commits_l['master'][4]) # next to the last one #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0]) # the last one def hexsha(l): return l.__class__(x.hexsha for x in l) # TODO requires additional tooling to re-enable ## Verify that we have desired tree of merges #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1], # commits_l['incoming'][0])) #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3], # also in master # commits_l['incoming'][2],)) # ben: The following two comparisons are targeting these commits: # commit "Merge branch 'incoming-processed'\n" in commits_l['master'], # parents are: # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed'] # TODO requires additional tooling to re-enable #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2], # commits_l['incoming-processed'][0])) #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4], # commits_l['incoming-processed'][1])) with chpwd(outd): eq_(set(glob('*')), {'changelog.txt', 'sub-1'}) all_files = sorted(find_files('.')) t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath, "mighty load 1.0.1") ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False) ok_file_under_git(t1w_fpath, annexed=True) try: # this is the new way from datalad.metadata.metadata import get_ds_aggregate_db_locations ds = Dataset('.') dbloc, objbase = get_ds_aggregate_db_locations(ds) dbloc = op.relpath(dbloc, start=ds.path) except ImportError: # this stopped working in early 2019 versions of datalad from datalad.metadata.metadata import agginfo_relpath dbloc = agginfo_relpath target_files = { './.datalad/config', './.datalad/crawl/crawl.cfg', # no more! # './.datalad/config.ttl', './.datalad/datalad.ttl', './.datalad/crawl/statuses/incoming.json', './.datalad/crawl/versions/incoming.json', './changelog.txt', './sub-1/anat/sub-1_T1w.dat', './sub-1/beh/responses.tsv', './' + dbloc, } target_incoming_files = { '.gitattributes', # we marked default backend right in the incoming # we now base 'incoming' on master branch, so we get all those as well '.datalad/.gitattributes', '.datalad/config', '.datalad/crawl/crawl.cfg', 'changelog.txt', 'ds666.tar.gz', 'ds666-beh_R1.0.1.tar.gz', 'ds666_R1.0.0.tar.gz', 'ds666_R1.0.1.tar.gz', 'ds666_R2.0.0.tar.gz', '.datalad/crawl/statuses/incoming.json', '.datalad/crawl/versions/incoming.json' } # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # check that -beh was committed in 2nd commit in incoming, not the first one assert_not_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][-1])) assert_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][0])) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new # actually we do manage to add_git 1 (README) since it is generated committed directly to git # BUT now fixed -- if not committed (was the same), should be marked as skipped # Nothing was committed so stats leaked all the way up eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) # rerun pipeline when new content is available # add new revision, rerun pipeline and check that stuff was processed/added correctly add_to_index( index_html, content= '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>') with chpwd(outd): out = run_pipeline(pipeline) all_files_updated = sorted(find_files('.')) eq_(len(out), 1) assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats()) # there is no overlays ATM, so behav would be gone since no 2.0.0 for it! target_files.remove('./sub-1/beh/responses.tsv') # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files_updated if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # new instance so it re-reads git stuff etc # repo = AnnexRepo(outd, create=False) # to be used in the checks commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l_ = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } assert_not_equal(commits_hexsha, commits_hexsha_) eq_(out[0]['datalad_stats'], ActivityStats()) # commit happened so stats were consumed # numbers seems to be right total_stats = out[0]['datalad_stats'].get_total() # but for some reason downloaded_size fluctuates.... why? probably archiving...? total_stats.downloaded_size = 0 eq_( total_stats, ActivityStats( files=8, skipped=5, downloaded=1, renamed=1, urls=6, add_annex=2, # add_git=1, # README versions=['2.0.0'], merges=[['incoming', 'incoming-processed']])) check_dropall_get(repo) # Let's see if pipeline would remove files we stopped tracking remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>') with chpwd(outd): with swallow_logs(new_level=logging.WARNING) as cml: out = run_pipeline(pipeline) # since files get removed in incoming, but repreprocessed completely # incomming-processed and merged into master -- new commits will come # They shouldn't have any difference but still should be new commits assert_in("There is already a tag 2.0.0 in the repository", cml.out) eq_(len(out), 1) incoming_files = repo.get_files('incoming') target_incoming_files.remove('ds666_R1.0.0.tar.gz') eq_(set(incoming_files), target_incoming_files) commits_hexsha_removed = { b: list(_get_branch_commits(repo, b)) for b in branches } # our 'statuses' database should have recorded the change thus got a diff # which propagated through all branches for b in 'master', 'incoming-processed': # with non persistent DB we had no changes # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), []) assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.diff(b, commits_hexsha_[b][0])) dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0]) eq_(len(dincoming), 2) # 2 diff objects -- 1 file removed, 1 statuses updated eq_( set(dincoming.keys()), { repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.pathobj / 'ds666_R1.0.0.tar.gz' }) eq_(out[0]['datalad_stats'].get_total().removed, 1) assert_not_equal(commits_hexsha_, commits_hexsha_removed) # we will check if a clone would be crawling just as good from datalad.api import crawl # make a brand new clone GitRepo.clone(outd, clonedir) def _pipeline(*args, **kwargs): """Helper to mock openfmri.pipeline invocation so it looks at our 'server'""" kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False}) return ofpipeline(*args, **kwargs) with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline): output, stats = crawl( ) # we should be able to recrawl without doing anything ok_(stats, ActivityStats(files=6, skipped=6, urls=5))