def test_openfmri_superdataset_pipeline1(ind, topurl, outd): list( initiate_dataset( template="openfmri", template_func="superdataset_pipeline", template_kwargs={'url': topurl}, path=outd, )()) with chpwd(outd): crawl() #pipeline = ofcpipeline(url=topurl) #out = run_pipeline(pipeline) #eq_(out, [{'datalad_stats': ActivityStats()}]) # TODO: replace below command with the one listing subdatasets subdatasets = ['ds000001', 'ds000002'] eq_(Dataset(outd).get_subdatasets(fulfilled=True), subdatasets) # Check that crawling configuration was created for every one of those for sub in subdatasets: repo = GitRepo(opj(outd, sub)) assert (not repo.dirty) assert (exists(opj(repo.path, CRAWLER_META_CONFIG_PATH)))
def test_openfmri_superdataset_pipeline1(ind, topurl, outd): list(initiate_dataset( template="openfmri", template_func="superdataset_pipeline", template_kwargs={'url': topurl}, path=outd, )()) with chpwd(outd): crawl() #pipeline = ofcpipeline(url=topurl) #out = run_pipeline(pipeline) #eq_(out, [{'datalad_stats': ActivityStats()}]) # TODO: replace below command with the one listing subdatasets subdatasets = ['ds000001', 'ds000002'] eq_(Dataset(outd).get_subdatasets(fulfilled=True), subdatasets) # Check that crawling configuration was created for every one of those for sub in subdatasets: repo = GitRepo(opj(outd, sub)) assert(not repo.dirty) assert(exists(opj(repo.path, CRAWLER_META_CONFIG_PATH))) # TODO: check that configuration for the crawler is up to the standard # Ideally should also crawl some fake datasets I guess
def check_crawl_autoaddtext(gz, ind, topurl, outd): ds = create(outd) ds.run_procedure("cfg_text2git") with chpwd(outd): # TODO -- dataset argument template_kwargs = { 'url': topurl, 'a_href_match_': '.*', } if gz: template_kwargs['archives_re'] = "\.gz$" crawl_init(template_kwargs, save=True, template='simple_with_archives') try: crawl() except MissingExternalDependency as exc: raise SkipTest(exc_str(exc)) ok_clean_git(outd) ok_file_under_git(outd, "anothertext", annexed=False) ok_file_under_git(outd, "d/textfile", annexed=False) ok_file_under_git(outd, "d/tooshort", annexed=True) if 'compressed.dat.gz' in TEST_TREE2: if gz: ok_file_under_git(outd, "compressed.dat", annexed=False) ok_file_has_content(op.join(outd, "compressed.dat"), u"мама мыла раму") else: ok_file_under_git(outd, "compressed.dat.gz", annexed=True) else: raise SkipTest( "Need datalad >= 0.11.2 to test .gz files decompression")
def test_obscure_names(path): bucket = "datalad-test2-obscurenames-versioned" get_test_providers('s3://' + bucket) # to verify having s3 credentials create(path) with externals_use_cassette('test_simple_s3_test2_obscurenames_versioned_crawl_ext'), \ chpwd(path): crawl_init(template="simple_s3", args=dict(bucket=bucket), save=True) crawl() # fun with unicode was postponed ok_clean_git(path, annex=True) for f in ['f &$=@:+,?;', "f!-_.*'( )", 'f 1', 'f [1][2]']: ok_file_under_git(path, f, annexed=True)
def test_crawl_autoaddtext(ind, topurl, outd): ds = create(outd, text_no_annex=True) with chpwd(outd): # TODO -- dataset argument crawl_init( {'url': topurl, 'a_href_match_': '.*'} , save=True , template='simple_with_archives') crawl() ok_clean_git(outd) ok_file_under_git(outd, "anothertext", annexed=False) ok_file_under_git(outd, "d/textfile", annexed=False) ok_file_under_git(outd, "d/tooshort", annexed=True)
def test_crawl_api_recursive(get_subdatasets_, run_pipeline_, load_pipeline_from_config_, get_repo_pipeline_script_path_, get_lofilename_, chpwd_, tdir): pwd = getpwd() with chpwd(tdir): output, stats = crawl(recursive=True) assert_equal(pwd, getpwd()) if external_versions['mock'] < '1.0.1': raise SkipTest( "needs a more recent mock which throws exceptions in side_effects") assert_equal(output, [[]] * 4 + [None]) # for now output is just a list of outputs assert_equal( stats, ActivityStats( datasets_crawled=5, datasets_crawl_failed=1)) # nothing was done but we got it crawled chpwd_.assert_has_calls([ call(None), call('path1'), call('path1/path1_1'), call('path2'), ], any_order=True) assert_equal( list(find_files('.*', tdir, exclude_vcs=False)), [_path_(tdir, 'some.log')]) # no files were generated besides the log
def test_crawl(tempd): if not _get_github_cred().is_known: raise SkipTest("no github credential") ds = create(tempd) with chpwd(tempd): crawl_init(template='gh', save=True, args={ 'org': 'datalad-collection-1', 'include': 'kaggle' }) crawl() subdss = ds.subdatasets(fulfilled=True, result_xfm='datasets') assert all('kaggle' in d.path for d in subdss) assert_greater(len(subdss), 1) assert_false(ds.repo.dirty)
def test_crawl_api_chdir(run_pipeline_, load_pipeline_from_config_, chpwd_): output, stats = crawl('some_path_not_checked', chdir='somedir') assert_equal( stats, ActivityStats(datasets_crawled=1)) # nothing was done but we got it assert_equal(output, None) chpwd_.assert_called_with('somedir') load_pipeline_from_config_.assert_called_with('some_path_not_checked') run_pipeline_.assert_called_with(['pipeline'], stats=ActivityStats(datasets_crawled=1))
def _test_drop(path, drop_immediately): s3url = 's3://datalad-test0-nonversioned' providers = get_test_providers(s3url) # to verify having s3 credentials # vcr tape is getting bound to the session object, so we need to # force re-establishing the session for the bucket. # TODO (in datalad): make a dedicated API for that, now too obscure _ = providers.get_status(s3url, allow_old_session=False) create(path) # unfortunately this doesn't work without force dropping since I guess vcr # stops and then gets queried again for the same tape while testing for # drop :-/ with chpwd(path): crawl_init( template="simple_s3", args=dict( bucket="datalad-test0-nonversioned", drop=True, drop_force=True, # so test goes faster drop_immediately=drop_immediately, ), save=True) if drop_immediately: # cannot figure out but taping that interaction results in # git annex addurl error. No time to figure it out # so we just crawl without vcr for now. TODO: figure out WTF with chpwd(path): crawl() else: with externals_use_cassette( 'test_simple_s3_test0_nonversioned_crawl_ext' + ('_immediately' if drop_immediately else '')), \ chpwd(path): crawl() # test that all was dropped repo = AnnexRepo(path, create=False) files = glob(_path_(path, '*')) eq_(len(files), 8) for f in files: assert_false(repo.file_has_content(f))
def test_drop(path): get_test_providers('s3://datalad-test0-nonversioned') # to verify having s3 credentials create(path) # unfortunately this doesn't work without force dropping since I guess vcr # stops and then gets queried again for the same tape while testing for # drop :-/ with externals_use_cassette('test_simple_s3_test0_nonversioned_crawl_ext'), \ chpwd(path): crawl_init(template="simple_s3", args=dict( bucket="datalad-test0-nonversioned", drop=True, drop_force=True # so test goes faster ), save=True ) crawl() # test that all was dropped repo = AnnexRepo(path, create=False) files = glob(_path_(path, '*')) eq_(len(files), 8) for f in files: assert_false(repo.file_has_content(f))
def test_simple1(ind, topurl, outd): list(initiate_dataset( template="simple_with_archives", dataset_name='test1', path=outd, add_fields={'url': topurl + 'study/show/WG33', 'a_href_match_': '.*download.*'} )({})) with chpwd(outd): out, stats = crawl() eq_(stats.add_annex, 3) ok_file_under_git(outd, 'file1.nii', annexed=True) ok_file_has_content(opj(outd, 'file1.nii'), 'content of file1.nii') ok_file_under_git(outd, _path_('dir1/file2.nii'), annexed=True) ok_file_has_content(opj(outd, 'dir1', 'file2.nii'), 'content of file2.nii') eq_(len(out), 1)
def test_simple1(ind, topurl, outd): list(initiate_dataset( template="simple_with_archives", dataset_name='test1', path=outd, add_fields={'url': topurl + 'study/show/WG33', 'a_href_match_': '.*download.*'} )({})) with chpwd(outd): out, stats = crawl() eq_(stats.add_annex, 3) ok_file_under_git(outd, 'file1.nii', annexed=True) ok_file_has_content(opj(outd, 'file1.nii'), 'content of file1.nii') ok_file_under_git(outd, _path_('dir1/file2.nii'), annexed=True) ok_file_has_content(opj(outd, 'dir1', 'file2.nii'), 'content of file2.nii') eq_(len(out), 1)
def test_openfmri_pipeline1(ind, topurl, outd, clonedir): index_html = opj(ind, 'ds666', 'index.html') list( initiate_dataset(template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({ 'dataset': 'ds666' })) repo = AnnexRepo(outd, create=False) # to be used in the checks # Since datalad 0.11.2 all .metadata/objects go under annex. # Here we have a test where we force drop all annexed content, # to mitigate that let's place all metadata under git dotdatalad_attributes_file = opj('.datalad', '.gitattributes') repo.set_gitattributes([('metadata/objects/**', { 'annex.largefiles': '(nothing)' })], dotdatalad_attributes_file) # --amend so we do not cause change in # of commits below repo.commit("gitattributes", files=dotdatalad_attributes_file, options=['--amend']) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath_nover, "mighty load in old format") # # And now versioned files were specified! # add_to_index(index_html, content=_versioned_files) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) ok_( not exists(t1w_fpath_nover), "%s file should no longer be there if unversioned files get removed correctly" % t1w_fpath_nover) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } # all commits out there: # dataset init, crawler init # (2 commits) # + 3*(incoming, processed, merge) # + 3*aggregate-metadata update # - 1 since now that incoming starts with master, there is one less merge # In --incremental mode there is a side effect of absent now # 2*remove of obsolete metadata object files, # see https://github.com/datalad/datalad/issues/2772 # TODO inspect by knowledgeable person and re-enable #ncommits_master = len(commits_hexsha['master']) #assert_in(ncommits_master, [13, 14]) #assert_in(len(commits_l['master']), [8, 9]) # TODO inspect by knowledgeable person and re-enable #eq_(len(commits_hexsha['incoming']), ncommits_master - 8) #eq_(len(commits_l['incoming']), ncommits_master - 8) #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5) #eq_(len(commits_l['incoming-processed']), ncommits_master - 8) # Check tags for the versions eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1']) # +1 because original "release" was assumed to be 1.0.0 repo_tags = repo.get_tags() eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1']) # Ben: The tagged ones currently are the ones with the message # '[DATALAD] dataset aggregate metadata update\n': #eq_(repo_tags[0]['hexsha'], commits_l['master'][4]) # next to the last one #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0]) # the last one def hexsha(l): return l.__class__(x.hexsha for x in l) # TODO requires additional tooling to re-enable ## Verify that we have desired tree of merges #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1], # commits_l['incoming'][0])) #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3], # also in master # commits_l['incoming'][2],)) # ben: The following two comparisons are targeting these commits: # commit "Merge branch 'incoming-processed'\n" in commits_l['master'], # parents are: # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed'] # TODO requires additional tooling to re-enable #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2], # commits_l['incoming-processed'][0])) #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4], # commits_l['incoming-processed'][1])) with chpwd(outd): eq_(set(glob('*')), {'changelog.txt', 'sub-1'}) all_files = sorted(find_files('.')) t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath, "mighty load 1.0.1") ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False) ok_file_under_git(t1w_fpath, annexed=True) try: # this is the new way from datalad.metadata.metadata import get_ds_aggregate_db_locations ds = Dataset('.') dbloc, objbase = get_ds_aggregate_db_locations(ds) dbloc = op.relpath(dbloc, start=ds.path) except ImportError: # this stopped working in early 2019 versions of datalad from datalad.metadata.metadata import agginfo_relpath dbloc = agginfo_relpath target_files = { './.datalad/config', './.datalad/crawl/crawl.cfg', # no more! # './.datalad/config.ttl', './.datalad/datalad.ttl', './.datalad/crawl/statuses/incoming.json', './.datalad/crawl/versions/incoming.json', './changelog.txt', './sub-1/anat/sub-1_T1w.dat', './sub-1/beh/responses.tsv', './' + dbloc, } target_incoming_files = { '.gitattributes', # we marked default backend right in the incoming # we now base 'incoming' on master branch, so we get all those as well '.datalad/.gitattributes', '.datalad/config', '.datalad/crawl/crawl.cfg', 'changelog.txt', 'ds666.tar.gz', 'ds666-beh_R1.0.1.tar.gz', 'ds666_R1.0.0.tar.gz', 'ds666_R1.0.1.tar.gz', 'ds666_R2.0.0.tar.gz', '.datalad/crawl/statuses/incoming.json', '.datalad/crawl/versions/incoming.json' } # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # check that -beh was committed in 2nd commit in incoming, not the first one assert_not_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][-1])) assert_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][0])) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new # actually we do manage to add_git 1 (README) since it is generated committed directly to git # BUT now fixed -- if not committed (was the same), should be marked as skipped # Nothing was committed so stats leaked all the way up eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) # rerun pipeline when new content is available # add new revision, rerun pipeline and check that stuff was processed/added correctly add_to_index( index_html, content= '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>') with chpwd(outd): out = run_pipeline(pipeline) all_files_updated = sorted(find_files('.')) eq_(len(out), 1) assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats()) # there is no overlays ATM, so behav would be gone since no 2.0.0 for it! target_files.remove('./sub-1/beh/responses.tsv') # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files_updated if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # new instance so it re-reads git stuff etc # repo = AnnexRepo(outd, create=False) # to be used in the checks commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l_ = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } assert_not_equal(commits_hexsha, commits_hexsha_) eq_(out[0]['datalad_stats'], ActivityStats()) # commit happened so stats were consumed # numbers seems to be right total_stats = out[0]['datalad_stats'].get_total() # but for some reason downloaded_size fluctuates.... why? probably archiving...? total_stats.downloaded_size = 0 eq_( total_stats, ActivityStats( files=8, skipped=5, downloaded=1, renamed=1, urls=6, add_annex=2, # add_git=1, # README versions=['2.0.0'], merges=[['incoming', 'incoming-processed']])) check_dropall_get(repo) # Let's see if pipeline would remove files we stopped tracking remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>') with chpwd(outd): with swallow_logs(new_level=logging.WARNING) as cml: out = run_pipeline(pipeline) # since files get removed in incoming, but repreprocessed completely # incomming-processed and merged into master -- new commits will come # They shouldn't have any difference but still should be new commits assert_in("There is already a tag 2.0.0 in the repository", cml.out) eq_(len(out), 1) incoming_files = repo.get_files('incoming') target_incoming_files.remove('ds666_R1.0.0.tar.gz') eq_(set(incoming_files), target_incoming_files) commits_hexsha_removed = { b: list(_get_branch_commits(repo, b)) for b in branches } # our 'statuses' database should have recorded the change thus got a diff # which propagated through all branches for b in 'master', 'incoming-processed': # with non persistent DB we had no changes # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), []) assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.diff(b, commits_hexsha_[b][0])) dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0]) eq_(len(dincoming), 2) # 2 diff objects -- 1 file removed, 1 statuses updated eq_( set(dincoming.keys()), { repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.pathobj / 'ds666_R1.0.0.tar.gz' }) eq_(out[0]['datalad_stats'].get_total().removed, 1) assert_not_equal(commits_hexsha_, commits_hexsha_removed) # we will check if a clone would be crawling just as good from datalad.api import crawl # make a brand new clone GitRepo.clone(outd, clonedir) def _pipeline(*args, **kwargs): """Helper to mock openfmri.pipeline invocation so it looks at our 'server'""" kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False}) return ofpipeline(*args, **kwargs) with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline): output, stats = crawl( ) # we should be able to recrawl without doing anything ok_(stats, ActivityStats(files=6, skipped=6, urls=5))
def test_openfmri_pipeline1(ind, topurl, outd, clonedir): index_html = opj(ind, 'ds666', 'index.html') list(initiate_dataset( template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({'dataset': 'ds666'})) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath_nover, "mighty load in old format") # # And now versioned files were specified! # add_to_index(index_html, content=_versioned_files) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) ok_(not exists(t1w_fpath_nover), "%s file should no longer be there if unversioned files get removed correctly" % t1w_fpath_nover) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits = {b: list(repo.get_branch_commits(b)) for b in branches} commits_hexsha = {b: list(repo.get_branch_commits(b, value='hexsha')) for b in branches} commits_l = {b: list(repo.get_branch_commits(b, limit='left-only')) for b in branches} eq_(len(commits['incoming']), 3) eq_(len(commits_l['incoming']), 3) eq_(len(commits['incoming-processed']), 6) eq_(len(commits_l['incoming-processed']), 4) # because original merge has only 1 parent - incoming eq_(len(commits['master']), 12) # all commits out there -- dataset init, crawler init + 3*(incoming, processed, meta data aggregation, merge) eq_(len(commits_l['master']), 6) # Check tags for the versions eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1']) # +1 because original "release" was assumed to be 1.0.0 eq_([x.name for x in repo.repo.tags], ['1.0.0', '1.0.0+1', '1.0.1']) eq_(repo.repo.tags[0].commit.hexsha, commits_l['master'][-4].hexsha) # next to the last one eq_(repo.repo.tags[-1].commit.hexsha, commits_l['master'][0].hexsha) # the last one def hexsha(l): return l.__class__(x.hexsha for x in l) # Verify that we have desired tree of merges eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1].hexsha, commits_l['incoming'][0].hexsha)) eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming'][2].hexsha,)) eq_(hexsha(commits_l['master'][0].parents), (commits_l['master'][1].hexsha, commits_l['incoming-processed'][0].hexsha)) eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2].hexsha, commits_l['incoming-processed'][1].hexsha)) with chpwd(outd): eq_(set(glob('*')), {'changelog.txt', 'sub-1'}) all_files = sorted(find_files('.')) t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath, "mighty load 1.0.1") ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False) ok_file_under_git(t1w_fpath, annexed=True) target_files = { './.datalad/config', './.datalad/meta/meta.json', './.datalad/crawl/crawl.cfg', # no more! # './.datalad/config.ttl', './.datalad/datalad.ttl', './.datalad/crawl/statuses/incoming.json', './.datalad/crawl/versions/incoming.json', './changelog.txt', './sub-1/anat/sub-1_T1w.dat', './sub-1/beh/responses.tsv'} target_incoming_files = { '.gitattributes', # we marked default backend right in the incoming 'changelog.txt', 'ds666.tar.gz', 'ds666-beh_R1.0.1.tar.gz', 'ds666_R1.0.0.tar.gz', 'ds666_R1.0.1.tar.gz', 'ds666_R2.0.0.tar.gz', '.datalad/crawl/statuses/incoming.json', '.datalad/crawl/versions/incoming.json' } eq_(set(all_files), target_files) # check that -beh was committed in 2nd commit in incoming, not the first one assert_not_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][-1])) assert_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][0])) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = {b: list(repo.get_branch_commits(b, value='hexsha')) for b in branches} eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new # actually we do manage to add_git 1 (README) since it is generated committed directly to git # BUT now fixed -- if not committed (was the same), should be marked as skipped # Nothing was committed so stats leaked all the way up eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) # rerun pipeline when new content is available # add new revision, rerun pipeline and check that stuff was processed/added correctly add_to_index(index_html, content='<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>') with chpwd(outd): out = run_pipeline(pipeline) all_files_updated = sorted(find_files('.')) eq_(len(out), 1) assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats()) # there is no overlays ATM, so behav would be gone since no 2.0.0 for it! target_files.remove('./sub-1/beh/responses.tsv') eq_(set(all_files_updated), target_files) # new instance so it re-reads git stuff etc # repo = AnnexRepo(outd, create=False) # to be used in the checks commits_ = {b: list(repo.get_branch_commits(b)) for b in branches} commits_hexsha_ = {b: list(repo.get_branch_commits(b, value='hexsha')) for b in branches} commits_l_ = {b: list(repo.get_branch_commits(b, limit='left-only')) for b in branches} assert_not_equal(commits_hexsha, commits_hexsha_) eq_(out[0]['datalad_stats'], ActivityStats()) # commit happened so stats were consumed # numbers seems to be right total_stats = out[0]['datalad_stats'].get_total() # but for some reason downloaded_size fluctuates.... why? probably archiving...? total_stats.downloaded_size = 0 eq_(total_stats, ActivityStats(files=8, skipped=5, downloaded=1, renamed=1, urls=6, add_annex=2, # add_git=1, # README versions=['2.0.0'], merges=[['incoming', 'incoming-processed']])) check_dropall_get(repo) # Let's see if pipeline would remove files we stopped tracking remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>') with chpwd(outd): with swallow_logs(new_level=logging.WARNING) as cml: out = run_pipeline(pipeline) # since files get removed in incoming, but repreprocessed completely # incomming-processed and merged into master -- new commits will come # They shouldn't have any difference but still should be new commits assert_in("There is already a tag 2.0.0 in the repository", cml.out) eq_(len(out), 1) incoming_files = repo.get_files('incoming') target_incoming_files.remove('ds666_R1.0.0.tar.gz') eq_(set(incoming_files), target_incoming_files) commits_hexsha_removed = {b: list(repo.get_branch_commits(b, value='hexsha')) for b in branches} # our 'statuses' database should have recorded the change thus got a diff # which propagated through all branches for b in 'master', 'incoming-processed': # with non persistent DB we had no changes # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), []) eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0])[0].a_path, '.datalad/crawl/statuses/incoming.json') dincoming = repo.repo.branches['incoming'].commit.diff(commits_hexsha_['incoming'][0]) eq_(len(dincoming), 2) # 2 diff objects -- 1 file removed, 1 statuses updated eq_(set([d.a_path for d in dincoming]), {'.datalad/crawl/statuses/incoming.json', 'ds666_R1.0.0.tar.gz'}) # since it seems to diff "from current to the specified", it will be listed as new_file assert any(d.new_file for d in dincoming) eq_(out[0]['datalad_stats'].get_total().removed, 1) assert_not_equal(commits_hexsha_, commits_hexsha_removed) # we will check if a clone would be crawling just as good from datalad.api import crawl # make a brand new clone GitRepo.clone(outd, clonedir) def _pipeline(*args, **kwargs): """Helper to mock openfmri.pipeline invocation so it looks at our 'server'""" kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False}) return ofpipeline(*args, **kwargs) with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline): output, stats = crawl() # we should be able to recrawl without doing anything ok_(stats, ActivityStats(files=6, skipped=6, urls=5))
def __call__(path=None, is_pipeline=False, is_template=False, recursive=False, chdir=None): # dry_run=False, dry_run = False from datalad_crawler.pipeline import ( load_pipeline_from_config, load_pipeline_from_module, get_repo_pipeline_config_path, get_repo_pipeline_script_path ) from datalad_crawler.pipeline import run_pipeline from datalad.utils import chpwd # import late so we could mock during tests with chpwd(chdir): assert not (is_pipeline and is_template), "it is either a pipeline or a template name, can't be both" if is_template: # generate a config and overload path with its filename path = initiate_pipeline_config(template=path, # kwargs=TODO, commit=True) # TODO: centralize via _params_ handling if dry_run: dryrun_optlabel = 'datalad.crawl.dryrun' if dryrun_optlabel in cfg: cfg.unset(dryrun_optlabel, where='local', reload=False) cfg.add(dryrun_optlabel, "True", where='local') if path is None: # get config from the current repository/dataset if is_pipeline: raise ValueError("You must specify the file if --pipeline") # Let's see if there is a config or pipeline in this repo path = get_repo_pipeline_config_path() if not path or not exists(path): # Check if there may be the pipeline provided path = get_repo_pipeline_script_path() if path and exists(path): is_pipeline = True stats = ActivityStats() if not path: raise RuntimeError("Cannot locate crawler config or pipeline file") if is_pipeline: lgr.info("Loading pipeline definition from %s" % path) pipeline = load_pipeline_from_module(path) else: lgr.info("Loading pipeline specification from %s" % path) pipeline = load_pipeline_from_config(path) lgr.info("Running pipeline %s" % str(pipeline)) # TODO: capture the state of all branches so in case of crash # we could gracefully reset back try: output = run_pipeline(pipeline, stats=stats) except Exception as exc: # TODO: config.crawl.failure = full-reset | last-good-master # probably ask via ui which action should be performed unless # explicitly specified raise stats.datasets_crawled += 1 # TODO: Move gc/clean over here! stats_total = stats.get_total() if recursive: # get all subdatasets, and crawl them too! ## ? assert path_orig is None, "Otherwise not sure what to do with path=%r in subdatasets" % path import os from datalad.distribution.dataset import Dataset from datalad.api import crawl from datalad.utils import swallow_logs from datalad.dochelpers import exc_str # Note: we could collect all datasets to be crawled here or pass recursive=True # into the subdatasets' crawl. We will collect all of them here so we might later # also introduce automatic commits when super-dataset got successfully updated subdatasets = Dataset(os.curdir).subdatasets(recursive=recursive, result_xfm='relpaths') lgr.info("Crawling %d subdatasets", len(subdatasets)) output = [output] # TODO: parallelize # TODO: assumes that all sub-datasets are 'crawllable', and if not # just adds them to crawl_failed count. But may be we should make it more # explicit, that some sub-datasets might not need to be crawled, so they get # skipped explicitly? for ds_ in subdatasets: ds_logfile = utils.get_logfilename(ds_, 'crawl') try: # TODO: might be cool to be able to report a 'heart beat' from the swallow into pbar or smth with swallow_logs(file_=ds_logfile) as cml: output_, stats_ = crawl(chdir=ds_) stats_total += stats_ output.append(output_) lgr.info("Crawled %s: %s (log: %s)", ds_, stats_.as_str(mode='line'), ds_logfile) except Exception as exc: stats_total.datasets_crawl_failed += 1 stats_total.datasets_crawled += 1 output += [None] lgr.warning("Crawling of %s has failed (more in %s): %s.", # Log output: %s", ds_, ds_logfile, exc_str(exc)) # , cml.out) lgr.info("Total stats: %s", stats_total.as_str(mode='line')) return output, stats_total