def test_uninstall_git_file(path): ds = Dataset(path) ok_(ds.is_installed()) ok_(exists(opj(path, 'INFO.txt'))) ok_file_under_git(ds.repo.path, 'INFO.txt') # drop file in Git in an annex repo # regardless of the type of repo this is 'notneeded'... # it is less about education that about "can we # we get the content back?", and for a file in Git we can assert_result_count( ds.drop(path='INFO.txt'), 1, status='notneeded', message="no annex'ed content") res = ds.uninstall(path="INFO.txt", on_failure='ignore') assert_result_count( res, 1, status='impossible', message='can only uninstall datasets (consider the `drop` command)') # remove the file: res = ds.remove(path='INFO.txt', result_xfm='paths', result_filter=lambda x: x['action'] == 'remove') assert_raises(AssertionError, ok_file_under_git, ds.repo.path, 'INFO.txt') ok_(not exists(opj(path, 'INFO.txt'))) eq_(res, ['INFO.txt'])
def _test_target_ssh_inherit(standardgroup, src_path, target_path): ds = Dataset(src_path).create() target_url = 'localhost:%s' % target_path remote = "magical" # for the test of setting a group, will just smoke test while using current # user's group ds.create_sibling(target_url, name=remote, shared='group', group=os.getgid()) # not doing recursively if standardgroup: ds.repo.set_preferred_content('wanted', 'standard', remote) ds.repo.set_preferred_content('group', standardgroup, remote) ds.publish(to=remote) # now a month later we created a new subdataset subds = ds.create('sub') # so now we got a hierarchy! create_tree(subds.path, {'sub.dat': 'lots of data'}) subds.add('sub.dat') ok_file_under_git(subds.path, 'sub.dat', annexed=True) target_sub = Dataset(opj(target_path, 'sub')) # since we do not have yet/thus have not used an option to record to publish # to that sibling by default (e.g. --set-upstream), if we run just ds.publish # -- should fail assert_result_count( ds.publish(on_failure='ignore'), 1, status='impossible', message='No target sibling configured for default publication, please specific via --to') ds.publish(to=remote) # should be ok, non recursive; BUT it (git or us?) would # create an empty sub/ directory ok_(not target_sub.is_installed()) # still not there res = ds.publish(to=remote, recursive=True, on_failure='ignore') assert_result_count(res, 2) assert_status(('error', 'notneeded'), res) assert_result_count( res, 1, status='error', message=("Unknown target sibling '%s' for publication", 'magical')) ds.publish(to=remote, recursive=True, missing='inherit') # we added the remote and set all the eq_(subds.repo.get_preferred_content('wanted', remote), 'standard' if standardgroup else '') eq_(subds.repo.get_preferred_content('group', remote), standardgroup or '') ok_(target_sub.is_installed()) # it is there now eq_(target_sub.repo.config.get('core.sharedrepository'), '1') # and we have transferred the content if standardgroup and standardgroup == 'backup': # only then content should be copied ok_file_has_content(opj(target_sub.path, 'sub.dat'), 'lots of data') else: # otherwise nothing is copied by default assert_false(target_sub.repo.file_has_content('sub.dat'))
def test_gh1597_simpler(path): ds = Dataset(path).create() # same goes for .gitattributes with open(opj(ds.path, '.gitignore'), 'a') as f: f.write('*.swp\n') ds.add('.gitignore') ok_clean_git(ds.path) ok_file_under_git(ds.path, '.gitignore', annexed=False) # put .gitattributes in some subdir and add all, should also go into Git os.makedirs(op.join(ds.path, 'subdir')) attrfile = op.join(ds.path, 'subdir', '.gitattributes') with open(attrfile, 'a') as f: f.write('# just a comment\n') ds.add('.') ok_clean_git(ds.path) ok_file_under_git(ds.path, op.relpath(attrfile, start=ds.path), annexed=False)
def test_gh1597(path): ds = Dataset(path).create() sub = ds.create('sub', save=False) # only staged at this point, but known, and not annexed ok_file_under_git(ds.path, '.gitmodules', annexed=False) res = ds.subdatasets() assert_result_count(res, 1, path=sub.path) # now modify .gitmodules with another command ds.subdatasets(contains=sub.path, set_property=[('this', 'that')]) ok_clean_git(ds.path, index_modified=['sub']) # now modify low-level with open(opj(ds.path, '.gitmodules'), 'a') as f: f.write('\n') ok_clean_git(ds.path, index_modified=['.gitmodules', 'sub']) ds.add('.gitmodules') # must not come under annex mangement ok_file_under_git(ds.path, '.gitmodules', annexed=False)
def test_create_text_no_annex(path): ds = create(path, text_no_annex=True) ok_clean_git(path) import re ok_file_has_content( _path_(path, '.gitattributes'), content='\* annex\.largefiles=\(not\(mimetype=text/\*\)\)', re_=True, match=False, flags=re.MULTILINE ) # and check that it is really committing text files to git and binaries # to annex create_tree(path, { 't': 'some text', 'b': '' # empty file is not considered to be a text file # should we adjust the rule to consider only non empty files? } ) ds.add(['t', 'b']) ok_file_under_git(path, 't', annexed=False) ok_file_under_git(path, 'b', annexed=True)
def test_uninstall_annex_file(path): ds = Dataset(path) ok_(ds.is_installed()) ok_file_under_git(ds.repo.path, 'test-annex.dat', annexed=True) ds.repo.get('test-annex.dat') ok_(ds.repo.file_has_content('test-annex.dat')) # remove file's content: res = ds.drop(path='test-annex.dat', result_xfm='paths') # test it happened: ok_(not ds.repo.file_has_content('test-annex.dat')) ok_file_under_git(ds.repo.path, 'test-annex.dat', annexed=True) # test result: eq_(res, [opj(ds.path, 'test-annex.dat')]) ds.repo.get('test-annex.dat') # remove file: ds.remove(path='test-annex.dat') assert_raises(AssertionError, ok_file_under_git, ds.repo.path, 'test-annex.dat', annexed=True) assert_raises(AssertionError, ok_file_under_git, ds.repo.path, 'test-annex.dat', annexed=False) ok_(not exists(opj(path, 'test-annex.dat')))
def _test_BasicAnnexTestRepo(repodir): trepo = BasicAnnexTestRepo(repodir) trepo.create() assert_repo_status(trepo.path) ok_file_under_git(trepo.path, 'test.dat') ok_file_under_git(trepo.path, 'INFO.txt') ok_file_under_git(trepo.path, 'test-annex.dat', annexed=True) ok_(trepo.repo.file_has_content('test-annex.dat') is False) with swallow_outputs(): trepo.repo.get('test-annex.dat') ok_(trepo.repo.file_has_content('test-annex.dat'))
def test_crawl_autoaddtext(ind, topurl, outd): ds = create(outd, text_no_annex=True) with chpwd(outd): # TODO -- dataset argument crawl_init({ 'url': topurl, 'a_href_match_': '.*' }, save=True, template='simple_with_archives') crawl() ok_clean_git(outd) ok_file_under_git(outd, "anothertext", annexed=False) ok_file_under_git(outd, "d/textfile", annexed=False) ok_file_under_git(outd, "d/tooshort", annexed=True)
def test_add_dir_file(repo_path, p, topurl): # test whenever file becomes a directory and then back a file. Should all work! annex = Annexificator(path=repo_path, auto_finalize=False) url = "%s/file" % topurl path1 = opj(repo_path, 'd') data1 = {'filename': 'd', 'url': url} out1 = list(annex(data1)) # becomes a directory which carries a file data2 = {'filename': 'f', 'url': url, 'path': 'd'} # but since we didn't commit previous file yet -- should puke! assert_raises(RuntimeError, list, annex(data2)) list(annex.finalize()({})) # so it gets committed ok_file_under_git(path1, annexed=True) # and after that it should proceed normally #import pdb; pdb.set_trace() out2 = list(annex(data2)) path2 = opj(repo_path, 'd', 'f') ok_(exists(path2)) # tricky one -- becomes back a file... what if repo was dirty and files under dir were staged? TODO assert_raises(RuntimeError, list, annex(data1)) list(annex.finalize()({})) # so it gets committed ok_file_under_git(path2, annexed=True) list(annex(data1)) list(annex.finalize()({})) # so it gets committed ok_file_under_git(path1, annexed=True) # with auto_finalize (default) it should go smoother ;) annex = Annexificator(path=repo_path) list(annex(data2)) # wouldn't happen without explicit finalize to commit whatever new is staged # ok_file_under_git(path2, annexed=True) list(annex(data1)) list(annex.finalize()({})) # so it gets committed ok_file_under_git(path1, annexed=True)
def test_add_mimetypes(path): # XXX apparently there is symlinks dereferencing going on while deducing repo # type there!!!! so can't use following invocation -- TODO separately import os path = os.path.realpath(path) # yoh gives up for now ds = Dataset(path).create(force=True) ds.repo.add('.gitattributes') ds.repo.commit('added attributes to git explicitly') # now test that those files will go into git/annex correspondingly __not_tested__ = ds.add(['file.txt', 'empty']) ok_clean_git(path, untracked=['file2.txt']) # Empty one considered to be application/octet-stream i.e. non-text ok_file_under_git(path, 'empty', annexed=True) ok_file_under_git(path, 'file.txt', annexed=False) # But we should be able to force adding file to annex when desired ds.add('file2.txt', to_git=False) ok_file_under_git(path, 'file2.txt', annexed=True)
def ok_file_under_git_kludge(path, basename): ok_file_under_git(opj(op.realpath(path), basename), annexed=True)
def test_BasicGitTestRepo(path): trepo = BasicGitTestRepo(path) trepo.create() assert_repo_status(trepo.path, annex=False) ok_file_under_git(trepo.path, 'test.dat') ok_file_under_git(trepo.path, 'INFO.txt')
def _test_target_ssh_inherit(standardgroup, ui, src_path, target_path): ds = Dataset(src_path).create() target_url = 'localhost:%s' % target_path remote = "magical" # for the test of setting a group, will just smoke test while using current # user's group ds.create_sibling(target_url, name=remote, shared='group', group=os.getgid(), ui=ui) # not doing recursively if standardgroup: ds.repo.set_preferred_content('wanted', 'standard', remote) ds.repo.set_preferred_content('group', standardgroup, remote) ds.publish(to=remote) # now a month later we created a new subdataset... a few of the nested ones # A known hiccup happened when there # is also subsub ds added - we might incorrectly traverse and not prepare # sub first for subsub to inherit etc parent_ds = ds subdss = [] nlevels = 2 # gets slow: 1 - 43 sec, 2 - 49 sec , 3 - 69 sec for levels in range(nlevels): subds = parent_ds.create('sub') create_tree(subds.path, {'sub.dat': 'lots of data'}) parent_ds.save('sub', recursive=True) ok_file_under_git(subds.path, 'sub.dat', annexed=True) parent_ds = subds subdss.append(subds) target_subdss = [ Dataset(opj(*([target_path] + ['sub'] * (i+1)))) for i in range(nlevels) ] # since we do not have yet/thus have not used an option to record to publish # to that sibling by default (e.g. --set-upstream), if we run just ds.publish # -- should fail assert_result_count( ds.publish(on_failure='ignore'), 1, status='impossible', message='No target sibling configured for default publication, please specific via --to') ds.publish(to=remote) # should be ok, non recursive; BUT it (git or us?) would # create an empty sub/ directory assert_postupdate_hooks(target_path, installed=ui) for target_sub in target_subdss: ok_(not target_sub.is_installed()) # still not there res = ds.publish(to=remote, recursive=True, on_failure='ignore') assert_result_count(res, 1 + len(subdss)) assert_status(('error', 'notneeded'), res) assert_result_count( res, len(subdss), status='error', message=("Unknown target sibling '%s' for publication", 'magical')) # Finally publishing with inheritance ds.publish(to=remote, recursive=True, missing='inherit') assert_postupdate_hooks(target_path, installed=ui) def check_dss(): # we added the remote and set all the for subds in subdss: eq_(subds.repo.get_preferred_content('wanted', remote), 'standard' if standardgroup else '') eq_(subds.repo.get_preferred_content('group', remote), standardgroup or '') for target_sub in target_subdss: ok_(target_sub.is_installed()) # it is there now eq_(target_sub.repo.config.get('core.sharedrepository'), '1') # and we have transferred the content if standardgroup and standardgroup == 'backup': # only then content should be copied ok_file_has_content(opj(target_sub.path, 'sub.dat'), 'lots of data') else: # otherwise nothing is copied by default assert_false(target_sub.repo.file_has_content('sub.dat')) check_dss() # and it should be ok to reconfigure the full hierarchy of datasets # while "inheriting". No URL must be specified, and we must not blow # but just issue a warning for the top level dataset which has no super, # so cannot inherit anything - use case is to fixup/establish the full # hierarchy on the remote site with swallow_logs(logging.WARNING) as cml: out = ds.create_sibling( None, name=remote, existing="reconfigure", inherit=True, ui=ui, recursive=True) eq_(len(out), 1 + len(subdss)) assert_in("Cannot determine super dataset", cml.out) check_dss()
def test_demo_repro_analysis(bids_path, ana_path, toolbox_url): import glob localizer_ds = Dataset(bids_path).create() localizer_ds.run_procedure('cfg_bids') # TODO: decorator # TODO: with config patch for toolbox ? -> overwrite? # localizer_ds.install(source="https://github.com/psychoinformatics-de/hirni-demo", # path="sourcedata", # recursive=True) with patch.dict('os.environ', {'DATALAD_HIRNI_TOOLBOX_URL': toolbox_url}): install_demo_dataset(localizer_ds, "sourcedata", recursive=True) assert_repo_status(localizer_ds.repo) subs = localizer_ds.subdatasets(recursive=True) assert_result_count(subs, 4) assert_result_count(subs, 1, path=op.join(localizer_ds.path, 'sourcedata')) assert_result_count(subs, 1, path=op.join(localizer_ds.path, 'sourcedata', 'code', 'hirni-toolbox')) assert_result_count(subs, 1, path=op.join(localizer_ds.path, 'sourcedata', 'acq1', 'dicoms')) assert_result_count(subs, 1, path=op.join(localizer_ds.path, 'sourcedata', 'acq2', 'dicoms')) localizer_ds.hirni_spec2bids( [op.join(localizer_ds.path, 'sourcedata', 'studyspec.json')] + glob.glob( op.join(localizer_ds.path, 'sourcedata', '*', 'studyspec.json')), anonymize=True) for f in [ 'sub-001', 'task-oneback_bold.json', 'participants.tsv', op.join('sub-001', 'sub-001_scans.tsv'), op.join('sub-001', 'anat'), op.join('sub-001', 'anat', 'sub-001_run-1_T1w.json'), op.join('sub-001', 'anat', 'sub-001_run-1_T1w.nii.gz'), op.join('sub-001', 'func'), op.join('sub-001', 'func', 'sub-001_task-oneback_run-01_bold.json'), op.join('sub-001', 'func', 'sub-001_task-oneback_run-01_bold.nii.gz'), op.join('sub-001', 'func', 'sub-001_task-oneback_run-01_events.tsv'), ]: assert_true(op.lexists(op.join(localizer_ds.path, f))) analysis_ds = Dataset(ana_path).create() analysis_ds.install(source=localizer_ds.path, path=op.join('inputs', 'rawdata')) analysis_ds.run_procedure('cfg_yoda') # download-url expects the target dir to exist (analysis_ds.pathobj / 'code').mkdir(exist_ok=True) analysis_ds.download_url( path=op.join(analysis_ds.path, 'code') + op. sep, # TODO: File issue. relative path via python API bound method doesn't work urls=[ 'https://raw.githubusercontent.com/myyoda/ohbm2018-training/master/section23/scripts/events2ev3.sh', 'https://raw.githubusercontent.com/myyoda/ohbm2018-training/master/section23/scripts/ffa_design.fsf' ]) assert_repo_status(analysis_ds.repo) ok_file_under_git(op.join(analysis_ds.path, 'code'), 'events2ev3.sh', annexed=False) ok_file_under_git(op.join(analysis_ds.path, 'code'), 'ffa_design.fsf', annexed=False) analysis_ds.run(inputs=[ op.join('inputs', 'rawdata', 'sub-001', 'func', 'sub-001_task-oneback_run-01_events.tsv') ], outputs=[op.join('sub-001', 'onsets')], cmd='bash code/events2ev3.sh sub-001 {inputs}', message="Build FSL EV3 design files") raise SkipTest("Solve datalad-containers #115") analysis_ds.containers_add('fsl', url="shub://ReproNim/ohbm2018-training:fsln") # % datalad containers-list analysis_ds.save(version_tag="ready4analysis") assert_repo_status(analysis_ds.repo) # analysis_ds.run( outputs=[op.join('sub-001', '1stlvl_design.fsf')], cmd= "bash -c 'sed -e \"s,##BASEPATH##,{pwd},g\" -e \"s,##SUB##,sub-001,g\" code/ffa_design.fsf > {outputs}'", message="FSL FEAT analysis config script") assert_repo_status(analysis_ds.repo)
def ok_file_under_git_kludge(path, basename): ok_file_under_git(op.join(str(Path(path).resolve()), basename), annexed=True)
def _test_target_ssh_inherit(standardgroup, ui, use_ssh, src_path, target_path): ds = Dataset(src_path).create() if use_ssh: target_url = 'datalad-test:%s' % target_path else: target_url = target_path remote = "magical" # for the test of setting a group, will just smoke test while using current # user's group ds.create_sibling(target_url, name=remote, shared='group', group=os.getgid(), ui=ui) # not doing recursively if standardgroup: ds.repo.set_preferred_content('wanted', 'standard', remote) ds.repo.set_preferred_content('group', standardgroup, remote) ds.publish(to=remote) # now a month later we created a new subdataset... a few of the nested ones # A known hiccup happened when there # is also subsub ds added - we might incorrectly traverse and not prepare # sub first for subsub to inherit etc parent_ds = ds subdss = [] nlevels = 2 # gets slow: 1 - 43 sec, 2 - 49 sec , 3 - 69 sec for levels in range(nlevels): subds = parent_ds.create('sub') create_tree(subds.path, {'sub.dat': 'lots of data'}) parent_ds.save('sub', recursive=True) ok_file_under_git(subds.path, 'sub.dat', annexed=True) parent_ds = subds subdss.append(subds) target_subdss = [ Dataset(opj(*([target_path] + ['sub'] * (i + 1)))) for i in range(nlevels) ] # since we do not have yet/thus have not used an option to record to publish # to that sibling by default (e.g. --set-upstream), if we run just ds.publish # -- should fail assert_result_count( ds.publish(on_failure='ignore'), 1, status='impossible', message= 'No target sibling configured for default publication, please specify via --to' ) ds.publish( to=remote) # should be ok, non recursive; BUT it (git or us?) would # create an empty sub/ directory assert_postupdate_hooks(target_path, installed=ui) for target_sub in target_subdss: ok_(not target_sub.is_installed()) # still not there res = ds.publish(to=remote, recursive=True, on_failure='ignore') assert_result_count(res, 1 + len(subdss)) assert_status(('error', 'notneeded'), res) assert_result_count(res, len(subdss), status='error', message=("Unknown target sibling '%s' for publication", 'magical')) # Finally publishing with inheritance ds.publish(to=remote, recursive=True, missing='inherit') assert_postupdate_hooks(target_path, installed=ui) def check_dss(): # we added the remote and set all the for subds in subdss: eq_(subds.repo.get_preferred_content('wanted', remote), 'standard' if standardgroup else '') eq_(subds.repo.get_preferred_content('group', remote), standardgroup or '') for target_sub in target_subdss: ok_(target_sub.is_installed()) # it is there now eq_(target_sub.repo.config.get('core.sharedrepository'), '1') # and we have transferred the content if standardgroup and standardgroup == 'backup': # only then content should be copied ok_file_has_content(opj(target_sub.path, 'sub.dat'), 'lots of data') else: # otherwise nothing is copied by default assert_false(target_sub.repo.file_has_content('sub.dat')) check_dss() # and it should be ok to reconfigure the full hierarchy of datasets # while "inheriting". No URL must be specified, and we must not blow # but just issue a warning for the top level dataset which has no super, # so cannot inherit anything - use case is to fixup/establish the full # hierarchy on the remote site with swallow_logs(logging.WARNING) as cml: out = ds.create_sibling(None, name=remote, existing="reconfigure", inherit=True, ui=ui, recursive=True) eq_(len(out), 1 + len(subdss)) assert_in("Cannot determine super dataset", cml.out) check_dss()
def test_within_ds_file_search(path): try: import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) # override default and search for datasets and files for this test for m in ('egrep', 'textblob', 'autofield'): ds.config.add( 'datalad.search.index-{}-documenttype'.format(m), 'all', where='dataset') ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') makedirs(opj(path, 'stim')) for src, dst in ( ('audio.mp3', opj('stim', 'stim1.mp3')),): copy( opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.save() ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True) # If it is not under annex, below addition of metadata silently does # not do anything ds.repo.set_metadata( opj('stim', 'stim1.mp3'), init={'importance': 'very'}) ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio',): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.importance annex.key audio.bitrate audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber datalad_core.id datalad_core.refcommit id parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched in ( ('egrep', ':mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # same as above, leading : is stripped, in indicates "ALL FIELDS" ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # same as above, but with AND condition # get both matches ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), {'type': 'file', 'audio.format': 'mp3'}), # case insensitive search ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # field selection by expression ('egrep', 'audio\.+:mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), {'meta': 'mp3'}), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) # in egrep we currently do not search unique values # and the queries above aim at files assert_result_count(res, 1 if mode == 'egrep' else 2) if mode != 'egrep': assert_result_count( res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match for matched_key, matched_val in matched.items(): assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val) # test a suggestion msg being logged if no hits and key is a bit off with swallow_logs(new_level=logging.INFO) as cml: res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep') assert not res assert_in('Did you mean any of', cml.out) assert_in('audio.format', cml.out) assert_in('audio.bitrate', cml.out)
def test_openfmri_pipeline1(ind, topurl, outd, clonedir): index_html = opj(ind, 'ds666', 'index.html') list( initiate_dataset(template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({ 'dataset': 'ds666' })) repo = AnnexRepo(outd, create=False) # to be used in the checks # Since datalad 0.11.2 all .metadata/objects go under annex. # Here we have a test where we force drop all annexed content, # to mitigate that let's place all metadata under git dotdatalad_attributes_file = opj('.datalad', '.gitattributes') repo.set_gitattributes([('metadata/objects/**', { 'annex.largefiles': '(nothing)' })], dotdatalad_attributes_file) # --amend so we do not cause change in # of commits below repo.commit("gitattributes", files=dotdatalad_attributes_file, options=['--amend']) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath_nover, "mighty load in old format") # # And now versioned files were specified! # add_to_index(index_html, content=_versioned_files) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) ok_( not exists(t1w_fpath_nover), "%s file should no longer be there if unversioned files get removed correctly" % t1w_fpath_nover) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } # all commits out there: # dataset init, crawler init # (2 commits) # + 3*(incoming, processed, merge) # + 3*aggregate-metadata update # - 1 since now that incoming starts with master, there is one less merge # In --incremental mode there is a side effect of absent now # 2*remove of obsolete metadata object files, # see https://github.com/datalad/datalad/issues/2772 # TODO inspect by knowledgeable person and re-enable #ncommits_master = len(commits_hexsha['master']) #assert_in(ncommits_master, [13, 14]) #assert_in(len(commits_l['master']), [8, 9]) # TODO inspect by knowledgeable person and re-enable #eq_(len(commits_hexsha['incoming']), ncommits_master - 8) #eq_(len(commits_l['incoming']), ncommits_master - 8) #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5) #eq_(len(commits_l['incoming-processed']), ncommits_master - 8) # Check tags for the versions eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1']) # +1 because original "release" was assumed to be 1.0.0 repo_tags = repo.get_tags() eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1']) # Ben: The tagged ones currently are the ones with the message # '[DATALAD] dataset aggregate metadata update\n': #eq_(repo_tags[0]['hexsha'], commits_l['master'][4]) # next to the last one #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0]) # the last one def hexsha(l): return l.__class__(x.hexsha for x in l) # TODO requires additional tooling to re-enable ## Verify that we have desired tree of merges #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1], # commits_l['incoming'][0])) #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3], # also in master # commits_l['incoming'][2],)) # ben: The following two comparisons are targeting these commits: # commit "Merge branch 'incoming-processed'\n" in commits_l['master'], # parents are: # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed'] # TODO requires additional tooling to re-enable #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2], # commits_l['incoming-processed'][0])) #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4], # commits_l['incoming-processed'][1])) with chpwd(outd): eq_(set(glob('*')), {'changelog.txt', 'sub-1'}) all_files = sorted(find_files('.')) t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath, "mighty load 1.0.1") ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False) ok_file_under_git(t1w_fpath, annexed=True) try: # this is the new way from datalad.metadata.metadata import get_ds_aggregate_db_locations ds = Dataset('.') dbloc, objbase = get_ds_aggregate_db_locations(ds) dbloc = op.relpath(dbloc, start=ds.path) except ImportError: # this stopped working in early 2019 versions of datalad from datalad.metadata.metadata import agginfo_relpath dbloc = agginfo_relpath target_files = { './.datalad/config', './.datalad/crawl/crawl.cfg', # no more! # './.datalad/config.ttl', './.datalad/datalad.ttl', './.datalad/crawl/statuses/incoming.json', './.datalad/crawl/versions/incoming.json', './changelog.txt', './sub-1/anat/sub-1_T1w.dat', './sub-1/beh/responses.tsv', './' + dbloc, } target_incoming_files = { '.gitattributes', # we marked default backend right in the incoming # we now base 'incoming' on master branch, so we get all those as well '.datalad/.gitattributes', '.datalad/config', '.datalad/crawl/crawl.cfg', 'changelog.txt', 'ds666.tar.gz', 'ds666-beh_R1.0.1.tar.gz', 'ds666_R1.0.0.tar.gz', 'ds666_R1.0.1.tar.gz', 'ds666_R2.0.0.tar.gz', '.datalad/crawl/statuses/incoming.json', '.datalad/crawl/versions/incoming.json' } # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # check that -beh was committed in 2nd commit in incoming, not the first one assert_not_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][-1])) assert_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][0])) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new # actually we do manage to add_git 1 (README) since it is generated committed directly to git # BUT now fixed -- if not committed (was the same), should be marked as skipped # Nothing was committed so stats leaked all the way up eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) # rerun pipeline when new content is available # add new revision, rerun pipeline and check that stuff was processed/added correctly add_to_index( index_html, content= '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>') with chpwd(outd): out = run_pipeline(pipeline) all_files_updated = sorted(find_files('.')) eq_(len(out), 1) assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats()) # there is no overlays ATM, so behav would be gone since no 2.0.0 for it! target_files.remove('./sub-1/beh/responses.tsv') # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files_updated if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # new instance so it re-reads git stuff etc # repo = AnnexRepo(outd, create=False) # to be used in the checks commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l_ = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } assert_not_equal(commits_hexsha, commits_hexsha_) eq_(out[0]['datalad_stats'], ActivityStats()) # commit happened so stats were consumed # numbers seems to be right total_stats = out[0]['datalad_stats'].get_total() # but for some reason downloaded_size fluctuates.... why? probably archiving...? total_stats.downloaded_size = 0 eq_( total_stats, ActivityStats( files=8, skipped=5, downloaded=1, renamed=1, urls=6, add_annex=2, # add_git=1, # README versions=['2.0.0'], merges=[['incoming', 'incoming-processed']])) check_dropall_get(repo) # Let's see if pipeline would remove files we stopped tracking remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>') with chpwd(outd): with swallow_logs(new_level=logging.WARNING) as cml: out = run_pipeline(pipeline) # since files get removed in incoming, but repreprocessed completely # incomming-processed and merged into master -- new commits will come # They shouldn't have any difference but still should be new commits assert_in("There is already a tag 2.0.0 in the repository", cml.out) eq_(len(out), 1) incoming_files = repo.get_files('incoming') target_incoming_files.remove('ds666_R1.0.0.tar.gz') eq_(set(incoming_files), target_incoming_files) commits_hexsha_removed = { b: list(_get_branch_commits(repo, b)) for b in branches } # our 'statuses' database should have recorded the change thus got a diff # which propagated through all branches for b in 'master', 'incoming-processed': # with non persistent DB we had no changes # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), []) assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.diff(b, commits_hexsha_[b][0])) dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0]) eq_(len(dincoming), 2) # 2 diff objects -- 1 file removed, 1 statuses updated eq_( set(dincoming.keys()), { repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.pathobj / 'ds666_R1.0.0.tar.gz' }) eq_(out[0]['datalad_stats'].get_total().removed, 1) assert_not_equal(commits_hexsha_, commits_hexsha_removed) # we will check if a clone would be crawling just as good from datalad.api import crawl # make a brand new clone GitRepo.clone(outd, clonedir) def _pipeline(*args, **kwargs): """Helper to mock openfmri.pipeline invocation so it looks at our 'server'""" kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False}) return ofpipeline(*args, **kwargs) with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline): output, stats = crawl( ) # we should be able to recrawl without doing anything ok_(stats, ActivityStats(files=6, skipped=6, urls=5))
def test_1(text_dandiset: Dict[str, Any], tmp_path: Path) -> None: # TODO: move pre-setup into a fixture, e.g. local_setup1 or make code work without? di = DandiDatasetter( dandi_client=text_dandiset["client"], target_path=tmp_path, config=Config( # gh_org=None, # re_filter=None, # backup_remote=None, # jobs=jobs, # force=force, content_url_regex=r".*/blobs/", s3bucket="dandi-api-staging-dandisets", ), ) with pytest.raises(Exception): log.info("test_1: Testing sync of nonexistent Dandiset") di.update_from_backup(["999999"]) assert not (tmp_path / "999999").exists() # Since we are using text_dandiset, that immediately creates us a dandiset # TODO: may be separate it out, so we could start "clean" and still work ok # clean run without dandisets is ok # ret = di.update_from_backup() # assert ret is None, "nothing is returned ATM, if added -- test should be extended" dandiset_id = text_dandiset["dandiset_id"] log.info("test_1: Syncing test dandiset") di.update_from_backup([dandiset_id]) ds = Dataset( tmp_path / text_dandiset["dandiset_id"]) # but we should get the super-dataset? assert_repo_status(ds.path) # that all is clean etc ok_file_under_git(ds.path, "file.txt") (text_dandiset["dspath"] / "new.txt").write_text("This is a new file.\n") log.info("test_1: Updating test dandiset on server") text_dandiset["reupload"]() assert_repo_status(ds.path) # no side-effects somehow log.info("test_1: Syncing test dandiset") di.update_from_backup([dandiset_id]) assert_repo_status(ds.path) # that all is clean etc assert (ds.pathobj / "new.txt").read_text() == "This is a new file.\n" repo = GitRepo(ds.path) def check_version_tag(v: Version) -> None: vid = v.identifier # Assert tag has correct timestamp assert repo.get_tag_date(vid) == v.created.isoformat( timespec="seconds") # Assert tag has correct committer assert repo.get_tag_creator( vid) == "DANDI User <*****@*****.**>" # Assert tagged commit has correct timestamp assert repo.get_commit_date(vid) == v.created.isoformat( timespec="seconds") # Assert that tag was merged into default branch assert repo.is_ancestor(vid, DEFAULT_BRANCH) # Assert tag branches from default branch assert repo.parent_is_ancestor(DEFAULT_BRANCH, vid) # Assert dandiset.yaml in tagged commit has doi metadata = yaml_load(repo.get_blob(vid, dandiset_metadata_file)) assert metadata.get("doi") log.info("test_1: Waiting for Dandiset to become valid") text_dandiset["dandiset"].wait_until_valid(65) log.info("test_1: Publishing Dandiset") v1 = text_dandiset["dandiset"].publish().version version1 = v1.identifier log.info("test_1: Syncing test dandiset") di.update_from_backup([dandiset_id]) assert_repo_status(ds.path) # that all is clean etc tags = {t["name"]: t["hexsha"] for t in ds.repo.get_tags()} assert version1 in tags v1_hash = tags[version1] check_version_tag(v1) (text_dandiset["dspath"] / "new.txt").write_text("This file's contents were changed.\n") log.info("test_1: Updating test dandiset on server") text_dandiset["reupload"]() log.info("test_1: Syncing test dandiset") di.update_from_backup([dandiset_id]) assert_repo_status(ds.path) # that all is clean etc assert (ds.pathobj / "new.txt").read_text() == "This file's contents were changed.\n" log.info("test_1: Waiting for Dandiset to become valid") text_dandiset["dandiset"].wait_until_valid(65) log.info("test_1: Publishing Dandiset") v2 = text_dandiset["dandiset"].publish().version version2 = v2.identifier log.info("test_1: Syncing test dandiset") di.update_from_backup([dandiset_id]) assert_repo_status(ds.path) # that all is clean etc tags = {t["name"]: t["hexsha"] for t in ds.repo.get_tags()} assert version1 in tags assert tags[version1] == v1_hash assert version2 in tags check_version_tag(v2) commit_authors = repo.readcmd("log", "--no-merges", "--format=%an <%ae>").splitlines() assert commit_authors == ["DANDI User <*****@*****.**>" ] * len(commit_authors) for c in repo.get_backup_commits(): assert repo.get_asset_files(c) == { asset["path"] for asset in repo.get_assets_json(c) }
def d1_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '1 f.txt', annexed=True) ok_file_under_git(opj('1', 'd', '1d'), annexed=True) ok_archives_caches(repo_path, 0)
def test_add_archive_content(path_orig, url, repo_path): with chpwd(repo_path): # TODO we need to be able to pass path into add_archive_content # We could mock but I mean for the API assert_raises(RuntimeError, add_archive_content, "nonexisting.tar.gz") # no repo yet repo = AnnexRepo(repo_path, create=True) assert_raises(ValueError, add_archive_content, "nonexisting.tar.gz") # we can't add a file from outside the repo ATM assert_raises(FileNotInRepositoryError, add_archive_content, opj(path_orig, '1.tar.gz')) # Let's add first archive to the repo so we could test with swallow_outputs(): repo.add_urls([opj(url, '1.tar.gz')], options=["--pathdepth", "-1"]) for s in range(1, 5): repo.add_urls([opj(url, '%du/1.tar.gz' % s)], options=["--pathdepth", "-2"]) repo.commit("added 1.tar.gz") key_1tar = repo.get_file_key( '1.tar.gz') # will be used in the test later def d1_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '1 f.txt', annexed=True) ok_file_under_git(opj('1', 'd', '1d'), annexed=True) ok_archives_caches(repo_path, 0) # and by default it just does it, everything goes to annex repo_ = add_archive_content('1.tar.gz') eq_(repo.path, repo_.path) d1_basic_checks() # If ran again, should proceed just fine since the content is the same so no changes would be made really add_archive_content('1.tar.gz') # But that other one carries updated file, so should fail due to overwrite with assert_raises(RuntimeError) as cme: add_archive_content(opj('1u', '1.tar.gz'), use_current_dir=True) # TODO: somewhat not precise since we have two possible "already exists" # -- in caching and overwrite check assert_in("already exists", str(cme.exception)) # but should do fine if overrides are allowed add_archive_content(opj('1u', '1.tar.gz'), existing='overwrite', use_current_dir=True) add_archive_content(opj('2u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) add_archive_content(opj('3u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) add_archive_content(opj('4u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) # rudimentary test assert_equal(sorted(map(basename, glob(opj(repo_path, '1', '1*')))), ['1 f-1.1.txt', '1 f-1.2.txt', '1 f-1.txt', '1 f.txt']) whereis = repo.whereis(glob(opj(repo_path, '1', '1*'))) # they all must be the same assert (all([x == whereis[0] for x in whereis[1:]])) # and we should be able to reference it while under subdirectory subdir = opj(repo_path, 'subdir') with chpwd(subdir, mkdir=True): add_archive_content(opj(pardir, '1.tar.gz'), use_current_dir=True) d1_basic_checks() # or we could keep relative path and also demand to keep the archive prefix # while extracting under original (annex root) dir add_archive_content(opj(pardir, '1.tar.gz'), add_archive_leading_dir=True) with chpwd(opj(repo_path, '1')): d1_basic_checks() with chpwd(repo_path): # test with excludes and renames and annex options add_archive_content('1.tar.gz', exclude=['d'], rename=['/ /_', '/^1/2'], annex_options="-c annex.largefiles=exclude=*.txt", delete=True) # no conflicts since new name ok_file_under_git('2', '1_f.txt', annexed=False) assert_false(exists(opj('2', 'd'))) assert_false(exists('1.tar.gz')) # delete was in effect # now test ability to extract within subdir with chpwd(opj(repo_path, 'd1'), mkdir=True): # Let's add first archive to the repo so we could test # named the same way but different content with swallow_outputs(): repo.add_urls([opj(url, 'd1', '1.tar.gz')], options=["--pathdepth", "-1"], cwd=getpwd()) # invoke under current subdir repo.commit("added 1.tar.gz in d1") def d2_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '2 f.txt', annexed=True) ok_file_under_git(opj('1', 'd2', '2d'), annexed=True) ok_archives_caches(repo.path, 0) add_archive_content('1.tar.gz') d2_basic_checks() # in manual tests ran into the situation of inability to obtain on a single run # a file from an archive which was coming from a dropped key. I thought it was # tested in custom remote tests, but I guess not sufficiently well enough repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.get(opj('1', '1 f.txt')) ok_archives_caches(repo.path, 1, persistent=True) ok_archives_caches(repo.path, 0, persistent=False) repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.drop(key_1tar, key=True) # is available from the URL -- should be kosher repo.get(opj('1', '1 f.txt')) # that what managed to not work # TODO: check if persistent archive is there for the 1.tar.gz # We should be able to drop everything since available online with swallow_outputs(): clean(dataset=repo.path) repo.drop(key_1tar, key=True) # is available from the URL -- should be kosher repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.get(opj('1', '1 f.txt')) # and should be able to get it again # bug was that dropping didn't work since archive was dropped first repo.call_annex(["drop", "--all"]) # verify that we can't drop a file if archive key was dropped and online archive was removed or changed size! ;) repo.get(key_1tar, key=True) unlink(opj(path_orig, '1.tar.gz')) with assert_raises(CommandError) as e: repo.drop(key_1tar, key=True) assert_equal(e.kwargs['stdout_json'][0]['success'], False) assert_result_values_cond( e.kwargs['stdout_json'], 'note', lambda x: '(Use --force to override this check, or adjust numcopies.)' in x) assert exists(opj(repo.path, repo.get_contentlocation(key_1tar)))
def _test_annex_file(mode, topdir, topurl, outdir): annex = Annexificator(path=outdir, mode=mode, statusdb='fileattr', largefiles="exclude=*.txt") input = {'url': "%sd1/1.dat" % topurl, 'filename': '1-copy.dat'} tfile = opj(outdir, '1-copy.dat') # we add full filepath now expected_output = [dict(filepath=opj(outdir, input['filename']), **input)] output = list(annex(input)) assert_equal(expected_output, output) # addurl is batched, and we haven't forced annex flushing so there should # be a batched process if not annex.repo.fake_dates_enabled: assert_equal(len(annex.repo._batched), 1) # if we finalize, it should flush batched annexes and commit list(annex.finalize()({})) assert (lexists(tfile)) ok_file_under_git(tfile, annexed=True) if mode == 'full': ok_file_has_content(tfile, '1.dat load') else: # in fast or relaxed mode there must not be any content assert_raises(AssertionError, ok_file_has_content, tfile, '1.dat load') whereis = annex.repo.whereis(tfile) assert_in(annex.repo.WEB_UUID, whereis) # url must have been added assert_equal(len(whereis), 1 + int(mode == 'full')) # TODO: check the url # Neither file should not be attempted to download again, since nothing changed # and by default we do use files db output = list(annex(input)) assert_equal(output, []) # nothing was done, so annex didn't yield data annex.yield_non_updated = True input_with_stats = input.copy() input_with_stats['datalad_stats'] = ActivityStats() output = list(annex(input_with_stats)) assert_equal(output[0]['datalad_stats'], ActivityStats(files=1, urls=1, skipped=1)) # but if we change that file, it should re-download it now with open(opj(topdir, 'd1', '1.dat'), 'a') as f: f.write("+") output = list(annex(input_with_stats)) stats = output[0]['datalad_stats'] stats.downloaded_time = 0 # 2 since we are reusing the same stats download_stats = dict(downloaded=1, downloaded_size=11) if mode == 'full' else {} addskip_stats = dict(add_annex=0, skipped=2, overwritten=0) if mode == 'relaxed' else dict( add_annex=1, skipped=1, overwritten=1) kwargs = download_stats.copy() kwargs.update(addskip_stats) assert_equal(stats, ActivityStats(files=2, urls=2, **kwargs)) # Download into a file which will be added to git # TODO: for now added to git only in full mode. in --fast or --relaxed, still goes to annex # http://git-annex.branchable.com/bugs/treatment_of_largefiles_is_not_working_for_addurl_--fast___40__or_--relaxed__41__/ input = { 'url': "%sd1/1.dat" % topurl, 'filename': '1.txt', 'datalad_stats': ActivityStats() } tfile = opj(outdir, '1.txt') output = list(annex(input)) annexed = mode not in {'full'} list(annex.finalize()({})) if not annexed: ok_file_has_content(tfile, '1.dat load+') else: assert_raises(AssertionError, ok_file_has_content, tfile, '1.dat load+') ok_file_under_git(tfile, annexed=annexed) assert_equal(len(output), 1) stats = output[0]['datalad_stats'] # reset varying metric stats.downloaded_time = 0 assert_equal( stats, ActivityStats(files=1, urls=1, add_git=1 - int(annexed), add_annex=int(annexed), **download_stats)) # Let's add a file without specifying URL sfilepath = opj(outdir, 'sample.txt') with open(sfilepath, 'w') as f: f.write("sample") ok_file_has_content(sfilepath, "sample") output = list( annex({ 'filename': 'sample.txt', 'datalad_stats': ActivityStats() })) ok_file_under_git(sfilepath, annexed=False) assert (output) assert_equal(output[0]['datalad_stats'], ActivityStats(files=1, add_git=1))
def test_within_ds_file_search(path): try: import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) # override default and search for datasets and files for this test for m in ('egrep', 'textblob', 'autofield'): ds.config.add('datalad.search.index-{}-documenttype'.format(m), 'all', where='dataset') ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') makedirs(opj(path, 'stim')) for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ): copy(opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.add('.') ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True) # If it is not under annex, below addition of metadata silently does # not do anything list( ds.repo.set_metadata(opj('stim', 'stim1.mp3'), init={'importance': 'very'})) ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio', ): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.importance annex.key audio.bitrate audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber datalad_core.id datalad_core.refcommit id parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched in ( ('egrep', ':mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # same as above, leading : is stripped, in indicates "ALL FIELDS" ('egrep', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # same as above, but with AND condition # get both matches ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), { 'type': 'file', 'audio.format': 'mp3' }), # case insensitive search ('egrep', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # field selection by expression ('egrep', 'audio\.+:mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), { 'meta': 'mp3' }), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) # in egrep we currently do not search unique values # and the queries above aim at files assert_result_count(res, 1 if mode == 'egrep' else 2) if mode != 'egrep': assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match for matched_key, matched_val in matched.items(): assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val) # test a suggestion msg being logged if no hits and key is a bit off with swallow_logs(new_level=logging.INFO) as cml: res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep') assert not res assert_in('Did you mean any of', cml.out) assert_in('audio.format', cml.out) assert_in('audio.bitrate', cml.out)
def d2_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '2 f.txt', annexed=True) ok_file_under_git(opj('1', 'd2', '2d'), annexed=True) ok_archives_caches(repo.path, 0)
def test_demo_raw_ds(path, toolbox_url): ds = Dataset(path) with patch.dict('os.environ', {'DATALAD_HIRNI_TOOLBOX_URL': toolbox_url}): ds.create() # TODO: May be move to ds.create(cfg_proc='hirni') in demo ds.run_procedure('cfg_hirni') # clean repo with an annex: assert_repo_status(ds.repo, annex=True) # README, dataset_description.json and studyspec.json at toplevel and in git for f in ['README', 'studyspec.json', 'dataset_description.json']: ok_file_under_git(ds.path, f, annexed=False) # toolbox installed under code/hirni-toolbox subs = ds.subdatasets() assert_result_count(subs, 1) assert_result_count(subs, 1, path=op.join(ds.path, 'code', 'hirni-toolbox')) ds.hirni_import_dcm( 'https://github.com/datalad/example-dicom-structural/archive/master.tar.gz', 'acq1', anon_subject='001') # acquisition directory and studyspec created + subdataset 'dicoms' within the acquisition dir for f in [ op.join(ds.path, 'acq1'), op.join(ds.path, 'acq1', 'studyspec.json'), op.join(ds.path, 'acq1', 'dicoms') ]: assert_true(op.exists(f)) subs = ds.subdatasets() assert_result_count(subs, 2) assert_result_count(subs, 1, path=op.join(ds.path, 'code', 'hirni-toolbox')) assert_result_count(subs, 1, path=op.join(ds.path, 'acq1', 'dicoms')) # TODO: check actual spec? (Prob. sufficient to test for that in dedicated import-dcm/dcm2spec tests # TODO: check dicom metadata ds.hirni_import_dcm( 'https://github.com/datalad/example-dicom-functional/archive/master.tar.gz', 'acq2', anon_subject='001') # acquisition directory and studyspec created + subdataset 'dicoms' within the acquisition dir for f in [ op.join(ds.path, 'acq2'), op.join(ds.path, 'acq2', 'studyspec.json'), op.join(ds.path, 'acq2', 'dicoms') ]: assert_true(op.exists(f)) subs = ds.subdatasets() assert_result_count(subs, 3) assert_result_count(subs, 1, path=op.join(ds.path, 'code', 'hirni-toolbox')) assert_result_count(subs, 1, path=op.join(ds.path, 'acq1', 'dicoms')) assert_result_count(subs, 1, path=op.join(ds.path, 'acq2', 'dicoms')) # Note from demo: The calls to `git annex addurl` and `datalad save` currently replace a single call to # `datalad download-url` due to a bug in that command. events_file = op.join('acq2', 'events.tsv') ds.repo.add_url_to_file( file_=events_file, url= 'https://github.com/datalad/example-dicom-functional/raw/master/events.tsv' ) ds.save(message="Added stimulation protocol for acquisition 2") ok_file_under_git(ds.path, events_file, annexed=True) ds.hirni_spec4anything( events_file, properties= '{"procedures": {"procedure-name": "copy-converter", "procedure-call": "bash {script} {{location}} ' '{ds}/sub-{{bids-subject}}/func/sub-{{bids-subject}}_task-{{bids-task}}_run-{{bids-run}}_events.tsv' '"}, "type": "events_file"}') ok_file_under_git(ds.path, op.join('acq2', 'studyspec.json'), annexed=False) assert_repo_status(ds.repo, annex=True)