def test_add_archive_content_zip(repo_path=None): ds = Dataset(repo_path).create(force=True) with chpwd(repo_path): with swallow_outputs(): ds.save("1.zip", message="add 1.zip") add_archive_content("1.zip") ok_file_under_git(ds.pathobj / "1" / "foo", annexed=True) ok_file_under_git(ds.pathobj / "1" / "dir" / "bar", annexed=True) ok_archives_caches(ds.path, 0)
def test_add_archive_leading_dir(self): import os os.mkdir(self.ds.pathobj / 'sub') f123 = Path('sub') / '123.tar' Path(self.ds.pathobj / '1.tar').rename(self.ds.pathobj / Path(f123)) self.annex.remove('1.tar', force=True) self.ds.save(message="renamed") self.ds.add_archive_content(f123, add_archive_leading_dir=True, strip_leading_dirs=True) ok_file_under_git(self.ds.path, str(Path('sub') / '123' / 'file.txt'), annexed=True)
def test_ok_file_under_git_symlinks(path=None): # Test that works correctly under symlinked path orepo = GitRepo(path) orepo.add('ingit') orepo.commit('msg') orepo.add('staged') lpath = path + "-symlink" # will also be removed AFAIK by our tempfile handling Path(lpath).symlink_to(Path(path)) ok_symlink(lpath) ok_file_under_git(op.join(path, 'ingit')) ok_file_under_git(op.join(lpath, 'ingit')) ok_file_under_git(op.join(lpath, 'staged')) with assert_raises(AssertionError): ok_file_under_git(op.join(lpath, 'notingit')) with assert_raises(AssertionError): ok_file_under_git(op.join(lpath, 'nonexisting'))
def test_add_archive_content_strip_leading(path_orig=None, url=None, repo_path=None): with chpwd(repo_path): ds = Dataset(repo_path).create(force=True) repo = ds.repo # Let's add first archive to the repo so we could test with swallow_outputs(): repo.add_url_to_file('1.tar.gz', opj(url, '1.tar.gz')) repo.commit("added 1.tar.gz") add_archive_content('1.tar.gz', strip_leading_dirs=True) ok_(not exists('1')) ok_file_under_git(ds.path, '1 f.txt', annexed=True) ok_file_under_git('d', '1d', annexed=True) ok_archives_caches(ds.path, 0)
def test_override_existing_under_git(self): create_tree(self.ds.path, {'1.dat': 'load2'}) self.ds.save('1.dat', to_git=True, message='added to git') self.ds.add_archive_content( '1.tar', strip_leading_dirs=True, ) # and we did not bother adding it to annex (for now) -- just skipped # since we have it and it is the same ok_file_under_git(self.ds.path, '1.dat', annexed=False) # but if we say 'overwrite' -- we would remove and replace self.ds.add_archive_content('1.tar', strip_leading_dirs=True, delete=True, existing='overwrite') ok_file_under_git(self.ds.path, '1.dat', annexed=True)
def test_add_archive_content_absolute_path(path=None): ds = Dataset(opj(path, "ds")).create(force=True) repo = ds.repo ds.save("1.tar.gz", message="1.tar.gz") abs_tar_gz = opj(path, "ds", "1.tar.gz") add_archive_content(abs_tar_gz, dataset=ds) ok_file_under_git(opj(path, "ds", "1", "foo"), annexed=True) commit_msg = repo.format_commit("%B") # The commit message uses relative paths. assert_not_in(abs_tar_gz, commit_msg) assert_in("1.tar.gz", commit_msg) res = add_archive_content(opj(path, "notds", "2.tar.gz"), dataset=ds, on_failure='ignore') assert_in_results( res, action='add-archive-content', status='impossible', message='Can not add archive outside of the dataset', )
def test_add_archive_use_archive_dir(repo_path=None): ds = Dataset(repo_path).create(force=True) with chpwd(repo_path): # Let's add first archive to the repo with default setting archive_path = opj('4u', '1.tar.gz') # check it gives informative error if archive is not already added res = add_archive_content(archive_path, on_failure='ignore') message = \ "Can not add an untracked archive. Run 'datalad save 4u\\1.tar.gz'"\ if on_windows else \ "Can not add an untracked archive. Run 'datalad save 4u/1.tar.gz'" assert_in_results(res, action='add-archive-content', message=message, status='impossible') with swallow_outputs(): ds.save(archive_path) ok_archives_caches(ds.path, 0) add_archive_content(archive_path, strip_leading_dirs=True, use_current_dir=True) ok_(not exists(opj('4u', '1 f.txt'))) ok_file_under_git(ds.path, '1 f.txt', annexed=True) ok_archives_caches(ds.path, 0) # and now let's extract under archive dir add_archive_content(archive_path, strip_leading_dirs=True) ok_file_under_git(ds.path, opj('4u', '1 f.txt'), annexed=True) ok_archives_caches(ds.path, 0) add_archive_content(opj('4u', 'sub.tar.gz')) ok_file_under_git(ds.path, opj('4u', 'sub', '2 f.txt'), annexed=True) ok_archives_caches(ds.path, 0)
def _test_BasicAnnexTestRepo(repodir): trepo = BasicAnnexTestRepo(repodir) trepo.create() assert_repo_status(trepo.path) ok_file_under_git(trepo.path, 'test.dat') ok_file_under_git(trepo.path, 'INFO.txt') ok_file_under_git(trepo.path, 'test-annex.dat', annexed=True) ok_(trepo.repo.file_has_content('test-annex.dat') is False) with swallow_outputs(): trepo.repo.get('test-annex.dat') ok_(trepo.repo.file_has_content('test-annex.dat'))
def test_BasicGitTestRepo(path=None): trepo = BasicGitTestRepo(path) trepo.create() assert_repo_status(trepo.path, annex=False) ok_file_under_git(trepo.path, 'test.dat') ok_file_under_git(trepo.path, 'INFO.txt')
def test_within_ds_file_search(path=None): try: import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) # override default and search for datasets and files for this test for m in ('egrep', 'textblob', 'autofield'): ds.config.add('datalad.search.index-{}-documenttype'.format(m), 'all', scope='branch') ds.config.add('datalad.metadata.nativetype', 'audio', scope='branch') makedirs(opj(path, 'stim')) for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ): copy(opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.save() ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True) # If it is not under annex, below addition of metadata silently does # not do anything ds.repo.set_metadata(opj('stim', 'stim1.mp3'), init={'importance': 'very'}) ds.aggregate_metadata() assert_repo_status(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio', ): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.importance annex.key audio.bitrate audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber datalad_core.id datalad_core.refcommit id parentds path type """ # test default behavior while limiting set of keys reported with swallow_outputs() as cmo: ds.search([r'\.id', 'artist$'], show_keys='short') out_lines = [l for l in cmo.out.split(os.linesep) if l] # test that only the ones matching were returned assert_equal([l for l in out_lines if not l.startswith(' ')], ['audio.music-artist', 'datalad_core.id']) # more specific test which would also test formatting assert_equal( out_lines, [ 'audio.music-artist', ' in 1 datasets', " has 1 unique values: 'dlartist'", 'datalad_core.id', ' in 1 datasets', # we have them sorted " has 1 unique values: '%s'" % ds.id ]) with assert_raises(ValueError) as cme: ds.search('*wrong') assert_re_in( r"regular expression '\(\?i\)\*wrong' \(original: '\*wrong'\) is incorrect: ", str(cme.value)) # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched in ( ('egrep', ':mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # same as above, leading : is stripped, in indicates "ALL FIELDS" ('egrep', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # same as above, but with AND condition # get both matches ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), { 'type': 'file', 'audio.format': 'mp3' }), # case insensitive search ('egrep', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # field selection by expression ('egrep', r'audio\.+:mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), { 'meta': 'mp3' }), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), { 'audio.format': 'mp3' }), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) # in egrep we currently do not search unique values # and the queries above aim at files assert_result_count(res, 1 if mode == 'egrep' else 2) if mode != 'egrep': assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match for matched_key, matched_val in matched.items(): assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val) # test a suggestion msg being logged if no hits and key is a bit off with swallow_logs(new_level=logging.INFO) as cml: res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep') assert not res assert_in('Did you mean any of', cml.out) assert_in('audio.format', cml.out) assert_in('audio.bitrate', cml.out)
def _test_target_ssh_inherit(standardgroup, ui, use_ssh, src_path, target_path): ds = Dataset(src_path).create() if use_ssh: target_url = 'datalad-test:%s' % target_path else: target_url = target_path remote = "magical" # for the test of setting a group, will just smoke test while using current # user's group ds.create_sibling(target_url, name=remote, shared='group', group=os.getgid(), ui=ui) # not doing recursively if standardgroup: ds.repo.set_preferred_content('wanted', 'standard', remote) ds.repo.set_preferred_content('group', standardgroup, remote) ds.publish(to=remote) # now a month later we created a new subdataset... a few of the nested ones # A known hiccup happened when there # is also subsub ds added - we might incorrectly traverse and not prepare # sub first for subsub to inherit etc parent_ds = ds subdss = [] nlevels = 2 # gets slow: 1 - 43 sec, 2 - 49 sec , 3 - 69 sec for levels in range(nlevels): subds = parent_ds.create('sub') create_tree(subds.path, {'sub.dat': 'lots of data'}) parent_ds.save('sub', recursive=True) ok_file_under_git(subds.path, 'sub.dat', annexed=True) parent_ds = subds subdss.append(subds) target_subdss = [ Dataset(opj(*([target_path] + ['sub'] * (i + 1)))) for i in range(nlevels) ] # since we do not have yet/thus have not used an option to record to publish # to that sibling by default (e.g. --set-upstream), if we run just ds.publish # -- should fail assert_result_count( ds.publish(on_failure='ignore'), 1, status='impossible', message= 'No target sibling configured for default publication, please specify via --to' ) ds.publish( to=remote) # should be ok, non recursive; BUT it (git or us?) would # create an empty sub/ directory assert_postupdate_hooks(target_path, installed=ui) for target_sub in target_subdss: ok_(not target_sub.is_installed()) # still not there res = ds.publish(to=remote, recursive=True, on_failure='ignore') assert_result_count(res, 1 + len(subdss)) assert_status(('error', 'notneeded'), res) assert_result_count(res, len(subdss), status='error', message=("Unknown target sibling '%s' for publication", 'magical')) # Finally publishing with inheritance ds.publish(to=remote, recursive=True, missing='inherit') assert_postupdate_hooks(target_path, installed=ui) def check_dss(): # we added the remote and set all the for subds in subdss: eq_(subds.repo.get_preferred_content('wanted', remote), 'standard' if standardgroup else '') eq_(subds.repo.get_preferred_content('group', remote), standardgroup or '') for target_sub in target_subdss: ok_(target_sub.is_installed()) # it is there now eq_(target_sub.repo.config.get('core.sharedrepository'), '1') # and we have transferred the content if standardgroup and standardgroup == 'backup': # only then content should be copied ok_file_has_content(opj(target_sub.path, 'sub.dat'), 'lots of data') else: # otherwise nothing is copied by default assert_false(target_sub.repo.file_has_content('sub.dat')) check_dss() # and it should be ok to reconfigure the full hierarchy of datasets # while "inheriting". No URL must be specified, and we must not blow # but just issue a warning for the top level dataset which has no super, # so cannot inherit anything - use case is to fixup/establish the full # hierarchy on the remote site ds.save( recursive=True) # so we have committed hierarchy for create_sibling with swallow_logs(logging.WARNING) as cml: out = ds.create_sibling(None, name=remote, existing="reconfigure", inherit=True, ui=ui, recursive=True) eq_(len(out), 1 + len(subdss)) assert_in("Cannot determine super dataset", cml.out) check_dss()
def d2_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '2 f.txt', annexed=True) ok_file_under_git(opj('1', 'd2', '2d'), annexed=True) ok_archives_caches(repo.path, 0)
def d1_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '1 f.txt', annexed=True) ok_file_under_git(opj('1', 'd', '1d'), annexed=True) ok_archives_caches(repo_path, 0)
def test_add_archive_content(path_orig=None, url=None, repo_path=None): with chpwd(repo_path): # TODO we need to be able to pass path into add_archive_content # We could mock but I mean for the API # no repo yet assert_raises(NoDatasetFound, add_archive_content, "nonexisting.tar.gz") ds = Dataset(repo_path).create() res = ds.add_archive_content("nonexisting.tar.gz", on_failure='ignore') assert_in_results(res, action='add-archive-content', status='impossible') repo = ds.repo # we can't add a file from outside the repo ATM res = ds.add_archive_content(Path(path_orig) / '1.tar.gz', on_failure='ignore') assert_in_results(res, action='add-archive-content', status='impossible', type="dataset", message="Can not add archive outside of the dataset") # Let's add first archive to the repo so we could test with swallow_outputs(): repo.add_url_to_file('1.tar.gz', opj(url, '1.tar.gz')) for s in range(1, 5): repo.add_url_to_file('%du/1.tar.gz' % s, opj(url, '%du/1.tar.gz' % s)) repo.commit("added 1.tar.gz") key_1tar = repo.get_file_annexinfo('1.tar.gz')[ 'key'] # will be used in the test later def d1_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '1 f.txt', annexed=True) ok_file_under_git(opj('1', 'd', '1d'), annexed=True) ok_archives_caches(repo_path, 0) # and by default it just does it, everything goes to annex res = add_archive_content('1.tar.gz') assert_in_results(res, action='add-archive-content', status='ok') d1_basic_checks() # If ran again, should proceed just fine since the content is the same # so no changes would be made really res = add_archive_content('1.tar.gz') assert_in_results(res, action='add-archive-content', status='ok') # But that other one carries updated file, so should fail due to # overwrite res = add_archive_content(Path('1u') / '1.tar.gz', use_current_dir=True, on_failure='ignore') assert_in_results( res, action='add-archive-content', status='error', ) assert_in('exists, but would be overwritten by new file', res[0]['message']) # but should do fine if overrides are allowed add_archive_content(Path('1u') / '1.tar.gz', existing='overwrite', use_current_dir=True) add_archive_content(Path('2u') / '1.tar.gz', existing='archive-suffix', use_current_dir=True) add_archive_content(Path('3u') / '1.tar.gz', existing='archive-suffix', use_current_dir=True) add_archive_content(Path('4u') / '1.tar.gz', existing='archive-suffix', use_current_dir=True) # rudimentary test assert_equal(sorted(map(basename, glob(opj(repo_path, '1', '1*')))), ['1 f-1.1.txt', '1 f-1.2.txt', '1 f-1.txt', '1 f.txt']) whereis = repo.whereis(glob(opj(repo_path, '1', '1*'))) # they all must be the same assert (all([x == whereis[0] for x in whereis[1:]])) # and we should be able to reference it while under subdirectory subdir = opj(repo_path, 'subdir') with chpwd(subdir, mkdir=True): add_archive_content(opj(pardir, '1.tar.gz'), dataset=ds.path, use_current_dir=True) d1_basic_checks() # or we could keep relative path and also demand to keep the archive prefix # while extracting under original (annex root) dir add_archive_content(opj(pardir, '1.tar.gz'), dataset=ds.path, add_archive_leading_dir=True) with chpwd(opj(repo_path, '1')): d1_basic_checks() with chpwd(repo_path): # test with excludes and renames and annex options ds.add_archive_content( '1.tar.gz', exclude=['d'], rename=['/ /_', '/^1/2'], annex_options="-c annex.largefiles=exclude=*.txt", delete=True) # no conflicts since new name ok_file_under_git('2', '1_f.txt', annexed=False) assert_false(exists(opj('2', 'd'))) assert_false(exists('1.tar.gz')) # delete was in effect # now test ability to extract within subdir with chpwd(opj(repo_path, 'd1'), mkdir=True): # Let's add first archive to the repo so we could test # named the same way but different content with swallow_outputs(): repo.add_url_to_file('d1/1.tar.gz', opj(url, 'd1', '1.tar.gz')) repo.commit("added 1.tar.gz in d1") def d2_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '2 f.txt', annexed=True) ok_file_under_git(opj('1', 'd2', '2d'), annexed=True) ok_archives_caches(repo.path, 0) add_archive_content('1.tar.gz', dataset=ds.path) d2_basic_checks() # in manual tests ran into the situation of inability to obtain on a single run # a file from an archive which was coming from a dropped key. I thought it was # tested in custom remote tests, but I guess not sufficiently well enough repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.get(opj('1', '1 f.txt')) ok_archives_caches(repo.path, 1, persistent=True) ok_archives_caches(repo.path, 0, persistent=False) repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.drop(key_1tar, key=True) # is available from the URL -- should be kosher repo.get(opj('1', '1 f.txt')) # that what managed to not work # TODO: check if persistent archive is there for the 1.tar.gz # We should be able to drop everything since available online with swallow_outputs(): clean(dataset=ds) repo.drop(key_1tar, key=True) # is available from the URL -- should be kosher ds.drop(opj('1', '1 f.txt')) # should be all kosher ds.get(opj('1', '1 f.txt')) # and should be able to get it again # bug was that dropping didn't work since archive was dropped first repo.call_annex(["drop", "--all"]) # verify that we can't drop a file if archive key was dropped and online archive was removed or changed size! ;) repo.get(key_1tar, key=True) unlink(opj(path_orig, '1.tar.gz')) with assert_raises(CommandError) as e: repo.drop(key_1tar, key=True) assert_equal(e.kwargs['stdout_json'][0]['success'], False) assert_result_values_cond( e.kwargs['stdout_json'], 'note', lambda x: '(Use --force to override this check, or adjust numcopies.)' in x) assert exists(opj(repo.path, repo.get_contentlocation(key_1tar)))
def ok_file_under_git_kludge(path, basename): ok_file_under_git(op.join(str(Path(path).resolve()), basename), annexed=True)