def test_creatsubdatasets(topds_path, n=2): from datalad.distribution.dataset import Dataset from datalad.api import create ds = Dataset(topds_path).create() paths = [op.join(topds_path, "subds%d" % i) for i in range(n)] paths.extend( op.join(topds_path, "subds%d" % i, "subsub%d" % k) for i in range(n) for k in range(2)) # To allow for parallel execution without hitting the problem of # a lock in the super dataset, we create all subdatasets, and then # save them all within their superdataset create_ = partial( create, # cfg_proc="yoda", result_xfm=None, return_type='generator') # if we flip the paths so to go from the end, create without --force should fail # and we should get the exception (the first one encountered!) # Note: reraise_immediately is of "concern" only for producer. since we typically # rely on outside code to do the killing! assert_raises(IncompleteResultsError, list, ProducerConsumer(paths[::-1], create_, jobs=5)) # we are in a dirty state, let's just remove all those for a clean run rmtree(topds_path) # and this one followed by save should be good IFF we provide our dependency checker ds = Dataset(topds_path).create() list( ProducerConsumer(paths, create_, safe_to_consume=no_parentds_in_futures, jobs=5)) ds.save(paths) assert_repo_status(ds.repo)
def check_compress_file(ext, annex, path, name): # we base the archive name on the filename, in order to also # be able to properly test compressors where the corresponding # archive format has no capability of storing a filename # (i.e. where the archive name itself determines the filename # of the decompressed file, like .xz) archive = op.join(name, _filename + ext) compress_files([_filename], archive, path=path) assert_true(op.exists(archive)) if annex: # It should work even when file is annexed and is a symlink to the # key from datalad.support.annexrepo import AnnexRepo repo = AnnexRepo(path, init=True) repo.add(_filename) repo.commit(files=[_filename], msg="commit") dir_extracted = name + "_extracted" try: decompress_file(archive, dir_extracted) except MissingExternalDependency as exc: raise SkipTest(exc_str(exc)) _filepath = op.join(dir_extracted, _filename) ok_file_has_content(_filepath, 'content')
def test_ExtractedArchive(path): archive = op.join(path, fn_archive_obscure_ext) earchive = ExtractedArchive(archive) assert_false(op.exists(earchive.path)) # no longer the case -- just using hash for now # assert_in(os.path.basename(archive), earchive.path) fpath = op.join(fn_archive_obscure, # lead directory fn_in_archive_obscure) extracted = earchive.get_extracted_filename(fpath) eq_(extracted, op.join(earchive.path, fpath)) assert_false(op.exists(extracted)) # not yet extracted_ = earchive.get_extracted_file(fpath) eq_(extracted, extracted_) assert_true(op.exists(extracted)) # now it should extracted_files = earchive.get_extracted_files() ok_generator(extracted_files) eq_(sorted(extracted_files), sorted([ # ['bbc/3.txt', 'bbc/abc'] op.join(fn_archive_obscure, fn_in_archive_obscure), op.join(fn_archive_obscure, '3.txt') ])) earchive.clean() if not os.environ.get('DATALAD_TESTS_TEMP_KEEP'): assert_false(op.exists(earchive.path))
def test_relpath_semantics(path): with chpwd(path): super = create('super') create('subsrc') sub = install( dataset='super', source='subsrc', path=op.join('super', 'sub')) eq_(sub.path, op.join(super.path, 'sub'))
def check_compress_dir(ext, path, name): archive = name + ext compress_files([os.path.basename(path)], archive, path=os.path.dirname(path)) assert_true(op.exists(archive)) name_extracted = name + "_extracted" decompress_file(archive, name_extracted, leading_directories='strip') assert_true(op.exists(op.join(name_extracted, 'empty'))) assert_true(op.exists(op.join(name_extracted, 'd1', 'd2', 'f1')))
def get_max_path_length(top_path=None, maxl=1000): """Deduce the maximal length of the filename in a given path """ if not top_path: top_path = getpwd() import os import random from datalad import lgr from datalad.dochelpers import exc_str from datalad.support import path prefix = path.join(top_path, "dl%d" % random.randint(1 ,100000)) # some smart folks could implement binary search for this max_path_length = None for i in range(maxl-len(prefix)): filename = prefix + '_' * i path_length = len(filename) try: with open(filename, 'w') as f: max_path_length = path_length except Exception as exc: lgr.debug( "Failed to create sample file for length %d. Last succeeded was %s. Exception: %s", path_length, max_path_length, exc_str(exc)) break unlink(filename) return max_path_length
def use_cassette(name, *args, **kwargs): """Adapter to store fixtures locally TODO: RF so could be used in other places as well """ return use_cassette_(op.join(FIXTURES_PATH, name + '.yaml'), *args, **kwargs)
def check_crawl_autoaddtext(gz, ind, topurl, outd): ds = create(outd) ds.run_procedure("cfg_text2git") with chpwd(outd): # TODO -- dataset argument template_kwargs = { 'url': topurl, 'a_href_match_': '.*', } if gz: template_kwargs['archives_re'] = "\.gz$" crawl_init(template_kwargs, save=True, template='simple_with_archives') try: crawl() except MissingExternalDependency as exc: raise SkipTest(exc_str(exc)) ok_clean_git(outd) ok_file_under_git(outd, "anothertext", annexed=False) ok_file_under_git(outd, "d/textfile", annexed=False) ok_file_under_git(outd, "d/tooshort", annexed=True) if 'compressed.dat.gz' in TEST_TREE2: if gz: ok_file_under_git(outd, "compressed.dat", annexed=False) ok_file_has_content(op.join(outd, "compressed.dat"), u"мама мыла раму") else: ok_file_under_git(outd, "compressed.dat.gz", annexed=True) else: raise SkipTest( "Need datalad >= 0.11.2 to test .gz files decompression")
def test_our_metadataset_search(tdir): # smoke test for basic search operations on our super-megadataset # expensive operation but ok ds = install(path=tdir, source=DATASETS_TOPURL, result_xfm='datasets', return_type='item-or-list') res_haxby = list(ds.search('haxby')) assert_greater(len(res_haxby), 10) # default search should be case insensitive # but somehow it is not fully -- we get 12 here #res_Haxby = list(ds.search('Haxby')) #eq_(len(res_haxby), len(res_Haxby)) assert_result_count(ds.search('id:873a6eae-7ae6-11e6-a6c8-002590f97d84', mode='textblob'), 1, type='dataset', path=op.join(ds.path, 'crcns', 'pfc-2')) # there is a problem with argparse not decoding into utf8 in PY2 from datalad.cmdline.tests.test_main import run_main # TODO: make it into an independent lean test from datalad.cmd import Runner out, err = Runner(cwd=ds.path)('datalad search Buzsáki') assert_in('crcns/pfc-2 ', out) # has it in description # and then another aspect: this entry it among multiple authors, need to # check if aggregating them into a searchable entity was done correctly assert_in('crcns/hc-1 ', out)
def _flyweight_postproc_path(cls, path): # we want an absolute path, but no resolved symlinks if not op.isabs(path): path = op.join(op.getpwd(), path) # use canonical paths only: return op.normpath(path)
def get_max_path_length(top_path=None, maxl=1000): """Deduce the maximal length of the filename in a given path """ if not top_path: top_path = getpwd() import random from datalad import lgr from datalad.dochelpers import exc_str from datalad.support import path prefix = path.join(top_path, "dl%d" % random.randint(1, 100000)) # some smart folks could implement binary search for this max_path_length = None for i in range(maxl - len(prefix)): filename = prefix + '_' * i path_length = len(filename) try: with open(filename, 'w') as f: max_path_length = path_length except Exception as exc: lgr.debug( "Failed to create sample file for length %d. Last succeeded was %s. Exception: %s", path_length, max_path_length, exc_str(exc)) break unlink(filename) return max_path_length
def test_ok_file_under_git_symlinks(path): # Test that works correctly under symlinked path orepo = GitRepo(path) orepo.add('ingit') orepo.commit('msg') orepo.add('staged') lpath = path + "-symlink" # will also be removed AFAIK by our tempfile handling os.symlink(path, lpath) ok_symlink(lpath) ok_file_under_git(op.join(path, 'ingit')) ok_file_under_git(op.join(lpath, 'ingit')) ok_file_under_git(op.join(lpath, 'staged')) with assert_raises(AssertionError): ok_file_under_git(op.join(lpath, 'notingit')) with assert_raises(AssertionError): ok_file_under_git(op.join(lpath, 'nonexisting'))
def use_cassette(name, *args, **kwargs): """Adapter to store fixtures locally and skip if there is no vcr TODO: RF local aspect so could be used in other places as well """ kwargs.setdefault('skip_if_no_vcr', True) return use_cassette_(op.join(FIXTURES_PATH, name + '.yaml'), *args, **kwargs)
def test__version__(): # in released stage, version in the last CHANGELOG entry # should correspond to the one in datalad CHANGELOG_filename = op.join( op.dirname(__file__), op.pardir, op.pardir, 'CHANGELOG.md') if not op.exists(CHANGELOG_filename): raise SkipTest("no %s found" % CHANGELOG_filename) regex = re.compile(r'^## ' r'(?P<version>[0-9]+\.[0-9.abcrc~]+)\s+' r'\((?P<date>.*)\)' r'\s+--\s+' r'(?P<codename>.+)' ) with open(CHANGELOG_filename, 'rb') as f: for line in f: line = line.rstrip() if not line.startswith(b'## '): # The first section header we hit, must be our changelog entry continue reg = regex.match(assure_unicode(line)) if not reg: # first one at that level is the one raise AssertionError( "Following line must have matched our regex: %r" % line) regd = reg.groupdict() changelog_version = regd['version'] lv_changelog_version = LooseVersion(changelog_version) # we might have a suffix - sanitize san__version__ = __version__.rstrip('.devdirty') lv__version__ = LooseVersion(san__version__) if '???' in regd['date'] and 'will be better than ever' in regd['codename']: # we only have our template # we can only assert that its version should be higher than # the one we have now assert_greater(lv_changelog_version, lv__version__) else: # should be a "release" record assert_not_in('???', regd['date']) assert_not_in('will be better than ever', regd['codename']) assert_equal(__hardcoded_version__, changelog_version) if __hardcoded_version__ != san__version__: # It was not tagged yet and Changelog should have its # template record for the next release assert_greater(lv_changelog_version, lv__version__) assert_in('.dev', san__version__) else: # all is good, tagged etc assert_equal(lv_changelog_version, lv__version__) assert_equal(changelog_version, san__version__) assert_equal(__hardcoded_version__, san__version__) return raise AssertionError( "No log line matching our regex found in %s" % CHANGELOG_filename )
def check_decompress_file(leading_directories, path): outdir = op.join(path, 'simple-extracted') with swallow_outputs() as cmo: decompress_file(op.join(path, fn_archive_obscure_ext), outdir, leading_directories=leading_directories) eq_(cmo.out, "") eq_(cmo.err, "") path_archive_obscure = op.join(outdir, fn_archive_obscure) if leading_directories == 'strip': assert_false(op.exists(path_archive_obscure)) testpath = outdir elif leading_directories is None: assert_true(op.exists(path_archive_obscure)) testpath = path_archive_obscure else: raise NotImplementedError("Dunno about this strategy: %s" % leading_directories) assert_true(op.exists(op.join(testpath, '3.txt'))) assert_true(op.exists(op.join(testpath, fn_in_archive_obscure))) with open(op.join(testpath, '3.txt')) as f: eq_(f.read(), '3 load')
def test_ArchivesCache(): # we don't actually need to test archives handling itself path1 = "/zuba/duba" path2 = "/zuba/duba2" # should not be able to create a persistent cache without topdir assert_raises(ValueError, ArchivesCache, persistent=True) cache = ArchivesCache() # by default -- non persistent archive1_path = op.join(path1, fn_archive_obscure_ext) archive2_path = op.join(path2, fn_archive_obscure_ext) cached_archive1_path = cache[archive1_path].path assert_false(cache[archive1_path].path == cache[archive2_path].path) assert_true(cache[archive1_path] is cache[archive1_path]) cache.clean() assert_false(op.exists(cached_archive1_path)) assert_false(op.exists(cache.path)) # test del cache = ArchivesCache() # by default -- non persistent assert_true(op.exists(cache.path)) cache_path = cache.path del cache assert_false(op.exists(cache_path))
def check_datasets_datalad_org(suffix, tdir): # Test that git annex / datalad install, get work correctly on our datasets.datalad.org # Apparently things can break, especially with introduction of the # smart HTTP backend for apache2 etc ds = install(tdir, source='///dicoms/dartmouth-phantoms/bids_test6-PD+T2w' + suffix) eq_(ds.config.get('remote.origin.annex-ignore', None), None) # assert_result_count and not just assert_status since for some reason on # Windows we get two records due to a duplicate attempt (as res[1]) to get it # again, which is reported as "notneeded". For the purpose of this test # it doesn't make a difference. assert_result_count( ds.get(op.join('001-anat-scout_ses-{date}', '000001.dcm')), 1, status='ok') assert_status('ok', ds.remove())
def check_datasets_datalad_org(suffix, tdir): # Test that git annex / datalad install, get work correctly on our datasets.datalad.org # Apparently things can break, especially with introduction of the # smart HTTP backend for apache2 etc ds = install(tdir, source='///dicoms/dartmouth-phantoms/bids_test6-PD+T2w' + suffix) eq_(ds.config.get('remote.origin.annex-ignore', None), None) # assert_result_count and not just assert_status since for some reason on # Windows we get two records due to a duplicate attempt (as res[1]) to get it # again, which is reported as "notneeded". For the purpose of this test # it doesn't make a difference. # git-annex version is not "real" - but that is about when fix was introduced from datalad import cfg if on_windows \ and cfg.obtain("datalad.repo.version") < 6 \ and external_versions['cmd:annex'] <= '7.20181203': raise SkipTest("Known to fail, needs fixed git-annex") assert_result_count( ds.get(op.join('001-anat-scout_ses-{date}', '000001.dcm')), 1, status='ok') assert_status('ok', ds.remove())
def check_datasets_datalad_org(suffix, tdir): # Test that git annex / datalad install, get work correctly on our datasets.datalad.org # Apparently things can break, especially with introduction of the # smart HTTP backend for apache2 etc ds = install(tdir, source='///dicoms/dartmouth-phantoms/bids_test6-PD+T2w' + suffix) eq_(ds.config.get('remote.origin.annex-ignore', None), None) # assert_result_count and not just assert_status since for some reason on # Windows we get two records due to a duplicate attempt (as res[1]) to get it # again, which is reported as "notneeded". For the purpose of this test # it doesn't make a difference. # git-annex version is not "real" - but that is about when fix was introduced from datalad import cfg if on_windows \ and cfg.obtain("datalad.repo.version") < 6 \ and external_versions['cmd:annex'] <= '7.20181203': raise SkipTest("Known to fail, needs fixed git-annex") assert_result_count( ds.get(op.join('001-anat-scout_ses-{date}', '000001.dcm')), 1, status='ok') assert_status('ok', ds.remove())
def path_under_rev_dataset(ds, path): ds_path = ds.pathobj try: rpath = str(ut.Path(path).relative_to(ds_path)) if not rpath.startswith(op.pardir): # path is already underneath the dataset return path except Exception: # whatever went wrong, we gotta play save pass root = get_dataset_root(str(path)) while root is not None and not ds_path.samefile(root): # path and therefore root could be relative paths, # hence in the next round we cannot use dirname() # to jump in the the next directory up, but we have # to use ./.. and get_dataset_root() will handle # the rest just fine root = get_dataset_root(op.join(root, op.pardir)) if root is None: return None return ds_path / op.relpath(str(path), root)
def test_no_blows(cookiesdir): cookies = CookiesDB(op.join(cookiesdir, 'mycookies')) # set the cookie cookies['best'] = 'mine' assert_equal(cookies['best'], 'mine') """ Somehow this manages to trigger on conda but not on debian for me File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/shelve.py", line 125, in __setitem__ self.dict[key.encode(self.keyencoding)] = f.getvalue() File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/dbm/dumb.py", line 216, in __setitem__ self._index[key] = self._setval(pos, val) File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/dbm/dumb.py", line 178, in _setval with _io.open(self._datfile, 'rb+') as f: FileNotFoundError: [Errno 2] No such file or directory: '/home/yoh/.tmp/datalad_temp_test_no_blowsalnsw_wk/mycookies.dat' on Debian (python 3.7.3~rc1-1) I just get a warning: BDB3028 /home/yoh/.tmp/datalad_temp_test_no_blows58tdg67s/mycookies.db: unable to flush: No such file or directory """ try: rmtree(cookiesdir) except OSError: # on NFS directory might still be open, so .nfs* lock file would prevent # removal, but it shouldn't matter and .close should succeed pass cookies.close()
def test_get_leading_directory(): ea = ExtractedArchive('/some/bogus', '/some/bogus') yield _test_get_leading_directory, ea, [], None yield _test_get_leading_directory, ea, ['file.txt'], None yield _test_get_leading_directory, ea, ['file.txt', op.join('d', 'f')], None yield _test_get_leading_directory, ea, [op.join('d', 'f'), op.join('d', 'f2')], 'd' yield _test_get_leading_directory, ea, [op.join('d', 'f'), op.join('d', 'f2')], 'd', {'consider': 'd'} yield _test_get_leading_directory, ea, [op.join('d', 'f'), op.join('d', 'f2')], None, {'consider': 'dd'} yield _test_get_leading_directory, ea, [op.join('d', 'f'), op.join('d2', 'f2')], None yield _test_get_leading_directory, ea, [op.join('d', 'd2', 'f'), op.join('d', 'd2', 'f2')], op.join('d', 'd2') yield _test_get_leading_directory, ea, [op.join('d', 'd2', 'f'), op.join('d', 'd2', 'f2')], 'd', {'depth': 1} # with some parasitic files yield _test_get_leading_directory, ea, [op.join('d', 'f'), op.join('._d')], 'd', {'exclude': ['\._.*']} yield _test_get_leading_directory, ea, [op.join('d', 'd1', 'f'), op.join('d', '._d'), '._x'], op.join('d', 'd1'), {'exclude': ['\._.*']}
def test_direct_cfg(path1, path2): # and if repo already exists and we have env var - we fail too # Adding backend so we get some commit into the repo ar = AnnexRepo(path1, create=True, backend='MD5E') del ar AnnexRepo._unique_instances.clear() # fight flyweight for path in (path1, path2): with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}): # try to create annex repo in direct mode as see how it fails with assert_raises(DirectModeNoLongerSupportedError) as cme: AnnexRepo(path, create=True) assert_in("no longer supported by DataLad", str(cme.exception)) # we have generic part assert_in("datalad.repo.direct configuration", str(cme.exception)) # situation specific part # assert not op.exists(path2) # that we didn't create it - we do! # fixing for that would be too cumbersome since we first call GitRepo.__init__ # with create ar = AnnexRepo(path1) # check if we somehow didn't reset the flag assert not ar.is_direct_mode() if not ar.check_direct_mode_support(): raise SkipTest( "Rest of test requires direct mode support in git-annex") # TODO: Remove the rest of this test once GIT_ANNEX_MIN_VERSION is # at least 7.20190912 (which dropped direct mode support). if ar.config.obtain("datalad.repo.version") >= 6: raise SkipTest( "Created repo not v5, cannot test detection of direct mode repos") # and if repo existed before and was in direct mode, we fail too # Since direct= option was deprecated entirely, we use protected method now ar._set_direct_mode(True) assert ar.is_direct_mode() del ar # but we would need to disable somehow the flywheel with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}): with assert_raises(DirectModeNoLongerSupportedError) as cme: AnnexRepo(path1, create=False) # TODO: RM DIRECT decide what should we here -- should we test/blow? # ATM both tests below just pass ar2 = AnnexRepo(path2, create=True) # happily can do it since it doesn't need a worktree to do the clone ar2.add_submodule('sub1', url=path1) ar2sub1 = AnnexRepo(op.join(path2, 'sub1')) # but now let's convert that sub1 to direct mode assert not ar2sub1.is_direct_mode() ar2sub1._set_direct_mode(True) assert ar2sub1.is_direct_mode() del ar2 del ar2sub1 AnnexRepo._unique_instances.clear() # fight flyweight ar2 = AnnexRepo(path2) list(ar2.get_submodules_()) # And what if we are trying to add pre-cloned repo in direct mode? ar2sub2 = AnnexRepo.clone(path1, op.join(path2, 'sub2')) ar2sub2._set_direct_mode(True) del ar2sub2 AnnexRepo._unique_instances.clear() # fight flyweight ar2.add('sub2')
'.bidsignore', 'code/**', '*.tsv', '*.json', '*.txt', ] # just to be sure + _scans.tsv could contain dates force_in_annex = [ '*.nii.gz', '*.tgz', '*_scans.tsv', ] # make an attempt to discover the prospective change in .gitattributes # to decide what needs to be done, and make this procedure idempotent # (for simple cases) attr_fpath = op.join(ds.path, '.gitattributes') if op.lexists(attr_fpath): with open(attr_fpath, 'rb') as f: attrs = f.read().decode() else: attrs = '' for paths, largefile in [ (force_in_annex, 'anything'), (force_in_git, 'nothing'), ]: # amend gitattributes, if needed ds.repo.set_gitattributes([ (path, { 'annex.largefiles': largefile }) for path in paths
def __call__(dataset, filename=None, archivetype='tar', compression='gz', missing_content='error'): import os import tarfile import zipfile from unittest.mock import patch from os.path import join as opj, dirname, normpath, isabs import os.path as op from datalad.distribution.dataset import require_dataset from datalad.utils import file_basename from datalad.support.annexrepo import AnnexRepo import logging lgr = logging.getLogger('datalad.local.export_archive') dataset = require_dataset(dataset, check_installed=True, purpose='export archive') repo = dataset.repo committed_date = repo.get_commit_date() # could be used later on to filter files by some criterion def _filter_tarinfo(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all # TODO: use the date of the last commit when any particular # file was changed -- would be the most kosher yoh thinks to the # degree of our abilities ti.mtime = committed_date return ti tar_args = dict(recursive=False, filter=_filter_tarinfo) file_extension = '.{}{}'.format( archivetype, '{}{}'.format('.' if compression else '', compression) if archivetype == 'tar' else '') default_filename = "datalad_{.id}".format(dataset) if filename is None: filename = default_filename # in current directory elif path.exists(filename) and path.isdir(filename): filename = path.join(filename, default_filename) # under given directory if not filename.endswith(file_extension): filename += file_extension root = dataset.path # use dir inside matching the output filename # TODO: could be an option to the export plugin allowing empty value # for no leading dir leading_dir = file_basename(filename) # workaround for inability to pass down the time stamp with patch('time.time', return_value=committed_date), \ tarfile.open(filename, "w:{}".format(compression)) \ if archivetype == 'tar' \ else zipfile.ZipFile( filename, 'w', zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \ as archive: add_method = archive.add if archivetype == 'tar' else archive.write repo_files = sorted(repo.get_indexed_files()) if isinstance(repo, AnnexRepo): annexed = repo.is_under_annex(repo_files, allow_quick=True, batch=True) # remember: returns False for files in Git! has_content = repo.file_has_content(repo_files, allow_quick=True, batch=True) else: annexed = [False] * len(repo_files) has_content = [True] * len(repo_files) for i, rpath in enumerate(repo_files): fpath = opj(root, rpath) if annexed[i]: if not has_content[i]: if missing_content in ('ignore', 'continue'): (lgr.warning if missing_content == 'continue' else lgr.debug)( 'File %s has no content available, skipped', fpath) continue else: raise IOError('File %s has no content available' % fpath) # resolve to possible link target if op.islink(fpath): link_target = os.readlink(fpath) if not isabs(link_target): link_target = normpath( opj(dirname(fpath), link_target)) fpath = link_target # name in the archive aname = normpath(opj(leading_dir, rpath)) add_method(fpath, arcname=aname, **(tar_args if archivetype == 'tar' else {})) if not isabs(filename): filename = opj(os.getcwd(), filename) yield dict(status='ok', path=filename, type='file', action='export_archive', logger=lgr)
) from datalad.support.exceptions import ( MissingExternalDependency, ) try: import github as gh except ImportError: # make sure that the command complains too assert_raises(MissingExternalDependency, create_sibling_github, 'some') raise SkipTest # Keep fixtures local to this test file from datalad.support import path as op FIXTURES_PATH = op.join(op.dirname(__file__), 'vcr_cassettes') def use_cassette(name, *args, **kwargs): """Adapter to store fixtures locally TODO: RF so could be used in other places as well """ return use_cassette_(op.join(FIXTURES_PATH, name + '.yaml'), *args, **kwargs) @with_tempfile def test_invalid_call(path): # no dataset assert_raises(ValueError, create_sibling_github, 'bogus', dataset=path) ds = Dataset(path).create()
) from datalad.support.exceptions import ( MissingExternalDependency, ) try: import github as gh except ImportError: # make sure that the command complains too assert_raises(MissingExternalDependency, create_sibling_github, 'some') raise SkipTest # Keep fixtures local to this test file from datalad.support import path as op FIXTURES_PATH = op.join(op.dirname(__file__), 'vcr_cassettes') def use_cassette(name, *args, **kwargs): """Adapter to store fixtures locally and skip if there is no vcr TODO: RF local aspect so could be used in other places as well """ kwargs.setdefault('skip_if_no_vcr', True) return use_cassette_(op.join(FIXTURES_PATH, name + '.yaml'), *args, **kwargs) @with_tempfile def test_invalid_call(path): # no dataset assert_raises(ValueError, create_sibling_github, 'bogus', dataset=path)
def use_cassette(name, *args, **kwargs): """Adapter to store fixtures locally TODO: RF so could be used in other places as well """ return use_cassette_(op.join(FIXTURES_PATH, name + '.yaml'), *args, **kwargs)
def resolve_path(path, ds=None, ds_resolved=None): """Resolve a path specification (against a Dataset location) Any path is returned as an absolute path. If, and only if, a dataset object instance is given as `ds`, relative paths are interpreted as relative to the given dataset. In all other cases, relative paths are treated as relative to the current working directory. Note however, that this function is not able to resolve arbitrarily obfuscated path specifications. All operations are purely lexical, and no actual path resolution against the filesystem content is performed. Consequently, common relative path arguments like '../something' (relative to PWD) can be handled properly, but things like 'down/../under' cannot, as resolving this path properly depends on the actual target of any (potential) symlink leading up to '..'. Parameters ---------- path : str or PathLike or list Platform-specific path specific path specification. Multiple path specifications can be given as a list ds : Dataset or PathLike or None Dataset instance to resolve relative paths against. ds_resolved : Dataset or None A dataset instance that was created from `ds` outside can be provided to avoid multiple instantiation on repeated calls. Returns ------- `pathlib.Path` object or list(Path) When a list was given as input a list is returned, a Path instance otherwise. """ got_ds_instance = isinstance(ds, Dataset) if ds is not None and not got_ds_instance: ds = ds_resolved or require_dataset( ds, check_installed=False, purpose='path resolution') out = [] pwd_parts = None # get it upon first use but only once for p in ensure_list(path): if ds is None or not got_ds_instance: # no dataset at all or no instance provided -> CWD is always the reference # nothing needs to be done here. Path-conversion and absolutification # are done next pass # we have a given datasets instance elif not Path(p).is_absolute(): # we have a dataset and no abspath nor an explicit relative path -> # resolve it against the dataset p = ds.pathobj / p p = ut.Path(p) # make sure we return an absolute path, but without actually # resolving anything if not p.is_absolute(): # in general it is almost impossible to use resolve() when # we can have symlinks in the root path of a dataset # (that we don't want to resolve here), symlinks to annex'ed # files (that we never want to resolve), and other within-repo # symlinks that we (sometimes) want to resolve (i.e. symlinked # paths for addressing content vs adding content) # CONCEPT: do the minimal thing to catch most real-world inputs # ASSUMPTION: the only sane relative path input that needs # handling and can be handled are upward references like # '../../some/that', whereas stuff like 'down/../someotherdown' # are intellectual exercises # ALGORITHM: match any number of leading '..' path components # and shorten the PWD by that number # NOT using ut.Path.cwd(), because it has symlinks resolved!! if not pwd_parts: pwd_parts = ut.Path(getpwd()).parts path_parts = p.parts leading_parents = 0 for pp in p.parts: if pp == op.pardir: leading_parents += 1 path_parts = path_parts[1:] elif pp == op.curdir: # we want to discard that, but without stripping # a corresponding parent path_parts = path_parts[1:] else: break p = ut.Path( op.join(*( pwd_parts[:-leading_parents if leading_parents else None] + path_parts))) # note that we will not "normpath()" the result, check the # pathlib docs for why this is the only sane choice in the # face of the possibility of symlinks in the path out.append(p) return out[0] if isinstance(path, (str, PurePath)) else out
def __call__(dataset, filename=None, archivetype='tar', compression='gz', missing_content='error'): import os import tarfile import zipfile from mock import patch from os.path import join as opj, dirname, normpath, isabs import os.path as op from datalad.distribution.dataset import require_dataset from datalad.utils import file_basename from datalad.support.annexrepo import AnnexRepo from datalad.dochelpers import exc_str import logging lgr = logging.getLogger('datalad.plugin.export_archive') dataset = require_dataset(dataset, check_installed=True, purpose='export archive') repo = dataset.repo committed_date = repo.get_commit_date() # could be used later on to filter files by some criterion def _filter_tarinfo(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all # TODO: use the date of the last commit when any particular # file was changed -- would be the most kosher yoh thinks to the # degree of our abilities ti.mtime = committed_date return ti tar_args = dict(recursive=False, filter=_filter_tarinfo) file_extension = '.{}{}'.format( archivetype, '{}{}'.format( '.' if compression else '', compression) if archivetype == 'tar' else '') default_filename = "datalad_{.id}".format(dataset) if filename is None: filename = default_filename # in current directory elif path.exists(filename) and path.isdir(filename): filename = path.join(filename, default_filename) # under given directory if not filename.endswith(file_extension): filename += file_extension root = dataset.path # use dir inside matching the output filename # TODO: could be an option to the export plugin allowing empty value # for no leading dir leading_dir = file_basename(filename) # workaround for inability to pass down the time stamp with patch('time.time', return_value=committed_date), \ tarfile.open(filename, "w:{}".format(compression)) \ if archivetype == 'tar' \ else zipfile.ZipFile( filename, 'w', zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \ as archive: add_method = archive.add if archivetype == 'tar' else archive.write repo_files = sorted(repo.get_indexed_files()) if isinstance(repo, AnnexRepo): annexed = repo.is_under_annex( repo_files, allow_quick=True, batch=True) # remember: returns False for files in Git! has_content = repo.file_has_content( repo_files, allow_quick=True, batch=True) else: annexed = [False] * len(repo_files) has_content = [True] * len(repo_files) for i, rpath in enumerate(repo_files): fpath = opj(root, rpath) if annexed[i]: if not has_content[i]: if missing_content in ('ignore', 'continue'): (lgr.warning if missing_content == 'continue' else lgr.debug)( 'File %s has no content available, skipped', fpath) continue else: raise IOError('File %s has no content available' % fpath) # resolve to possible link target if op.islink(fpath): link_target = os.readlink(fpath) if not isabs(link_target): link_target = normpath(opj(dirname(fpath), link_target)) fpath = link_target # name in the archive aname = normpath(opj(leading_dir, rpath)) add_method( fpath, arcname=aname, **(tar_args if archivetype == 'tar' else {})) if not isabs(filename): filename = opj(os.getcwd(), filename) yield dict( status='ok', path=filename, type='file', action='export_archive', logger=lgr)
# will get its own .gitattributes entry to keep it out of the annex # give relative path to dataset root (use platform notation) force_in_git = [ 'README', 'CHANGES', 'dataset_description.json', '.bidsignore', 'code/**', # to not put participants or scan info into Git, might contain sensitive # information #'*.tsv', ] # make an attempt to discover the prospective change in .gitattributes # to decide what needs to be done, and make this procedure idempotent # (for simple cases) attr_fpath = op.join(ds.path, '.gitattributes') if op.lexists(attr_fpath): with open(attr_fpath, 'rb') as f: attrs = f.read().decode() else: attrs = '' # amend gitattributes, if needed ds.repo.set_gitattributes([ (path, {'annex.largefiles': 'nothing'}) for path in force_in_git if '{} annex.largefiles=nothing'.format(path) not in attrs ]) # leave clean ds.save(
assert_false(op.exists(cache.path)) # test del cache = ArchivesCache() # by default -- non persistent assert_true(op.exists(cache.path)) cache_path = cache.path del cache assert_false(op.exists(cache_path)) @pytest.mark.parametrize( "return_value,target_value,kwargs", [ ([], None, {}), (['file.txt'], None, {}), (['file.txt', op.join('d', 'f')], None, {}), ([op.join('d', 'f'), op.join('d', 'f2')], 'd', {}), ([op.join('d', 'f'), op.join('d', 'f2')], 'd', { 'consider': 'd' }), ([op.join('d', 'f'), op.join('d', 'f2')], None, { 'consider': 'dd' }), ([op.join('d', 'f'), op.join('d2', 'f2')], None, {}), ([op.join('d', 'd2', 'f'), op.join('d', 'd2', 'f2')], op.join('d', 'd2'), {}), ([op.join('d', 'd2', 'f'), op.join('d', 'd2', 'f2')], 'd', { 'depth': 1 }), # with some parasitic files