def decompress_file(archive, dir_, leading_directories='strip'): """Decompress `archive` into a directory `dir_` Parameters ---------- archive: str dir_: str leading_directories: {'strip', None} If `strip`, and archive contains a single leading directory under which all content is stored, all the content will be moved one directory up and that leading directory will be removed. """ if not exists(dir_): lgr.debug("Creating directory %s to extract archive into", dir_) os.makedirs(dir_) _decompress_file(archive, dir_) if leading_directories == 'strip': _, dirs, files = next(os.walk(dir_)) if not len(files) and len(dirs) == 1: # move all the content under dirs[0] up 1 level widow_dir = opj(dir_, dirs[0]) lgr.debug("Moving content within %s upstairs", widow_dir) subdir, subdirs_, files_ = next(os.walk(opj(dir_, dirs[0]))) for f in subdirs_ + files_: os.rename(opj(subdir, f), opj(dir_, f)) # NFS might hold it victim so use rmtree so it tries a few times rmtree(widow_dir) elif leading_directories is None: pass # really do nothing else: raise NotImplementedError("Not supported %s" % leading_directories)
def test_sibling_inherit_no_super_remote(basedir): ds_source = Dataset(opj(basedir, "source")).create() ds_super = Dataset(opj(basedir, "super")).create() ds_clone = ds_super.clone( source=ds_source.path, path="clone") # Adding a sibling with inherit=True doesn't crash when the superdataset # doesn't have a remote `name`. ds_clone.siblings(action="add", name="donotexist", inherit=True, url=ds_source.path, result_renderer=None)
def __init__(self, toppath=None, persistent=False): self._toppath = toppath if toppath: path = opj(toppath, ARCHIVES_TEMP_DIR) if not persistent: tempsuffix = "-" + _get_random_id() lgr.debug("For non-persistent archives using %s suffix for path %s", tempsuffix, path) path += tempsuffix else: if persistent: raise ValueError("%s cannot be persistent since no toppath was provided" % self) path = tempfile.mktemp(**get_tempfile_kwargs()) self._path = path self.persistent = persistent # TODO? ensure that it is absent or we should allow for it to persist a bit? #if exists(path): # self._clean_cache() self._archives = {} # TODO: begging for a race condition if not exists(path): lgr.debug("Initiating clean cache for the archives under %s" % self.path) try: self._made_path = True os.makedirs(path) lgr.debug("Cache initialized") except Exception as e: lgr.error("Failed to initialize cached under %s" % path) raise else: lgr.debug("Not initiating existing cache for the archives under %s" % self.path) self._made_path = False
def get_extracted_files(self): """Generator to provide filenames which are available under extracted archive """ path = self.assure_extracted() path_len = len(path) + (len(os.sep) if not path.endswith(os.sep) else 0) for root, dirs, files in os.walk(path): # TEMP for name in files: yield ensure_unicode(opj(root, name)[path_len:])
def test_sibling_inherit(basedir): ds_source = Dataset(opj(basedir, "source")).create() # In superdataset, set up remote "source" that has git-annex group "grp". ds_super = Dataset(opj(basedir, "super")).create() ds_super.siblings(action="add", name="source", url=ds_source.path, annex_group="grp", result_renderer=None) ds_clone = ds_super.clone( source=ds_source.path, path="clone") # In a subdataset, adding a "source" sibling with inherit=True pulls in # that configuration. ds_clone.siblings(action="add", name="source", url=ds_source.path, inherit=True, result_renderer=None) res = ds_clone.siblings(action="query", name="source", result_renderer=None) eq_(res[0]["annex-group"], "grp")
def get_archive(self, archive): archive = self._get_normalized_archive_path(archive) if archive not in self._archives: self._archives[archive] = \ ExtractedArchive(archive, opj(self.path, _get_cached_filename(archive)), persistent=self.persistent) return self._archives[archive]
def test_sibling_path_is_posix(basedir=None, otherpath=None): ds_source = Dataset(opj(basedir, "source")).create() # add remote with system native path ds_source.siblings(action="add", name="donotexist", url=otherpath, result_renderer='disabled') res = ds_source.siblings(action="query", name="donotexist", result_renderer='disabled', return_type='item-or-list') # path URL should come out POSIX as if `git clone` had configured it for origin # https://github.com/datalad/datalad/issues/3972 eq_(res['url'], Path(otherpath).as_posix())
def get_leading_directory(self, depth=None, consider=None, exclude=None): """Return leading directory of the content within archive Parameters ---------- depth: int or None, optional Maximal depth of leading directories to consider. If None - no upper limit consider : list of str, optional Regular expressions for file/directory names to be considered (before exclude). Applied to the entire relative path to the file as in the archive exclude: list of str, optional Regular expressions for file/directory names to be excluded from consideration. Applied to the entire relative path to the file as in the archive Returns ------- str or None: If there is no single leading directory -- None returned """ leading = None # returns only files, so no need to check if a dir or not for fpath in self.get_extracted_files(): if consider and not any_re_search(consider, fpath): continue if exclude and any_re_search(exclude, fpath): continue lpath = fpath.split(opsep) dpath = lpath[:-1] # directory path components if leading is None: leading = dpath if depth is None else dpath[:depth] else: if dpath[:len(leading)] != leading: # find smallest common path leading_ = [] # TODO: there might be more efficient pythonic way for d1, d2 in zip(leading, dpath): if d1 != d2: break leading_.append(d1) leading = leading_ if not len(leading): # no common leading - ready to exit return None return leading if leading is None else opj(*leading)
def _get_normalized_archive_path(self, archive): """Return full path to archive So we have consistent operation from different subdirs, while referencing archives from the topdir TODO: why do we need it??? """ if not isabs(archive) and self._toppath: out = normpath(opj(self._toppath, archive)) if relpath(out, self._toppath).startswith(pardir): raise RuntimeError("%s points outside of the topdir %s" % (archive, self._toppath)) if isdir(out): raise RuntimeError("got a directory here... bleh") return out return archive
def get_extracted_filename(self, afile): """Return full path to the `afile` within extracted `archive` It does not actually extract any archive """ return opj(self.path, afile)
def test_siblings(origin, repo_path, local_clone_path): sshurl = "ssh://push-remote.example.com" httpurl1 = "http://remote1.example.com/location" httpurl2 = "http://remote2.example.com/location" # insufficient arguments # we need a dataset to work at with chpwd(repo_path): # not yet there assert_raises(InsufficientArgumentsError, siblings, 'add', url=httpurl1) # prepare src source = install(repo_path, source=origin, recursive=True) # pollute config depvar = 'remote.test-remote.datalad-publish-depends' source.config.add(depvar, 'stupid', where='local') # cannot configure unknown remotes as dependencies res = siblings( 'configure', dataset=source, name="test-remote", url=httpurl1, publish_depends=['r1', 'r2'], on_failure='ignore', result_renderer=None) assert_status('error', res) eq_(res[0]['message'], ('unknown sibling(s) specified as publication dependency: %s', set(('r1', 'r2')))) # prior config was not changed by failed call above eq_(source.config.get(depvar, None), 'stupid') res = siblings('configure', dataset=source, name="test-remote", url=httpurl1, result_xfm='paths', result_renderer=None) eq_(res, [source.path]) assert_in("test-remote", source.repo.get_remotes()) eq_(httpurl1, source.repo.get_remote_url("test-remote")) # reconfiguring doesn't change anything siblings('configure', dataset=source, name="test-remote", url=httpurl1, result_renderer=None) assert_in("test-remote", source.repo.get_remotes()) eq_(httpurl1, source.repo.get_remote_url("test-remote")) # re-adding doesn't work res = siblings('add', dataset=source, name="test-remote", url=httpurl1, on_failure='ignore', result_renderer=None) assert_status('error', res) # only after removal res = siblings('remove', dataset=source, name="test-remote", result_renderer=None) assert_status('ok', res) assert_not_in("test-remote", source.repo.get_remotes()) res = siblings('add', dataset=source, name="test-remote", url=httpurl1, on_failure='ignore', result_renderer=None) assert_status('ok', res) # add to another remote automagically taking it from the url # and being in the dataset directory with chpwd(source.path): res = siblings('add', url=httpurl2, result_renderer=None) assert_result_count( res, 1, name="remote2.example.com", type='sibling') assert_in("remote2.example.com", source.repo.get_remotes()) # don't fail with conflicting url, when using force: res = siblings('configure', dataset=source, name="test-remote", url=httpurl1 + "/elsewhere", result_renderer=None) assert_status('ok', res) eq_(httpurl1 + "/elsewhere", source.repo.get_remote_url("test-remote")) # no longer a use case, I would need additional convincing that # this is anyhow useful other then tripple checking other peoples # errors. for an actual check use 'query' # maybe it could be turned into a set of warnings when `configure` # alters an existing setting, but then why call configure, if you # want to keep the old values #with assert_raises(RuntimeError) as cm: # add_sibling(dataset=source, name="test-remote", # url=httpurl1 + "/elsewhere") #assert_in("""'test-remote' already exists with conflicting settings""", # str(cm.exception)) ## add a push url without force fails, since in a way the fetch url is the ## configured push url, too, in that case: #with assert_raises(RuntimeError) as cm: # add_sibling(dataset=source, name="test-remote", # url=httpurl1 + "/elsewhere", # pushurl=sshurl, force=False) #assert_in("""'test-remote' already exists with conflicting settings""", # str(cm.exception)) # add push url (force): res = siblings('configure', dataset=source, name="test-remote", url=httpurl1 + "/elsewhere", pushurl=sshurl, result_renderer=None) assert_status('ok', res) eq_(httpurl1 + "/elsewhere", source.repo.get_remote_url("test-remote")) eq_(sshurl, source.repo.get_remote_url("test-remote", push=True)) # recursively: for r in siblings( 'configure', dataset=source, name="test-remote", url=httpurl1 + "/%NAME", pushurl=sshurl + "/%NAME", recursive=True, # we need to disable annex queries, as it will try to access # the fake URL configured above get_annex_info=False): repo = GitRepo(r['path'], create=False) assert_in("test-remote", repo.get_remotes()) url = repo.get_remote_url("test-remote") pushurl = repo.get_remote_url("test-remote", push=True) ok_(url.startswith(httpurl1 + '/' + basename(source.path))) ok_(url.endswith(basename(repo.path))) ok_(pushurl.startswith(sshurl + '/' + basename(source.path))) ok_(pushurl.endswith(basename(repo.path))) eq_(url, r['url']) eq_(pushurl, r['pushurl']) # recursively without template: for r in siblings( 'configure', dataset=source, name="test-remote-2", url=httpurl1, pushurl=sshurl, recursive=True, # we need to disable annex queries, as it will try to access # the fake URL configured above get_annex_info=False, result_renderer=None): repo = GitRepo(r['path'], create=False) assert_in("test-remote-2", repo.get_remotes()) url = repo.get_remote_url("test-remote-2") pushurl = repo.get_remote_url("test-remote-2", push=True) ok_(url.startswith(httpurl1)) ok_(pushurl.startswith(sshurl)) # FIXME: next condition used to compare the *Repo objects instead of # there paths. Due to missing annex-init in # datalad/tests/utils.py:clone_url this might not be the same, since # `source` actually is an annex, but after flavor 'clone' in # `with_testrepos` and then `install` any trace of an annex might be # gone in v5 (branch 'master' only), while in direct mode it still is # considered an annex. `repo` is forced to be a `GitRepo`, so we might # compare two objects of different classes while they actually are # pointing to the same repository. # See github issue #1854 if repo.path != source.repo.path: ok_(url.endswith('/' + basename(repo.path))) ok_(pushurl.endswith(basename(repo.path))) eq_(url, r['url']) eq_(pushurl, r['pushurl']) # recursively without template and pushurl but full "hierarchy" # to a local clone for r in siblings( 'configure', dataset=source, name="test-remote-3", url=local_clone_path, recursive=True, # we need to disable annex queries, as it will try to access # the fake URL configured above get_annex_info=False, result_renderer=None): repo = GitRepo(r['path'], create=False) assert_in("test-remote-3", repo.get_remotes()) url = repo.get_remote_url("test-remote-3") pushurl = repo.get_remote_url("test-remote-3", push=True) eq_(normpath(url), normpath(opj(local_clone_path, relpath(str(r['path']), source.path)))) # https://github.com/datalad/datalad/issues/3951 ok_(not pushurl) # no pushurl should be defined