def decompress_file(archive, dir_, leading_directories='strip'): """Decompress `archive` into a directory `dir_` Parameters ---------- archive: str dir_: str leading_directories: {'strip', None} If `strip`, and archive contains a single leading directory under which all content is stored, all the content will be moved one directory up and that leading directory will be removed. """ if not exists(dir_): lgr.debug("Creating directory %s to extract archive into" % dir_) os.makedirs(dir_) _decompress_file(archive, dir_) if leading_directories == 'strip': _, dirs, files = next(os.walk(dir_)) if not len(files) and len(dirs) == 1: # move all the content under dirs[0] up 1 level widow_dir = opj(dir_, dirs[0]) lgr.debug("Moving content within %s upstairs" % widow_dir) subdir, subdirs_, files_ = next(os.walk(opj(dir_, dirs[0]))) for f in subdirs_ + files_: os.rename(opj(subdir, f), opj(dir_, f)) rmdir(widow_dir) elif leading_directories is None: pass # really do nothing else: raise NotImplementedError("Not supported %s" % leading_directories)
def check_compress_file(ext, annex, path, name): # we base the archive name on the filename, in order to also # be able to properly test compressors where the corresponding # archive format has no capability of storing a filename # (i.e. where the archive name itself determines the filename # of the decompressed file, like .xz) archive = op.join(name, _filename + ext) compress_files([_filename], archive, path=path) assert_true(op.exists(archive)) if annex: # It should work even when file is annexed and is a symlink to the # key from datalad.support.annexrepo import AnnexRepo repo = AnnexRepo(path, init=True) repo.add(_filename) repo.commit(files=[_filename], msg="commit") dir_extracted = name + "_extracted" try: decompress_file(archive, dir_extracted) except MissingExternalDependency as exc: raise SkipTest(exc_str(exc)) _filepath = op.join(dir_extracted, _filename) ok_file_has_content(_filepath, 'content')
def __init__(self, archive, path=None, persistent=False): self._archive = archive # TODO: bad location for extracted archive -- use tempfile if not path: path = tempfile.mktemp(**get_tempfile_kwargs(prefix=_get_cached_filename(archive))) if exists(path) and not persistent: raise RuntimeError("Directory %s already exists whenever it should not " "persist" % path) self._persistent = persistent self._path = path
def test__version__(): # in released stage, version in the last CHANGELOG entry # should correspond to the one in datalad CHANGELOG_filename = op.join( op.dirname(__file__), op.pardir, op.pardir, 'CHANGELOG.md') if not op.exists(CHANGELOG_filename): raise SkipTest("no %s found" % CHANGELOG_filename) regex = re.compile(r'^## ' r'(?P<version>[0-9]+\.[0-9.abcrc~]+)\s+' r'\((?P<date>.*)\)' r'\s+--\s+' r'(?P<codename>.+)' ) with open(CHANGELOG_filename, 'rb') as f: for line in f: line = line.rstrip() if not line.startswith(b'## '): # The first section header we hit, must be our changelog entry continue reg = regex.match(assure_unicode(line)) if not reg: # first one at that level is the one raise AssertionError( "Following line must have matched our regex: %r" % line) regd = reg.groupdict() changelog_version = regd['version'] lv_changelog_version = LooseVersion(changelog_version) # we might have a suffix - sanitize san__version__ = __version__.rstrip('.devdirty') lv__version__ = LooseVersion(san__version__) if '???' in regd['date'] and 'will be better than ever' in regd['codename']: # we only have our template # we can only assert that its version should be higher than # the one we have now assert_greater(lv_changelog_version, lv__version__) else: # should be a "release" record assert_not_in('???', regd['date']) assert_not_in('will be better than ever', regd['codename']) assert_equal(__hardcoded_version__, changelog_version) if __hardcoded_version__ != san__version__: # It was not tagged yet and Changelog should have its # template record for the next release assert_greater(lv_changelog_version, lv__version__) assert_in('.dev', san__version__) else: # all is good, tagged etc assert_equal(lv_changelog_version, lv__version__) assert_equal(changelog_version, san__version__) assert_equal(__hardcoded_version__, san__version__) return raise AssertionError( "No log line matching our regex found in %s" % CHANGELOG_filename )
def get_extracted_file(self, afile): lgr.debug(u"Requested file {afile} from archive {self._archive}".format(**locals())) # TODO: That could be a good place to provide "compatibility" layer if # filenames within archive are too obscure for local file system. # We could somehow adjust them while extracting and here channel back # "fixed" up names since they are only to point to the load self.assure_extracted() path = self.get_extracted_filename(afile) # TODO: make robust lgr.log(2, "Verifying that %s exists", abspath(path)) assert exists(path), "%s must exist" % path return path
def test_ArchivesCache(): # we don't actually need to test archives handling itself path1 = "/zuba/duba" path2 = "/zuba/duba2" # should not be able to create a persistent cache without topdir assert_raises(ValueError, ArchivesCache, persistent=True) cache = ArchivesCache() # by default -- non persistent archive1_path = op.join(path1, fn_archive_obscure_ext) archive2_path = op.join(path2, fn_archive_obscure_ext) cached_archive1_path = cache[archive1_path].path assert_false(cache[archive1_path].path == cache[archive2_path].path) assert_true(cache[archive1_path] is cache[archive1_path]) cache.clean() assert_false(op.exists(cached_archive1_path)) assert_false(op.exists(cache.path)) # test del cache = ArchivesCache() # by default -- non persistent assert_true(op.exists(cache.path)) cache_path = cache.path del cache assert_false(op.exists(cache_path))
def check_decompress_file(leading_directories, path): outdir = op.join(path, 'simple-extracted') with swallow_outputs() as cmo: decompress_file(op.join(path, fn_archive_obscure_ext), outdir, leading_directories=leading_directories) eq_(cmo.out, "") eq_(cmo.err, "") path_archive_obscure = op.join(outdir, fn_archive_obscure) if leading_directories == 'strip': assert_false(op.exists(path_archive_obscure)) testpath = outdir elif leading_directories is None: assert_true(op.exists(path_archive_obscure)) testpath = path_archive_obscure else: raise NotImplementedError("Dunno about this strategy: %s" % leading_directories) assert_true(op.exists(op.join(testpath, '3.txt'))) assert_true(op.exists(op.join(testpath, fn_in_archive_obscure))) with open(op.join(testpath, '3.txt')) as f: eq_(f.read(), '3 load')
def clean(self, force=False): # would interfere with tests # if os.environ.get('DATALAD_TESTS_TEMP_KEEP'): # lgr.info("As instructed, not cleaning up the cache under %s" # % self._path) # return for path, name in [(self._path, 'cache'), (self.stamp_path, 'stamp file')]: if exists(path): if (not self._persistent) or force: lgr.debug("Cleaning up the %s for %s under %s", name, self._archive, path) # TODO: we must be careful here -- to not modify permissions of files # only of directories (rmtree if isdir(path) else unlink)(path)
def __init__(self, toppath=None, persistent=False): self._toppath = toppath if toppath: path = opj(toppath, ARCHIVES_TEMP_DIR) if not persistent: tempsuffix = "-" + _get_random_id() lgr.debug( "For non-persistent archives using %s suffix for path %s", tempsuffix, path) path += tempsuffix else: if persistent: raise ValueError( "%s cannot be persistent since no toppath was provided" % self) path = tempfile.mktemp(**get_tempfile_kwargs()) self._path = path self.persistent = persistent # TODO? ensure that it is absent or we should allow for it to persist a bit? #if exists(path): # self._clean_cache() self._archives = {} # TODO: begging for a race condition if not exists(path): lgr.debug("Initiating clean cache for the archives under %s" % self.path) try: self._made_path = True os.makedirs(path) lgr.debug("Cache initialized") except Exception as e: lgr.error("Failed to initialize cached under %s" % path) raise else: lgr.debug( "Not initiating existing cache for the archives under %s" % self.path) self._made_path = False
def is_extracted(self): return exists(self.path) and exists(self.stamp_path) \ and os.stat(self.stamp_path).st_mtime >= os.stat(self.path).st_mtime
def __call__(dataset, filename=None, archivetype='tar', compression='gz', missing_content='error'): import os import tarfile import zipfile from unittest.mock import patch from os.path import join as opj, dirname, normpath, isabs import os.path as op from datalad.distribution.dataset import require_dataset from datalad.utils import file_basename from datalad.support.annexrepo import AnnexRepo import logging lgr = logging.getLogger('datalad.local.export_archive') dataset = require_dataset(dataset, check_installed=True, purpose='export archive') repo = dataset.repo committed_date = repo.get_commit_date() # could be used later on to filter files by some criterion def _filter_tarinfo(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all # TODO: use the date of the last commit when any particular # file was changed -- would be the most kosher yoh thinks to the # degree of our abilities ti.mtime = committed_date return ti tar_args = dict(recursive=False, filter=_filter_tarinfo) file_extension = '.{}{}'.format( archivetype, '{}{}'.format('.' if compression else '', compression) if archivetype == 'tar' else '') default_filename = "datalad_{.id}".format(dataset) if filename is None: filename = default_filename # in current directory elif path.exists(filename) and path.isdir(filename): filename = path.join(filename, default_filename) # under given directory if not filename.endswith(file_extension): filename += file_extension root = dataset.path # use dir inside matching the output filename # TODO: could be an option to the export plugin allowing empty value # for no leading dir leading_dir = file_basename(filename) # workaround for inability to pass down the time stamp with patch('time.time', return_value=committed_date), \ tarfile.open(filename, "w:{}".format(compression)) \ if archivetype == 'tar' \ else zipfile.ZipFile( filename, 'w', zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \ as archive: add_method = archive.add if archivetype == 'tar' else archive.write repo_files = sorted(repo.get_indexed_files()) if isinstance(repo, AnnexRepo): annexed = repo.is_under_annex(repo_files, allow_quick=True, batch=True) # remember: returns False for files in Git! has_content = repo.file_has_content(repo_files, allow_quick=True, batch=True) else: annexed = [False] * len(repo_files) has_content = [True] * len(repo_files) for i, rpath in enumerate(repo_files): fpath = opj(root, rpath) if annexed[i]: if not has_content[i]: if missing_content in ('ignore', 'continue'): (lgr.warning if missing_content == 'continue' else lgr.debug)( 'File %s has no content available, skipped', fpath) continue else: raise IOError('File %s has no content available' % fpath) # resolve to possible link target if op.islink(fpath): link_target = os.readlink(fpath) if not isabs(link_target): link_target = normpath( opj(dirname(fpath), link_target)) fpath = link_target # name in the archive aname = normpath(opj(leading_dir, rpath)) add_method(fpath, arcname=aname, **(tar_args if archivetype == 'tar' else {})) if not isabs(filename): filename = opj(os.getcwd(), filename) yield dict(status='ok', path=filename, type='file', action='export_archive', logger=lgr)
def __call__(dataset, filename=None, archivetype='tar', compression='gz', missing_content='error'): import os import tarfile import zipfile from mock import patch from os.path import join as opj, dirname, normpath, isabs import os.path as op from datalad.distribution.dataset import require_dataset from datalad.utils import file_basename from datalad.support.annexrepo import AnnexRepo from datalad.dochelpers import exc_str import logging lgr = logging.getLogger('datalad.plugin.export_archive') dataset = require_dataset(dataset, check_installed=True, purpose='export archive') repo = dataset.repo committed_date = repo.get_commit_date() # could be used later on to filter files by some criterion def _filter_tarinfo(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all # TODO: use the date of the last commit when any particular # file was changed -- would be the most kosher yoh thinks to the # degree of our abilities ti.mtime = committed_date return ti tar_args = dict(recursive=False, filter=_filter_tarinfo) file_extension = '.{}{}'.format( archivetype, '{}{}'.format( '.' if compression else '', compression) if archivetype == 'tar' else '') default_filename = "datalad_{.id}".format(dataset) if filename is None: filename = default_filename # in current directory elif path.exists(filename) and path.isdir(filename): filename = path.join(filename, default_filename) # under given directory if not filename.endswith(file_extension): filename += file_extension root = dataset.path # use dir inside matching the output filename # TODO: could be an option to the export plugin allowing empty value # for no leading dir leading_dir = file_basename(filename) # workaround for inability to pass down the time stamp with patch('time.time', return_value=committed_date), \ tarfile.open(filename, "w:{}".format(compression)) \ if archivetype == 'tar' \ else zipfile.ZipFile( filename, 'w', zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \ as archive: add_method = archive.add if archivetype == 'tar' else archive.write repo_files = sorted(repo.get_indexed_files()) if isinstance(repo, AnnexRepo): annexed = repo.is_under_annex( repo_files, allow_quick=True, batch=True) # remember: returns False for files in Git! has_content = repo.file_has_content( repo_files, allow_quick=True, batch=True) else: annexed = [False] * len(repo_files) has_content = [True] * len(repo_files) for i, rpath in enumerate(repo_files): fpath = opj(root, rpath) if annexed[i]: if not has_content[i]: if missing_content in ('ignore', 'continue'): (lgr.warning if missing_content == 'continue' else lgr.debug)( 'File %s has no content available, skipped', fpath) continue else: raise IOError('File %s has no content available' % fpath) # resolve to possible link target if op.islink(fpath): link_target = os.readlink(fpath) if not isabs(link_target): link_target = normpath(opj(dirname(fpath), link_target)) fpath = link_target # name in the archive aname = normpath(opj(leading_dir, rpath)) add_method( fpath, arcname=aname, **(tar_args if archivetype == 'tar' else {})) if not isabs(filename): filename = opj(os.getcwd(), filename) yield dict( status='ok', path=filename, type='file', action='export_archive', logger=lgr)
def crawl_github_org(data): assert list(data) == ['datalad_stats'], data # TODO: actually populate the datalad_stats with # of datasets and # possibly amount of data downloaded in get below # Needs DataLad >= 0.13.6~7^2~3 where password was removed entity, cred = next(_gen_github_entity(None, org)) all_repos = list(entity.get_repos(repo_type)) for repo in all_repos: name = repo.name if include and not re.search(include, name): lgr.debug( "Skipping %s since include regex search returns nothing", name) continue if exclude and re.search(exclude, name): lgr.debug( "Skipping %s since exclude regex search returns smth", name) continue # Let's just do everything here dspath = name if op.exists(dspath): lgr.info("Skipping %s since already exists", name) # although we could just do install, which would at least # verify that url is the same... to not try to aggregate # etc, we will just skip for now continue # TODO: all the recursive etc options try: ds = superds.install(dspath, source=repo.clone_url, get_data=get_data, on_failure='continue') except Exception as exc: if all( f.get('action', '') == 'add_submodule' and f.get('status', '') == 'error' for f in exc.failed): # since we do not like nice exceptions and want to parse arbitrary text # in the return records... let's resist that urge and redo the check # since if no commit -- likely reason is an empty repo if GitRepo(dspath).get_hexsha() is None: lgr.warning( "Cloned an empty repository. Removing and proceeding without error" ) rmtree(dspath) continue if all(f.get('action', '') == 'get' for f in exc.failed): lgr.warning( "We failed to obtain %d files, extracted metadata etc might be incomplete", len(exc.failed)) ds = Dataset(exc.failed[0]['refds']) else: raise if metadata_nativetypes: lgr.info("Setting native metadata types to include %s", ", ".join(metadata_nativetypes)) nativetypes = ds.config.obtain('datalad.metadata.nativetype', default=[]) for nativetype in metadata_nativetypes: if nativetype not in nativetypes: lgr.debug("Adding %s nativetype", nativetype) ds.config.add('datalad.metadata.nativetype', nativetype, where='local') else: lgr.debug( "Not adding %s nativetype since already defined", nativetype) # if anyone down the line needs it aggregate_later.append(dspath) yield { 'dataset': ds, 'superdataset': superds, 'dataset_path': dspath, 'dataset_name': name, 'dataset_url': repo.clone_url, }