示例#1
0
def decompress_file(archive, dir_, leading_directories='strip'):
    """Decompress `archive` into a directory `dir_`

    Parameters
    ----------
    archive: str
    dir_: str
    leading_directories: {'strip', None}
      If `strip`, and archive contains a single leading directory under which
      all content is stored, all the content will be moved one directory up
      and that leading directory will be removed.
    """
    if not exists(dir_):
        lgr.debug("Creating directory %s to extract archive into" % dir_)
        os.makedirs(dir_)

    _decompress_file(archive, dir_)

    if leading_directories == 'strip':
        _, dirs, files = next(os.walk(dir_))
        if not len(files) and len(dirs) == 1:
            # move all the content under dirs[0] up 1 level
            widow_dir = opj(dir_, dirs[0])
            lgr.debug("Moving content within %s upstairs" % widow_dir)
            subdir, subdirs_, files_ = next(os.walk(opj(dir_, dirs[0])))
            for f in subdirs_ + files_:
                os.rename(opj(subdir, f), opj(dir_, f))
            rmdir(widow_dir)
    elif leading_directories is None:
        pass  # really do nothing
    else:
        raise NotImplementedError("Not supported %s" % leading_directories)
示例#2
0
def check_compress_file(ext, annex, path, name):
    # we base the archive name on the filename, in order to also
    # be able to properly test compressors where the corresponding
    # archive format has no capability of storing a filename
    # (i.e. where the archive name itself determines the filename
    # of the decompressed file, like .xz)
    archive = op.join(name, _filename + ext)
    compress_files([_filename], archive,
                   path=path)
    assert_true(op.exists(archive))
    if annex:
        # It should work even when file is annexed and is a symlink to the
        # key
        from datalad.support.annexrepo import AnnexRepo
        repo = AnnexRepo(path, init=True)
        repo.add(_filename)
        repo.commit(files=[_filename], msg="commit")

    dir_extracted = name + "_extracted"
    try:
        decompress_file(archive, dir_extracted)
    except MissingExternalDependency as exc:
        raise SkipTest(exc_str(exc))
    _filepath = op.join(dir_extracted, _filename)

    ok_file_has_content(_filepath, 'content')
示例#3
0
    def __init__(self, archive, path=None, persistent=False):
        self._archive = archive
        # TODO: bad location for extracted archive -- use tempfile
        if not path:
            path = tempfile.mktemp(**get_tempfile_kwargs(prefix=_get_cached_filename(archive)))

        if exists(path) and not persistent:
            raise RuntimeError("Directory %s already exists whenever it should not "
                               "persist" % path)
        self._persistent = persistent
        self._path = path
示例#4
0
def test__version__():
    # in released stage, version in the last CHANGELOG entry
    # should correspond to the one in datalad
    CHANGELOG_filename = op.join(
        op.dirname(__file__), op.pardir, op.pardir, 'CHANGELOG.md')
    if not op.exists(CHANGELOG_filename):
        raise SkipTest("no %s found" % CHANGELOG_filename)
    regex = re.compile(r'^## '
                       r'(?P<version>[0-9]+\.[0-9.abcrc~]+)\s+'
                       r'\((?P<date>.*)\)'
                       r'\s+--\s+'
                       r'(?P<codename>.+)'
                       )
    with open(CHANGELOG_filename, 'rb') as f:
        for line in f:
            line = line.rstrip()
            if not line.startswith(b'## '):
                # The first section header we hit, must be our changelog entry
                continue
            reg = regex.match(assure_unicode(line))
            if not reg:  # first one at that level is the one
                raise AssertionError(
                    "Following line must have matched our regex: %r" % line)
            regd = reg.groupdict()
            changelog_version = regd['version']
            lv_changelog_version = LooseVersion(changelog_version)
            # we might have a suffix - sanitize
            san__version__ = __version__.rstrip('.devdirty')
            lv__version__ = LooseVersion(san__version__)
            if '???' in regd['date'] and 'will be better than ever' in regd['codename']:
                # we only have our template
                # we can only assert that its version should be higher than
                # the one we have now
                assert_greater(lv_changelog_version, lv__version__)
            else:
                # should be a "release" record
                assert_not_in('???', regd['date'])
                assert_not_in('will be better than ever', regd['codename'])
                assert_equal(__hardcoded_version__, changelog_version)
                if __hardcoded_version__ != san__version__:
                    # It was not tagged yet and Changelog should have its
                    # template record for the next release
                    assert_greater(lv_changelog_version, lv__version__)
                    assert_in('.dev', san__version__)
                else:
                    # all is good, tagged etc
                    assert_equal(lv_changelog_version, lv__version__)
                    assert_equal(changelog_version, san__version__)
                    assert_equal(__hardcoded_version__, san__version__)
            return

    raise AssertionError(
        "No log line matching our regex found in %s" % CHANGELOG_filename
    )
示例#5
0
 def get_extracted_file(self, afile):
     lgr.debug(u"Requested file {afile} from archive {self._archive}".format(**locals()))
     # TODO: That could be a good place to provide "compatibility" layer if
     # filenames within archive are too obscure for local file system.
     # We could somehow adjust them while extracting and here channel back
     # "fixed" up names since they are only to point to the load
     self.assure_extracted()
     path = self.get_extracted_filename(afile)
     # TODO: make robust
     lgr.log(2, "Verifying that %s exists", abspath(path))
     assert exists(path), "%s must exist" % path
     return path
示例#6
0
def test_ArchivesCache():
    # we don't actually need to test archives handling itself
    path1 = "/zuba/duba"
    path2 = "/zuba/duba2"
    # should not be able to create a persistent cache without topdir
    assert_raises(ValueError, ArchivesCache, persistent=True)
    cache = ArchivesCache()  # by default -- non persistent

    archive1_path = op.join(path1, fn_archive_obscure_ext)
    archive2_path = op.join(path2, fn_archive_obscure_ext)
    cached_archive1_path = cache[archive1_path].path
    assert_false(cache[archive1_path].path == cache[archive2_path].path)
    assert_true(cache[archive1_path] is cache[archive1_path])
    cache.clean()
    assert_false(op.exists(cached_archive1_path))
    assert_false(op.exists(cache.path))

    # test del
    cache = ArchivesCache()  # by default -- non persistent
    assert_true(op.exists(cache.path))
    cache_path = cache.path
    del cache
    assert_false(op.exists(cache_path))
示例#7
0
def check_decompress_file(leading_directories, path):
    outdir = op.join(path, 'simple-extracted')

    with swallow_outputs() as cmo:
        decompress_file(op.join(path, fn_archive_obscure_ext), outdir,
                        leading_directories=leading_directories)
        eq_(cmo.out, "")
        eq_(cmo.err, "")

    path_archive_obscure = op.join(outdir, fn_archive_obscure)
    if leading_directories == 'strip':
        assert_false(op.exists(path_archive_obscure))
        testpath = outdir
    elif leading_directories is None:
        assert_true(op.exists(path_archive_obscure))
        testpath = path_archive_obscure
    else:
        raise NotImplementedError("Dunno about this strategy: %s"
                                  % leading_directories)

    assert_true(op.exists(op.join(testpath, '3.txt')))
    assert_true(op.exists(op.join(testpath, fn_in_archive_obscure)))
    with open(op.join(testpath, '3.txt')) as f:
        eq_(f.read(), '3 load')
示例#8
0
    def clean(self, force=False):
        # would interfere with tests
        # if os.environ.get('DATALAD_TESTS_TEMP_KEEP'):
        #     lgr.info("As instructed, not cleaning up the cache under %s"
        #              % self._path)
        #     return

        for path, name in [(self._path, 'cache'),
                           (self.stamp_path, 'stamp file')]:
            if exists(path):
                if (not self._persistent) or force:
                    lgr.debug("Cleaning up the %s for %s under %s", name,
                              self._archive, path)
                    # TODO:  we must be careful here -- to not modify permissions of files
                    #        only of directories
                    (rmtree if isdir(path) else unlink)(path)
示例#9
0
    def __init__(self, toppath=None, persistent=False):

        self._toppath = toppath
        if toppath:
            path = opj(toppath, ARCHIVES_TEMP_DIR)
            if not persistent:
                tempsuffix = "-" + _get_random_id()
                lgr.debug(
                    "For non-persistent archives using %s suffix for path %s",
                    tempsuffix, path)
                path += tempsuffix
        else:
            if persistent:
                raise ValueError(
                    "%s cannot be persistent since no toppath was provided" %
                    self)
            path = tempfile.mktemp(**get_tempfile_kwargs())
        self._path = path
        self.persistent = persistent
        # TODO?  ensure that it is absent or we should allow for it to persist a bit?
        #if exists(path):
        #    self._clean_cache()
        self._archives = {}

        # TODO: begging for a race condition
        if not exists(path):
            lgr.debug("Initiating clean cache for the archives under %s" %
                      self.path)
            try:
                self._made_path = True
                os.makedirs(path)
                lgr.debug("Cache initialized")
            except Exception as e:
                lgr.error("Failed to initialize cached under %s" % path)
                raise
        else:
            lgr.debug(
                "Not initiating existing cache for the archives under %s" %
                self.path)
            self._made_path = False
示例#10
0
 def is_extracted(self):
     return exists(self.path) and exists(self.stamp_path) \
         and os.stat(self.stamp_path).st_mtime >= os.stat(self.path).st_mtime
    def __call__(dataset,
                 filename=None,
                 archivetype='tar',
                 compression='gz',
                 missing_content='error'):
        import os
        import tarfile
        import zipfile
        from unittest.mock import patch
        from os.path import join as opj, dirname, normpath, isabs
        import os.path as op

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import file_basename
        from datalad.support.annexrepo import AnnexRepo

        import logging
        lgr = logging.getLogger('datalad.local.export_archive')

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='export archive')

        repo = dataset.repo
        committed_date = repo.get_commit_date()

        # could be used later on to filter files by some criterion
        def _filter_tarinfo(ti):
            # Reset the date to match the one of the last commit, not from the
            # filesystem since git doesn't track those at all
            # TODO: use the date of the last commit when any particular
            # file was changed -- would be the most kosher yoh thinks to the
            # degree of our abilities
            ti.mtime = committed_date
            return ti

        tar_args = dict(recursive=False, filter=_filter_tarinfo)

        file_extension = '.{}{}'.format(
            archivetype, '{}{}'.format('.' if compression else '', compression)
            if archivetype == 'tar' else '')

        default_filename = "datalad_{.id}".format(dataset)
        if filename is None:
            filename = default_filename  # in current directory
        elif path.exists(filename) and path.isdir(filename):
            filename = path.join(filename,
                                 default_filename)  # under given directory
        if not filename.endswith(file_extension):
            filename += file_extension

        root = dataset.path
        # use dir inside matching the output filename
        # TODO: could be an option to the export plugin allowing empty value
        # for no leading dir
        leading_dir = file_basename(filename)

        # workaround for inability to pass down the time stamp
        with patch('time.time', return_value=committed_date), \
                tarfile.open(filename, "w:{}".format(compression)) \
                if archivetype == 'tar' \
                else zipfile.ZipFile(
                    filename, 'w',
                    zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \
                as archive:
            add_method = archive.add if archivetype == 'tar' else archive.write
            repo_files = sorted(repo.get_indexed_files())
            if isinstance(repo, AnnexRepo):
                annexed = repo.is_under_annex(repo_files,
                                              allow_quick=True,
                                              batch=True)
                # remember: returns False for files in Git!
                has_content = repo.file_has_content(repo_files,
                                                    allow_quick=True,
                                                    batch=True)
            else:
                annexed = [False] * len(repo_files)
                has_content = [True] * len(repo_files)
            for i, rpath in enumerate(repo_files):
                fpath = opj(root, rpath)
                if annexed[i]:
                    if not has_content[i]:
                        if missing_content in ('ignore', 'continue'):
                            (lgr.warning
                             if missing_content == 'continue' else lgr.debug)(
                                 'File %s has no content available, skipped',
                                 fpath)
                            continue
                        else:
                            raise IOError('File %s has no content available' %
                                          fpath)

                    # resolve to possible link target
                    if op.islink(fpath):
                        link_target = os.readlink(fpath)
                        if not isabs(link_target):
                            link_target = normpath(
                                opj(dirname(fpath), link_target))
                        fpath = link_target
                # name in the archive
                aname = normpath(opj(leading_dir, rpath))
                add_method(fpath,
                           arcname=aname,
                           **(tar_args if archivetype == 'tar' else {}))

        if not isabs(filename):
            filename = opj(os.getcwd(), filename)

        yield dict(status='ok',
                   path=filename,
                   type='file',
                   action='export_archive',
                   logger=lgr)
示例#12
0
    def __call__(dataset, filename=None, archivetype='tar', compression='gz',
                 missing_content='error'):
        import os
        import tarfile
        import zipfile
        from mock import patch
        from os.path import join as opj, dirname, normpath, isabs
        import os.path as op

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import file_basename
        from datalad.support.annexrepo import AnnexRepo
        from datalad.dochelpers import exc_str

        import logging
        lgr = logging.getLogger('datalad.plugin.export_archive')

        dataset = require_dataset(dataset, check_installed=True,
                                  purpose='export archive')

        repo = dataset.repo
        committed_date = repo.get_commit_date()

        # could be used later on to filter files by some criterion
        def _filter_tarinfo(ti):
            # Reset the date to match the one of the last commit, not from the
            # filesystem since git doesn't track those at all
            # TODO: use the date of the last commit when any particular
            # file was changed -- would be the most kosher yoh thinks to the
            # degree of our abilities
            ti.mtime = committed_date
            return ti
        tar_args = dict(recursive=False, filter=_filter_tarinfo)

        file_extension = '.{}{}'.format(
            archivetype,
            '{}{}'.format(
                '.' if compression else '',
                compression) if archivetype == 'tar' else '')

        default_filename = "datalad_{.id}".format(dataset)
        if filename is None:
            filename = default_filename  # in current directory
        elif path.exists(filename) and path.isdir(filename):
            filename = path.join(filename, default_filename) # under given directory
        if not filename.endswith(file_extension):
            filename += file_extension

        root = dataset.path
        # use dir inside matching the output filename
        # TODO: could be an option to the export plugin allowing empty value
        # for no leading dir
        leading_dir = file_basename(filename)

        # workaround for inability to pass down the time stamp
        with patch('time.time', return_value=committed_date), \
                tarfile.open(filename, "w:{}".format(compression)) \
                if archivetype == 'tar' \
                else zipfile.ZipFile(
                    filename, 'w',
                    zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \
                as archive:
            add_method = archive.add if archivetype == 'tar' else archive.write
            repo_files = sorted(repo.get_indexed_files())
            if isinstance(repo, AnnexRepo):
                annexed = repo.is_under_annex(
                    repo_files, allow_quick=True, batch=True)
                # remember: returns False for files in Git!
                has_content = repo.file_has_content(
                    repo_files, allow_quick=True, batch=True)
            else:
                annexed = [False] * len(repo_files)
                has_content = [True] * len(repo_files)
            for i, rpath in enumerate(repo_files):
                fpath = opj(root, rpath)
                if annexed[i]:
                    if not has_content[i]:
                        if missing_content in ('ignore', 'continue'):
                            (lgr.warning if missing_content == 'continue' else lgr.debug)(
                                'File %s has no content available, skipped', fpath)
                            continue
                        else:
                            raise IOError('File %s has no content available' % fpath)

                    # resolve to possible link target
                    if op.islink(fpath):
                        link_target = os.readlink(fpath)
                        if not isabs(link_target):
                            link_target = normpath(opj(dirname(fpath), link_target))
                        fpath = link_target
                # name in the archive
                aname = normpath(opj(leading_dir, rpath))
                add_method(
                    fpath,
                    arcname=aname,
                    **(tar_args if archivetype == 'tar' else {}))

        if not isabs(filename):
            filename = opj(os.getcwd(), filename)

        yield dict(
            status='ok',
            path=filename,
            type='file',
            action='export_archive',
            logger=lgr)
示例#13
0
    def crawl_github_org(data):
        assert list(data) == ['datalad_stats'], data
        # TODO: actually populate the datalad_stats with # of datasets and
        # possibly amount of data downloaded in get below
        # Needs DataLad >= 0.13.6~7^2~3 where password was removed
        entity, cred = next(_gen_github_entity(None, org))
        all_repos = list(entity.get_repos(repo_type))

        for repo in all_repos:
            name = repo.name
            if include and not re.search(include, name):
                lgr.debug(
                    "Skipping %s since include regex search returns nothing",
                    name)
                continue
            if exclude and re.search(exclude, name):
                lgr.debug(
                    "Skipping %s since exclude regex search returns smth",
                    name)
                continue
            # Let's just do everything here
            dspath = name
            if op.exists(dspath):
                lgr.info("Skipping %s since already exists", name)
                # although we could just do install, which would at least
                # verify that url is the same... to not try to aggregate
                # etc, we will just skip for now
                continue

            # TODO: all the recursive etc options
            try:
                ds = superds.install(dspath,
                                     source=repo.clone_url,
                                     get_data=get_data,
                                     on_failure='continue')
            except Exception as exc:
                if all(
                        f.get('action', '') == 'add_submodule'
                        and f.get('status', '') == 'error'
                        for f in exc.failed):
                    # since we do not like nice exceptions and want to parse arbitrary text
                    # in the return records... let's resist that urge and redo the check
                    # since if no commit -- likely reason is an empty repo
                    if GitRepo(dspath).get_hexsha() is None:
                        lgr.warning(
                            "Cloned an empty repository.  Removing and proceeding without error"
                        )
                        rmtree(dspath)
                        continue
                if all(f.get('action', '') == 'get' for f in exc.failed):
                    lgr.warning(
                        "We failed to obtain %d files, extracted metadata etc might be incomplete",
                        len(exc.failed))
                    ds = Dataset(exc.failed[0]['refds'])
                else:
                    raise

            if metadata_nativetypes:
                lgr.info("Setting native metadata types to include %s",
                         ", ".join(metadata_nativetypes))
                nativetypes = ds.config.obtain('datalad.metadata.nativetype',
                                               default=[])
                for nativetype in metadata_nativetypes:
                    if nativetype not in nativetypes:
                        lgr.debug("Adding %s nativetype", nativetype)
                        ds.config.add('datalad.metadata.nativetype',
                                      nativetype,
                                      where='local')
                    else:
                        lgr.debug(
                            "Not adding %s nativetype since already defined",
                            nativetype)
            # if anyone down the line needs it
            aggregate_later.append(dspath)
            yield {
                'dataset': ds,
                'superdataset': superds,
                'dataset_path': dspath,
                'dataset_name': name,
                'dataset_url': repo.clone_url,
            }