Exemplo n.º 1
0
    def test_add_delete_after_and_drop(self):
        # To test that .tar gets removed
        # but that new stuff was added to annex repo.  We know the key since default
        # backend and content remain the same
        key1 = 'SHA256E-s5--16d3ad1974655987dd7801d70659990b89bfe7e931a0a358964e64e901761cc0.dat'

        # previous state of things:
        prev_files = list(find_files('.*', self.annex.path))
        assert_equal(self.annex.whereis(key1, key=True, output='full'), {})

        commits_prior = list(self.annex.get_branch_commits_('git-annex'))
        add_archive_content('1.tar', annex=self.annex, strip_leading_dirs=True, delete_after=True)
        commits_after = list(self.annex.get_branch_commits_('git-annex'))
        # There should be a single commit for all additions +1 to initiate datalad-archives gh-1258
        # If faking dates, there should be another +1 because
        # annex.alwayscommit isn't set to false.
        assert_equal(len(commits_after),
                     # We expect one more when faking dates because
                     # annex.alwayscommit isn't set to false.
                     len(commits_prior) + 2 + self.annex.fake_dates_enabled)
        assert_equal(prev_files, list(find_files('.*', self.annex.path)))
        w = self.annex.whereis(key1, key=True, output='full')
        assert_equal(len(w), 2)  # in archive, and locally since we didn't drop

        # Let's now do the same but also drop content
        add_archive_content('1.tar', annex=self.annex, strip_leading_dirs=True, delete_after=True,
                            drop_after=True)
        assert_equal(prev_files, list(find_files('.*', self.annex.path)))
        w = self.annex.whereis(key1, key=True, output='full')
        assert_equal(len(w), 1)  # in archive

        # there should be no .datalad temporary files hanging around
        self.assert_no_trash_left_behind()
Exemplo n.º 2
0
 def test_add_delete(self):
     # To test that .tar gets removed
     add_archive_content('1.tar',
                         annex=self.annex,
                         strip_leading_dirs=True,
                         delete=True)
     assert_false(lexists(opj(self.annex.path, '1.tar')))
Exemplo n.º 3
0
def test_add_archive_content_zip(repo_path=None):
    ds = Dataset(repo_path).create(force=True)
    with chpwd(repo_path):
        with swallow_outputs():
            ds.save("1.zip", message="add 1.zip")
        add_archive_content("1.zip")
        ok_file_under_git(ds.pathobj / "1" / "foo", annexed=True)
        ok_file_under_git(ds.pathobj / "1" / "dir" / "bar", annexed=True)
        ok_archives_caches(ds.path, 0)
Exemplo n.º 4
0
def test_add_archive_content_zip(repo_path):
    repo = AnnexRepo(repo_path, create=True)
    with chpwd(repo_path):
        with swallow_outputs():
            repo.add(["1.zip"])
        repo.commit("add 1.zip")
        add_archive_content("1.zip")
        ok_file_under_git(opj(repo.path, "1", "foo"), annexed=True)
        ok_file_under_git(opj("1", "dir", "bar"), annexed=True)
        ok_archives_caches(repo.path, 0)
Exemplo n.º 5
0
def test_annex_get_from_subdir(topdir):
    from datalad.api import add_archive_content
    annex = AnnexRepo(topdir, init=True)
    annex.add('a.tar.gz', commit=True)
    add_archive_content('a.tar.gz', annex=annex, delete=True)
    fpath = opj(topdir, 'a', 'd', fn_inarchive_obscure)

    with chpwd(opj(topdir, 'a', 'd')):
        runner = Runner()
        runner(['git', 'annex', 'drop', fn_inarchive_obscure])  # run git annex drop
        assert_false(annex.file_has_content(fpath))             # and verify if file deleted from directory
        runner(['git', 'annex', 'get', fn_inarchive_obscure])   # run git annex get
        assert_true(annex.file_has_content(fpath))              # and verify if file got into directory
Exemplo n.º 6
0
def test_add_archive_content_absolute_path(path):
    repo = AnnexRepo(opj(path, "ds"), create=True)
    repo.add(["1.tar.gz"])
    repo.commit("1.tar.gz")
    abs_tar_gz = opj(path, "ds", "1.tar.gz")
    add_archive_content(abs_tar_gz, annex=repo)
    ok_file_under_git(opj(path, "ds", "1", "foo"), annexed=True)
    commit_msg = repo.format_commit("%B")
    # The commit message uses relative paths.
    assert_not_in(abs_tar_gz, commit_msg)
    assert_in("1.tar.gz", commit_msg)

    with assert_raises(FileNotInRepositoryError):
        add_archive_content(opj(path, "notds", "2.tar.gz"), annex=repo)
Exemplo n.º 7
0
def test_add_archive_use_archive_dir(repo_path):
    repo = AnnexRepo(repo_path, create=True)
    with chpwd(repo_path):
        # Let's add first archive to the repo with default setting
        archive_path = opj('4u', '1.tar.gz')
        # check it gives informative error if archive is not already added
        with assert_raises(RuntimeError) as cmr:
            add_archive_content(archive_path)
        assert_re_in(
            "You should run ['\"]datalad save 4u\\\\1\\.tar\\.gz['\"] first"
            if on_windows else
            "You should run ['\"]datalad save 4u/1\\.tar\\.gz['\"] first",
            str(cmr.exception),
            match=False)
        with swallow_outputs():
            repo.add(archive_path)
        repo.commit("added 1.tar.gz")

        ok_archives_caches(repo.path, 0)
        add_archive_content(archive_path,
                            strip_leading_dirs=True,
                            use_current_dir=True)
        ok_(not exists(opj('4u', '1 f.txt')))
        ok_file_under_git(repo.path, '1 f.txt', annexed=True)
        ok_archives_caches(repo.path, 0)

        # and now let's extract under archive dir
        add_archive_content(archive_path, strip_leading_dirs=True)
        ok_file_under_git(repo.path, opj('4u', '1 f.txt'), annexed=True)
        ok_archives_caches(repo.path, 0)

        add_archive_content(opj('4u', 'sub.tar.gz'))
        ok_file_under_git(repo.path, opj('4u', 'sub', '2 f.txt'), annexed=True)
        ok_archives_caches(repo.path, 0)
Exemplo n.º 8
0
def test_add_archive_content_strip_leading(path_orig, url, repo_path):
    with chpwd(repo_path):
        repo = AnnexRepo(repo_path, create=True)

        # Let's add first archive to the repo so we could test
        with swallow_outputs():
            repo.add_urls([opj(url, '1.tar.gz')], options=["--pathdepth", "-1"])
        repo.commit("added 1.tar.gz")

        add_archive_content('1.tar.gz', strip_leading_dirs=True)
        ok_(not exists('1'))
        ok_file_under_git(repo.path, '1 f.txt', annexed=True)
        ok_file_under_git('d', '1d', annexed=True)
        ok_archives_caches(repo.path, 0)
Exemplo n.º 9
0
def test_add_archive_dirs(path_orig=None, url=None, repo_path=None):
    # change to repo_path
    with chpwd(repo_path):
        # create annex repo
        ds = Dataset(repo_path).create(force=True)
        repo = ds.repo
        # add archive to the repo so we could test
        with swallow_outputs():
            repo.add_url_to_file('1.tar.gz', opj(url, '1.tar.gz'))
        repo.commit("added 1.tar.gz")

        # test with excludes and annex options
        add_archive_content(
            '1.tar.gz',
            existing='archive-suffix',
            # Since inconsistent and seems in many cases no
            # leading dirs to strip, keep them as provided
            strip_leading_dirs=True,
            delete=True,
            leading_dirs_consider=['crcns.*', '1'],
            leading_dirs_depth=2,
            use_current_dir=False,
            exclude='.*__MACOSX.*')  # some junk penetrates

        eq_(
            repo.get_description(
                uuid=DATALAD_SPECIAL_REMOTES_UUIDS[ARCHIVES_SPECIAL_REMOTE]),
            '[%s]' % ARCHIVES_SPECIAL_REMOTE)

        all_files = sorted(find_files('.'))
        # posixify paths to make it work on Windows as well
        all_files = [Path(file).as_posix() for file in all_files]
        target_files = {
            'CR24A/behaving1/1 f.txt',
            'CR24C/behaving3/3 f.txt',
            'CR24D/behaving2/2 f.txt',
            '.datalad/config',
        }
        eq_(set(all_files), target_files)

        # regression test: the subdir in MACOSX wasn't excluded and its name was
        # getting stripped by leading_dir_len
        # if stripping and exclude didn't work this fails
        assert_false(exists('__MACOSX'))
        # if exclude doesn't work then name of subdir gets stripped by
        # leading_dir_len
        assert_false(exists('c-1_data'))
        # if exclude doesn't work but everything else works this fails
        assert_false(exists('CR24B'))
Exemplo n.º 10
0
 def test_add_archive_leading_dir(self):
     import os
     os.mkdir(opj(self.annex.path, 'sub'))
     f123 = opj('sub', '123.tar')
     os.rename(opj(self.annex.path, '1.tar'), opj(self.annex.path, f123))
     self.annex.remove('1.tar', force=True)
     self.annex.add(f123)
     self.annex.commit(msg="renamed")
     add_archive_content(f123,
                         annex=self.annex,
                         add_archive_leading_dir=True,
                         strip_leading_dirs=True)
     ok_file_under_git(self.annex.path,
                       opj('sub', '123', 'file.txt'),
                       annexed=True)
Exemplo n.º 11
0
def test_add_archive_dirs(path_orig, url, repo_path):
    # change to repo_path
    with chpwd(repo_path):
        # create annex repo
        repo = AnnexRepo(repo_path, create=True)

        # add archive to the repo so we could test
        with swallow_outputs():
            repo.add_urls([opj(url, '1.tar.gz')],
                          options=["--pathdepth", "-1"])
        repo.commit("added 1.tar.gz")

        # test with excludes and annex options
        add_archive_content(
            '1.tar.gz',
            existing='archive-suffix',
            # Since inconsistent and seems in many cases no leading dirs to strip, keep them as provided
            strip_leading_dirs=True,
            delete=True,
            leading_dirs_consider=['crcns.*', '1'],
            leading_dirs_depth=2,
            use_current_dir=False,
            exclude='.*__MACOSX.*')  # some junk penetrates

        if external_versions['cmd:annex'] >= '6.20170208':
            # should have fixed remotes
            eq_(
                repo.get_description(
                    uuid=DATALAD_SPECIAL_REMOTES_UUIDS[ARCHIVES_SPECIAL_REMOTE]
                ), '[%s]' % ARCHIVES_SPECIAL_REMOTE)

        all_files = sorted(find_files('.'))
        target_files = {
            './CR24A/behaving1/1 f.txt',
            './CR24C/behaving3/3 f.txt',
            './CR24D/behaving2/2 f.txt',
        }
        eq_(set(all_files), target_files)

        # regression test: the subdir in MACOSX wasn't excluded and its name was getting stripped by leading_dir_len
        assert_false(exists(
            '__MACOSX'))  # if stripping and exclude didn't work this fails
        assert_false(
            exists('c-1_data')
        )  # if exclude doesn't work then name of subdir gets stripped by leading_dir_len
        assert_false(
            exists('CR24B')
        )  # if exclude doesn't work but everything else works this fails
Exemplo n.º 12
0
def test_add_archive_content_strip_leading(path_orig=None,
                                           url=None,
                                           repo_path=None):
    with chpwd(repo_path):
        ds = Dataset(repo_path).create(force=True)
        repo = ds.repo
        # Let's add first archive to the repo so we could test
        with swallow_outputs():
            repo.add_url_to_file('1.tar.gz', opj(url, '1.tar.gz'))
        repo.commit("added 1.tar.gz")

        add_archive_content('1.tar.gz', strip_leading_dirs=True)
        ok_(not exists('1'))
        ok_file_under_git(ds.path, '1 f.txt', annexed=True)
        ok_file_under_git('d', '1d', annexed=True)
        ok_archives_caches(ds.path, 0)
Exemplo n.º 13
0
    def test_override_existing_under_git(self):
        create_tree(self.annex.path, {'1.dat': 'load2'})
        self.annex.add('1.dat', git=True)
        self.annex.commit('added to git')
        add_archive_content(
            '1.tar', annex=self.annex, strip_leading_dirs=True,
        )
        # and we did not bother adding it to annex (for now) -- just skipped
        # since we have it and it is the same
        ok_file_under_git(self.annex.path, '1.dat', annexed=False)

        # but if we say 'overwrite' -- we would remove and replace
        add_archive_content(
            '1.tar', annex=self.annex, strip_leading_dirs=True, delete=True
            , existing='overwrite'
        )
        ok_file_under_git(self.annex.path, '1.dat', annexed=True)
Exemplo n.º 14
0
def test_add_archive_use_archive_dir(repo_path=None):
    ds = Dataset(repo_path).create(force=True)
    with chpwd(repo_path):
        # Let's add first archive to the repo with default setting
        archive_path = opj('4u', '1.tar.gz')
        # check it gives informative error if archive is not already added
        res = add_archive_content(archive_path, on_failure='ignore')
        message = \
            "Can not add an untracked archive. Run 'datalad save 4u\\1.tar.gz'"\
        if on_windows else \
            "Can not add an untracked archive. Run 'datalad save 4u/1.tar.gz'"
        assert_in_results(res,
                          action='add-archive-content',
                          message=message,
                          status='impossible')

        with swallow_outputs():
            ds.save(archive_path)

        ok_archives_caches(ds.path, 0)
        add_archive_content(archive_path,
                            strip_leading_dirs=True,
                            use_current_dir=True)
        ok_(not exists(opj('4u', '1 f.txt')))
        ok_file_under_git(ds.path, '1 f.txt', annexed=True)
        ok_archives_caches(ds.path, 0)

        # and now let's extract under archive dir
        add_archive_content(archive_path, strip_leading_dirs=True)
        ok_file_under_git(ds.path, opj('4u', '1 f.txt'), annexed=True)
        ok_archives_caches(ds.path, 0)

        add_archive_content(opj('4u', 'sub.tar.gz'))
        ok_file_under_git(ds.path, opj('4u', 'sub', '2 f.txt'), annexed=True)
        ok_archives_caches(ds.path, 0)
Exemplo n.º 15
0
def test_annex_get_from_subdir(topdir):
    from datalad.api import add_archive_content
    annex = AnnexRepo(topdir, backend='MD5E', init=True)
    annex.add('a.tar.gz')
    annex.commit()
    add_archive_content('a.tar.gz', annex=annex, delete=True)
    fpath = op.join(topdir, 'a', 'd', fn_in_archive_obscure)

    with chpwd(op.join(topdir, 'a', 'd')):
        runner = WitlessRunner()
        runner.run(['git', 'annex', 'drop', '--', fn_in_archive_obscure],
                   protocol=KillOutput)  # run git annex drop
        assert_false(annex.file_has_content(
            fpath))  # and verify if file deleted from directory
        runner.run(['git', 'annex', 'get', '--', fn_in_archive_obscure],
                   protocol=KillOutput)  # run git annex get
        assert_true(annex.file_has_content(
            fpath))  # and verify if file got into directory
Exemplo n.º 16
0
    def test_add_delete_after_and_drop_subdir(self):
        os.mkdir(opj(self.annex.path, 'subdir'))
        mv_out = self.annex.call_git(
            ['mv', '1.tar', 'subdir']
        )
        self.annex.commit("moved into subdir")
        with chpwd(self.annex.path):
            # was failing since deleting without considering if tarball
            # was extracted in that tarball directory
            commits_prior_master = list(self.annex.get_branch_commits_())
            commits_prior = list(self.annex.get_branch_commits_('git-annex'))
            add_out = add_archive_content(
                opj('subdir', '1.tar'),
                delete_after=True,
                drop_after=True)
            assert_repo_status(self.annex.path)
            commits_after_master = list(self.annex.get_branch_commits_())
            commits_after = list(self.annex.get_branch_commits_('git-annex'))
            # There should be a single commit for all additions +1 to
            # initiate datalad-archives gh-1258.  If faking dates,
            # there should be another +1 because annex.alwayscommit
            # isn't set to false.
            assert_equal(len(commits_after),
                         len(commits_prior) + 2 + self.annex.fake_dates_enabled)
            assert_equal(len(commits_after_master), len(commits_prior_master))
            assert(add_out is self.annex)
            # there should be no .datalad temporary files hanging around
            self.assert_no_trash_left_behind()

            # and if we add some untracked file, redo, there should be no changes
            # to master and file should remain not committed
            create_tree(self.annex.path, {'dummy.txt': '123'})
            assert_true(self.annex.dirty)  # untracked file
            add_out = add_archive_content(
                opj('subdir', '1.tar'),
                delete_after=True,
                drop_after=True,
                allow_dirty=True)
            assert_repo_status(self.annex.path, untracked=['dummy.txt'])
            assert_equal(len(list(self.annex.get_branch_commits_())),
                         len(commits_prior_master))

            # there should be no .datalad temporary files hanging around
            self.assert_no_trash_left_behind()
Exemplo n.º 17
0
def check_observe_tqdm(topdir, topurl, outdir):
    # just a helper to enable/use when want quickly to get some
    # repository with archives and observe tqdm
    from datalad.api import create, add_archive_content
    ds = create(outdir)
    for f in '1.tar.gz', '2.tar.gz':
        with chpwd(outdir):
            ds.repo.add_url_to_file(f, topurl + f)
            ds.add(f)
            add_archive_content(f, delete=True, drop_after=True)
    files = glob.glob(op.join(outdir, '*'))
    ds.drop(files) # will not drop tarballs
    ds.repo.drop([], options=['--all', '--fast'])
    ds.get(files)
    ds.repo.drop([], options=['--all', '--fast'])
    # now loop so we could play with it outside
    print(outdir)
    # import pdb; pdb.set_trace()
    while True:
        sleep(0.1)
Exemplo n.º 18
0
def check_observe_tqdm(topdir, topurl, outdir):
    # just a helper to enable/use when want quickly to get some
    # repository with archives and observe tqdm
    from datalad.api import create, add_archive_content
    ds = create(outdir)
    for f in '1.tar.gz', '2.tar.gz':
        with chpwd(outdir):
            ds.repo.add_url_to_file(f, topurl + f)
            ds.add(f)
            add_archive_content(f, delete=True, drop_after=True)
    files = glob.glob(op.join(outdir, '*'))
    ds.drop(files)  # will not drop tarballs
    ds.repo.drop([], options=['--all', '--fast'])
    ds.get(files)
    ds.repo.drop([], options=['--all', '--fast'])
    # now loop so we could play with it outside
    print(outdir)
    # import pdb; pdb.set_trace()
    while True:
        sleep(0.1)
Exemplo n.º 19
0
def test_add_archive_content_absolute_path(path=None):
    ds = Dataset(opj(path, "ds")).create(force=True)
    repo = ds.repo
    ds.save("1.tar.gz", message="1.tar.gz")
    abs_tar_gz = opj(path, "ds", "1.tar.gz")
    add_archive_content(abs_tar_gz, dataset=ds)
    ok_file_under_git(opj(path, "ds", "1", "foo"), annexed=True)
    commit_msg = repo.format_commit("%B")
    # The commit message uses relative paths.
    assert_not_in(abs_tar_gz, commit_msg)
    assert_in("1.tar.gz", commit_msg)
    res = add_archive_content(opj(path, "notds", "2.tar.gz"),
                              dataset=ds,
                              on_failure='ignore')

    assert_in_results(
        res,
        action='add-archive-content',
        status='impossible',
        message='Can not add archive outside of the dataset',
    )
Exemplo n.º 20
0
def download_file(bucket, d, dataset_dir):
    link = bucket["links"]["self"]
    annex = Repo(dataset_dir).git.annex
    if "access_token" not in link:
        if bucket["type"] == "zip":
            d.download_url(link,
                           archive=True if bucket["type"] == "zip" else False)
        else:
            try:  # Try to addurl twice as rarely it might not work on the first try
                annex("addurl", link, "--fast", "--file", link.split("/")[-1])
            except GitCommandError:
                annex("addurl", link, "--fast", "--file", link.split("/")[-1])
    else:  # Have to remove token from annex URL
        if bucket["type"] == "zip":
            file_path = d.download_url(link)[0]["path"]
            annex("rmurl", file_path, link)
            try:  # Try to addurl twice as rarely it might not work on the first try
                annex("addurl",
                      link.split("?")[0], "--file", file_path, "--relaxed")
            except GitCommandError:
                annex("addurl",
                      link.split("?")[0], "--file", file_path, "--relaxed")
            api.add_archive_content(file_path,
                                    annex=AnnexRepo(dataset_dir),
                                    delete=True)
        else:
            file_name = json.load(annex("addurl", link, "--fast",
                                        "--json"))["file"]
            annex("rmurl", file_name, link)
            try:  # Try to addurl twice as rarely it might not work on the first try
                annex("addurl",
                      link.split("?")[0], "--file", file_name, "--relaxed")
            except GitCommandError:
                annex("addurl",
                      link.split("?")[0], "--file", file_name, "--relaxed")
    d.save()
Exemplo n.º 21
0
    def __call__(urls,
                 dataset=None,
                 path=None,
                 overwrite=False,
                 archive=False,
                 save=True,
                 message=None):
        from ..downloaders.providers import Providers

        pwd, rel_pwd = get_dataset_pwds(dataset)

        ds = None
        if save or dataset:
            try:
                ds = require_dataset(dataset,
                                     check_installed=True,
                                     purpose='downloading urls')
            except NoDatasetArgumentFound:
                pass

        common_report = {"action": "download_url", "ds": ds}

        urls = assure_list_from_str(urls)

        if len(urls) > 1 and path and not op.isdir(path):
            yield get_status_dict(
                status="error",
                message=(
                    "When specifying multiple urls, --path should point to "
                    "an existing directory. Got %r", path),
                type="file",
                path=path,
                **common_report)
            return

        if dataset:  # A dataset was explicitly given.
            path = op.normpath(op.join(ds.path, path or op.curdir))
        elif save and ds:
            path = op.normpath(op.join(ds.path, rel_pwd, path or op.curdir))
        elif not path:
            path = op.curdir

        # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress
        # in % of urls which were already downloaded
        providers = Providers.from_config_files()
        downloaded_paths = []
        path_urls = {}
        for url in urls:
            # somewhat "ugly"
            # providers.get_provider(url).get_downloader(url).download(url, path=path)
            # for now -- via sugaring
            try:
                downloaded_path = providers.download(url,
                                                     path=path,
                                                     overwrite=overwrite)
            except Exception as e:
                yield get_status_dict(status="error",
                                      message=exc_str(e),
                                      type="file",
                                      path=path,
                                      **common_report)
            else:
                downloaded_paths.append(downloaded_path)
                path_urls[downloaded_path] = url
                yield get_status_dict(status="ok",
                                      type="file",
                                      path=downloaded_path,
                                      **common_report)

        if downloaded_paths and save and ds is not None:
            msg = message or """\
[DATALAD] Download URLs

URLs:
  {}""".format("\n  ".join(urls))

            for r in ds.rev_save(downloaded_paths, message=msg):
                yield r

            if isinstance(ds.repo, AnnexRepo):
                annex_paths = [
                    p for p, annexed in zip(
                        downloaded_paths,
                        ds.repo.is_under_annex(downloaded_paths)) if annexed
                ]
                if annex_paths:
                    for path in annex_paths:
                        try:
                            # The file is already present. This is just to
                            # register the URL.
                            ds.repo.add_url_to_file(path,
                                                    path_urls[path],
                                                    batch=True)
                        except AnnexBatchCommandError as exc:
                            lgr.warning("Registering %s with %s failed: %s",
                                        path, path_urls[path], exc_str(exc))

                    if archive:
                        from datalad.api import add_archive_content
                        for path in annex_paths:
                            add_archive_content(path,
                                                annex=ds.repo,
                                                delete=True)
Exemplo n.º 22
0
    def __call__(dataset, filename=None, missing_content='error', no_annex=False,
                 # TODO: support working with projects and articles within them
                 # project_id=None,
                 article_id=None):
        import os
        import logging
        lgr = logging.getLogger('datalad.plugin.export_to_figshare')

        from datalad.ui import ui
        from datalad.api import add_archive_content
        from datalad.api import export_archive
        from datalad.distribution.dataset import require_dataset
        from datalad.support.annexrepo import AnnexRepo

        dataset = require_dataset(dataset, check_installed=True,
                                  purpose='export to figshare')

        if not isinstance(dataset.repo, AnnexRepo):
            raise ValueError(
                "%s is not an annex repo, so annexification could be done"
                % dataset
            )

        if dataset.repo.is_dirty():
            raise RuntimeError(
                "Paranoid authors of DataLad refuse to proceed in a dirty repository"
            )
        if filename is None:
            filename = dataset.path
        lgr.info(
            "Exporting current tree as an archive under %s since figshare "
            "does not support directories",
            filename
        )
        archive_out = next(
            export_archive(
                dataset,
                filename=filename,
                archivetype='zip',
                missing_content=missing_content,
                return_type="generator"
            )
        )
        assert archive_out['status'] == 'ok'
        fname = archive_out['path']

        lgr.info("Uploading %s to figshare", fname)
        figshare = FigshareRESTLaison()

        if not article_id:
            # TODO: ask if it should be an article within a project
            if ui.is_interactive:
                # or should we just upload to a new article?
                if ui.yesno(
                    "Would you like to create a new article to upload to?  "
                    "If not - we will list existing articles",
                    title="Article"
                ):
                    article = figshare.create_article(
                        title=os.path.basename(dataset.path)
                    )
                    lgr.info(
                        "Created a new (private) article %(id)s at %(url_private_html)s. "
                        "Please visit it, enter additional meta-data and make public",
                        article
                    )
                    article_id = article['id']
                else:
                    article_id = int(ui.question(
                        "Which of the articles should we upload to.",
                        choices=list(map(str, figshare.get_article_ids()))
                    ))
            if not article_id:
                raise ValueError("We need an article to upload to.")

        file_info = figshare.upload_file(
            fname,
            files_url='account/articles/%s/files' % article_id
        )

        if no_annex:
            lgr.info("Removing generated tarball")
            unlink(fname)
        else:
            # I will leave all the complaining etc to the dataset add if path
            # is outside etc
            lgr.info("'Registering' %s within annex", fname)
            repo = dataset.repo
            repo.add(fname, git=False)
            key = repo.get_file_key(fname)
            lgr.info("Adding URL %(download_url)s for it", file_info)
            repo._annex_custom_command([],
                [
                    "git", "annex", "registerurl", '-c', 'annex.alwayscommit=false',
                    key, file_info['download_url']
                ]
            )

            lgr.info("Registering links back for the content of the archive")
            add_archive_content(
                fname,
                annex=dataset.repo,
                delete_after=True,  # just remove extracted into a temp dir
                allow_dirty=True,  # since we have a tarball
                commit=False  # we do not want to commit anything we have done here
            )

            lgr.info("Removing generated and now registered in annex archive")
            repo.drop(key, key=True, options=['--force'])
            repo.remove(fname, force=True)  # remove the tarball

            # if annex in {'delete'}:
            #     dataset.repo.remove(fname)
            # else:
            #     # kinda makes little sense I guess.
            #     # Made more sense if export_archive could export an arbitrary treeish
            #     # so we could create a branch where to dump and export to figshare
            #     # (kinda closer to my idea)
            #     dataset.save(fname, message="Added the entire dataset into a zip file")

        # TODO: add to downloader knowledge about figshare token so it could download-url
        # those zipballs before they go public
        yield dict(
            status='ok',
            # TODO: add article url (which needs to be queried if only ID is known
            message="Published archive {}".format(
                file_info['download_url']),
            file_info=file_info,
            path=dataset,
            action='export_to_figshare',
            logger=lgr
        )
Exemplo n.º 23
0
    def __call__(urls,
                 dataset=None,
                 path=None,
                 overwrite=False,
                 archive=False,
                 save=True,
                 message=None):
        from ..downloaders.providers import Providers

        ds = None
        if save or dataset:
            try:
                ds = require_dataset(dataset,
                                     check_installed=True,
                                     purpose='downloading urls')
            except NoDatasetFound:
                pass

        common_report = {"action": "download_url", "ds": ds}

        got_ds_instance = isinstance(dataset, Dataset)
        dir_is_target = not path or path.endswith(op.sep)
        path = str(resolve_path(path or op.curdir, ds=dataset))
        if dir_is_target:
            # resolve_path() doesn't preserve trailing separators. Add one for
            # the download() call.
            path = path + op.sep
        urls = ensure_list_from_str(urls)

        if not dir_is_target:
            if len(urls) > 1:
                yield get_status_dict(
                    status="error",
                    message=
                    ("When specifying multiple urls, --path should point to "
                     "a directory target (with a trailing separator). Got %r",
                     path),
                    type="file",
                    path=path,
                    **common_report)
                return
            if archive:
                # make sure the file suffix indicated by a URL is preserved
                # so that any further archive processing doesn't have to
                # employ mime type inspection in order to determine the archive
                # type
                from datalad.support.network import URL
                suffixes = PurePosixPath(URL(urls[0]).path).suffixes
                if not Path(path).suffixes == suffixes:
                    path += ''.join(suffixes)
            # we know that we have a single URL
            # download() would be fine getting an existing directory and
            # downloading the URL underneath it, but let's enforce a trailing
            # slash here for consistency.
            if op.isdir(path):
                yield get_status_dict(
                    status="error",
                    message=(
                        "Non-directory path given (no trailing separator) "
                        "but a directory with that name (after adding archive "
                        "suffix) exists"),
                    type="file",
                    path=path,
                    **common_report)
                return

        # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress
        # in % of urls which were already downloaded
        providers = Providers.from_config_files()
        downloaded_paths = []
        path_urls = {}
        for url in urls:
            # somewhat "ugly"
            # providers.get_provider(url).get_downloader(url).download(url, path=path)
            # for now -- via sugaring
            try:
                downloaded_path = providers.download(url,
                                                     path=path,
                                                     overwrite=overwrite)
            except Exception as e:
                yield get_status_dict(status="error",
                                      message=exc_str(e),
                                      type="file",
                                      path=path,
                                      **common_report)
            else:
                downloaded_paths.append(downloaded_path)
                path_urls[downloaded_path] = url
                yield get_status_dict(status="ok",
                                      type="file",
                                      path=downloaded_path,
                                      **common_report)

        if downloaded_paths and save and ds is not None:
            msg = message or """\
[DATALAD] Download URLs

URLs:
  {}""".format("\n  ".join(urls))

            for r in Save()(
                    downloaded_paths,
                    message=msg,
                    # ATTN: Pass the original dataset argument to
                    # preserve relative path handling semantics.
                    dataset=dataset,
                    return_type="generator",
                    result_xfm=None,
                    result_filter=None,
                    on_failure="ignore"):
                yield r

            if isinstance(ds.repo, AnnexRepo):
                if got_ds_instance:
                    # Paths in `downloaded_paths` are already relative to the
                    # dataset.
                    rpaths = dict(zip(downloaded_paths, downloaded_paths))
                else:
                    # Paths in `downloaded_paths` are already relative to the
                    # current working directory. Take these relative to the
                    # dataset for use with the AnnexRepo method calls.
                    rpaths = {}
                    for orig_path, resolved in zip(
                            downloaded_paths,
                            resolve_path(downloaded_paths, ds=dataset)):
                        rpath = path_under_rev_dataset(ds, resolved)
                        if rpath:
                            rpaths[str(rpath)] = orig_path
                        else:
                            lgr.warning("Path %s not under dataset %s",
                                        orig_path, ds)
                annex_paths = [
                    p for p, annexed in zip(
                        rpaths, ds.repo.is_under_annex(list(rpaths.keys())))
                    if annexed
                ]
                if annex_paths:
                    for path in annex_paths:
                        url = path_urls[rpaths[path]]
                        try:
                            # The file is already present. This is just to
                            # register the URL.
                            ds.repo.add_url_to_file(
                                path,
                                url,
                                # avoid batch mode for single files
                                # https://github.com/datalad/datalad/issues/2849
                                batch=len(annex_paths) > 1,
                                # bypass URL size check, we already have the file
                                options=['--relaxed'])
                        except CommandError as exc:
                            lgr.warning("Registering %s with %s failed: %s",
                                        path, url, exc_str(exc))

                    if archive:
                        from datalad.api import add_archive_content
                        for path in annex_paths:
                            add_archive_content(path,
                                                annex=ds.repo,
                                                delete=True)
Exemplo n.º 24
0
def test_add_archive_content(path_orig=None, url=None, repo_path=None):
    with chpwd(repo_path):
        # TODO we need to be able to pass path into add_archive_content
        # We could mock but I mean for the API

        # no repo yet
        assert_raises(NoDatasetFound, add_archive_content,
                      "nonexisting.tar.gz")
        ds = Dataset(repo_path).create()
        res = ds.add_archive_content("nonexisting.tar.gz", on_failure='ignore')
        assert_in_results(res,
                          action='add-archive-content',
                          status='impossible')
        repo = ds.repo

        # we can't add a file from outside the repo ATM
        res = ds.add_archive_content(Path(path_orig) / '1.tar.gz',
                                     on_failure='ignore')
        assert_in_results(res,
                          action='add-archive-content',
                          status='impossible',
                          type="dataset",
                          message="Can not add archive outside of the dataset")

        # Let's add first archive to the repo so we could test
        with swallow_outputs():
            repo.add_url_to_file('1.tar.gz', opj(url, '1.tar.gz'))
            for s in range(1, 5):
                repo.add_url_to_file('%du/1.tar.gz' % s,
                                     opj(url, '%du/1.tar.gz' % s))
            repo.commit("added 1.tar.gz")

        key_1tar = repo.get_file_annexinfo('1.tar.gz')[
            'key']  # will be used in the test later

        def d1_basic_checks():
            ok_(exists('1'))
            ok_file_under_git('1', '1 f.txt', annexed=True)
            ok_file_under_git(opj('1', 'd', '1d'), annexed=True)
            ok_archives_caches(repo_path, 0)

        # and by default it just does it, everything goes to annex
        res = add_archive_content('1.tar.gz')
        assert_in_results(res, action='add-archive-content', status='ok')
        d1_basic_checks()

        # If ran again, should proceed just fine since the content is the same
        # so no changes would be made really
        res = add_archive_content('1.tar.gz')
        assert_in_results(res, action='add-archive-content', status='ok')

        # But that other one carries updated file, so should fail due to
        # overwrite
        res = add_archive_content(Path('1u') / '1.tar.gz',
                                  use_current_dir=True,
                                  on_failure='ignore')
        assert_in_results(
            res,
            action='add-archive-content',
            status='error',
        )
        assert_in('exists, but would be overwritten by new file',
                  res[0]['message'])
        # but should do fine if overrides are allowed
        add_archive_content(Path('1u') / '1.tar.gz',
                            existing='overwrite',
                            use_current_dir=True)
        add_archive_content(Path('2u') / '1.tar.gz',
                            existing='archive-suffix',
                            use_current_dir=True)
        add_archive_content(Path('3u') / '1.tar.gz',
                            existing='archive-suffix',
                            use_current_dir=True)
        add_archive_content(Path('4u') / '1.tar.gz',
                            existing='archive-suffix',
                            use_current_dir=True)

        # rudimentary test
        assert_equal(sorted(map(basename, glob(opj(repo_path, '1', '1*')))),
                     ['1 f-1.1.txt', '1 f-1.2.txt', '1 f-1.txt', '1 f.txt'])
        whereis = repo.whereis(glob(opj(repo_path, '1', '1*')))
        # they all must be the same
        assert (all([x == whereis[0] for x in whereis[1:]]))

    # and we should be able to reference it while under subdirectory
    subdir = opj(repo_path, 'subdir')
    with chpwd(subdir, mkdir=True):
        add_archive_content(opj(pardir, '1.tar.gz'),
                            dataset=ds.path,
                            use_current_dir=True)
        d1_basic_checks()
        # or we could keep relative path and also demand to keep the archive prefix
        # while extracting under original (annex root) dir
        add_archive_content(opj(pardir, '1.tar.gz'),
                            dataset=ds.path,
                            add_archive_leading_dir=True)

    with chpwd(opj(repo_path, '1')):
        d1_basic_checks()

    with chpwd(repo_path):
        # test with excludes and renames and annex options
        ds.add_archive_content(
            '1.tar.gz',
            exclude=['d'],
            rename=['/ /_', '/^1/2'],
            annex_options="-c annex.largefiles=exclude=*.txt",
            delete=True)
        # no conflicts since new name
        ok_file_under_git('2', '1_f.txt', annexed=False)
        assert_false(exists(opj('2', 'd')))
        assert_false(exists('1.tar.gz'))  # delete was in effect

    # now test ability to extract within subdir
    with chpwd(opj(repo_path, 'd1'), mkdir=True):
        # Let's add first archive to the repo so we could test
        # named the same way but different content
        with swallow_outputs():
            repo.add_url_to_file('d1/1.tar.gz', opj(url, 'd1', '1.tar.gz'))
        repo.commit("added 1.tar.gz in d1")

        def d2_basic_checks():
            ok_(exists('1'))
            ok_file_under_git('1', '2 f.txt', annexed=True)
            ok_file_under_git(opj('1', 'd2', '2d'), annexed=True)
            ok_archives_caches(repo.path, 0)

        add_archive_content('1.tar.gz', dataset=ds.path)
        d2_basic_checks()

    # in manual tests ran into the situation of inability to obtain on a single run
    # a file from an archive which was coming from a dropped key.  I thought it was
    # tested in custom remote tests, but I guess not sufficiently well enough
    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.get(opj('1', '1 f.txt'))
    ok_archives_caches(repo.path, 1, persistent=True)
    ok_archives_caches(repo.path, 0, persistent=False)

    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.drop(key_1tar,
              key=True)  # is available from the URL -- should be kosher
    repo.get(opj('1', '1 f.txt'))  # that what managed to not work

    # TODO: check if persistent archive is there for the 1.tar.gz

    # We should be able to drop everything since available online
    with swallow_outputs():
        clean(dataset=ds)
    repo.drop(key_1tar,
              key=True)  # is available from the URL -- should be kosher

    ds.drop(opj('1', '1 f.txt'))  # should be all kosher
    ds.get(opj('1', '1 f.txt'))  # and should be able to get it again

    # bug was that dropping didn't work since archive was dropped first
    repo.call_annex(["drop", "--all"])

    # verify that we can't drop a file if archive key was dropped and online archive was removed or changed size! ;)
    repo.get(key_1tar, key=True)
    unlink(opj(path_orig, '1.tar.gz'))
    with assert_raises(CommandError) as e:
        repo.drop(key_1tar, key=True)
        assert_equal(e.kwargs['stdout_json'][0]['success'], False)
        assert_result_values_cond(
            e.kwargs['stdout_json'], 'note', lambda x:
            '(Use --force to override this check, or adjust numcopies.)' in x)
    assert exists(opj(repo.path, repo.get_contentlocation(key_1tar)))
Exemplo n.º 25
0
    def __call__(urls, dataset=None, path=None, overwrite=False,
                 archive=False, save=True, message=None):
        from ..downloaders.providers import Providers

        pwd, rel_pwd = get_dataset_pwds(dataset)

        ds = None
        if save or dataset:
            try:
                ds = require_dataset(
                    dataset, check_installed=True,
                    purpose='downloading urls')
            except NoDatasetArgumentFound:
                pass

        common_report = {"action": "download_url",
                         "ds": ds}

        urls = assure_list_from_str(urls)

        if len(urls) > 1 and path and not op.isdir(path):
            yield get_status_dict(
                status="error",
                message=(
                    "When specifying multiple urls, --path should point to "
                    "an existing directory. Got %r", path),
                type="file",
                path=path,
                **common_report)
            return

        if dataset:  # A dataset was explicitly given.
            path = op.normpath(op.join(ds.path, path or op.curdir))
        elif save and ds:
            path = op.normpath(op.join(ds.path, rel_pwd, path or op.curdir))
        elif not path:
            path = op.curdir

        # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress
        # in % of urls which were already downloaded
        providers = Providers.from_config_files()
        downloaded_paths = []
        path_urls = {}
        for url in urls:
            # somewhat "ugly"
            # providers.get_provider(url).get_downloader(url).download(url, path=path)
            # for now -- via sugaring
            try:
                downloaded_path = providers.download(url, path=path, overwrite=overwrite)
            except Exception as e:
                yield get_status_dict(
                    status="error",
                    message=exc_str(e),
                    type="file",
                    path=path,
                    **common_report)
            else:
                downloaded_paths.append(downloaded_path)
                path_urls[downloaded_path] = url
                yield get_status_dict(
                    status="ok",
                    type="file",
                    path=downloaded_path,
                    **common_report)

        if downloaded_paths and save and ds is not None:
            msg = message or """\
[DATALAD] Download URLs

URLs:
  {}""".format("\n  ".join(urls))

            for r in ds.add(downloaded_paths, message=msg):
                yield r

            if isinstance(ds.repo, AnnexRepo):
                annex_paths = [p for p, annexed in
                               zip(downloaded_paths,
                                   ds.repo.is_under_annex(downloaded_paths))
                               if annexed]
                if annex_paths:
                    for path in annex_paths:
                        try:
                            # The file is already present. This is just to
                            # register the URL.
                            ds.repo.add_url_to_file(path, path_urls[path],
                                                    batch=True)
                        except AnnexBatchCommandError as exc:
                            lgr.warning("Registering %s with %s failed: %s",
                                        path, path_urls[path], exc_str(exc))

                    if archive:
                        from datalad.api import add_archive_content
                        for path in annex_paths:
                            add_archive_content(path, annex=ds.repo, delete=True)
Exemplo n.º 26
0
def test_add_archive_content(path_orig, url, repo_path):
    with chpwd(repo_path):
        # TODO we need to be able to pass path into add_archive_content
        # We could mock but I mean for the API
        assert_raises(RuntimeError, add_archive_content,
                      "nonexisting.tar.gz")  # no repo yet

        repo = AnnexRepo(repo_path, create=True)
        assert_raises(ValueError, add_archive_content, "nonexisting.tar.gz")
        # we can't add a file from outside the repo ATM
        assert_raises(FileNotInRepositoryError, add_archive_content,
                      opj(path_orig, '1.tar.gz'))

        # Let's add first archive to the repo so we could test
        with swallow_outputs():
            repo.add_urls([opj(url, '1.tar.gz')],
                          options=["--pathdepth", "-1"])
            for s in range(1, 5):
                repo.add_urls([opj(url, '%du/1.tar.gz' % s)],
                              options=["--pathdepth", "-2"])
        repo.commit("added 1.tar.gz")

        key_1tar = repo.get_file_key(
            '1.tar.gz')  # will be used in the test later

        def d1_basic_checks():
            ok_(exists('1'))
            ok_file_under_git('1', '1 f.txt', annexed=True)
            ok_file_under_git(opj('1', 'd', '1d'), annexed=True)
            ok_archives_caches(repo_path, 0)

        # and by default it just does it, everything goes to annex
        repo_ = add_archive_content('1.tar.gz')
        eq_(repo.path, repo_.path)
        d1_basic_checks()

        # If ran again, should proceed just fine since the content is the same so no changes would be made really
        add_archive_content('1.tar.gz')

        # But that other one carries updated file, so should fail due to overwrite
        with assert_raises(RuntimeError) as cme:
            add_archive_content(opj('1u', '1.tar.gz'), use_current_dir=True)

        # TODO: somewhat not precise since we have two possible "already exists"
        # -- in caching and overwrite check
        assert_in("already exists", str(cme.exception))
        # but should do fine if overrides are allowed
        add_archive_content(opj('1u', '1.tar.gz'),
                            existing='overwrite',
                            use_current_dir=True)
        add_archive_content(opj('2u', '1.tar.gz'),
                            existing='archive-suffix',
                            use_current_dir=True)
        add_archive_content(opj('3u', '1.tar.gz'),
                            existing='archive-suffix',
                            use_current_dir=True)
        add_archive_content(opj('4u', '1.tar.gz'),
                            existing='archive-suffix',
                            use_current_dir=True)

        # rudimentary test
        assert_equal(sorted(map(basename, glob(opj(repo_path, '1', '1*')))),
                     ['1 f-1.1.txt', '1 f-1.2.txt', '1 f-1.txt', '1 f.txt'])
        whereis = repo.whereis(glob(opj(repo_path, '1', '1*')))
        # they all must be the same
        assert (all([x == whereis[0] for x in whereis[1:]]))

    # and we should be able to reference it while under subdirectory
    subdir = opj(repo_path, 'subdir')
    with chpwd(subdir, mkdir=True):
        add_archive_content(opj(pardir, '1.tar.gz'), use_current_dir=True)
        d1_basic_checks()
        # or we could keep relative path and also demand to keep the archive prefix
        # while extracting under original (annex root) dir
        add_archive_content(opj(pardir, '1.tar.gz'),
                            add_archive_leading_dir=True)

    with chpwd(opj(repo_path, '1')):
        d1_basic_checks()

    with chpwd(repo_path):
        # test with excludes and renames and annex options
        add_archive_content('1.tar.gz',
                            exclude=['d'],
                            rename=['/ /_', '/^1/2'],
                            annex_options="-c annex.largefiles=exclude=*.txt",
                            delete=True)
        # no conflicts since new name
        ok_file_under_git('2', '1_f.txt', annexed=False)
        assert_false(exists(opj('2', 'd')))
        assert_false(exists('1.tar.gz'))  # delete was in effect

    # now test ability to extract within subdir
    with chpwd(opj(repo_path, 'd1'), mkdir=True):
        # Let's add first archive to the repo so we could test
        # named the same way but different content
        with swallow_outputs():
            repo.add_urls([opj(url, 'd1', '1.tar.gz')],
                          options=["--pathdepth", "-1"],
                          cwd=getpwd())  # invoke under current subdir
        repo.commit("added 1.tar.gz in d1")

        def d2_basic_checks():
            ok_(exists('1'))
            ok_file_under_git('1', '2 f.txt', annexed=True)
            ok_file_under_git(opj('1', 'd2', '2d'), annexed=True)
            ok_archives_caches(repo.path, 0)

        add_archive_content('1.tar.gz')
        d2_basic_checks()

    # in manual tests ran into the situation of inability to obtain on a single run
    # a file from an archive which was coming from a dropped key.  I thought it was
    # tested in custom remote tests, but I guess not sufficiently well enough
    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.get(opj('1', '1 f.txt'))
    ok_archives_caches(repo.path, 1, persistent=True)
    ok_archives_caches(repo.path, 0, persistent=False)

    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.drop(key_1tar,
              key=True)  # is available from the URL -- should be kosher
    repo.get(opj('1', '1 f.txt'))  # that what managed to not work

    # TODO: check if persistent archive is there for the 1.tar.gz

    # We should be able to drop everything since available online
    with swallow_outputs():
        clean(dataset=repo.path)
    repo.drop(key_1tar,
              key=True)  # is available from the URL -- should be kosher

    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.get(opj('1', '1 f.txt'))  # and should be able to get it again

    # bug was that dropping didn't work since archive was dropped first
    repo.call_annex(["drop", "--all"])

    # verify that we can't drop a file if archive key was dropped and online archive was removed or changed size! ;)
    repo.get(key_1tar, key=True)
    unlink(opj(path_orig, '1.tar.gz'))
    with assert_raises(CommandError) as e:
        repo.drop(key_1tar, key=True)
        assert_equal(e.kwargs['stdout_json'][0]['success'], False)
        assert_result_values_cond(
            e.kwargs['stdout_json'], 'note', lambda x:
            '(Use --force to override this check, or adjust numcopies.)' in x)
    assert exists(opj(repo.path, repo.get_contentlocation(key_1tar)))