예제 #1
0
def check_compress_file(ext, annex, path, name):
    # we base the archive name on the filename, in order to also
    # be able to properly test compressors where the corresponding
    # archive format has no capability of storing a filename
    # (i.e. where the archive name itself determines the filename
    # of the decompressed file, like .xz)
    archive = op.join(name, _filename + ext)
    compress_files([_filename], archive,
                   path=path)
    assert_true(op.exists(archive))
    if annex:
        # It should work even when file is annexed and is a symlink to the
        # key
        from datalad.support.annexrepo import AnnexRepo
        repo = AnnexRepo(path, init=True)
        repo.add(_filename)
        repo.commit(files=[_filename], msg="commit")

    dir_extracted = name + "_extracted"
    try:
        decompress_file(archive, dir_extracted)
    except MissingExternalDependency as exc:
        raise SkipTest(exc_str(exc))
    _filepath = op.join(dir_extracted, _filename)

    ok_file_has_content(_filepath, 'content')
예제 #2
0
def test_add_archive_use_archive_dir(repo_path):
    repo = AnnexRepo(repo_path, create=True)
    with chpwd(repo_path):
        # Let's add first archive to the repo with default setting
        archive_path = opj('4u', '1.tar.gz')
        # check it gives informative error if archive is not already added
        with assert_raises(RuntimeError) as cmr:
            add_archive_content(archive_path)
        assert_re_in(
            "You should run ['\"]datalad save 4u\\\\1\\.tar\\.gz['\"] first"
            if on_windows else
            "You should run ['\"]datalad save 4u/1\\.tar\\.gz['\"] first",
            str(cmr.exception),
            match=False)
        with swallow_outputs():
            repo.add(archive_path)
        repo.commit("added 1.tar.gz")

        ok_archives_caches(repo.path, 0)
        add_archive_content(archive_path,
                            strip_leading_dirs=True,
                            use_current_dir=True)
        ok_(not exists(opj('4u', '1 f.txt')))
        ok_file_under_git(repo.path, '1 f.txt', annexed=True)
        ok_archives_caches(repo.path, 0)

        # and now let's extract under archive dir
        add_archive_content(archive_path, strip_leading_dirs=True)
        ok_file_under_git(repo.path, opj('4u', '1 f.txt'), annexed=True)
        ok_archives_caches(repo.path, 0)

        add_archive_content(opj('4u', 'sub.tar.gz'))
        ok_file_under_git(repo.path, opj('4u', 'sub', '2 f.txt'), annexed=True)
        ok_archives_caches(repo.path, 0)
예제 #3
0
def test_proxying_open_testrepobased(repo):
    TEST_CONTENT = "content to be annex-addurl'd"
    fname = 'test-annex.dat'
    fpath = opj(repo, fname)
    assert_raises(IOError, open, fpath)

    aio = AutomagicIO(activate=True)
    try:
        with swallow_outputs():
            # now we should be able just to request to open this file
            with open(fpath) as f:
                content = f.read()
                eq_(content, TEST_CONTENT)
    finally:
        aio.deactivate()

    # and now that we have fetched it, nothing should forbid us to open it again
    with open(fpath) as f:
        eq_(f.read(), TEST_CONTENT)

    annex = AnnexRepo(repo, create=False)
    # Let's create another file deeper under the directory with the same content
    # so it would point to the same key, which we would drop and repeat the drill
    fpath2 = opj(repo, 'd1', 'd2', 'test2.dat')
    os.makedirs(dirname(fpath2))
    with open(fpath2, 'w') as f:
        f.write(content)
    annex.add(fpath2)
    annex.drop(fpath2)
    annex.commit("added and dropped")
    assert_raises(IOError, open, fpath2)

    # Let's use context manager form
    with AutomagicIO() as aio:
        ok_(isinstance(aio, AutomagicIO))
        ok_(aio.active)
        # swallowing output would cause trouble while testing with
        # DATALAD_ASSERT_NO_OPEN_FILES mode on.  Reason is not 100% clear
        # on why underlying git-annex process would be dumping to stdout or err
        #with swallow_outputs():

        # now we should be able just to request to open this file
        with open(fpath2) as f:
            content = f.read()
            eq_(content, TEST_CONTENT)

    annex.drop(fpath2)
    assert_raises(IOError, open, fpath2)

    # Let's use relative path
    with chpwd(opj(repo, 'd1')):
        # Let's use context manager form
        with AutomagicIO() as aio, \
                swallow_outputs(), \
                open(opj('d2', 'test2.dat')) as f:
            content = f.read()
            eq_(content, TEST_CONTENT)
예제 #4
0
def test_add_archive_content_zip(repo_path):
    repo = AnnexRepo(repo_path, create=True)
    with chpwd(repo_path):
        with swallow_outputs():
            repo.add(["1.zip"])
        repo.commit("add 1.zip")
        add_archive_content("1.zip")
        ok_file_under_git(opj(repo.path, "1", "foo"), annexed=True)
        ok_file_under_git(opj("1", "dir", "bar"), annexed=True)
        ok_archives_caches(repo.path, 0)
예제 #5
0
def test_add_archive_content_absolute_path(path):
    repo = AnnexRepo(opj(path, "ds"), create=True)
    repo.add(["1.tar.gz"])
    repo.commit("1.tar.gz")
    abs_tar_gz = opj(path, "ds", "1.tar.gz")
    add_archive_content(abs_tar_gz, annex=repo)
    ok_file_under_git(opj(path, "ds", "1", "foo"), annexed=True)
    commit_msg = repo.format_commit("%B")
    # The commit message uses relative paths.
    assert_not_in(abs_tar_gz, commit_msg)
    assert_in("1.tar.gz", commit_msg)

    with assert_raises(FileNotInRepositoryError):
        add_archive_content(opj(path, "notds", "2.tar.gz"), annex=repo)
예제 #6
0
def test_add_archive_content_strip_leading(path_orig, url, repo_path):
    with chpwd(repo_path):
        repo = AnnexRepo(repo_path, create=True)

        # Let's add first archive to the repo so we could test
        with swallow_outputs():
            repo.add_urls([opj(url, '1.tar.gz')], options=["--pathdepth", "-1"])
        repo.commit("added 1.tar.gz")

        add_archive_content('1.tar.gz', strip_leading_dirs=True)
        ok_(not exists('1'))
        ok_file_under_git(repo.path, '1 f.txt', annexed=True)
        ok_file_under_git('d', '1d', annexed=True)
        ok_archives_caches(repo.path, 0)
예제 #7
0
파일: test_base.py 프로젝트: hanke/datalad
def test_get_contentlocation(tdir):
    repo = AnnexRepo(tdir, create=True, init=True)
    repo.add('file.dat')
    repo.commit('added file.dat')

    key = repo.get_file_key('file.dat')
    cr = AnnexCustomRemote(tdir)
    key_path = cr.get_contentlocation(key, absolute=False)
    assert not isabs(key_path)
    key_path_abs = cr.get_contentlocation(key, absolute=True)
    assert isabs(key_path_abs)
    assert cr._contentlocations == {key: key_path}
    repo.drop('file.dat', options=['--force'])
    assert not cr.get_contentlocation(key, absolute=True)
예제 #8
0
def test_get_contentlocation(tdir):
    repo = AnnexRepo(tdir, create=True, init=True)
    repo.add('file.dat')
    repo.commit('added file.dat')

    key = repo.get_file_key('file.dat')
    cr = AnnexCustomRemote(tdir)
    key_path = cr.get_contentlocation(key, absolute=False)
    assert not isabs(key_path)
    key_path_abs = cr.get_contentlocation(key, absolute=True)
    assert isabs(key_path_abs)
    assert cr._contentlocations == {key: key_path}
    repo.drop('file.dat', options=['--force'])
    assert not cr.get_contentlocation(key, absolute=True)
예제 #9
0
def test_get_contentlocation(tdir=None):
    repo = AnnexRepo(tdir, create=True, init=True)
    repo.add('file.dat')
    repo.commit('added file.dat')

    # TODO contentlocation would come with eval_availability=True
    key = repo.get_file_annexinfo('file.dat')['key']
    cr = ArchiveAnnexCustomRemote(None, path=tdir)
    key_path = cr.get_contentlocation(key, absolute=False)
    assert not isabs(key_path)
    key_path_abs = cr.get_contentlocation(key, absolute=True)
    assert isabs(key_path_abs)
    assert cr._contentlocations == {key: key_path}
    repo.drop('file.dat', options=['--force'])
    assert not cr.get_contentlocation(key, absolute=True)
예제 #10
0
def test_add_archive_dirs(path_orig, url, repo_path):
    # change to repo_path
    with chpwd(repo_path):
        # create annex repo
        repo = AnnexRepo(repo_path, create=True)

        # add archive to the repo so we could test
        with swallow_outputs():
            repo.add_urls([opj(url, '1.tar.gz')],
                          options=["--pathdepth", "-1"])
        repo.commit("added 1.tar.gz")

        # test with excludes and annex options
        add_archive_content(
            '1.tar.gz',
            existing='archive-suffix',
            # Since inconsistent and seems in many cases no leading dirs to strip, keep them as provided
            strip_leading_dirs=True,
            delete=True,
            leading_dirs_consider=['crcns.*', '1'],
            leading_dirs_depth=2,
            use_current_dir=False,
            exclude='.*__MACOSX.*')  # some junk penetrates

        if external_versions['cmd:annex'] >= '6.20170208':
            # should have fixed remotes
            eq_(
                repo.get_description(
                    uuid=DATALAD_SPECIAL_REMOTES_UUIDS[ARCHIVES_SPECIAL_REMOTE]
                ), '[%s]' % ARCHIVES_SPECIAL_REMOTE)

        all_files = sorted(find_files('.'))
        target_files = {
            './CR24A/behaving1/1 f.txt',
            './CR24C/behaving3/3 f.txt',
            './CR24D/behaving2/2 f.txt',
        }
        eq_(set(all_files), target_files)

        # regression test: the subdir in MACOSX wasn't excluded and its name was getting stripped by leading_dir_len
        assert_false(exists(
            '__MACOSX'))  # if stripping and exclude didn't work this fails
        assert_false(
            exists('c-1_data')
        )  # if exclude doesn't work then name of subdir gets stripped by leading_dir_len
        assert_false(
            exists('CR24B')
        )  # if exclude doesn't work but everything else works this fails
예제 #11
0
def test_check_dates(path):
    skip_if_no_module("dateutil")

    ref_ts = 1218182889  # Fri, 08 Aug 2008 04:08:09 -0400
    refdate = "@{}".format(ref_ts)

    repo = os.path.join(path, "repo")
    with set_date(ref_ts + 5000):
        ar = AnnexRepo(repo)
        ar.add(".")
        ar.commit()

    # The standard renderer outputs json.
    with swallow_outputs() as cmo:
        # Set level to WARNING to avoid the progress bar when
        # DATALAD_TESTS_UI_BACKEND=console.
        with swallow_logs(new_level=logging.WARNING):
            check_dates([repo],
                        reference_date=refdate,
                        return_type="list")
        assert_in("report", json.loads(cmo.out).keys())

    # We find the newer objects.
    newer = call([path], reference_date=refdate)
    eq_(len(newer), 1)
    ok_(newer[0]["report"]["objects"])

    # There are no older objects to find.
    older = call([repo], reference_date=refdate, older=True)
    assert_false(older[0]["report"]["objects"])

    # We can pass the date in RFC 2822 format.
    assert_dict_equal(
        newer[0],
        call([path], reference_date="08 Aug 2008 04:08:09 -0400")[0])

    # paths=None defaults to the current directory.
    with chpwd(path):
        assert_dict_equal(
            newer[0]["report"],
            call(paths=None, reference_date=refdate)[0]["report"])

    # Only commit type is present when annex='none'.
    newer_noannex = call([path], reference_date=refdate, annex="none")
    for entry in newer_noannex[0]["report"]["objects"].values():
        ok_(entry["type"] == "commit")
예제 #12
0
def test_check_dates(path=None):
    skip_if_no_module("dateutil")

    ref_ts = 1218182889  # Fri, 08 Aug 2008 04:08:09 -0400
    refdate = "@{}".format(ref_ts)

    repo = os.path.join(path, "repo")
    with set_date(ref_ts + 5000):
        ar = AnnexRepo(repo)
        ar.add(".")
        ar.commit()

    # The standard renderer outputs json.
    with swallow_outputs() as cmo:
        # Set level to WARNING to avoid the progress bar when
        # DATALAD_TESTS_UI_BACKEND=console.
        with swallow_logs(new_level=logging.WARNING):
            check_dates([repo], reference_date=refdate, return_type="list")
        assert_in("report", json.loads(cmo.out).keys())

    # We find the newer objects.
    newer = call([path], reference_date=refdate)
    eq_(len(newer), 1)
    ok_(newer[0]["report"]["objects"])

    # There are no older objects to find.
    older = call([repo], reference_date=refdate, older=True)
    assert_false(older[0]["report"]["objects"])

    # We can pass the date in RFC 2822 format.
    assert_dict_equal(
        newer[0],
        call([path], reference_date="08 Aug 2008 04:08:09 -0400")[0])

    # paths=None defaults to the current directory.
    with chpwd(path):
        assert_dict_equal(
            newer[0]["report"],
            call(paths=None, reference_date=refdate)[0]["report"])

    # Only commit type is present when annex='none'.
    newer_noannex = call([path], reference_date=refdate, annex="none")
    for entry in newer_noannex[0]["report"]["objects"].values():
        ok_(entry["type"] == "commit")
예제 #13
0
파일: utils.py 프로젝트: shots47s/datalad
def put_file_under_git(path, filename=None, content=None, annexed=False):
    """Place file under git/annex and return used Repo
    """
    annex, file_repo_path, filename, path, repo = _prep_file_under_git(
        path, filename)
    if content is None:
        content = ""
    with open(opj(repo.path, file_repo_path), 'w') as f_:
        f_.write(content)

    if annexed:
        if not isinstance(repo, AnnexRepo):
            repo = AnnexRepo(repo.path)
        repo.add(file_repo_path)
    else:
        repo.add(file_repo_path, git=True)
    repo.commit(_datalad_msg=True)
    ok_file_under_git(repo.path, file_repo_path, annexed)
    return repo
예제 #14
0
def test_interactions(tdir):
    # Just a placeholder since constructor expects a repo
    repo = AnnexRepo(tdir, create=True, init=True)
    repo.add('file.dat')
    repo.commit('added file.dat')
    for scenario in BASE_INTERACTION_SCENARIOS + [
        [
            ('GETAVAILABILITY', 'AVAILABILITY %s' % DEFAULT_AVAILABILITY),
            ('GETCOST', 'COST %d' % DEFAULT_COST),
            ('TRANSFER RETRIEVE somekey somefile',
             re.compile('TRANSFER-FAILURE RETRIEVE somekey NotImplementedError().*')),
        ],
        [
            # by default we do not require any fancy init
            # no urls supported by default
            ('CLAIMURL http://example.com', 'CLAIMURL-FAILURE'),
            # we know that is just a single option, url, is expected so full
            # one would be passed
            ('CLAIMURL http://example.com roguearg', 'CLAIMURL-FAILURE'),
        ]
    ]:
        check_interaction_scenario(AnnexCustomRemote, tdir, scenario)
예제 #15
0
def check_compress_file(ext, annex, path, name):
    archive = name + ext
    compress_files([_filename], archive, path=path)
    assert_true(exists(archive))
    if annex:
        # It should work even when file is annexed and is a symlink to the
        # key
        from datalad.support.annexrepo import AnnexRepo
        repo = AnnexRepo(path, init=True)
        repo.add(_filename)
        repo.commit(files=[_filename], msg="commit")

    dir_extracted = name + "_extracted"
    try:
        decompress_file(archive, dir_extracted)
    except MissingExternalDependency as exc:
        raise SkipTest(exc_str(exc))
    _filepath = op.join(dir_extracted, _filename)

    import glob
    print(dir_extracted)
    print(glob.glob(dir_extracted + '/*'))
    ok_file_has_content(_filepath, 'content')
예제 #16
0
def check_compress_file(ext, annex, path, name):
    archive = name + ext
    compress_files([_filename], archive,
                   path=path)
    assert_true(exists(archive))
    if annex:
        # It should work even when file is annexed and is a symlink to the
        # key
        from datalad.support.annexrepo import AnnexRepo
        repo = AnnexRepo(path, init=True)
        repo.add(_filename)
        repo.commit(files=[_filename], msg="commit")

    dir_extracted = name + "_extracted"
    try:
        decompress_file(archive, dir_extracted)
    except MissingExternalDependency as exc:
        raise SkipTest(exc_str(exc))
    _filepath = op.join(dir_extracted, _filename)

    import glob
    print(dir_extracted)
    print(glob.glob(dir_extracted + '/*'))
    ok_file_has_content(_filepath, 'content')
예제 #17
0
def test_add_archive_content(path_orig, url, repo_path):
    with chpwd(repo_path):
        # TODO we need to be able to pass path into add_archive_content
        # We could mock but I mean for the API
        assert_raises(RuntimeError, add_archive_content,
                      "nonexisting.tar.gz")  # no repo yet

        repo = AnnexRepo(repo_path, create=True)
        assert_raises(ValueError, add_archive_content, "nonexisting.tar.gz")
        # we can't add a file from outside the repo ATM
        assert_raises(FileNotInRepositoryError, add_archive_content,
                      opj(path_orig, '1.tar.gz'))

        # Let's add first archive to the repo so we could test
        with swallow_outputs():
            repo.add_urls([opj(url, '1.tar.gz')],
                          options=["--pathdepth", "-1"])
            for s in range(1, 5):
                repo.add_urls([opj(url, '%du/1.tar.gz' % s)],
                              options=["--pathdepth", "-2"])
        repo.commit("added 1.tar.gz")

        key_1tar = repo.get_file_key(
            '1.tar.gz')  # will be used in the test later

        def d1_basic_checks():
            ok_(exists('1'))
            ok_file_under_git('1', '1 f.txt', annexed=True)
            ok_file_under_git(opj('1', 'd', '1d'), annexed=True)
            ok_archives_caches(repo_path, 0)

        # and by default it just does it, everything goes to annex
        repo_ = add_archive_content('1.tar.gz')
        eq_(repo.path, repo_.path)
        d1_basic_checks()

        # If ran again, should proceed just fine since the content is the same so no changes would be made really
        add_archive_content('1.tar.gz')

        # But that other one carries updated file, so should fail due to overwrite
        with assert_raises(RuntimeError) as cme:
            add_archive_content(opj('1u', '1.tar.gz'), use_current_dir=True)

        # TODO: somewhat not precise since we have two possible "already exists"
        # -- in caching and overwrite check
        assert_in("already exists", str(cme.exception))
        # but should do fine if overrides are allowed
        add_archive_content(opj('1u', '1.tar.gz'),
                            existing='overwrite',
                            use_current_dir=True)
        add_archive_content(opj('2u', '1.tar.gz'),
                            existing='archive-suffix',
                            use_current_dir=True)
        add_archive_content(opj('3u', '1.tar.gz'),
                            existing='archive-suffix',
                            use_current_dir=True)
        add_archive_content(opj('4u', '1.tar.gz'),
                            existing='archive-suffix',
                            use_current_dir=True)

        # rudimentary test
        assert_equal(sorted(map(basename, glob(opj(repo_path, '1', '1*')))),
                     ['1 f-1.1.txt', '1 f-1.2.txt', '1 f-1.txt', '1 f.txt'])
        whereis = repo.whereis(glob(opj(repo_path, '1', '1*')))
        # they all must be the same
        assert (all([x == whereis[0] for x in whereis[1:]]))

    # and we should be able to reference it while under subdirectory
    subdir = opj(repo_path, 'subdir')
    with chpwd(subdir, mkdir=True):
        add_archive_content(opj(pardir, '1.tar.gz'), use_current_dir=True)
        d1_basic_checks()
        # or we could keep relative path and also demand to keep the archive prefix
        # while extracting under original (annex root) dir
        add_archive_content(opj(pardir, '1.tar.gz'),
                            add_archive_leading_dir=True)

    with chpwd(opj(repo_path, '1')):
        d1_basic_checks()

    with chpwd(repo_path):
        # test with excludes and renames and annex options
        add_archive_content('1.tar.gz',
                            exclude=['d'],
                            rename=['/ /_', '/^1/2'],
                            annex_options="-c annex.largefiles=exclude=*.txt",
                            delete=True)
        # no conflicts since new name
        ok_file_under_git('2', '1_f.txt', annexed=False)
        assert_false(exists(opj('2', 'd')))
        assert_false(exists('1.tar.gz'))  # delete was in effect

    # now test ability to extract within subdir
    with chpwd(opj(repo_path, 'd1'), mkdir=True):
        # Let's add first archive to the repo so we could test
        # named the same way but different content
        with swallow_outputs():
            repo.add_urls([opj(url, 'd1', '1.tar.gz')],
                          options=["--pathdepth", "-1"],
                          cwd=getpwd())  # invoke under current subdir
        repo.commit("added 1.tar.gz in d1")

        def d2_basic_checks():
            ok_(exists('1'))
            ok_file_under_git('1', '2 f.txt', annexed=True)
            ok_file_under_git(opj('1', 'd2', '2d'), annexed=True)
            ok_archives_caches(repo.path, 0)

        add_archive_content('1.tar.gz')
        d2_basic_checks()

    # in manual tests ran into the situation of inability to obtain on a single run
    # a file from an archive which was coming from a dropped key.  I thought it was
    # tested in custom remote tests, but I guess not sufficiently well enough
    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.get(opj('1', '1 f.txt'))
    ok_archives_caches(repo.path, 1, persistent=True)
    ok_archives_caches(repo.path, 0, persistent=False)

    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.drop(key_1tar,
              key=True)  # is available from the URL -- should be kosher
    repo.get(opj('1', '1 f.txt'))  # that what managed to not work

    # TODO: check if persistent archive is there for the 1.tar.gz

    # We should be able to drop everything since available online
    with swallow_outputs():
        clean(dataset=repo.path)
    repo.drop(key_1tar,
              key=True)  # is available from the URL -- should be kosher

    repo.drop(opj('1', '1 f.txt'))  # should be all kosher
    repo.get(opj('1', '1 f.txt'))  # and should be able to get it again

    # bug was that dropping didn't work since archive was dropped first
    repo.call_annex(["drop", "--all"])

    # verify that we can't drop a file if archive key was dropped and online archive was removed or changed size! ;)
    repo.get(key_1tar, key=True)
    unlink(opj(path_orig, '1.tar.gz'))
    with assert_raises(CommandError) as e:
        repo.drop(key_1tar, key=True)
        assert_equal(e.kwargs['stdout_json'][0]['success'], False)
        assert_result_values_cond(
            e.kwargs['stdout_json'], 'note', lambda x:
            '(Use --force to override this check, or adjust numcopies.)' in x)
    assert exists(opj(repo.path, repo.get_contentlocation(key_1tar)))
예제 #18
0
def _test_AnnexDB(cls, path):
    filepath1 = opj(path, 'file1.txt')
    filep2 = opj('d', 'file2.txt')
    filepath2 = opj(path, filep2)

    annex = AnnexRepo(path, create=True)
    # PhysicalFileStatusesDB relies on information in annex so files
    # must be committed first
    annex.add('file1.txt')
    annex.commit("initial commit")
    db = cls(annex=annex)

    def set_db_status_from_file(fpath):
        """To test JsonFileStatusesDB, we need to keep updating the status stored"""
        if cls is JsonFileStatusesDB:
            # we need first to set the status
            db.set(fpath, db._get_fileattributes_status(fpath))

    set_db_status_from_file('file1.txt')
    status1 = db.get('file1.txt')
    assert(status1.size)

    status1_ = db.get('file1.txt')
    assert_equal(status1, status1_)
    assert_false(db.is_different('file1.txt', status1))
    assert_false(db.is_different('file1.txt', status1_))
    # even if we add a filename specification
    status1_.filename = 'file1.txt'
    assert_false(db.is_different('file1.txt', status1_))
    status1_.filename = 'different.txt'
    assert_false(db.is_different('file1.txt', status1_))

    os.unlink(filepath1)  # under annex- - we don't have unlock yet and thus can't inplace augment
    with open(filepath1, 'a') as f:
        f.write('+')
    # Note/TODO: fixed (realpath) path should go. Inner logic has to adapt to
    # dataset singletons, that don't resolve symlinks
    set_db_status_from_file(realpath(filepath1))
    assert(db.is_different('file1.txt', status1))

    # we should be able to get status of files out and inside of git
    set_db_status_from_file('2git')
    status_git1 = db.get('2git')
    annex.add('2git', git=True)
    annex.commit("added 2git")
    assert_equal(db.get('2git'), status_git1)

    # we should be able to get status of files with relative path to top dir and abs path
    set_db_status_from_file(filep2)
    status2 = db.get(filep2)
    # Note/TODO: fixed (realpath) path should go. Inner logic has to adapt to
    # dataset singletons, that don't resolve symlinks
    status2_full = db.get(realpath(filepath2))
    assert_equal(status2, status2_full)
    # TODO? what about relative to curdir??
    #with chpwd(opj(path, 'd')):
    #    status2_dir = db.get('./file2.txt')
    #    assert_equal(status2, status2_dir)

    # since we asked about each file we added to DB/annex -- none should be
    # known as "deleted"
    assert_equal(db.get_obsolete(), [])

    # Possibly save its state for persistent storage
    #import pdb; pdb.set_trace()
    db.save()

    # but, if we create another DB which wasn't queried yet
    db2 = cls(annex=annex)
    # all files should be returned
    # TODO: fixed by using realpath, but there should be a cleaner
    # adaption to dataset singletons, that are NOT resolving symlinks, while the
    # underlying repos do!
    assert_equal(
            set(db2.get_obsolete()),
            {opj(realpath(path), p) for p in ['file1.txt', filep2, '2git']})
    # and if we query one, it shouldn't be listed as deleted any more
    status2_ = db2.get(filep2)
    assert_equal(status2, status2_)
    # TODO: fixed by using realpath, but there should be a cleaner
    # adaption to dataset singletons, that are NOT resolving symlinks, while the
    # underlying repos do!
    assert_equal(
            set(db2.get_obsolete()),
            {opj(realpath(path), p) for p in ['file1.txt', '2git']})

    # and if we queried with ./ prefix, should still work
    db2.get(curdir + sep + 'file1.txt')
    # TODO: fixed by using realpath, but there should be a cleaner
    # adaption to dataset singletons, that are NOT resolving symlinks, while the
    # underlying repos do!
    assert_equal(
            set(db2.get_obsolete()),
            {opj(realpath(path), p) for p in ['2git']})

    # and if we queried with a full path, should still work
    # TODO: fixed by using realpath, but there should be a cleaner
    # adaption to dataset singletons, that are NOT resolving symlinks, while the
    # underlying repos do!
    db2.get(opj(realpath(path), '2git'))
    assert_equal(db2.get_obsolete(), [])
예제 #19
0
def test_ls_json(topdir, topurl):
    annex = AnnexRepo(topdir, create=True)
    ds = Dataset(topdir)
    # create some file and commit it
    with open(opj(ds.path, 'subdsfile.txt'), 'w') as f:
        f.write('123')
    ds.add(path='subdsfile.txt')
    ds.save("Hello!", version_tag=1)

    # add a subdataset
    ds.install('subds', source=topdir)

    subdirds = ds.create(_path_('dir/subds2'), force=True)
    subdirds.add('file')

    git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True)                    # create git repo
    git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'))                           # commit to git to init git repo
    git.commit()
    annex.add(opj(topdir, 'dir', 'subgit'))                                     # add the non-dataset git repo to annex
    annex.add(opj(topdir, 'dir'))                                               # add to annex (links)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force'])  # broken-link
    annex.commit()

    git.add('fgit.txt')              # commit to git to init git repo
    git.commit()
    # annex.add doesn't add submodule, so using ds.add
    ds.add(opj('dir', 'subgit'))                        # add the non-dataset git repo to annex
    ds.add('dir')                                  # add to annex (links)
    ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False)  # broken-link

    # register "external" submodule  by installing and uninstalling it
    ext_url = topurl + '/dir/subgit/.git'
    # need to make it installable via http
    Runner()('git update-server-info', cwd=opj(topdir, 'dir', 'subgit'))
    ds.install(opj('dir', 'subgit_ext'), source=ext_url)
    ds.uninstall(opj('dir', 'subgit_ext'))
    meta_dir = opj('.git', 'datalad', 'metadata')

    def get_metahash(*path):
        if not path:
            path = ['/']
        return hashlib.md5(opj(*path).encode('utf-8')).hexdigest()

    def get_metapath(dspath, *path):
        return _path_(dspath, meta_dir, get_metahash(*path))

    def get_meta(dspath, *path):
        with open(get_metapath(dspath, *path)) as f:
            return js.load(f)

    # Let's see that there is no crash if one of the files is available only
    # in relaxed URL mode, so no size could be picked up
    ds.repo.add_url_to_file(
        'fromweb', topurl + '/noteventhere', options=['--relaxed'])

    for all_ in [True, False]:  # recurse directories
        for recursive in [True, False]:
            for state in ['file', 'delete']:
                # subdataset should have its json created and deleted when
                # all=True else not
                subds_metapath = get_metapath(opj(topdir, 'subds'))
                exists_prior = exists(subds_metapath)

                #with swallow_logs(), swallow_outputs():
                dsj = _ls_json(
                    topdir,
                    json=state,
                    all_=all_,
                    recursive=recursive
                )
                ok_startswith(dsj['tags'], '1-')

                exists_post = exists(subds_metapath)
                # print("%s %s -> %s" % (state, exists_prior, exists_post))
                assert_equal(exists_post, (state == 'file' and recursive))

                # root should have its json file created and deleted in all cases
                ds_metapath = get_metapath(topdir)
                assert_equal(exists(ds_metapath), state == 'file')

                # children should have their metadata json's created and deleted only when recursive=True
                child_metapath = get_metapath(topdir, 'dir', 'subdir')
                assert_equal(exists(child_metapath), (state == 'file' and all_))

                # ignored directories should not have json files created in any case
                for subdir in [('.hidden',), ('dir', 'subgit')]:
                    assert_false(exists(get_metapath(topdir, *subdir)))

                # check if its updated in its nodes sublist too. used by web-ui json. regression test
                assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total'])

                # check size of subdataset
                subds = [item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0]
                assert_equal(subds['size']['total'], '3 Bytes')

                # dir/subds2 must not be listed among nodes of the top dataset:
                topds_nodes = {x['name']: x for x in dsj['nodes']}

                assert_in('subds', topds_nodes)
                # XXX
                # # condition here is a bit a guesswork by yoh later on
                # # TODO: here and below clear destiny/interaction of all_ and recursive
                # assert_equal(dsj['size']['total'],
                #              '15 Bytes' if (recursive and all_) else
                #              ('9 Bytes' if (recursive or all_) else '3 Bytes')
                # )

                # https://github.com/datalad/datalad/issues/1674
                if state == 'file' and all_:
                    dirj = get_meta(topdir, 'dir')
                    dir_nodes = {x['name']: x for x in dirj['nodes']}
                    # it should be present in the subdir meta
                    assert_in('subds2', dir_nodes)
                    assert_not_in('url_external', dir_nodes['subds2'])
                    assert_in('subgit_ext', dir_nodes)
                    assert_equal(dir_nodes['subgit_ext']['url'], ext_url)
                # and not in topds
                assert_not_in('subds2', topds_nodes)

                # run non-recursive dataset traversal after subdataset metadata already created
                # to verify sub-dataset metadata being picked up from its metadata file in such cases
                if state == 'file' and recursive and not all_:
                    dsj = _ls_json(topdir, json='file', all_=False)
                    subds = [
                        item for item in dsj['nodes']
                        if item['name'] == ('subdsfile.txt' or 'subds')
                    ][0]
                    assert_equal(subds['size']['total'], '3 Bytes')

                assert_equal(
                    topds_nodes['fromweb']['size']['total'], UNKNOWN_SIZE
                )
예제 #20
0
def test_fs_traverse(topdir):
    # setup temp directory tree for testing
    annex = AnnexRepo(topdir)
    AnnexRepo(opj(topdir, 'annexdir'), create=True)
    GitRepo(opj(topdir, 'gitdir'), create=True)
    GitRepo(opj(topdir, 'dir', 'subgit'), create=True)
    annex.add(opj(topdir, 'dir'))
    annex.commit()
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force'])

    # traverse file system in recursive and non-recursive modes
    for recursive in [True, False]:
        # test fs_traverse in display mode
        with swallow_logs(new_level=logging.INFO) as log, swallow_outputs() as cmo:
            repo = AnnexRepo(topdir)
            fs = fs_traverse(topdir, repo, recurse_directories=recursive, json='display')
            if recursive:
                # fs_traverse logs should contain all not ignored subdirectories
                for subdir in [opj(topdir, 'dir'), opj(topdir, 'dir', 'subdir')]:
                    assert_in('Directory: ' + subdir, log.out)
                # fs_traverse stdout contains subdirectory
                assert_in(('file2.txt' and 'dir'), cmo.out)

            # extract info of the top-level child directory
            child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
            # size of dir type child in non-recursive modes should be 0 Bytes(default) as
            # dir type child's size currently has no metadata file for traverser to pick its size from
            # and would require a recursive traversal w/ write to child metadata file mode
            assert_equal(child['size']['total'], {True: '6 Bytes', False: '0 Bytes'}[recursive])
            repo.precommit()  # to possibly stop batch process occupying the stdout

    for recursive in [True, False]:
        # run fs_traverse in write to json 'file' mode
        repo = AnnexRepo(topdir)
        fs = fs_traverse(topdir, repo, recurse_directories=recursive, json='file')
        # fs_traverse should return a dictionary
        assert_equal(isinstance(fs, dict), True)
        # not including git and annex folders
        assert_equal([item for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name']], [])
        # extract info of the top-level child directory
        child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
        # verify node type
        assert_equal(child['type'], 'dir')
        # same node size on running fs_traversal in recursive followed by non-recursive mode
        # verifies child's metadata file being used to find its size
        # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size)
        assert_equal(child['size']['total'], '6 Bytes')

        # verify subdirectory traversal if run in recursive mode
        # In current RF 'nodes' are stripped away during recursive traversal
        # for now... later we might reincarnate them "differently"
        # TODO!
        if False:  # recursive:
            # sub-dictionary should not include git and hidden directory info
            assert_equal([item for item in child['nodes'] if ('subgit' or '.fgit') == item['name']], [])
            # extract subdirectory dictionary, else fail
            subchild = [subitem for subitem in child["nodes"] if subitem['name'] == 'subdir'][0]
            # extract info of file1.txts, else fail
            link = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file1.txt'][0]
            # verify node's sizes and type
            assert_equal(link['size']['total'], '3 Bytes')
            assert_equal(link['size']['ondisk'], link['size']['total'])
            assert_equal(link['type'], 'link')
            # extract info of file2.txt, else fail
            brokenlink = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file2.txt'][0]
            # verify node's sizes and type
            assert_equal(brokenlink['type'], 'link-broken')
            assert_equal(brokenlink['size']['ondisk'], '0 Bytes')
            assert_equal(brokenlink['size']['total'], '3 Bytes')
예제 #21
0
def test_fs_traverse(topdir):
    # setup temp directory tree for testing
    annex = AnnexRepo(topdir)
    AnnexRepo(opj(topdir, 'annexdir'), create=True)
    GitRepo(opj(topdir, 'gitdir'), create=True)
    GitRepo(opj(topdir, 'dir', 'subgit'), create=True)
    annex.add(opj(topdir, 'dir'))
    annex.commit()
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force'])

    # traverse file system in recursive and non-recursive modes
    for recursive in [True, False]:
        # test fs_traverse in display mode
        with swallow_logs(
                new_level=logging.INFO) as log, swallow_outputs() as cmo:
            repo = AnnexRepo(topdir)
            fs = fs_traverse(topdir,
                             repo,
                             recurse_directories=recursive,
                             json='display')
            if recursive:
                # fs_traverse logs should contain all not ignored subdirectories
                for subdir in [
                        opj(topdir, 'dir'),
                        opj(topdir, 'dir', 'subdir')
                ]:
                    assert_in('Directory: ' + subdir, log.out)
                # fs_traverse stdout contains subdirectory
                assert_in(('file2.txt' and 'dir'), cmo.out)

            # extract info of the top-level child directory
            child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
            # size of dir type child in non-recursive modes should be 0 Bytes(default) as
            # dir type child's size currently has no metadata file for traverser to pick its size from
            # and would require a recursive traversal w/ write to child metadata file mode
            assert_equal(child['size']['total'], {
                True: '6 Bytes',
                False: '0 Bytes'
            }[recursive])
            repo.precommit(
            )  # to possibly stop batch process occupying the stdout

    for recursive in [True, False]:
        # run fs_traverse in write to json 'file' mode
        repo = AnnexRepo(topdir)
        fs = fs_traverse(topdir,
                         repo,
                         recurse_directories=recursive,
                         json='file')
        # fs_traverse should return a dictionary
        assert_equal(isinstance(fs, dict), True)
        # not including git and annex folders
        assert_equal([
            item
            for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name']
        ], [])
        # extract info of the top-level child directory
        child = [item for item in fs['nodes'] if item['name'] == 'dir'][0]
        # verify node type
        assert_equal(child['type'], 'dir')
        # same node size on running fs_traversal in recursive followed by non-recursive mode
        # verifies child's metadata file being used to find its size
        # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size)
        assert_equal(child['size']['total'], '6 Bytes')

        # verify subdirectory traversal if run in recursive mode
        # In current RF 'nodes' are stripped away during recursive traversal
        # for now... later we might reincarnate them "differently"
        # TODO!
        if False:  # recursive:
            # sub-dictionary should not include git and hidden directory info
            assert_equal([
                item for item in child['nodes']
                if ('subgit' or '.fgit') == item['name']
            ], [])
            # extract subdirectory dictionary, else fail
            subchild = [
                subitem for subitem in child["nodes"]
                if subitem['name'] == 'subdir'
            ][0]
            # extract info of file1.txts, else fail
            link = [
                subnode for subnode in subchild["nodes"]
                if subnode['name'] == 'file1.txt'
            ][0]
            # verify node's sizes and type
            assert_equal(link['size']['total'], '3 Bytes')
            assert_equal(link['size']['ondisk'], link['size']['total'])
            assert_equal(link['type'], 'link')
            # extract info of file2.txt, else fail
            brokenlink = [
                subnode for subnode in subchild["nodes"]
                if subnode['name'] == 'file2.txt'
            ][0]
            # verify node's sizes and type
            assert_equal(brokenlink['type'], 'link-broken')
            assert_equal(brokenlink['size']['ondisk'], '0 Bytes')
            assert_equal(brokenlink['size']['total'], '3 Bytes')
예제 #22
0
def test_openfmri_pipeline1(ind, topurl, outd, clonedir):
    index_html = opj(ind, 'ds666', 'index.html')

    list(
        initiate_dataset(template="openfmri",
                         dataset_name='dataladtest-ds666',
                         path=outd,
                         data_fields=['dataset'])({
                             'dataset': 'ds666'
                         }))

    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Since datalad 0.11.2 all .metadata/objects go under annex.
    # Here we have a test where we force drop all annexed content,
    # to mitigate that let's place all metadata under git
    dotdatalad_attributes_file = opj('.datalad', '.gitattributes')
    repo.set_gitattributes([('metadata/objects/**', {
        'annex.largefiles': '(nothing)'
    })], dotdatalad_attributes_file)
    # --amend so we do not cause change in # of commits below
    repo.commit("gitattributes",
                files=dotdatalad_attributes_file,
                options=['--amend'])

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat')
    ok_file_has_content(t1w_fpath_nover, "mighty load in old format")

    #
    # And now versioned files were specified!
    #
    add_to_index(index_html, content=_versioned_files)

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    ok_(
        not exists(t1w_fpath_nover),
        "%s file should no longer be there if unversioned files get removed correctly"
        % t1w_fpath_nover)
    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming
    # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents

    commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches}
    commits_l = {
        b: list(_get_branch_commits(repo, b, limit='left-only'))
        for b in branches
    }

    # all commits out there:
    # dataset init, crawler init
    #   (2 commits)
    # + 3*(incoming, processed, merge)
    # + 3*aggregate-metadata update
    #   - 1 since now that incoming starts with master, there is one less merge
    # In --incremental mode there is a side effect of absent now
    #   2*remove of obsolete metadata object files,
    #     see https://github.com/datalad/datalad/issues/2772
    # TODO inspect by knowledgeable person and re-enable
    #ncommits_master = len(commits_hexsha['master'])
    #assert_in(ncommits_master, [13, 14])
    #assert_in(len(commits_l['master']), [8, 9])

    # TODO inspect by knowledgeable person and re-enable
    #eq_(len(commits_hexsha['incoming']), ncommits_master - 8)
    #eq_(len(commits_l['incoming']), ncommits_master - 8)
    #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5)
    #eq_(len(commits_l['incoming-processed']), ncommits_master - 8)

    # Check tags for the versions
    eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1'])
    # +1 because original "release" was assumed to be 1.0.0
    repo_tags = repo.get_tags()
    eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1'])

    # Ben: The tagged ones currently are the ones with the message
    # '[DATALAD] dataset aggregate metadata update\n':
    #eq_(repo_tags[0]['hexsha'], commits_l['master'][4])  # next to the last one
    #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0])  # the last one

    def hexsha(l):
        return l.__class__(x.hexsha for x in l)

    # TODO requires additional tooling to re-enable
    ## Verify that we have desired tree of merges
    #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1],
    #                                                         commits_l['incoming'][0]))
    #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3],  # also in master
    #                                                         commits_l['incoming'][2],))

    # ben: The following two comparisons are targeting these commits:
    # commit "Merge branch 'incoming-processed'\n" in commits_l['master'],
    # parents are:
    # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and
    # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed']
    # TODO requires additional tooling to re-enable
    #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2],
    #                                             commits_l['incoming-processed'][0]))
    #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4],
    #                                             commits_l['incoming-processed'][1]))

    with chpwd(outd):
        eq_(set(glob('*')), {'changelog.txt', 'sub-1'})
        all_files = sorted(find_files('.'))

    t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat')
    ok_file_has_content(t1w_fpath, "mighty load 1.0.1")
    ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False)
    ok_file_under_git(t1w_fpath, annexed=True)

    try:
        # this is the new way
        from datalad.metadata.metadata import get_ds_aggregate_db_locations
        ds = Dataset('.')
        dbloc, objbase = get_ds_aggregate_db_locations(ds)
        dbloc = op.relpath(dbloc, start=ds.path)
    except ImportError:
        # this stopped working in early 2019 versions of datalad
        from datalad.metadata.metadata import agginfo_relpath
        dbloc = agginfo_relpath

    target_files = {
        './.datalad/config',
        './.datalad/crawl/crawl.cfg',
        # no more!
        # './.datalad/config.ttl', './.datalad/datalad.ttl',
        './.datalad/crawl/statuses/incoming.json',
        './.datalad/crawl/versions/incoming.json',
        './changelog.txt',
        './sub-1/anat/sub-1_T1w.dat',
        './sub-1/beh/responses.tsv',
        './' + dbloc,
    }
    target_incoming_files = {
        '.gitattributes',  # we marked default backend right in the incoming
        # we now base 'incoming' on master branch, so we get all those as well
        '.datalad/.gitattributes',
        '.datalad/config',
        '.datalad/crawl/crawl.cfg',
        'changelog.txt',
        'ds666.tar.gz',
        'ds666-beh_R1.0.1.tar.gz',
        'ds666_R1.0.0.tar.gz',
        'ds666_R1.0.1.tar.gz',
        'ds666_R2.0.0.tar.gz',
        '.datalad/crawl/statuses/incoming.json',
        '.datalad/crawl/versions/incoming.json'
    }
    # Ben: metadata object files may differ in their names containing some checksum-ish shit ...
    # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison
    eq_(
        set([
            f for f in all_files
            if not f.startswith('./.datalad/metadata/objects/')
        ]), target_files)

    # check that -beh was committed in 2nd commit in incoming, not the first one
    assert_not_in('ds666-beh_R1.0.1.tar.gz',
                  repo.get_files(commits_l['incoming'][-1]))
    assert_in('ds666-beh_R1.0.1.tar.gz',
              repo.get_files(commits_l['incoming'][0]))

    # rerun pipeline -- make sure we are on the same in all branches!
    with chpwd(outd):
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches}
    eq_(commits_hexsha, commits_hexsha_)  # i.e. nothing new
    # actually we do manage to add_git 1 (README) since it is generated committed directly to git
    # BUT now fixed -- if not committed (was the same), should be marked as skipped
    # Nothing was committed so stats leaked all the way up
    eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5))
    eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total())

    # rerun pipeline when new content is available
    # add new revision, rerun pipeline and check that stuff was processed/added correctly
    add_to_index(
        index_html,
        content=
        '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>')

    with chpwd(outd):
        out = run_pipeline(pipeline)
        all_files_updated = sorted(find_files('.'))
    eq_(len(out), 1)
    assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats())
    # there is no overlays ATM, so behav would be gone since no 2.0.0 for it!
    target_files.remove('./sub-1/beh/responses.tsv')

    # Ben: metadata object files may differ in their names containing some checksum-ish shit ...
    # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison
    eq_(
        set([
            f for f in all_files_updated
            if not f.startswith('./.datalad/metadata/objects/')
        ]), target_files)

    # new instance so it re-reads git stuff etc
    # repo = AnnexRepo(outd, create=False)  # to be used in the checks
    commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches}
    commits_l_ = {
        b: list(_get_branch_commits(repo, b, limit='left-only'))
        for b in branches
    }

    assert_not_equal(commits_hexsha, commits_hexsha_)
    eq_(out[0]['datalad_stats'],
        ActivityStats())  # commit happened so stats were consumed
    # numbers seems to be right
    total_stats = out[0]['datalad_stats'].get_total()
    # but for some reason downloaded_size fluctuates.... why? probably archiving...?
    total_stats.downloaded_size = 0
    eq_(
        total_stats,
        ActivityStats(
            files=8,
            skipped=5,
            downloaded=1,
            renamed=1,
            urls=6,
            add_annex=2,  # add_git=1, # README
            versions=['2.0.0'],
            merges=[['incoming', 'incoming-processed']]))

    check_dropall_get(repo)

    # Let's see if pipeline would remove files we stopped tracking
    remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>')
    with chpwd(outd):
        with swallow_logs(new_level=logging.WARNING) as cml:
            out = run_pipeline(pipeline)
            # since files get removed in incoming, but repreprocessed completely
            # incomming-processed and merged into master -- new commits will come
            # They shouldn't have any difference but still should be new commits
            assert_in("There is already a tag 2.0.0 in the repository",
                      cml.out)
    eq_(len(out), 1)
    incoming_files = repo.get_files('incoming')
    target_incoming_files.remove('ds666_R1.0.0.tar.gz')
    eq_(set(incoming_files), target_incoming_files)
    commits_hexsha_removed = {
        b: list(_get_branch_commits(repo, b))
        for b in branches
    }
    # our 'statuses' database should have recorded the change thus got a diff
    # which propagated through all branches
    for b in 'master', 'incoming-processed':
        # with non persistent DB we had no changes
        # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), [])
        assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json',
                  repo.diff(b, commits_hexsha_[b][0]))
    dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0])
    eq_(len(dincoming),
        2)  # 2 diff objects -- 1 file removed, 1 statuses updated
    eq_(
        set(dincoming.keys()), {
            repo.pathobj / '.datalad/crawl/statuses/incoming.json',
            repo.pathobj / 'ds666_R1.0.0.tar.gz'
        })

    eq_(out[0]['datalad_stats'].get_total().removed, 1)
    assert_not_equal(commits_hexsha_, commits_hexsha_removed)

    # we will check if a clone would be crawling just as good
    from datalad.api import crawl

    # make a brand new clone
    GitRepo.clone(outd, clonedir)

    def _pipeline(*args, **kwargs):
        """Helper to mock openfmri.pipeline invocation so it looks at our 'server'"""
        kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False})
        return ofpipeline(*args, **kwargs)

    with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline):
        output, stats = crawl(
        )  # we should be able to recrawl without doing anything
        ok_(stats, ActivityStats(files=6, skipped=6, urls=5))
예제 #23
0
def test_ls_json(topdir, topurl):
    annex = AnnexRepo(topdir, create=True)
    ds = Dataset(topdir)
    # create some file and commit it
    with open(opj(ds.path, 'subdsfile.txt'), 'w') as f:
        f.write('123')
    ds.add(path='subdsfile.txt')
    ds.save("Hello!", version_tag=1)

    # add a subdataset
    ds.install('subds', source=topdir)

    subdirds = ds.create(_path_('dir/subds2'), force=True)
    subdirds.add('file')

    git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True)  # create git repo
    git.add(opj(topdir, 'dir', 'subgit',
                'fgit.txt'))  # commit to git to init git repo
    git.commit()
    annex.add(opj(topdir, 'dir',
                  'subgit'))  # add the non-dataset git repo to annex
    annex.add(opj(topdir, 'dir'))  # add to annex (links)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'),
               options=['--force'])  # broken-link
    annex.commit()

    git.add('fgit.txt')  # commit to git to init git repo
    git.commit()
    # annex.add doesn't add submodule, so using ds.add
    ds.add(opj('dir', 'subgit'))  # add the non-dataset git repo to annex
    ds.add('dir')  # add to annex (links)
    ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False)  # broken-link

    # register "external" submodule  by installing and uninstalling it
    ext_url = topurl + '/dir/subgit/.git'
    # need to make it installable via http
    Runner()('git update-server-info', cwd=opj(topdir, 'dir', 'subgit'))
    ds.install(opj('dir', 'subgit_ext'), source=ext_url)
    ds.uninstall(opj('dir', 'subgit_ext'))
    meta_dir = opj('.git', 'datalad', 'metadata')

    def get_metahash(*path):
        if not path:
            path = ['/']
        return hashlib.md5(opj(*path).encode('utf-8')).hexdigest()

    def get_metapath(dspath, *path):
        return _path_(dspath, meta_dir, get_metahash(*path))

    def get_meta(dspath, *path):
        with open(get_metapath(dspath, *path)) as f:
            return js.load(f)

    # Let's see that there is no crash if one of the files is available only
    # in relaxed URL mode, so no size could be picked up
    ds.repo.add_url_to_file('fromweb',
                            topurl + '/noteventhere',
                            options=['--relaxed'])

    for all_ in [True, False]:  # recurse directories
        for recursive in [True, False]:
            for state in ['file', 'delete']:
                # subdataset should have its json created and deleted when
                # all=True else not
                subds_metapath = get_metapath(opj(topdir, 'subds'))
                exists_prior = exists(subds_metapath)

                #with swallow_logs(), swallow_outputs():
                dsj = _ls_json(topdir,
                               json=state,
                               all_=all_,
                               recursive=recursive)
                ok_startswith(dsj['tags'], '1-')

                exists_post = exists(subds_metapath)
                # print("%s %s -> %s" % (state, exists_prior, exists_post))
                assert_equal(exists_post, (state == 'file' and recursive))

                # root should have its json file created and deleted in all cases
                ds_metapath = get_metapath(topdir)
                assert_equal(exists(ds_metapath), state == 'file')

                # children should have their metadata json's created and deleted only when recursive=True
                child_metapath = get_metapath(topdir, 'dir', 'subdir')
                assert_equal(exists(child_metapath),
                             (state == 'file' and all_))

                # ignored directories should not have json files created in any case
                for subdir in [('.hidden', ), ('dir', 'subgit')]:
                    assert_false(exists(get_metapath(topdir, *subdir)))

                # check if its updated in its nodes sublist too. used by web-ui json. regression test
                assert_equal(dsj['nodes'][0]['size']['total'],
                             dsj['size']['total'])

                # check size of subdataset
                subds = [
                    item for item in dsj['nodes']
                    if item['name'] == ('subdsfile.txt' or 'subds')
                ][0]
                assert_equal(subds['size']['total'], '3 Bytes')

                # dir/subds2 must not be listed among nodes of the top dataset:
                topds_nodes = {x['name']: x for x in dsj['nodes']}

                assert_in('subds', topds_nodes)
                # XXX
                # # condition here is a bit a guesswork by yoh later on
                # # TODO: here and below clear destiny/interaction of all_ and recursive
                # assert_equal(dsj['size']['total'],
                #              '15 Bytes' if (recursive and all_) else
                #              ('9 Bytes' if (recursive or all_) else '3 Bytes')
                # )

                # https://github.com/datalad/datalad/issues/1674
                if state == 'file' and all_:
                    dirj = get_meta(topdir, 'dir')
                    dir_nodes = {x['name']: x for x in dirj['nodes']}
                    # it should be present in the subdir meta
                    assert_in('subds2', dir_nodes)
                    assert_not_in('url_external', dir_nodes['subds2'])
                    assert_in('subgit_ext', dir_nodes)
                    assert_equal(dir_nodes['subgit_ext']['url'], ext_url)
                # and not in topds
                assert_not_in('subds2', topds_nodes)

                # run non-recursive dataset traversal after subdataset metadata already created
                # to verify sub-dataset metadata being picked up from its metadata file in such cases
                if state == 'file' and recursive and not all_:
                    dsj = _ls_json(topdir, json='file', all_=False)
                    subds = [
                        item for item in dsj['nodes']
                        if item['name'] == ('subdsfile.txt' or 'subds')
                    ][0]
                    assert_equal(subds['size']['total'], '3 Bytes')

                assert_equal(topds_nodes['fromweb']['size']['total'],
                             UNKNOWN_SIZE)
예제 #24
0
def test_check_dates(path):
    refdate = 1218182889

    with set_date(refdate - 1):
        ar = AnnexRepo(path, create=True)
        ar.add("foo")
        ar.commit("add foo")
        foo_commit = ar.get_hexsha()
        ar.commit("add foo")
        ar.tag("foo-tag", "tag before refdate")
        # We can't use ar.get_tags because that returns the commit's hexsha,
        # not the tag's, and ar.get_hexsha is limited to commit objects.
        foo_tag = ar.repo.git.rev_parse("foo-tag")
        # Make a lightweight tag to make sure `tag_dates` doesn't choke on it.
        ar.tag("light")
    with set_date(refdate + 1):
        ar.add("bar")
        ar.commit("add bar")
        bar_commit = ar.get_hexsha()
        ar.tag("bar-tag", "tag after refdate")
        bar_tag = ar.repo.git.rev_parse("bar-tag")
    with set_date(refdate + 2):
        # Drop an annexed file so that we have more blobs in the git-annex
        # branch than its current tree.
        ar.drop("bar", options=["--force"])

    results = {}
    for which in ["older", "newer"]:
        result = check_dates(ar, refdate, which=which)["objects"]
        ok_(result)
        if which == "newer":
            assert_in(bar_commit, result)
            assert_not_in(foo_commit, result)
            assert_in(bar_tag, result)
        elif which == "older":
            assert_in(foo_commit, result)
            assert_not_in(bar_commit, result)
            assert_in(foo_tag, result)
        results[which] = result

    ok_(any(x.get("filename") == "uuid.log"
            for x in results["older"].values()))

    newer_tree = check_dates(ar, refdate, annex="tree")["objects"]

    def is_annex_log_blob(entry):
        return (entry["type"] == "annex-blob"
                and entry["filename"].endswith(".log"))

    def num_logs(entries):
        return sum(map(is_annex_log_blob, entries.values()))

    # Because we dropped bar above, we should have one more blob in the
    # git-annex branch than in the current tree of the git-annex branch.
    eq_(num_logs(results["newer"]) - num_logs(newer_tree), 1)

    # Act like today is one day from the reference timestamp to check that we
    # get the same results with the one-day-back default.
    seconds_in_day = 60 * 60 * 24
    with patch('time.time', return_value=refdate + seconds_in_day):
        assert_equal(check_dates(ar, annex="tree")["objects"],
                     newer_tree)

    # We can give a path (str) instead of a GitRepo object.
    assert_equal(check_dates(path, refdate, annex="tree")["objects"],
                 newer_tree)

    with assert_raises(ValueError):
        check_dates(ar, refdate, which="unrecognized")
예제 #25
0
def test_save_amend(dspath):

    dspath = Path(dspath)
    file_in_super = dspath / 'somefile'
    file_in_sub = dspath / 'subds' / 'file_in_sub'

    # test on a hierarchy including a plain git repo:
    ds = Dataset(dspath).create(force=True, no_annex=True)
    subds = ds.create('subds', force=True)
    ds.save(recursive=True)
    assert_repo_status(ds.repo)

    # recursive and amend are mutually exclusive:
    for d in (ds, subds):
        assert_raises(ValueError, d.save, recursive=True, amend=True)

    # in an annex repo the branch we are interested in might not be the active
    # branch (adjusted):
    sub_branch = subds.repo.get_corresponding_branch()

    # amend in subdataset w/ new message; otherwise empty amendment:
    last_sha = subds.repo.get_hexsha(sub_branch)
    subds.save(message="new message in sub", amend=True)
    # we did in fact commit something:
    neq_(last_sha, subds.repo.get_hexsha(sub_branch))
    # repo is clean:
    assert_repo_status(subds.repo)
    # message is correct:
    eq_(
        subds.repo.format_commit("%B", sub_branch).strip(),
        "new message in sub")
    # actually replaced the previous commit:
    assert_not_in(last_sha, subds.repo.get_branch_commits_(sub_branch))

    # amend modifications in subdataset w/o new message
    if not subds.repo.is_managed_branch():
        subds.unlock('file_in_sub')
    file_in_sub.write_text("modified again")
    last_sha = subds.repo.get_hexsha(sub_branch)
    subds.save(amend=True)
    neq_(last_sha, subds.repo.get_hexsha(sub_branch))
    assert_repo_status(subds.repo)
    # message unchanged:
    eq_(
        subds.repo.format_commit("%B", sub_branch).strip(),
        "new message in sub")
    # actually replaced the previous commit:
    assert_not_in(last_sha, subds.repo.get_branch_commits_(sub_branch))

    # save --amend with nothing to amend with:
    res = subds.save(amend=True)
    assert_result_count(res, 1)
    assert_result_count(res, 1, status='notneeded', action='save')

    # amend in superdataset w/ new message; otherwise empty amendment:
    last_sha = ds.repo.get_hexsha()
    ds.save(message="new message in super", amend=True)
    neq_(last_sha, ds.repo.get_hexsha())
    assert_repo_status(subds.repo)
    eq_(ds.repo.format_commit("%B").strip(), "new message in super")
    assert_not_in(last_sha, ds.repo.get_branch_commits_())

    # amend modifications in superdataset w/o new message
    file_in_super.write_text("changed content")
    if not subds.repo.is_managed_branch():
        subds.unlock('file_in_sub')
    file_in_sub.write_text("modified once again")
    last_sha = ds.repo.get_hexsha()
    last_sha_sub = subds.repo.get_hexsha(sub_branch)
    ds.save(amend=True)
    neq_(last_sha, ds.repo.get_hexsha())
    eq_(ds.repo.format_commit("%B").strip(), "new message in super")
    assert_not_in(last_sha, ds.repo.get_branch_commits_())
    # we didn't mess with the subds:
    assert_repo_status(ds.repo, modified=["subds"])
    eq_(last_sha_sub, subds.repo.get_hexsha(sub_branch))
    eq_(
        subds.repo.format_commit("%B", sub_branch).strip(),
        "new message in sub")

    # save --amend with nothing to amend with:
    last_sha = ds.repo.get_hexsha()
    res = ds.save(amend=True)
    assert_result_count(res, 1)
    assert_result_count(res, 1, status='notneeded', action='save')
    eq_(last_sha, ds.repo.get_hexsha())
    # we didn't mess with the subds:
    assert_repo_status(ds.repo, modified=["subds"])
    eq_(last_sha_sub, subds.repo.get_hexsha(sub_branch))
    eq_(
        subds.repo.format_commit("%B", sub_branch).strip(),
        "new message in sub")

    # amend with different identity:
    orig_author = ds.repo.format_commit("%an")
    orig_email = ds.repo.format_commit("%ae")
    orig_date = ds.repo.format_commit("%ad")
    orig_committer = ds.repo.format_commit("%cn")
    orig_committer_mail = ds.repo.format_commit("%ce")
    eq_(orig_author, orig_committer)
    eq_(orig_email, orig_committer_mail)
    with patch.dict(
            'os.environ', {
                'GIT_COMMITTER_NAME': 'Hopefully Different',
                'GIT_COMMITTER_EMAIL': '*****@*****.**'
            }):

        ds.config.reload(force=True)
        ds.save(amend=True, message="amend with hope")
    # author was kept:
    eq_(orig_author, ds.repo.format_commit("%an"))
    eq_(orig_email, ds.repo.format_commit("%ae"))
    eq_(orig_date, ds.repo.format_commit("%ad"))
    # committer changed:
    eq_(ds.repo.format_commit("%cn"), "Hopefully Different")
    eq_(ds.repo.format_commit("%ce"), "*****@*****.**")

    # corner case: amend empty commit with no parent:
    rmtree(str(dspath))
    # When adjusted branch is enforced by git-annex detecting a crippled FS,
    # git-annex produces an empty commit before switching to adjusted branch:
    # "commit before entering adjusted branch"
    # The commit by `create` would be the second one already.
    # Therefore go with plain annex repo and create an (empty) commit only when
    # not on adjusted branch:
    repo = AnnexRepo(dspath, create=True)
    if not repo.is_managed_branch():
        repo.commit(msg="initial", options=['--allow-empty'])
    ds = Dataset(dspath)
    branch = ds.repo.get_corresponding_branch() or ds.repo.get_active_branch()
    # test pointless if we start with more than one commit
    eq_(len(list(ds.repo.get_branch_commits_(branch))),
        1,
        msg="More than on commit '{}': {}".format(
            branch, ds.repo.call_git(['log', branch])))
    last_sha = ds.repo.get_hexsha(branch)

    ds.save(message="new initial commit", amend=True)
    assert_repo_status(ds.repo)
    eq_(len(list(ds.repo.get_branch_commits_(branch))),
        1,
        msg="More than on commit '{}': {}".format(
            branch, ds.repo.call_git(['log', branch])))
    assert_not_in(last_sha, ds.repo.get_branch_commits_(branch))
    eq_(ds.repo.format_commit("%B", branch).strip(), "new initial commit")
예제 #26
0
def test_check_dates(path):
    refdate = 1218182889

    with set_date(refdate - 1):
        ar = AnnexRepo(path, create=True)

        def tag_object(tag):
            """Return object for tag.  Do not dereference it.
            """
            # We can't use ar.get_tags because that returns the commit's hexsha,
            # not the tag's, and ar.get_hexsha is limited to commit objects.
            return ar.call_git_oneline(
                ["rev-parse", "refs/tags/{}".format(tag)], read_only=True)

        ar.add("foo")
        ar.commit("add foo")
        foo_commit = ar.get_hexsha()
        ar.commit("add foo")
        ar.tag("foo-tag", "tag before refdate")
        foo_tag = tag_object("foo-tag")
        # Make a lightweight tag to make sure `tag_dates` doesn't choke on it.
        ar.tag("light")
    with set_date(refdate + 1):
        ar.add("bar")
        ar.commit("add bar")
        bar_commit = ar.get_hexsha()
        ar.tag("bar-tag", "tag after refdate")
        bar_tag = tag_object("bar-tag")
    with set_date(refdate + 2):
        # Drop an annexed file so that we have more blobs in the git-annex
        # branch than its current tree.
        ar.drop("bar", options=["--force"])

    results = {}
    for which in ["older", "newer"]:
        result = check_dates(ar, refdate, which=which)["objects"]
        ok_(result)
        if which == "newer":
            assert_in(bar_commit, result)
            assert_not_in(foo_commit, result)
            assert_in(bar_tag, result)
        elif which == "older":
            assert_in(foo_commit, result)
            assert_not_in(bar_commit, result)
            assert_in(foo_tag, result)
        results[which] = result

    ok_(any(
        x.get("filename") == "uuid.log" for x in results["older"].values()))

    newer_tree = check_dates(ar, refdate, annex="tree")["objects"]

    def is_annex_log_blob(entry):
        return (entry["type"] == "annex-blob"
                and entry["filename"].endswith(".log"))

    def num_logs(entries):
        return sum(map(is_annex_log_blob, entries.values()))

    # Because we dropped bar above, we should have one more blob in the
    # git-annex branch than in the current tree of the git-annex branch.
    eq_(num_logs(results["newer"]) - num_logs(newer_tree), 1)

    # Act like today is one day from the reference timestamp to check that we
    # get the same results with the one-day-back default.
    seconds_in_day = 60 * 60 * 24
    with patch('time.time', return_value=refdate + seconds_in_day):
        assert_equal(check_dates(ar, annex="tree")["objects"], newer_tree)

    # We can give a path (str) instead of a GitRepo object.
    assert_equal(
        check_dates(path, refdate, annex="tree")["objects"], newer_tree)

    with assert_raises(ValueError):
        check_dates(ar, refdate, which="unrecognized")
예제 #27
0
def test_interactions(tdir):
    # Just a placeholder since constructor expects a repo
    repo = AnnexRepo(tdir, create=True, init=True)
    repo.add('file.dat')
    repo.commit('added file.dat')

    class FIFO(object):
        def __init__(self, content=None, default=None):
            """

            Parameters
            ----------
            content
            default
              If defined, will be the one returned if empty.
              If not defined -- would raise an Exception
            """
            self.content = content or []
            self.default = default

        def _pop(self):
            # return empty line, usually to signal
            if self.content:
                v = self.content.pop(0)
                # allow for debug
                if v.startswith('DEBUG '):
                    # next one
                    return self._pop()
                return v
            else:
                if self.default is not None:
                    return self.default
                else:
                    raise IndexError("we are empty")

        def write(self, l):
            self.content.append(l)

        def read(self):
            return self._pop()

        def readline(self):
            return self._pop().rstrip('\n')

        def flush(self):
            pass  # working hard

    # now we should test interactions
    import re
    ERROR_ARGS = re.compile('^ERROR .*(missing|takes) .*\d+ .*argument')
    for scenario in [
        [],  # default of doing nothing
        [  # support of EXPORT which by default is not supported
            ('EXPORTSUPPORTED', 'EXPORTSUPPORTED-FAILURE'),
        ],
        [  # some unknown option
            ('FANCYNEWOPTION', 'UNSUPPORTED-REQUEST'),
        ],
        [
            # get the COST etc for , and make sure we do not
            # fail right on unsupported
            ('FANCYNEWOPTION', 'UNSUPPORTED-REQUEST'),
            ('GETCOST', 'COST %d' % DEFAULT_COST),
            ('GETCOST roguearg', ERROR_ARGS),
            ('GETAVAILABILITY', 'AVAILABILITY %s' % DEFAULT_AVAILABILITY),
            ('INITREMOTE', 'INITREMOTE-SUCCESS'
             ),  # by default we do not require any fancy init
            # no urls supported by default
            ('CLAIMURL http://example.com', 'CLAIMURL-FAILURE'),
            # we know that is just a single option, url, is expected so full
            # one would be passed
            ('CLAIMURL http://example.com roguearg', 'CLAIMURL-FAILURE'),
            # but if not enough params -- ERROR_ARGS
            ('CLAIMURL', ERROR_ARGS)
        ]
    ]:
        # First one is always version and
        # Final empty command to signal the end of the transactions
        scenario = [(None, 'VERSION 1')] + scenario + [('', None)]
        fin, fout = FIFO(), FIFO(default='')
        for in_, out_ in scenario:
            if in_ is not None:
                fin.write(in_ + '\n')
        cr = AnnexCustomRemote(tdir, fin=fin, fout=fout)
        cr.main()
        for in_, out_ in scenario:
            if out_ is not None:
                out_read = fout.readline()
                if isinstance(out_, type(ERROR_ARGS)):
                    assert out_.match(out_read), (out_, out_read)
                else:
                    eq_(out_, out_read)
        out_read = fout.readline()
        eq_(out_read, '')  # nothing left to say
예제 #28
0
    def __call__(path=None,
                 force=False,
                 description=None,
                 dataset=None,
                 no_annex=False,
                 save=True,
                 annex_version=None,
                 annex_backend='MD5E',
                 native_metadata_type=None,
                 shared_access=None,
                 git_opts=None,
                 annex_opts=None,
                 annex_init_opts=None,
                 text_no_annex=None):

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD
        if path and dataset:
            # Given a path and a dataset (path) not pointing to installed
            # dataset
            if not dataset.is_installed():
                msg = "No installed dataset at %s found." % dataset.path
                dsroot = get_dataset_root(dataset.path)
                if dsroot:
                    msg += " If you meant to add to the %s dataset, use that path " \
                           "instead but remember that if dataset is provided, " \
                           "relative paths are relative to the top of the " \
                           "dataset." % dsroot
                raise ValueError(msg)

        # sanity check first
        if git_opts:
            lgr.warning(
                "`git_opts` argument is presently ignored, please complain!")
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")
            if annex_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex and declaring no "
                                 "annex repo.")
            if annex_init_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex init and declaring no "
                                 "annex repo.")

        if not isinstance(force, bool):
            raise ValueError(
                "force should be bool, got %r.  Did you mean to provide a 'path'?"
                % force)
        annotated_paths = AnnotatePaths.__call__(
            # nothing given explicitly, assume create fresh right here
            path=path if path else getpwd() if dataset is None else None,
            dataset=dataset,
            recursive=False,
            action='create',
            # we need to know whether we have to check for potential
            # subdataset collision
            force_parentds_discovery=True,
            # it is absolutely OK to have something that does not exist
            unavailable_path_status='',
            unavailable_path_msg=None,
            # if we have a dataset given that actually exists, we want to
            # fail if the requested path is not in it
            nondataset_path_status='error' \
                if isinstance(dataset, Dataset) and dataset.is_installed() else '',
            on_failure='ignore')
        path = None
        for r in annotated_paths:
            if r['status']:
                # this is dealt with already
                yield r
                continue
            if path is not None:
                raise ValueError(
                    "`create` can only handle single target path or dataset")
            path = r

        if len(annotated_paths) and path is None:
            # we got something, we complained already, done
            return

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        # prep for yield
        path.update({'logger': lgr, 'type': 'dataset'})
        # just discard, we have a new story to tell
        path.pop('message', None)
        if 'parentds' in path:
            subs = Subdatasets.__call__(
                dataset=path['parentds'],
                # any known
                fulfilled=None,
                recursive=False,
                contains=path['path'],
                result_xfm='relpaths')
            if len(subs):
                path.update({
                    'status':
                    'error',
                    'message':
                    ('collision with known subdataset %s/ in dataset %s',
                     subs[0], path['parentds'])
                })
                yield path
                return

        # TODO here we need a further test that if force=True, we need to look if
        # there is a superdataset (regardless of whether we want to create a
        # subdataset or not), and if that superdataset tracks anything within
        # this directory -- if so, we need to stop right here and whine, because
        # the result of creating a repo here will produce an undesired mess

        if git_opts is None:
            git_opts = {}
        if shared_access:
            # configure `git --shared` value
            git_opts['shared'] = shared_access

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if isinstance(dataset, Dataset) and dataset.path == path['path'] \
            else Dataset(path['path'])

        # don't create in non-empty directory without `force`:
        if isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            path.update({
                'status':
                'error',
                'message':
                'will not create a dataset in a non-empty directory, use '
                '`force` option to ignore'
            })
            yield path
            return

        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            GitRepo(tbds.path, url=None, create=True, git_opts=git_opts)
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(tbds.path,
                               url=None,
                               create=True,
                               backend=annex_backend,
                               version=annex_version,
                               description=description,
                               git_opts=git_opts,
                               annex_opts=annex_opts,
                               annex_init_opts=annex_init_opts)

            if text_no_annex:
                git_attributes_file = opj(tbds.path, '.gitattributes')
                with open(git_attributes_file, 'a') as f:
                    f.write('* annex.largefiles=(not(mimetype=text/*))\n')
                tbrepo.add([git_attributes_file], git=True)
                tbrepo.commit("Instructed annex to add text files to git",
                              _datalad_msg=True,
                              files=[git_attributes_file])

        if native_metadata_type is not None:
            if not isinstance(native_metadata_type, list):
                native_metadata_type = [native_metadata_type]
            for nt in native_metadata_type:
                tbds.config.add('datalad.metadata.nativetype', nt)

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a re-create
            tbds.config.unset(id_var, where='dataset')
        tbds.config.add(id_var,
                        tbds.id if tbds.id is not None else
                        uuid.uuid1().urn.split(':')[-1],
                        where='dataset')

        # make sure that v6 annex repos never commit content under .datalad
        with open(opj(tbds.path, '.datalad', '.gitattributes'),
                  'a') as gitattr:
            # TODO this will need adjusting, when annex'ed aggregate meta data
            # comes around
            gitattr.write(
                '# Text files (according to file --mime-type) are added directly to git.\n'
            )
            gitattr.write(
                '# See http://git-annex.branchable.com/tips/largefiles/ for more info.\n'
            )
            gitattr.write('** annex.largefiles=nothing\n')

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbds.add('.datalad',
                 to_git=True,
                 save=save,
                 message='[DATALAD] new dataset')

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if save and isinstance(dataset, Dataset) and dataset.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.add(tbds.path,
                                 save=True,
                                 return_type='generator',
                                 result_filter=None,
                                 result_xfm=None,
                                 on_failure='ignore'):
                yield r

        path.update({'status': 'ok'})
        yield path
예제 #29
0
def _test_proxying_open(generate_load, verify_load, repo):
    annex = AnnexRepo(repo, create=True)
    fpath1 = opj(repo, "test")
    fpath2 = opj(repo, 'd1', 'd2', 'test2')
    # generate load
    fpath1 = generate_load(fpath1) or fpath1
    os.makedirs(dirname(fpath2))
    fpath2 = generate_load(fpath2) or fpath2
    annex.add([fpath1, fpath2])
    verify_load(fpath1)
    verify_load(fpath2)
    annex.commit("Added some files")

    # clone to another repo
    repo2 = repo + "_2"
    annex2 = AnnexRepo.clone(repo, repo2)
    # verify that can't access
    fpath1_2 = fpath1.replace(repo, repo2)
    fpath2_2 = fpath2.replace(repo, repo2)

    EXPECTED_EXCEPTIONS = (IOError, OSError)
    assert_raises(EXPECTED_EXCEPTIONS, verify_load, fpath1_2)

    with AutomagicIO():
        # verify that it doesn't even try to get files which do not exist
        with patch('datalad.support.annexrepo.AnnexRepo.get') as gricm:
            # if we request absent file
            assert_raises(EXPECTED_EXCEPTIONS, open, fpath1_2 + "_", 'r')
            # no get should be called
            assert_false(gricm.called)
        verify_load(fpath1_2)
        verify_load(fpath2_2)
        # and even if we drop it -- we still can get it no problem
        annex2.drop(fpath2_2)
        assert_false(annex2.file_has_content(fpath2_2))
        verify_load(fpath2_2)
        assert_true(annex2.file_has_content(fpath2_2))
        annex2.drop(fpath2_2)
        assert_false(annex2.file_has_content(fpath2_2))
        assert_true(os.path.isfile(fpath2_2))

    # In check_once mode, if we drop it, it wouldn't be considered again
    annex2.drop(fpath2_2)
    assert_false(annex2.file_has_content(fpath2_2))
    with AutomagicIO(check_once=True):
        verify_load(fpath2_2)
        assert_true(annex2.file_has_content(fpath2_2))
        annex2.drop(fpath2_2)
        assert_false(annex2.file_has_content(fpath2_2))
        assert_false(os.path.isfile(fpath2_2))

    # if we override stdout with something not supporting fileno, like tornado
    # does which ruins using get under IPython
    # TODO: we might need to refuse any online logging in other places like that
    annex2.drop(fpath2_2)

    class StringIOfileno(StringIO):
        def fileno(self):
            raise Exception("I have no clue how to do fileno")

    with patch('sys.stdout', new_callable=StringIOfileno), \
         patch('sys.stderr', new_callable=StringIOfileno):
        with AutomagicIO():
            assert_false(annex2.file_has_content(fpath2_2))
            verify_load(fpath2_2)
            assert_true(annex2.file_has_content(fpath2_2))