Exemplo n.º 1
0
def test_walk_dir(tmp_dir, dvc):
    tmp_dir.gen(
        {
            "dir": {
                "subdir1": {"foo1": "foo1", "bar1": "bar1"},
                "subdir2": {"foo2": "foo2"},
                "foo": "foo",
                "bar": "bar",
            }
        }
    )

    dvc.add("dir")
    fs = DataFileSystem(repo=dvc)

    expected = [
        "dir/subdir1",
        "dir/subdir2",
        "dir/subdir1/foo1",
        "dir/subdir1/bar1",
        "dir/subdir2/foo2",
        "dir/foo",
        "dir/bar",
    ]

    actual = []
    for root, dirs, files in fs.walk("dir"):
        for entry in dirs + files:
            actual.append(posixpath.join(root, entry))

    assert set(actual) == set(expected)
    assert len(actual) == len(expected)
Exemplo n.º 2
0
def test_exists(tmp_dir, dvc):
    tmp_dir.gen("foo", "foo")
    dvc.add("foo")
    (tmp_dir / "foo").unlink()

    fs = DataFileSystem(repo=dvc)
    assert fs.exists("foo")
Exemplo n.º 3
0
def test_open(tmp_dir, dvc):
    tmp_dir.gen("foo", "foo")
    dvc.add("foo")
    (tmp_dir / "foo").unlink()

    fs = DataFileSystem(repo=dvc)
    with fs.open("foo", "r") as fobj:
        assert fobj.read() == "foo"
Exemplo n.º 4
0
def test_isdir_mixed(tmp_dir, dvc):
    tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}})

    dvc.add(str(tmp_dir / "dir" / "foo"))

    fs = DataFileSystem(repo=dvc)
    assert fs.isdir("dir")
    assert not fs.isfile("dir")
Exemplo n.º 5
0
def test_get_hash_dir(tmp_dir, dvc, mocker):
    tmp_dir.dvc_gen(
        {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}}
    )
    fs = DataFileSystem(repo=dvc)
    hash_file_spy = mocker.spy(dvc_data.hashfile.hash, "hash_file")
    assert fs.info("dir")["md5"] == "8761c4e9acad696bee718615e23e22db.dir"
    assert not hash_file_spy.called
Exemplo n.º 6
0
def test_get_hash_dirty_dir(tmp_dir, dvc):
    tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}})
    (tmp_dir / "dir" / "baz").write_text("baz")

    fs = DataFileSystem(repo=dvc)
    expected = "5ea40360f5b4ec688df672a4db9c17d1.dir"
    assert fs.info("dir").get("md5") == expected
    _, _, obj = stage(dvc.odb.local, "dir", fs, "md5", dry_run=True)
    assert obj.hash_info == HashInfo("md5", expected)
Exemplo n.º 7
0
def test_open_dirty_hash(tmp_dir, dvc):
    tmp_dir.dvc_gen("file", "file")
    (tmp_dir / "file").write_text("something")

    fs = DataFileSystem(repo=dvc)
    with fs.open("file", "r") as fobj:
        # NOTE: Unlike DvcFileSystem, DataFileSystem should not
        # be affected by a dirty workspace.
        assert fobj.read() == "file"
Exemplo n.º 8
0
def test_get_hash_dirty_file(tmp_dir, dvc):
    tmp_dir.dvc_gen("file", "file")
    (tmp_dir / "file").write_text("something")

    fs = DataFileSystem(repo=dvc)
    expected = "8c7dd922ad47494fc02c388e12c00eac"
    assert fs.info("file").get("md5") == expected
    _, _, obj = stage(dvc.odb.local, "file", fs, "md5", dry_run=True)
    assert obj.hash_info == HashInfo("md5", expected)
Exemplo n.º 9
0
def get(url, path, out=None, rev=None, jobs=None):
    import shortuuid

    from dvc.dvcfile import is_valid_filename
    from dvc.external_repo import external_repo
    from dvc.fs.callbacks import Callback

    out = resolve_output(path, out)

    if is_valid_filename(out):
        raise GetDVCFileError()

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))

    # Try any links possible to avoid data duplication.
    #
    # Not using symlink, because we need to remove cache after we
    # are done, and to make that work we would have to copy data
    # over anyway before removing the cache, so we might just copy
    # it right away.
    #
    # Also, we can't use theoretical "move" link type here, because
    # the same cache file might be used a few times in a directory.
    cache_types = ["reflink", "hardlink", "copy"]
    try:
        with external_repo(
            url=url, rev=rev, cache_dir=tmp_dir, cache_types=cache_types
        ) as repo:

            if os.path.isabs(path):
                from dvc.fs.data import DataFileSystem

                fs = DataFileSystem(repo=repo, workspace="local")
                fs_path = path
            else:
                fs = repo.dvcfs
                fs_path = fs.from_os_path(path)

            with Callback.as_tqdm_callback(
                desc=f"Downloading {fs.path.name(path)}",
                unit="files",
            ) as cb:
                fs.get(
                    fs_path,
                    os.path.abspath(out),
                    batch_size=jobs,
                    callback=cb,
                )
    finally:
        remove(tmp_dir)
Exemplo n.º 10
0
def test_open_no_remote(tmp_dir, dvc):
    tmp_dir.dvc_gen("file", "file")
    (tmp_dir / "file").unlink()
    remove(dvc.odb.local.cache_dir)

    fs = DataFileSystem(repo=dvc)
    with pytest.raises(FileNotFoundError) as exc_info:
        with fs.open("file", "r"):
            pass
    assert isinstance(exc_info.value.__cause__, NoRemoteError)
Exemplo n.º 11
0
def test_open_dirty_no_hash(tmp_dir, dvc):
    tmp_dir.gen("file", "file")
    (tmp_dir / "file.dvc").write_text("outs:\n- path: file\n")

    fs = DataFileSystem(repo=dvc)
    # NOTE: Unlike DvcFileSystem, DataFileSystem should not
    # be affected by a dirty workspace.
    with pytest.raises(FileNotFoundError):
        with fs.open("file", "r"):
            pass
Exemplo n.º 12
0
def test_get_hash_granular(tmp_dir, dvc):
    tmp_dir.dvc_gen(
        {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}}
    )
    fs = DataFileSystem(repo=dvc)
    subdir = "dir/subdir"
    assert fs.info(subdir).get("md5") is None
    _, _, obj = stage(dvc.odb.local, subdir, fs, "md5", dry_run=True)
    assert obj.hash_info == HashInfo(
        "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir"
    )
    data = posixpath.join(subdir, "data")
    assert fs.info(data)["md5"] == "8d777f385d3dfec8815d20f7496026dc"
    _, _, obj = stage(dvc.odb.local, data, fs, "md5", dry_run=True)
    assert obj.hash_info == HashInfo("md5", "8d777f385d3dfec8815d20f7496026dc")
Exemplo n.º 13
0
def test_isdir_isfile(tmp_dir, dvc):
    tmp_dir.gen({"datafile": "data", "datadir": {"foo": "foo", "bar": "bar"}})

    fs = DataFileSystem(repo=dvc)
    assert not fs.isdir("datadir")
    assert not fs.isfile("datadir")
    assert not fs.isdir("datafile")
    assert not fs.isfile("datafile")

    dvc.add(["datadir", "datafile"])
    shutil.rmtree(tmp_dir / "datadir")
    (tmp_dir / "datafile").unlink()

    assert fs.isdir("datadir")
    assert not fs.isfile("datadir")
    assert not fs.isdir("datafile")
    assert fs.isfile("datafile")
Exemplo n.º 14
0
def test_open_in_history(tmp_dir, scm, dvc):
    tmp_dir.gen("foo", "foo")
    dvc.add("foo")
    dvc.scm.add(["foo.dvc", ".gitignore"])
    dvc.scm.commit("foo")

    tmp_dir.gen("foo", "foofoo")
    dvc.add("foo")
    dvc.scm.add(["foo.dvc", ".gitignore"])
    dvc.scm.commit("foofoo")

    for rev in dvc.brancher(revs=["HEAD~1"]):
        if rev == "workspace":
            continue

        fs = DataFileSystem(repo=dvc)
        with fs.open("foo", "r") as fobj:
            assert fobj.read() == "foo"
Exemplo n.º 15
0
    def open_by_relpath(self, path, remote=None, mode="r", encoding=None):
        """Opens a specified resource as a file descriptor"""
        from dvc.fs.data import DataFileSystem
        from dvc.fs.dvc import DvcFileSystem

        if os.path.isabs(path):
            fs = DataFileSystem(repo=self, workspace="local")
            fs_path = path
        else:
            fs = DvcFileSystem(repo=self, subrepos=True)
            fs_path = fs.from_os_path(path)

        try:
            with fs.open(
                    fs_path,
                    mode=mode,
                    encoding=encoding,
                    remote=remote,
            ) as fobj:
                yield fobj
        except FileNotFoundError as exc:
            raise FileMissingError(path) from exc
        except IsADirectoryError as exc:
            raise DvcIsADirectoryError(f"'{path}' is a directory") from exc
Exemplo n.º 16
0
    def datafs(self):
        from dvc.fs.data import DataFileSystem

        return DataFileSystem(repo=self)
Exemplo n.º 17
0
def test_walk_missing(tmp_dir, dvc):
    fs = DataFileSystem(repo=dvc)

    for _ in fs.walk("dir"):
        pass
Exemplo n.º 18
0
def test_walk_not_a_dir(tmp_dir, dvc):
    tmp_dir.dvc_gen("foo", "foo")
    fs = DataFileSystem(repo=dvc)

    for _ in fs.walk("foo"):
        pass
Exemplo n.º 19
0
def test_isdvc(tmp_dir, dvc):
    tmp_dir.gen({"foo": "foo", "bar": "bar"})
    dvc.add("foo")
    fs = DataFileSystem(repo=dvc)
    assert fs.isdvc("foo")
    assert not fs.isdvc("bar")
Exemplo n.º 20
0
def test_get_hash_file(tmp_dir, dvc):
    tmp_dir.dvc_gen({"foo": "foo"})
    fs = DataFileSystem(repo=dvc)
    assert fs.info("foo")["md5"] == "acbd18db4cc2f85cedef654fccc4a4d8"
Exemplo n.º 21
0
def test_get_key(tmp_dir, dvc, path, key):
    fs = DataFileSystem(repo=dvc)
    assert fs.fs._get_key(path) == key