示例#1
0
def get(url, path, out=None, rev=None):
    from dvc.external_repo import external_repo
    from dvc.dvcfile import is_valid_filename

    out = resolve_output(path, out)

    if is_valid_filename(out):
        raise GetDVCFileError()

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))
    try:
        with external_repo(url=url, rev=rev) as repo:
            if hasattr(repo, "cache"):
                repo.cache.local.cache_dir = tmp_dir

                # Try any links possible to avoid data duplication.
                #
                # Not using symlink, because we need to remove cache after we
                # are done, and to make that work we would have to copy data
                # over anyway before removing the cache, so we might just copy
                # it right away.
                #
                # Also, we can't use theoretical "move" link type here, because
                # the same cache file might be used a few times in a directory.
                repo.cache.local.cache_types = ["reflink", "hardlink", "copy"]

            repo.pull_to(path, PathInfo(out))
    finally:
        remove(tmp_dir)
示例#2
0
文件: imp_url.py 项目: shizacat/dvc
def imp_url(self, url, out=None, fname=None, erepo=None, locked=True):
    from dvc.dvcfile import Dvcfile
    from dvc.stage import Stage

    out = resolve_output(url, out)
    path, wdir, out = resolve_paths(self, out)

    # NOTE: when user is importing something from within his own repository
    if os.path.exists(url) and path_isin(os.path.abspath(url), self.root_dir):
        url = relpath(url, wdir)

    stage = Stage.create(
        self,
        fname or path,
        wdir=wdir,
        deps=[url],
        outs=[out],
        erepo=erepo,
    )

    if stage is None:
        return None

    dvcfile = Dvcfile(self, stage.path)
    dvcfile.overwrite_with_prompt(force=True)

    self.check_modified_graph([stage])

    stage.run()

    stage.locked = locked

    dvcfile.dump(stage)

    return stage
示例#3
0
文件: imp_url.py 项目: zeta1999/dvc
def imp_url(self, url, out=None, fname=None, erepo=None, locked=True):
    from dvc.stage import Stage

    out = resolve_output(url, out)

    stage = Stage.create(
        self,
        cmd=None,
        deps=[url],
        outs=[out],
        fname=fname,
        erepo=erepo,
        accompany_outs=True,
    )

    if stage is None:
        return None

    self.check_modified_graph([stage])

    stage.run()

    stage.locked = locked

    stage.dump()

    return stage
示例#4
0
def imp_url(
    self,
    url,
    out=None,
    fname=None,
    erepo=None,
    frozen=True,
    no_exec=False,
    desc=None,
    jobs=None,
):
    from dvc.dvcfile import Dvcfile
    from dvc.stage import Stage, create_stage, restore_meta

    out = resolve_output(url, out)
    path, wdir, out = resolve_paths(self, out)

    # NOTE: when user is importing something from within their own repository
    if (
        erepo is None
        and os.path.exists(url)
        and path_isin(os.path.abspath(url), self.root_dir)
    ):
        url = relpath(url, wdir)

    stage = create_stage(
        Stage,
        self,
        fname or path,
        wdir=wdir,
        deps=[url],
        outs=[out],
        erepo=erepo,
    )
    restore_meta(stage)
    if stage.can_be_skipped:
        return None

    if desc:
        stage.outs[0].desc = desc

    dvcfile = Dvcfile(self, stage.path)
    dvcfile.remove()

    try:
        self.check_modified_graph([stage])
    except OutputDuplicationError as exc:
        raise OutputDuplicationError(exc.output, set(exc.stages) - {stage})

    if no_exec:
        stage.ignore_outs()
    else:
        stage.run(jobs=jobs)

    stage.frozen = frozen

    dvcfile.dump(stage)

    return stage
示例#5
0
def get(url, path, out=None, rev=None):
    out = resolve_output(path, out)

    if Stage.is_valid_filename(out):
        raise GetDVCFileError()

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))
    try:
        with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo:
            # Note: we need to replace state, because in case of getting DVC
            # dependency on CIFS or NFS filesystems, sqlite-based state
            # will be unable to obtain lock
            repo.state = StateNoop()

            # Try any links possible to avoid data duplication.
            #
            # Not using symlink, because we need to remove cache after we are
            # done, and to make that work we would have to copy data over
            # anyway before removing the cache, so we might just copy it
            # right away.
            #
            # Also, we can't use theoretical "move" link type here, because
            # the same cache file might be used a few times in a directory.
            repo.cache.local.cache_types = ["reflink", "hardlink", "copy"]

            output = None
            output_error = None

            try:
                output = repo.find_out_by_relpath(path)
            except OutputNotFoundError as ex:
                output_error = ex

            is_git_file = output_error and not os.path.isabs(path)
            is_not_cached = output and not output.use_cache

            if is_git_file or is_not_cached:
                _copy_git_file(repo, path, out, url)
                return

            if output_error:
                raise OutputNotFoundError(path)

            with repo.state:
                repo.cloud.pull(output.get_used_cache())
            output.path_info = PathInfo(os.path.abspath(out))
            with output.repo.state:
                output.checkout()

    except NotDvcRepoError:
        raise UrlNotDvcRepoError(url)
    finally:
        remove(tmp_dir)
示例#6
0
def get(url, path, out=None, rev=None, jobs=None):
    import shortuuid

    from dvc.dvcfile import is_valid_filename
    from dvc.external_repo import external_repo
    from dvc.fs.callbacks import Callback

    out = resolve_output(path, out)

    if is_valid_filename(out):
        raise GetDVCFileError()

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))

    # Try any links possible to avoid data duplication.
    #
    # Not using symlink, because we need to remove cache after we
    # are done, and to make that work we would have to copy data
    # over anyway before removing the cache, so we might just copy
    # it right away.
    #
    # Also, we can't use theoretical "move" link type here, because
    # the same cache file might be used a few times in a directory.
    cache_types = ["reflink", "hardlink", "copy"]
    try:
        with external_repo(
            url=url, rev=rev, cache_dir=tmp_dir, cache_types=cache_types
        ) as repo:

            if os.path.isabs(path):
                from dvc.fs.data import DataFileSystem

                fs = DataFileSystem(repo=repo, workspace="local")
                fs_path = path
            else:
                fs = repo.dvcfs
                fs_path = fs.from_os_path(path)

            with Callback.as_tqdm_callback(
                desc=f"Downloading {fs.path.name(path)}",
                unit="files",
            ) as cb:
                fs.get(
                    fs_path,
                    os.path.abspath(out),
                    batch_size=jobs,
                    callback=cb,
                )
    finally:
        remove(tmp_dir)
示例#7
0
def get_url(url, out=None):
    out = resolve_output(url, out)

    if os.path.exists(url):
        url = os.path.abspath(url)
        out = os.path.abspath(out)

    dep, = dependency.loads_from(None, [url])
    out, = output.loads_from(None, [out], use_cache=False)
    dep.download(out)
示例#8
0
def get_url(url, out=None):
    out = resolve_output(url, out)

    if os.path.exists(url):
        url = os.path.abspath(url)

    out = os.path.abspath(out)

    (dep, ) = dependency.loads_from(None, [url])
    (out, ) = output.loads_from(None, [out], use_cache=False)
    if not dep.exists:
        raise dep.DoesNotExistError(dep)
    dep.download(out)
示例#9
0
def get_url(url, out=None, jobs=None):
    import dvc.dependency as dependency
    import dvc.output as output
    from dvc.utils import resolve_output

    out = resolve_output(url, out)

    if os.path.exists(url):
        url = os.path.abspath(url)

    out = os.path.abspath(out)

    (dep, ) = dependency.loads_from(None, [url])
    (out, ) = output.loads_from(None, [out], use_cache=False)
    if not dep.exists:
        raise dep.DoesNotExistError(dep)
    dep.download(out, jobs=jobs)
示例#10
0
文件: get.py 项目: ptrcklv/dvc
def get(url, path, out=None, rev=None):
    out = resolve_output(path, out)

    if Stage.is_valid_filename(out):
        raise GetDVCFileError()

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))
    try:
        with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo:
            # Try any links possible to avoid data duplication.
            #
            # Not using symlink, because we need to remove cache after we are
            # done, and to make that work we would have to copy data over
            # anyway before removing the cache, so we might just copy it
            # right away.
            #
            # Also, we can't use theoretical "move" link type here, because
            # the same cache file might be used a few times in a directory.
            repo.cache.local.cache_types = ["reflink", "hardlink", "copy"]

            try:
                output = repo.find_out_by_relpath(path)
            except OutputNotFoundError:
                output = None

            if output and output.use_cache:
                _get_cached(repo, output, out)
            else:
                # Either an uncached out with absolute path or a user error
                if os.path.isabs(path):
                    raise FileNotFoundError

                _copy(os.path.join(repo.root_dir, path), out)

    except (OutputNotFoundError, FileNotFoundError):
        raise PathMissingError(path, url)
    except NotDvcRepoError:
        raise UrlNotDvcRepoError(url)
    finally:
        remove(tmp_dir)
示例#11
0
文件: get.py 项目: woodshop/dvc
def get(url, path, out=None, rev=None):
    out = resolve_output(path, out)

    if Stage.is_valid_filename(out):
        raise GetDVCFileError()

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))
    try:
        try:
            with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo:
                # Try any links possible to avoid data duplication.
                #
                # Not using symlink, because we need to remove cache after we
                # are done, and to make that work we would have to copy data
                # over anyway before removing the cache, so we might just copy
                # it right away.
                #
                # Also, we can't use theoretical "move" link type here, because
                # the same cache file might be used a few times in a directory.
                repo.cache.local.cache_types = ["reflink", "hardlink", "copy"]
                output = repo.find_out_by_relpath(path)
                if output.use_cache:
                    _get_cached(repo, output, out)
                    return
                # Non-cached output, fall through and try to copy from git.
        except (NotDvcRepoError, NoOutputInExternalRepoError):
            # Not a DVC repository or, possibly, path is not tracked by DVC.
            # Fall through and try to copy from git.
            pass

        if os.path.isabs(path):
            raise FileNotFoundError

        repo_dir = cached_clone(url, rev=rev)

        fs_copy(os.path.join(repo_dir, path), out)
    except (OutputNotFoundError, FileNotFoundError):
        raise PathMissingError(path, url)
    finally:
        remove(tmp_dir)
示例#12
0
def imp_url(self, url, out=None, fname=None, erepo=None, frozen=True):
    from dvc.dvcfile import Dvcfile
    from dvc.stage import Stage, create_stage

    out = resolve_output(url, out)
    path, wdir, out = resolve_paths(self, out)

    # NOTE: when user is importing something from within their own repository
    if (
        erepo is None
        and os.path.exists(url)
        and path_isin(os.path.abspath(url), self.root_dir)
    ):
        url = relpath(url, wdir)

    stage = create_stage(
        Stage,
        self,
        fname or path,
        wdir=wdir,
        deps=[url],
        outs=[out],
        erepo=erepo,
    )

    if stage is None:
        return None

    dvcfile = Dvcfile(self, stage.path)
    dvcfile.remove_with_prompt(force=True)

    try:
        self.check_modified_graph([stage])
    except OutputDuplicationError as exc:
        raise OutputDuplicationError(exc.output, set(exc.stages) - {stage})

    stage.run()

    stage.frozen = frozen

    dvcfile.dump(stage)

    return stage
示例#13
0
def imp_url(
    self,
    url,
    out=None,
    fname=None,
    erepo=None,
    frozen=True,
    no_exec=False,
    remote=None,
    to_remote=False,
    desc=None,
    jobs=None,
):
    from dvc.dvcfile import Dvcfile
    from dvc.stage import Stage, create_stage, restore_meta

    out = resolve_output(url, out)
    path, wdir, out = resolve_paths(self,
                                    out,
                                    always_local=to_remote and not out)

    if to_remote and no_exec:
        raise InvalidArgumentError(
            "--no-exec can't be combined with --to-remote")

    if not to_remote and remote:
        raise InvalidArgumentError(
            "--remote can't be used without --to-remote")

    # NOTE: when user is importing something from within their own repository
    if (erepo is None and os.path.exists(url)
            and path_isin(os.path.abspath(url), self.root_dir)):
        url = relpath(url, wdir)

    stage = create_stage(
        Stage,
        self,
        fname or path,
        wdir=wdir,
        deps=[url],
        outs=[out],
        erepo=erepo,
    )
    restore_meta(stage)

    if desc:
        stage.outs[0].desc = desc

    dvcfile = Dvcfile(self, stage.path)
    dvcfile.remove()

    try:
        new_index = self.index.add(stage)
        new_index.check_graph()
    except OutputDuplicationError as exc:
        raise OutputDuplicationError(exc.output, set(exc.stages) - {stage})

    if no_exec:
        stage.ignore_outs()
    elif to_remote:
        remote_odb = self.cloud.get_remote_odb(remote, "import-url")
        stage.outs[0].transfer(url, odb=remote_odb, jobs=jobs)
        stage.save_deps()
        stage.md5 = stage.compute_md5()
    else:
        stage.run(jobs=jobs)

    stage.frozen = frozen

    dvcfile.dump(stage)

    return stage
示例#14
0
def test_resolve_output(inp, out, is_dir, expected, mocker):
    mocker.patch("os.path.isdir", return_value=is_dir)
    result = resolve_output(inp, out)
    assert result == expected