Exemplo n.º 1
0
def get(url, path, out=None, rev=None):
    out = out or os.path.basename(urlparse(path).path)

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))
    erepo = ExternalRepo(tmp_dir, url=url, rev=rev)
    try:
        erepo.install()
        # Try any links possible to avoid data duplication.
        #
        # Not using symlink, because we need to remove cache after we are
        # done, and to make that work we would have to copy data over
        # anyway before removing the cache, so we might just copy it
        # right away.
        #
        # Also, we can't use theoretical "move" link type here, because
        # the same cache file might be used a few times in a directory.
        erepo.repo.config.set(
            Config.SECTION_CACHE,
            Config.SECTION_CACHE_TYPE,
            "reflink,hardlink,copy",
        )
        src = os.path.join(erepo.path, urlparse(path).path.lstrip("/"))
        o, = erepo.repo.find_outs_by_path(src)
        erepo.repo.fetch(o.stage.path)
        o.path_info = PathInfo(os.path.abspath(out))
        with o.repo.state:
            o.checkout()
    finally:
        erepo.uninstall()
Exemplo n.º 2
0
    def __init__(self, project, config):
        super(RemoteAzure, self).__init__(project, config)

        self.url = config.get(Config.SECTION_REMOTE_URL, 'azure://')
        match = re.match(self.REGEX, self.url)  # backward compatibility

        path = match.group('path')
        self.bucket = (
            urlparse(self.url if path else '').netloc
            or match.group('container_name')  # backward compatibility
            or os.getenv('AZURE_STORAGE_CONTAINER_NAME'))

        self.prefix = urlparse(self.url).path.lstrip('/') if path else ''

        self.connection_string = (
            config.get(Config.SECTION_AZURE_CONNECTION_STRING)
            or match.group('connection_string')  # backward compatibility
            or os.getenv('AZURE_STORAGE_CONNECTION_STRING'))

        if not self.bucket:
            raise ValueError('azure storage container name missing')

        if not self.connection_string:
            raise ValueError('azure storage connection string missing')

        self.__blob_service = None

        self.path_info = {'scheme': self.scheme, 'bucket': self.bucket}
Exemplo n.º 3
0
Arquivo: ssh.py Projeto: zjj2wry/dvc
    def __init__(self,
                 stage,
                 path,
                 info=None,
                 remote=None,
                 cache=True,
                 metric=False):
        super(OutputSSH, self).__init__(stage,
                                        path,
                                        info=info,
                                        remote=remote,
                                        cache=cache,
                                        metric=metric)
        parsed = urlparse(path)
        host = remote.host if remote else parsed.hostname
        port = remote.port if remote else (parsed.port
                                           or RemoteSSH.DEFAULT_PORT)
        user = remote.user if remote else (parsed.username
                                           or getpass.getuser())

        if remote:
            path = posixpath.join(remote.prefix,
                                  urlparse(path).path.lstrip('/'))
        else:
            path = parsed.path

        self.path_info = {'scheme': 'ssh',
                          'host': host,
                          'port': port,
                          'user': user,
                          'path': path}
Exemplo n.º 4
0
Arquivo: azure.py Projeto: cakey28/dvc
    def __init__(self, repo, config):
        super(RemoteAzure, self).__init__(repo, config)

        self.url = config.get(Config.SECTION_REMOTE_URL, "azure://")
        match = re.match(self.REGEX, self.url)  # backward compatibility

        path = match.group("path")
        self.bucket = (
            urlparse(self.url if path else "").netloc
            or match.group("container_name")  # backward compatibility
            or os.getenv("AZURE_STORAGE_CONTAINER_NAME"))

        self.prefix = urlparse(self.url).path.lstrip("/") if path else ""

        self.connection_string = (
            config.get(Config.SECTION_AZURE_CONNECTION_STRING)
            or match.group("connection_string")  # backward compatibility
            or os.getenv("AZURE_STORAGE_CONNECTION_STRING"))

        if not self.bucket:
            raise ValueError("azure storage container name missing")

        if not self.connection_string:
            raise ValueError("azure storage connection string missing")

        self.__blob_service = None

        self.path_info = {"scheme": self.scheme, "bucket": self.bucket}
Exemplo n.º 5
0
 def __init__(
     self,
     stage,
     path,
     info=None,
     remote=None,
     cache=True,
     metric=False,
     persist=False,
     tags=None,
 ):
     super(OutputS3, self).__init__(
         stage,
         path,
         info=info,
         remote=remote,
         cache=cache,
         metric=metric,
         persist=persist,
         tags=tags,
     )
     bucket = remote.bucket if remote else urlparse(path).netloc
     path = urlparse(path).path.lstrip("/")
     if remote:
         path = posixpath.join(remote.prefix, path)
     self.path_info = {
         "scheme": self.scheme,
         "bucket": bucket,
         "path": path,
     }
Exemplo n.º 6
0
    def __init__(self,
                 stage,
                 path,
                 info=None,
                 remote=None,
                 cache=True,
                 metric=False):
        super(OutputSSH, self).__init__(stage,
                                        path,
                                        info=info,
                                        remote=remote,
                                        cache=cache,
                                        metric=metric)
        parsed = urlparse(path)
        host = remote.host if remote else parsed.hostname
        port = (remote.port if remote else
                (parsed.port or RemoteSSH.DEFAULT_PORT))
        user = (remote.user if remote else
                (parsed.username or getpass.getuser()))

        if remote:
            path = posixpath.join(remote.prefix,
                                  urlparse(path).path.lstrip("/"))
        else:
            path = parsed.path

        self.path_info = {
            "scheme": "ssh",
            "host": host,
            "port": port,
            "user": user,
            "path": path,
        }
Exemplo n.º 7
0
Arquivo: http.py Projeto: vyloy/dvc
    def __init__(self, stage, path, info=None, remote=None):
        super(DependencyHTTP, self).__init__(stage,
                                             path,
                                             info=info,
                                             remote=remote)
        if path.startswith("remote"):
            path = urljoin(self.remote.cache_dir, urlparse(path).path)

        self.path_info = {"scheme": urlparse(path).scheme, "path": path}
Exemplo n.º 8
0
    def __init__(self, stage, path, info=None, remote=None):
        super(DependencyHTTP, self).__init__(stage,
                                             path,
                                             info=info,
                                             remote=remote)
        if path.startswith('remote'):
            path = urljoin(self.remote.cache_dir, urlparse(path).path)

        self.path_info = {
            'scheme': urlparse(path).scheme,
            'path': path,
        }
Exemplo n.º 9
0
Arquivo: hdfs.py Projeto: ptrcklv/dvc
    def list_cache_paths(self):
        if not self.exists(self.path_info):
            return

        dirs = deque([self.path_info.path])

        with self.hdfs(self.path_info) as hdfs:
            while dirs:
                for entry in hdfs.ls(dirs.pop(), detail=True):
                    if entry["kind"] == "directory":
                        dirs.append(urlparse(entry["name"]).path)
                    elif entry["kind"] == "file":
                        yield urlparse(entry["name"]).path
Exemplo n.º 10
0
    def __init__(
        self,
        stage,
        path,
        info=None,
        remote=None,
        cache=True,
        metric=False,
        persist=False,
        tags=None,
    ):
        super(OutputLOCAL, self).__init__(
            stage,
            path,
            info,
            remote=remote,
            cache=cache,
            metric=metric,
            persist=persist,
            tags=tags,
        )
        if remote:
            p = os.path.join(remote.prefix,
                             urlparse(self.url).path.lstrip("/"))
        else:
            p = path

        if not os.path.isabs(p):
            p = self.remote.to_ospath(p)
            p = os.path.join(stage.wdir, p)
        p = os.path.abspath(os.path.normpath(p))

        self.path_info = PathLOCAL(url=self.url, path=p)
Exemplo n.º 11
0
    def __init__(self, repo, config):
        super(RemoteSSH, self).__init__(repo, config)
        self.url = config.get(Config.SECTION_REMOTE_URL, "ssh://")

        parsed = urlparse(self.url)
        self.host = parsed.hostname

        user_ssh_config = self._load_user_ssh_config(self.host)

        self.host = user_ssh_config.get("hostname", self.host)
        self.user = (config.get(Config.SECTION_REMOTE_USER) or parsed.username
                     or user_ssh_config.get("user") or getpass.getuser())
        self.prefix = parsed.path or "/"
        self.port = (config.get(Config.SECTION_REMOTE_PORT) or parsed.port
                     or self._try_get_ssh_config_port(user_ssh_config)
                     or self.DEFAULT_PORT)
        self.keyfile = config.get(
            Config.SECTION_REMOTE_KEY_FILE
        ) or self._try_get_ssh_config_keyfile(user_ssh_config)
        self.timeout = config.get(Config.SECTION_REMOTE_TIMEOUT, self.TIMEOUT)
        self.password = config.get(Config.SECTION_REMOTE_PASSWORD, None)
        self.ask_password = config.get(Config.SECTION_REMOTE_ASK_PASSWORD,
                                       False)

        self.path_info = PathSSH(host=self.host,
                                 user=self.user,
                                 port=self.port)
Exemplo n.º 12
0
    def __init__(self, repo, config):
        super(RemoteSSH, self).__init__(repo, config)
        self.url = config.get(Config.SECTION_REMOTE_URL, "ssh://")

        parsed = urlparse(self.url)
        self.host = parsed.hostname
        self.user = (
            config.get(Config.SECTION_REMOTE_USER)
            or parsed.username
            or getpass.getuser()
        )
        self.prefix = parsed.path or "/"
        self.port = (
            config.get(Config.SECTION_REMOTE_PORT)
            or parsed.port
            or self.DEFAULT_PORT
        )
        self.keyfile = config.get(Config.SECTION_REMOTE_KEY_FILE, None)
        self.timeout = config.get(Config.SECTION_REMOTE_TIMEOUT, self.TIMEOUT)
        self.password = config.get(Config.SECTION_REMOTE_PASSWORD, None)
        self.ask_password = config.get(
            Config.SECTION_REMOTE_ASK_PASSWORD, False
        )

        self.path_info = {
            "scheme": "ssh",
            "host": self.host,
            "user": self.user,
            "port": self.port,
        }
Exemplo n.º 13
0
def imp_url(
    self, url, out=None, resume=False, fname=None, erepo=None, locked=False
):
    from dvc.stage import Stage

    default_out = os.path.basename(urlparse(url).path)
    out = out or default_out

    with self.state:
        stage = Stage.create(
            repo=self,
            cmd=None,
            deps=[url],
            outs=[out],
            fname=fname,
            erepo=erepo,
        )

    if stage is None:
        return None

    self.check_dag(self.stages() + [stage])

    with self.state:
        stage.run(resume=resume)

    stage.locked = locked

    stage.dump()

    return stage
Exemplo n.º 14
0
    def __init__(self, repo, config):
        super(RemoteOSS, self).__init__(repo, config)

        self.url = config.get(Config.SECTION_REMOTE_URL)
        parsed = urlparse(self.url)
        self.bucket = parsed.netloc
        self.prefix = parsed.path.lstrip("/")

        self.endpoint = config.get(Config.SECTION_OSS_ENDPOINT) or os.getenv(
            "OSS_ENDPOINT"
        )

        self.key_id = (
            config.get(Config.SECTION_OSS_ACCESS_KEY_ID)
            or os.getenv("OSS_ACCESS_KEY_ID")
            or "defaultId"
        )

        self.key_secret = (
            config.get(Config.SECTION_OSS_ACCESS_KEY_SECRET)
            or os.getenv("OSS_ACCESS_KEY_SECRET")
            or "defaultSecret"
        )

        self._bucket = None
        self.path_info = PathOSS(bucket=self.bucket)
Exemplo n.º 15
0
    def __init__(self, project, config):
        super(RemoteS3, self).__init__(project, config)

        storagepath = 's3://{}'.format(
            config.get(Config.SECTION_AWS_STORAGEPATH, '').lstrip('/'))

        self.url = config.get(Config.SECTION_REMOTE_URL, storagepath)

        self.region = (os.environ.get('AWS_DEFAULT_REGION')
                       or config.get(Config.SECTION_AWS_REGION))

        self.profile = (os.environ.get('AWS_PROFILE')
                        or config.get(Config.SECTION_AWS_PROFILE))

        self.endpoint_url = config.get(Config.SECTION_AWS_ENDPOINT_URL)

        self.use_ssl = config.get(Config.SECTION_AWS_USE_SSL, True)

        shared_creds = config.get(Config.SECTION_AWS_CREDENTIALPATH)
        if shared_creds:
            os.environ.setdefault('AWS_SHARED_CREDENTIALS_FILE', shared_creds)

        parsed = urlparse(self.url)
        self.bucket = parsed.netloc
        self.prefix = parsed.path.lstrip('/')

        self.path_info = {'scheme': self.scheme, 'bucket': self.bucket}
Exemplo n.º 16
0
Arquivo: hdfs.py Projeto: khamutov/dvc
 def __init__(
     self,
     stage,
     path,
     info=None,
     remote=None,
     cache=True,
     metric=False,
     persist=False,
     tags=None,
 ):
     super(OutputHDFS, self).__init__(
         stage,
         path,
         info=info,
         remote=remote,
         cache=cache,
         metric=metric,
         persist=persist,
         tags=tags,
     )
     if remote:
         path = posixpath.join(remote.url, urlparse(path).path.lstrip("/"))
     user = remote.user if remote else self.group("user")
     self.path_info["user"] = user
     self.path_info["path"] = path
Exemplo n.º 17
0
Arquivo: pkg.py Projeto: vasinkd/dvc
    def get(cls, url, src, out=None, version=None):
        if not out:
            out = os.path.basename(src)

        # Creating a directory right beside the output to make sure that they
        # are on the same filesystem, so we could take the advantage of
        # reflink and/or hardlink.
        dpath = os.path.dirname(os.path.abspath(out))
        with TemporaryDirectory(dir=dpath, prefix=".") as tmp_dir:
            pkg = Pkg(tmp_dir, url=url, version=version)
            pkg.install()
            # Try any links possible to avoid data duplication.
            #
            # Not using symlink, because we need to remove cache after we are
            # done, and to make that work we would have to copy data over
            # anyway before removing the cache, so we might just copy it
            # right away.
            #
            # Also, we can't use theoretical "move" link type here, because
            # the same cache file might be used a few times in a directory.
            pkg.repo.config.set(
                Config.SECTION_CACHE,
                Config.SECTION_CACHE_TYPE,
                "reflink,hardlink,copy",
            )
            src = os.path.join(pkg.path, urlparse(src).path.lstrip("/"))
            output, = pkg.repo.find_outs_by_path(src)
            pkg.repo.fetch(output.stage.path)
            output.path_info = PathInfo(os.path.abspath(out))
            with output.repo.state:
                output.checkout()
Exemplo n.º 18
0
    def __init__(self, repo, config):
        super(RemoteSSH, self).__init__(repo, config)

        url = config.get(Config.SECTION_REMOTE_URL)
        if url:
            parsed = urlparse(url)
            user_ssh_config = self._load_user_ssh_config(parsed.hostname)

            host = user_ssh_config.get("hostname", parsed.hostname)
            user = (config.get(Config.SECTION_REMOTE_USER) or parsed.username
                    or user_ssh_config.get("user") or getpass.getuser())
            port = (config.get(Config.SECTION_REMOTE_PORT) or parsed.port
                    or self._try_get_ssh_config_port(user_ssh_config)
                    or self.DEFAULT_PORT)
            self.path_info = self.path_cls.from_parts(
                scheme=self.scheme,
                host=host,
                user=user,
                port=port,
                path=parsed.path,
            )
        else:
            self.path_info = None
            user_ssh_config = {}

        self.keyfile = config.get(
            Config.SECTION_REMOTE_KEY_FILE
        ) or self._try_get_ssh_config_keyfile(user_ssh_config)
        self.timeout = config.get(Config.SECTION_REMOTE_TIMEOUT, self.TIMEOUT)
        self.password = config.get(Config.SECTION_REMOTE_PASSWORD, None)
        self.ask_password = config.get(Config.SECTION_REMOTE_ASK_PASSWORD,
                                       False)
Exemplo n.º 19
0
    def __init__(self,
                 stage,
                 path,
                 info=None,
                 remote=None,
                 cache=True,
                 metric=False):
        super(OutputLOCAL, self).__init__(stage,
                                          path,
                                          info,
                                          remote=remote,
                                          cache=cache,
                                          metric=metric)
        if remote:
            p = os.path.join(remote.prefix,
                             urlparse(self.url).path.lstrip("/"))
        else:
            p = path

        if not os.path.isabs(p):
            p = self.remote.to_ospath(p)
            p = os.path.join(stage.cwd, p)
        p = os.path.abspath(os.path.normpath(p))

        self.path_info = {"scheme": "local", "path": p}
Exemplo n.º 20
0
    def __init__(self, repo, config):
        super(RemoteAZURE, self).__init__(repo, config)

        url = config.get(Config.SECTION_REMOTE_URL, "azure://")

        match = re.match(self.REGEX, url)  # backward compatibility
        path = match.group("path")
        bucket = (
            urlparse(url if path else "").netloc
            or match.group("container_name")  # backward compatibility
            or os.getenv("AZURE_STORAGE_CONTAINER_NAME"))

        self.connection_string = (
            config.get(Config.SECTION_AZURE_CONNECTION_STRING)
            or match.group("connection_string")  # backward compatibility
            or os.getenv("AZURE_STORAGE_CONNECTION_STRING"))

        if not bucket:
            raise ValueError("azure storage container name missing")

        if not self.connection_string:
            raise ValueError("azure storage connection string missing")

        self.path_info = (self.path_cls(url) if path else
                          self.path_cls.from_parts(scheme=self.scheme,
                                                   netloc=bucket))
Exemplo n.º 21
0
def _get(stage, p, info, cache, metric, persist):
    parsed = urlparse(p)
    if parsed.scheme == "remote":
        name = Config.SECTION_REMOTE_FMT.format(parsed.netloc)
        sect = stage.repo.config.config[name]
        remote = Remote(stage.repo, sect)
        return OUTS_MAP[remote.scheme](
            stage,
            p,
            info,
            cache=cache,
            remote=remote,
            metric=metric,
            persist=persist,
        )

    for o in OUTS:
        if o.supported(p):
            return o(stage, p, info, cache=cache, remote=None, metric=metric)
    return OutputLOCAL(
        stage,
        p,
        info,
        cache=cache,
        remote=None,
        metric=metric,
        persist=persist,
    )
Exemplo n.º 22
0
    def __init__(self, repo, config):
        super(RemoteS3, self).__init__(repo, config)

        storagepath = "s3://{}".format(
            config.get(Config.SECTION_AWS_STORAGEPATH, "").lstrip("/"))

        self.url = config.get(Config.SECTION_REMOTE_URL, storagepath)

        self.region = os.environ.get("AWS_DEFAULT_REGION") or config.get(
            Config.SECTION_AWS_REGION)

        self.profile = os.environ.get("AWS_PROFILE") or config.get(
            Config.SECTION_AWS_PROFILE)

        self.endpoint_url = config.get(Config.SECTION_AWS_ENDPOINT_URL)

        self.list_objects = config.get(Config.SECTION_AWS_LIST_OBJECTS)

        self.use_ssl = config.get(Config.SECTION_AWS_USE_SSL, True)

        shared_creds = config.get(Config.SECTION_AWS_CREDENTIALPATH)
        if shared_creds:
            os.environ.setdefault("AWS_SHARED_CREDENTIALS_FILE", shared_creds)

        parsed = urlparse(self.url)
        self.bucket = parsed.netloc
        self.prefix = parsed.path.lstrip("/")

        self.path_info = PathS3(bucket=self.bucket)
Exemplo n.º 23
0
def _make_repo(repo_url, rev=None):
    if not repo_url or urlparse(repo_url).scheme == "":
        assert rev is None, "Custom revision is not supported for local repo"
        yield Repo(repo_url)
    else:
        with external_repo(url=repo_url, rev=rev) as repo:
            yield repo
Exemplo n.º 24
0
    def _parse_path(self, remote, path):
        out_path = os.path.join(self.pkg.repo.root_dir,
                                urlparse(path).path.lstrip("/"))

        out, = self.pkg.repo.find_outs_by_path(out_path)
        self.info = copy.copy(out.info)
        self._pkg_stage = copy.copy(out.stage.path)
        return self.REMOTE.path_cls(out.cache_path)
Exemplo n.º 25
0
def resolve_output(inp, out):
    from dvc.utils.compat import urlparse

    name = os.path.basename(urlparse(inp).path)
    if not out:
        return name
    if os.path.isdir(out):
        return os.path.join(out, name)
    return out
Exemplo n.º 26
0
    def supported(cls, config):
        if isinstance(config, basestring):
            url = config
        else:
            url = config[Config.SECTION_REMOTE_URL]

        # NOTE: silently skipping remote, calling code should handle that
        parsed = urlparse(url)
        return parsed.scheme == cls.scheme
Exemplo n.º 27
0
Arquivo: get_url.py Projeto: yk/dvc
def get_url(url, out=None):
    out = out or os.path.basename(urlparse(url).path)

    if os.path.exists(url):
        url = os.path.abspath(url)
        out = os.path.abspath(out)

    dep, = dependency.loads_from(None, [url])
    out, = output.loads_from(None, [out], use_cache=False)
    dep.download(out)
Exemplo n.º 28
0
    def get_settings(self, name):
        """
        Args:
            name (str): The name of the remote that we want to retrieve

        Returns:
            dict: The content beneath the given remote name.

        Example:
            >>> config = {'remote "server"': {'url': 'ssh://localhost/'}}
            >>> get_settings("server")
            {'url': 'ssh://localhost/'}
        """
        settings = self.config.config.get(
            Config.SECTION_REMOTE_FMT.format(name.lower())
        )

        if settings is None:
            raise ConfigError(
                "unable to find remote section '{}'".format(name)
            )

        parsed = urlparse(settings["url"])

        # Support for cross referenced remotes.
        # This will merge the settings, giving priority to the outer reference.
        # For example, having:
        #
        #       dvc remote add server ssh://localhost
        #       dvc remote modify server user root
        #       dvc remote modify server ask_password true
        #
        #       dvc remote add images remote://server/tmp/pictures
        #       dvc remote modify images user alice
        #       dvc remote modify images ask_password false
        #       dvc remote modify images password asdf1234
        #
        # Results on a config dictionary like:
        #
        #       {
        #           "url": "ssh://localhost/tmp/pictures",
        #           "user": "******",
        #           "password": "******",
        #           "ask_password": False,
        #       }
        #
        if parsed.scheme == "remote":
            reference = self.get_settings(parsed.netloc)
            url = posixpath.join(reference["url"], parsed.path.lstrip("/"))
            merged = reference.copy()
            merged.update(settings)
            merged["url"] = url
            return merged

        return settings
Exemplo n.º 29
0
def _make_repo(repo_url):
    if not repo_url or urlparse(repo_url).scheme == "":
        yield Repo(repo_url)
    else:
        tmp_dir = tempfile.mkdtemp("dvc-repo")
        try:
            ext_repo = ExternalRepo(tmp_dir, url=repo_url)
            ext_repo.install()
            yield ext_repo.repo
        finally:
            remove(tmp_dir)
Exemplo n.º 30
0
def _get(stage, p, info):
    parsed = urlparse(p)
    if parsed.scheme == 'remote':
        name = Config.SECTION_REMOTE_FMT.format(parsed.netloc)
        sect = stage.project.config.config[name]
        remote = Remote(stage.project, sect)
        return DEP_MAP[remote.scheme](stage, p, info, remote=remote)

    for d in DEPS:
        if d.supported(p):
            return d(stage, p, info)
    return DependencyLOCAL(stage, p, info)