def get(url, path, out=None, rev=None): out = out or os.path.basename(urlparse(path).path) # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) erepo = ExternalRepo(tmp_dir, url=url, rev=rev) try: erepo.install() # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are # done, and to make that work we would have to copy data over # anyway before removing the cache, so we might just copy it # right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. erepo.repo.config.set( Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "reflink,hardlink,copy", ) src = os.path.join(erepo.path, urlparse(path).path.lstrip("/")) o, = erepo.repo.find_outs_by_path(src) erepo.repo.fetch(o.stage.path) o.path_info = PathInfo(os.path.abspath(out)) with o.repo.state: o.checkout() finally: erepo.uninstall()
def __init__(self, project, config): super(RemoteAzure, self).__init__(project, config) self.url = config.get(Config.SECTION_REMOTE_URL, 'azure://') match = re.match(self.REGEX, self.url) # backward compatibility path = match.group('path') self.bucket = ( urlparse(self.url if path else '').netloc or match.group('container_name') # backward compatibility or os.getenv('AZURE_STORAGE_CONTAINER_NAME')) self.prefix = urlparse(self.url).path.lstrip('/') if path else '' self.connection_string = ( config.get(Config.SECTION_AZURE_CONNECTION_STRING) or match.group('connection_string') # backward compatibility or os.getenv('AZURE_STORAGE_CONNECTION_STRING')) if not self.bucket: raise ValueError('azure storage container name missing') if not self.connection_string: raise ValueError('azure storage connection string missing') self.__blob_service = None self.path_info = {'scheme': self.scheme, 'bucket': self.bucket}
def __init__(self, stage, path, info=None, remote=None, cache=True, metric=False): super(OutputSSH, self).__init__(stage, path, info=info, remote=remote, cache=cache, metric=metric) parsed = urlparse(path) host = remote.host if remote else parsed.hostname port = remote.port if remote else (parsed.port or RemoteSSH.DEFAULT_PORT) user = remote.user if remote else (parsed.username or getpass.getuser()) if remote: path = posixpath.join(remote.prefix, urlparse(path).path.lstrip('/')) else: path = parsed.path self.path_info = {'scheme': 'ssh', 'host': host, 'port': port, 'user': user, 'path': path}
def __init__(self, repo, config): super(RemoteAzure, self).__init__(repo, config) self.url = config.get(Config.SECTION_REMOTE_URL, "azure://") match = re.match(self.REGEX, self.url) # backward compatibility path = match.group("path") self.bucket = ( urlparse(self.url if path else "").netloc or match.group("container_name") # backward compatibility or os.getenv("AZURE_STORAGE_CONTAINER_NAME")) self.prefix = urlparse(self.url).path.lstrip("/") if path else "" self.connection_string = ( config.get(Config.SECTION_AZURE_CONNECTION_STRING) or match.group("connection_string") # backward compatibility or os.getenv("AZURE_STORAGE_CONNECTION_STRING")) if not self.bucket: raise ValueError("azure storage container name missing") if not self.connection_string: raise ValueError("azure storage connection string missing") self.__blob_service = None self.path_info = {"scheme": self.scheme, "bucket": self.bucket}
def __init__( self, stage, path, info=None, remote=None, cache=True, metric=False, persist=False, tags=None, ): super(OutputS3, self).__init__( stage, path, info=info, remote=remote, cache=cache, metric=metric, persist=persist, tags=tags, ) bucket = remote.bucket if remote else urlparse(path).netloc path = urlparse(path).path.lstrip("/") if remote: path = posixpath.join(remote.prefix, path) self.path_info = { "scheme": self.scheme, "bucket": bucket, "path": path, }
def __init__(self, stage, path, info=None, remote=None, cache=True, metric=False): super(OutputSSH, self).__init__(stage, path, info=info, remote=remote, cache=cache, metric=metric) parsed = urlparse(path) host = remote.host if remote else parsed.hostname port = (remote.port if remote else (parsed.port or RemoteSSH.DEFAULT_PORT)) user = (remote.user if remote else (parsed.username or getpass.getuser())) if remote: path = posixpath.join(remote.prefix, urlparse(path).path.lstrip("/")) else: path = parsed.path self.path_info = { "scheme": "ssh", "host": host, "port": port, "user": user, "path": path, }
def __init__(self, stage, path, info=None, remote=None): super(DependencyHTTP, self).__init__(stage, path, info=info, remote=remote) if path.startswith("remote"): path = urljoin(self.remote.cache_dir, urlparse(path).path) self.path_info = {"scheme": urlparse(path).scheme, "path": path}
def __init__(self, stage, path, info=None, remote=None): super(DependencyHTTP, self).__init__(stage, path, info=info, remote=remote) if path.startswith('remote'): path = urljoin(self.remote.cache_dir, urlparse(path).path) self.path_info = { 'scheme': urlparse(path).scheme, 'path': path, }
def list_cache_paths(self): if not self.exists(self.path_info): return dirs = deque([self.path_info.path]) with self.hdfs(self.path_info) as hdfs: while dirs: for entry in hdfs.ls(dirs.pop(), detail=True): if entry["kind"] == "directory": dirs.append(urlparse(entry["name"]).path) elif entry["kind"] == "file": yield urlparse(entry["name"]).path
def __init__( self, stage, path, info=None, remote=None, cache=True, metric=False, persist=False, tags=None, ): super(OutputLOCAL, self).__init__( stage, path, info, remote=remote, cache=cache, metric=metric, persist=persist, tags=tags, ) if remote: p = os.path.join(remote.prefix, urlparse(self.url).path.lstrip("/")) else: p = path if not os.path.isabs(p): p = self.remote.to_ospath(p) p = os.path.join(stage.wdir, p) p = os.path.abspath(os.path.normpath(p)) self.path_info = PathLOCAL(url=self.url, path=p)
def __init__(self, repo, config): super(RemoteSSH, self).__init__(repo, config) self.url = config.get(Config.SECTION_REMOTE_URL, "ssh://") parsed = urlparse(self.url) self.host = parsed.hostname user_ssh_config = self._load_user_ssh_config(self.host) self.host = user_ssh_config.get("hostname", self.host) self.user = (config.get(Config.SECTION_REMOTE_USER) or parsed.username or user_ssh_config.get("user") or getpass.getuser()) self.prefix = parsed.path or "/" self.port = (config.get(Config.SECTION_REMOTE_PORT) or parsed.port or self._try_get_ssh_config_port(user_ssh_config) or self.DEFAULT_PORT) self.keyfile = config.get( Config.SECTION_REMOTE_KEY_FILE ) or self._try_get_ssh_config_keyfile(user_ssh_config) self.timeout = config.get(Config.SECTION_REMOTE_TIMEOUT, self.TIMEOUT) self.password = config.get(Config.SECTION_REMOTE_PASSWORD, None) self.ask_password = config.get(Config.SECTION_REMOTE_ASK_PASSWORD, False) self.path_info = PathSSH(host=self.host, user=self.user, port=self.port)
def __init__(self, repo, config): super(RemoteSSH, self).__init__(repo, config) self.url = config.get(Config.SECTION_REMOTE_URL, "ssh://") parsed = urlparse(self.url) self.host = parsed.hostname self.user = ( config.get(Config.SECTION_REMOTE_USER) or parsed.username or getpass.getuser() ) self.prefix = parsed.path or "/" self.port = ( config.get(Config.SECTION_REMOTE_PORT) or parsed.port or self.DEFAULT_PORT ) self.keyfile = config.get(Config.SECTION_REMOTE_KEY_FILE, None) self.timeout = config.get(Config.SECTION_REMOTE_TIMEOUT, self.TIMEOUT) self.password = config.get(Config.SECTION_REMOTE_PASSWORD, None) self.ask_password = config.get( Config.SECTION_REMOTE_ASK_PASSWORD, False ) self.path_info = { "scheme": "ssh", "host": self.host, "user": self.user, "port": self.port, }
def imp_url( self, url, out=None, resume=False, fname=None, erepo=None, locked=False ): from dvc.stage import Stage default_out = os.path.basename(urlparse(url).path) out = out or default_out with self.state: stage = Stage.create( repo=self, cmd=None, deps=[url], outs=[out], fname=fname, erepo=erepo, ) if stage is None: return None self.check_dag(self.stages() + [stage]) with self.state: stage.run(resume=resume) stage.locked = locked stage.dump() return stage
def __init__(self, repo, config): super(RemoteOSS, self).__init__(repo, config) self.url = config.get(Config.SECTION_REMOTE_URL) parsed = urlparse(self.url) self.bucket = parsed.netloc self.prefix = parsed.path.lstrip("/") self.endpoint = config.get(Config.SECTION_OSS_ENDPOINT) or os.getenv( "OSS_ENDPOINT" ) self.key_id = ( config.get(Config.SECTION_OSS_ACCESS_KEY_ID) or os.getenv("OSS_ACCESS_KEY_ID") or "defaultId" ) self.key_secret = ( config.get(Config.SECTION_OSS_ACCESS_KEY_SECRET) or os.getenv("OSS_ACCESS_KEY_SECRET") or "defaultSecret" ) self._bucket = None self.path_info = PathOSS(bucket=self.bucket)
def __init__(self, project, config): super(RemoteS3, self).__init__(project, config) storagepath = 's3://{}'.format( config.get(Config.SECTION_AWS_STORAGEPATH, '').lstrip('/')) self.url = config.get(Config.SECTION_REMOTE_URL, storagepath) self.region = (os.environ.get('AWS_DEFAULT_REGION') or config.get(Config.SECTION_AWS_REGION)) self.profile = (os.environ.get('AWS_PROFILE') or config.get(Config.SECTION_AWS_PROFILE)) self.endpoint_url = config.get(Config.SECTION_AWS_ENDPOINT_URL) self.use_ssl = config.get(Config.SECTION_AWS_USE_SSL, True) shared_creds = config.get(Config.SECTION_AWS_CREDENTIALPATH) if shared_creds: os.environ.setdefault('AWS_SHARED_CREDENTIALS_FILE', shared_creds) parsed = urlparse(self.url) self.bucket = parsed.netloc self.prefix = parsed.path.lstrip('/') self.path_info = {'scheme': self.scheme, 'bucket': self.bucket}
def __init__( self, stage, path, info=None, remote=None, cache=True, metric=False, persist=False, tags=None, ): super(OutputHDFS, self).__init__( stage, path, info=info, remote=remote, cache=cache, metric=metric, persist=persist, tags=tags, ) if remote: path = posixpath.join(remote.url, urlparse(path).path.lstrip("/")) user = remote.user if remote else self.group("user") self.path_info["user"] = user self.path_info["path"] = path
def get(cls, url, src, out=None, version=None): if not out: out = os.path.basename(src) # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. dpath = os.path.dirname(os.path.abspath(out)) with TemporaryDirectory(dir=dpath, prefix=".") as tmp_dir: pkg = Pkg(tmp_dir, url=url, version=version) pkg.install() # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are # done, and to make that work we would have to copy data over # anyway before removing the cache, so we might just copy it # right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. pkg.repo.config.set( Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "reflink,hardlink,copy", ) src = os.path.join(pkg.path, urlparse(src).path.lstrip("/")) output, = pkg.repo.find_outs_by_path(src) pkg.repo.fetch(output.stage.path) output.path_info = PathInfo(os.path.abspath(out)) with output.repo.state: output.checkout()
def __init__(self, repo, config): super(RemoteSSH, self).__init__(repo, config) url = config.get(Config.SECTION_REMOTE_URL) if url: parsed = urlparse(url) user_ssh_config = self._load_user_ssh_config(parsed.hostname) host = user_ssh_config.get("hostname", parsed.hostname) user = (config.get(Config.SECTION_REMOTE_USER) or parsed.username or user_ssh_config.get("user") or getpass.getuser()) port = (config.get(Config.SECTION_REMOTE_PORT) or parsed.port or self._try_get_ssh_config_port(user_ssh_config) or self.DEFAULT_PORT) self.path_info = self.path_cls.from_parts( scheme=self.scheme, host=host, user=user, port=port, path=parsed.path, ) else: self.path_info = None user_ssh_config = {} self.keyfile = config.get( Config.SECTION_REMOTE_KEY_FILE ) or self._try_get_ssh_config_keyfile(user_ssh_config) self.timeout = config.get(Config.SECTION_REMOTE_TIMEOUT, self.TIMEOUT) self.password = config.get(Config.SECTION_REMOTE_PASSWORD, None) self.ask_password = config.get(Config.SECTION_REMOTE_ASK_PASSWORD, False)
def __init__(self, stage, path, info=None, remote=None, cache=True, metric=False): super(OutputLOCAL, self).__init__(stage, path, info, remote=remote, cache=cache, metric=metric) if remote: p = os.path.join(remote.prefix, urlparse(self.url).path.lstrip("/")) else: p = path if not os.path.isabs(p): p = self.remote.to_ospath(p) p = os.path.join(stage.cwd, p) p = os.path.abspath(os.path.normpath(p)) self.path_info = {"scheme": "local", "path": p}
def __init__(self, repo, config): super(RemoteAZURE, self).__init__(repo, config) url = config.get(Config.SECTION_REMOTE_URL, "azure://") match = re.match(self.REGEX, url) # backward compatibility path = match.group("path") bucket = ( urlparse(url if path else "").netloc or match.group("container_name") # backward compatibility or os.getenv("AZURE_STORAGE_CONTAINER_NAME")) self.connection_string = ( config.get(Config.SECTION_AZURE_CONNECTION_STRING) or match.group("connection_string") # backward compatibility or os.getenv("AZURE_STORAGE_CONNECTION_STRING")) if not bucket: raise ValueError("azure storage container name missing") if not self.connection_string: raise ValueError("azure storage connection string missing") self.path_info = (self.path_cls(url) if path else self.path_cls.from_parts(scheme=self.scheme, netloc=bucket))
def _get(stage, p, info, cache, metric, persist): parsed = urlparse(p) if parsed.scheme == "remote": name = Config.SECTION_REMOTE_FMT.format(parsed.netloc) sect = stage.repo.config.config[name] remote = Remote(stage.repo, sect) return OUTS_MAP[remote.scheme]( stage, p, info, cache=cache, remote=remote, metric=metric, persist=persist, ) for o in OUTS: if o.supported(p): return o(stage, p, info, cache=cache, remote=None, metric=metric) return OutputLOCAL( stage, p, info, cache=cache, remote=None, metric=metric, persist=persist, )
def __init__(self, repo, config): super(RemoteS3, self).__init__(repo, config) storagepath = "s3://{}".format( config.get(Config.SECTION_AWS_STORAGEPATH, "").lstrip("/")) self.url = config.get(Config.SECTION_REMOTE_URL, storagepath) self.region = os.environ.get("AWS_DEFAULT_REGION") or config.get( Config.SECTION_AWS_REGION) self.profile = os.environ.get("AWS_PROFILE") or config.get( Config.SECTION_AWS_PROFILE) self.endpoint_url = config.get(Config.SECTION_AWS_ENDPOINT_URL) self.list_objects = config.get(Config.SECTION_AWS_LIST_OBJECTS) self.use_ssl = config.get(Config.SECTION_AWS_USE_SSL, True) shared_creds = config.get(Config.SECTION_AWS_CREDENTIALPATH) if shared_creds: os.environ.setdefault("AWS_SHARED_CREDENTIALS_FILE", shared_creds) parsed = urlparse(self.url) self.bucket = parsed.netloc self.prefix = parsed.path.lstrip("/") self.path_info = PathS3(bucket=self.bucket)
def _make_repo(repo_url, rev=None): if not repo_url or urlparse(repo_url).scheme == "": assert rev is None, "Custom revision is not supported for local repo" yield Repo(repo_url) else: with external_repo(url=repo_url, rev=rev) as repo: yield repo
def _parse_path(self, remote, path): out_path = os.path.join(self.pkg.repo.root_dir, urlparse(path).path.lstrip("/")) out, = self.pkg.repo.find_outs_by_path(out_path) self.info = copy.copy(out.info) self._pkg_stage = copy.copy(out.stage.path) return self.REMOTE.path_cls(out.cache_path)
def resolve_output(inp, out): from dvc.utils.compat import urlparse name = os.path.basename(urlparse(inp).path) if not out: return name if os.path.isdir(out): return os.path.join(out, name) return out
def supported(cls, config): if isinstance(config, basestring): url = config else: url = config[Config.SECTION_REMOTE_URL] # NOTE: silently skipping remote, calling code should handle that parsed = urlparse(url) return parsed.scheme == cls.scheme
def get_url(url, out=None): out = out or os.path.basename(urlparse(url).path) if os.path.exists(url): url = os.path.abspath(url) out = os.path.abspath(out) dep, = dependency.loads_from(None, [url]) out, = output.loads_from(None, [out], use_cache=False) dep.download(out)
def get_settings(self, name): """ Args: name (str): The name of the remote that we want to retrieve Returns: dict: The content beneath the given remote name. Example: >>> config = {'remote "server"': {'url': 'ssh://localhost/'}} >>> get_settings("server") {'url': 'ssh://localhost/'} """ settings = self.config.config.get( Config.SECTION_REMOTE_FMT.format(name.lower()) ) if settings is None: raise ConfigError( "unable to find remote section '{}'".format(name) ) parsed = urlparse(settings["url"]) # Support for cross referenced remotes. # This will merge the settings, giving priority to the outer reference. # For example, having: # # dvc remote add server ssh://localhost # dvc remote modify server user root # dvc remote modify server ask_password true # # dvc remote add images remote://server/tmp/pictures # dvc remote modify images user alice # dvc remote modify images ask_password false # dvc remote modify images password asdf1234 # # Results on a config dictionary like: # # { # "url": "ssh://localhost/tmp/pictures", # "user": "******", # "password": "******", # "ask_password": False, # } # if parsed.scheme == "remote": reference = self.get_settings(parsed.netloc) url = posixpath.join(reference["url"], parsed.path.lstrip("/")) merged = reference.copy() merged.update(settings) merged["url"] = url return merged return settings
def _make_repo(repo_url): if not repo_url or urlparse(repo_url).scheme == "": yield Repo(repo_url) else: tmp_dir = tempfile.mkdtemp("dvc-repo") try: ext_repo = ExternalRepo(tmp_dir, url=repo_url) ext_repo.install() yield ext_repo.repo finally: remove(tmp_dir)
def _get(stage, p, info): parsed = urlparse(p) if parsed.scheme == 'remote': name = Config.SECTION_REMOTE_FMT.format(parsed.netloc) sect = stage.project.config.config[name] remote = Remote(stage.project, sect) return DEP_MAP[remote.scheme](stage, p, info, remote=remote) for d in DEPS: if d.supported(p): return d(stage, p, info) return DependencyLOCAL(stage, p, info)