def _add_from_url(self, dataset, url, destination, extract, progress=None): """Process an add from url and return the location on disk.""" if destination.exists() and destination.is_dir(): u = parse.urlparse(url) destination = destination / Path(u.path).name try: paths = _download(url=url, download_to=destination, extract=extract, progress_class=progress) except error.HTTPError as e: # pragma nocover raise errors.OperationError( 'Cannot download from {}'.format(url)) from e # make the added file read-only for path in paths: mode = path.stat().st_mode & 0o777 path.chmod(mode & ~(stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) return [{ 'path': path.relative_to(self.path), 'url': remove_credentials(url), 'creator': dataset.creator, 'parent': self } for path in paths]
def _add_from_url(self, dataset, url, destination, extract, filename=None, progress=None): """Process adding from url and return the location on disk.""" url = self._provider_check(url) try: start = time.time() * 1e3 tmp_root, paths = self._download(url=url, filename=filename, extract=extract, progress_class=progress) exec_time = (time.time() * 1e3 - start) // 1e3 # If execution time was less or equal to zero seconds, # block the thread a bit to avoid being rate limited. if exec_time == 0: time.sleep(min(os.cpu_count() - 1, 4) or 1) except (requests.exceptions.HTTPError, error.HTTPError) as e: # pragma nocover raise errors.OperationError("Cannot download from {}".format(url)) from e paths = [(src, destination / src.relative_to(tmp_root)) for src in paths if not src.is_dir()] return [ { "operation": (src, dst, "move"), "path": dst.relative_to(self.path), "source": remove_credentials(url), "parent": self, } for src, dst in paths ]
def _add_from_url(self, dataset, url, destination, extract, progress=None): """Process adding from url and return the location on disk.""" if destination.exists() and destination.is_dir(): u = parse.urlparse(url) destination = destination / Path(u.path).name else: destination.parent.mkdir(parents=True, exist_ok=True) url = self.provider_check(url) try: start = time.time() * 1e+3 paths = _download(url=url, download_to=destination, extract=extract, progress_class=progress) exec_time = (time.time() * 1e+3 - start) // 1e+3 # If execution time was less or equal to zero seconds, # block the thread a bit to avoid being rate limited. if exec_time == 0: time.sleep(min(os.cpu_count() - 1, 4) or 1) except (requests.exceptions.HTTPError, error.HTTPError) as e: # pragma nocover raise errors.OperationError( 'Cannot download from {}'.format(url)) from e # make the added file read-only for path in paths: mode = path.stat().st_mode & 0o777 path.chmod(mode & ~(stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) return [{ 'path': path.relative_to(self.path), 'url': remove_credentials(url), 'creator': dataset.creator, 'parent': self } for path in paths]
def _add_from_git(self, dataset, url, sources, destination, ref): """Process adding resources from another git repository.""" from renku import LocalClient u = parse.urlparse(url) sources = self._resolve_paths(u.path, sources) # Get all files from repo that match sources repo, repo_path = self.prepare_git_repo(url, ref) files = set() used_sources = set() for file in repo.head.commit.tree.traverse(): path = file.path result = self._get_src_and_dst(path, repo_path, sources, destination, used_sources) if result: files.add(result) unused_sources = set(sources.keys()) - used_sources if unused_sources: unused_sources = {str(s) for s in unused_sources} raise errors.ParameterError('No such file or directory', param_hint=unused_sources) if destination.exists() and not destination.is_dir(): if len(files) > 1: raise errors.ParameterError( 'Cannot copy multiple files or directories to a file') # Create metadata and move files to dataset results = [] remote_client = LocalClient(repo_path) # Pull files from LFS paths = set() for path, src, _ in files: if src.is_dir(): continue if src.is_symlink(): try: path = str(src.resolve().relative_to(repo_path)) except ValueError: # External file pass paths.add(path) self._fetch_lfs_files(repo_path, paths) # Fetch metadata from Renku if any paths = {f[0] for f in files} metadata = self._fetch_files_metadata(remote_client, paths) for path, src, dst in files: if not src.is_dir(): # Use original metadata if it exists based_on = metadata.get(path) if based_on: based_on.url = url based_on.based_on = None creators = based_on.creator else: creators = [] # grab all the creators from the commit history for commit in repo.iter_commits(paths=path): creator = Person.from_commit(commit) if creator not in creators: creators.append(creator) based_on = DatasetFile.from_revision(remote_client, path=path, url=url, creator=creators) path_in_dst_repo = dst.relative_to(self.path) if remote_client._is_external_file(src): operation = (src.resolve(), dst, 'symlink') else: operation = (src, dst, 'copy') results.append({ 'path': path_in_dst_repo, 'url': remove_credentials(url), 'creator': creators, 'parent': self, 'based_on': based_on, 'operation': operation }) return results
def _migrate_submodule_based_datasets(client): from renku.core.management import LocalClient from renku.core.management.migrate import is_project_unsupported submodules = client.repo.submodules if not submodules: return for s in submodules: try: s.update() except GitError: pass submodules_urls = {s.path: s.url for s in submodules} repo_paths = [] symlinks = [] for dataset in client.datasets.values(): for file_ in dataset.files: path = client.path / file_.path if not path.is_symlink(): continue target = path.resolve() if '/.renku/vendors/' not in str(target): continue repo = Repo(target.parent, search_parent_directories=True) repo_path = repo.working_dir if repo_path not in repo_paths: repo_paths.append(repo_path) symlinks.append((file_.path, target, repo_path)) if not symlinks: return remote_clients = {p: LocalClient(p) for p in repo_paths} for remote_client in remote_clients.values(): if not is_project_unsupported(remote_client): migrate(remote_client) metadata = {} for path, target, repo_path in symlinks: remote_client = remote_clients[repo_path] path_within_repo = target.relative_to(repo_path) repo_is_remote = '.renku/vendors/local' not in repo_path based_on = None submodule_path = Path(repo_path).relative_to(client.path) url = submodules_urls.get(str(submodule_path), '') if repo_is_remote: based_on = _fetch_file_metadata(remote_client, path_within_repo) if based_on: based_on.url = url based_on.based_on = None else: based_on = DatasetFile.from_revision(remote_client, path=path_within_repo, url=url) else: if url: full_path = Path(url) / path_within_repo rel_path = os.path.relpath(full_path, client.path) url = f'file://{rel_path}' metadata[path] = (based_on, url) path = client.path / path path.unlink() try: shutil.move(target, path) except FileNotFoundError: raise errors.InvalidFileOperation(f'File was not found: {target}') for s in submodules: if s.path.startswith('.renku/vendors/'): try: s.remove(force=True) except ValueError: pass for dataset in client.datasets.values(): for file_ in dataset.files: if file_.path in metadata: based_on, url = metadata[file_.path] file_.based_on = based_on file_.url = remove_credentials(url) dataset.to_yaml()
def commit(self, commit_only=None, commit_empty=True, raise_if_empty=False, commit_message=None): """Automatic commit.""" from git import Actor from renku.version import __version__, version_url diff_before = set() if commit_only == COMMIT_DIFF_STRATEGY: staged = {item.a_path for item in self.repo.index.diff(None)} modified = {item.a_path for item in self.repo.index.diff('HEAD')} if staged or modified: self.repo.git.reset() # Exclude files created by pipes. diff_before = { file_ for file_ in self.repo.untracked_files if STARTED_AT - int(Path(file_).stat().st_ctime * 1e3) >= 1e3 } if isinstance(commit_only, list): for path_ in commit_only: self.ensure_untracked(str(path_)) self.ensure_unstaged(str(path_)) yield committer = Actor('renku {0}'.format(__version__), version_url) change_types = {} if commit_only == COMMIT_DIFF_STRATEGY: # Get diff generated in command. change_types = { item.a_path: item.change_type for item in self.repo.index.diff(None) } staged_after = set(change_types.keys()) modified_after_change_types = { item.a_path: item.change_type for item in self.repo.index.diff('HEAD') } modified_after = set(modified_after_change_types.keys()) change_types.update(modified_after_change_types) diff_after = set(self.repo.untracked_files)\ .union(staged_after)\ .union(modified_after) # Remove files not touched in command. commit_only = list(diff_after - diff_before) if isinstance(commit_only, list): for path_ in commit_only: p = Path(path_) if p.exists() or change_types.get(path_) == 'D': self.repo.git.add(path_) if not commit_only: self.repo.git.add('--all') if not commit_empty and not self.repo.index.diff('HEAD'): if raise_if_empty: raise errors.NothingToCommit() return if commit_message and not isinstance(commit_message, str): raise errors.CommitMessageEmpty() elif not commit_message: argv = [os.path.basename(sys.argv[0]) ] + [remove_credentials(arg) for arg in sys.argv[1:]] commit_message = ' '.join(argv) # Ignore pre-commit hooks since we have already done everything. self.repo.index.commit( commit_message, committer=committer, skip_hooks=True, )
def import_dataset( client, uri, short_name='', extract=False, with_prompt=False, yes=False, commit_message=None, progress=None, ): """Import data from a 3rd party provider or another renku project.""" provider, err = ProviderFactory.from_uri(uri) if err and provider is None: raise ParameterError('Could not process {0}.\n{1}'.format(uri, err)) try: record = provider.find_record(uri, client) dataset = record.as_dataset(client) files = dataset.files total_size = 0 if with_prompt and not yes: click.echo( tabulate( files, headers=OrderedDict(( ('checksum', None), ('filename', 'name'), ('size_in_mb', 'size (mb)'), ('filetype', 'type'), )), floatfmt='.2f' ) ) text_prompt = 'Do you wish to download this version?' if record.is_last_version(uri) is False: text_prompt = WARNING + 'Newer version found at {}\n'.format( record.links.get('latest_html') ) + text_prompt click.confirm(text_prompt, abort=True) for file_ in files: if file_.size_in_mb is not None: total_size += file_.size_in_mb total_size *= 2**20 except KeyError as e: raise ParameterError(( 'Could not process {0}.\n' 'Unable to fetch metadata due to {1}'.format(uri, e) )) except LookupError as e: raise ParameterError( ('Could not process {0}.\n' 'Reason: {1}'.format(uri, str(e))) ) if not files: raise ParameterError('Dataset {} has no files.'.format(uri)) dataset.same_as = Url(url_id=remove_credentials(uri)) if not provider.is_git_based: if not short_name: short_name = generate_default_short_name( dataset.name, dataset.version ) if is_doi(dataset.identifier): dataset.same_as = Url( url_str=urllib.parse. urljoin('https://doi.org', dataset.identifier) ) urls, names = zip(*[(f.url, f.filename) for f in files]) _add_to_dataset( client, urls=urls, short_name=short_name, create=True, with_metadata=dataset, force=True, extract=extract, all_at_once=True, destination_names=names, progress=progress, interactive=with_prompt, total_size=total_size, ) if dataset.version: tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version) tag_dataset( client, short_name, tag_name, 'Tag {} created by renku import'.format(dataset.version) ) else: short_name = short_name or dataset.short_name _add_to_dataset( client, urls=[record.project_url], short_name=short_name, sources=[f.path for f in files], with_metadata=dataset, create=True )
def _add_from_git(self, dataset, url, sources, destination, ref): """Process adding resources from another git repository.""" from renku import LocalClient u = parse.urlparse(url) sources = self._resolve_paths(u.path, sources) # Get all files from repo that match sources repo, repo_path = self._prepare_git_repo(url, ref) copied_sources = set() files = set() for file in repo.head.commit.tree.traverse(): path = file.path result = self._get_src_and_dst(path, repo_path, sources, destination) if result: files.add(result) source = result[3] copied_sources.add(source) uncopied_sources = sources - copied_sources if uncopied_sources: uncopied_sources = {str(s) for s in uncopied_sources} raise errors.ParameterError('No such file or directory', param_hint=uncopied_sources) # Create metadata and move files to dataset results = [] remote_client = LocalClient(repo_path) # Pull files from LFS paths = set() for path, src, _, __ in files: if src.is_dir(): continue if src.is_symlink(): path = str(src.resolve().relative_to(repo_path)) paths.add(path) self._fetch_lfs_files(repo_path, paths) # Fetch metadata from Renku if any paths = {f[0] for f in files} metadata = self._fetch_files_metadata(remote_client, paths) for path, src, dst, _ in files: if not src.is_dir(): # Use original metadata if it exists based_on = metadata.get(path) if based_on: based_on.url = url based_on.based_on = None creators = based_on.creator else: creators = [] # grab all the creators from the commit history for commit in repo.iter_commits(paths=path): creator = Person.from_commit(commit) if creator not in creators: creators.append(creator) based_on = DatasetFile.from_revision(remote_client, path=path, url=url) path_in_dst_repo = dst.relative_to(self.path) results.append({ 'path': path_in_dst_repo, 'url': remove_credentials(url), 'creator': creators, 'parent': self, 'based_on': based_on }) dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy(str(src), str(dst)) return results
def import_dataset( client, uri, name="", extract=False, with_prompt=False, yes=False, commit_message=None, progress=None, ): """Import data from a 3rd party provider or another renku project.""" u = urllib.parse.urlparse(uri) if u.scheme not in ("", "file", "git+https", "git+ssh", "doi"): # NOTE: Check if the url is a redirect. uri = requests.head(uri, allow_redirects=True).url provider, err = ProviderFactory.from_uri(uri) if err and provider is None: raise ParameterError("Could not process {0}.\n{1}".format(uri, err)) try: record = provider.find_record(uri, client) dataset = record.as_dataset(client) files = dataset.files total_size = 0 if with_prompt and not yes: click.echo( tabulate( files, headers=OrderedDict(( ("checksum", None), ("filename", "name"), ("size_in_mb", "size (mb)"), ("filetype", "type"), )), floatfmt=".2f", )) text_prompt = "Do you wish to download this version?" if record.is_last_version(uri) is False: text_prompt = (WARNING + "Newer version found at {}\n".format( record.links.get("latest_html")) + text_prompt) click.confirm(text_prompt, abort=True) for file_ in files: if file_.size_in_mb is not None: total_size += file_.size_in_mb total_size *= 2**20 except KeyError as e: raise ParameterError( ("Could not process {0}.\n" "Unable to fetch metadata due to {1}".format(uri, e))) except LookupError as e: raise ParameterError(("Could not process {0}.\n" "Reason: {1}".format(uri, str(e)))) if not files: raise ParameterError("Dataset {} has no files.".format(uri)) dataset.same_as = Url(url_id=remove_credentials(uri)) if not provider.is_git_based: if not name: name = generate_default_name(dataset.title, dataset.version) if is_doi(dataset.identifier): dataset.same_as = Url(url_str=urllib.parse.urljoin( "https://doi.org", dataset.identifier)) urls, names = zip(*[(f.source, f.filename) for f in files]) _add_to_dataset( client, urls=urls, name=name, create=True, with_metadata=dataset, force=True, extract=extract, all_at_once=True, destination_names=names, progress=progress, interactive=with_prompt, total_size=total_size, ) if dataset.version: tag_name = re.sub("[^a-zA-Z0-9.-_]", "_", dataset.version) tag_dataset( client, name, tag_name, "Tag {} created by renku import".format(dataset.version)) else: name = name or dataset.name if not dataset.data_dir: raise OperationError( f"Data directory for dataset must be set: {dataset.name}") sources = [f"{dataset.data_dir}/**"] for file_ in dataset.files: try: Path(file_.path).relative_to(dataset.data_dir) except ValueError: # Files that are not in dataset's data directory sources.append(file_.path) _add_to_dataset( client, urls=[record.project_url], name=name, sources=sources, with_metadata=dataset, create=True, )
def import_dataset( client, uri, short_name='', extract=False, with_prompt=False, commit_message=None, progress=None, ): """Import data from a 3rd party provider.""" provider, err = ProviderFactory.from_uri(uri) if err and provider is None: raise ParameterError('Could not process {0}.\n{1}'.format(uri, err)) try: record = provider.find_record(uri) dataset = record.as_dataset(client) files = dataset.files if with_prompt: click.echo( tabulate(files, headers=OrderedDict(( ('checksum', None), ('filename', 'name'), ('size_in_mb', 'size (mb)'), ('filetype', 'type'), )))) text_prompt = 'Do you wish to download this version?' if record.is_last_version(uri) is False: text_prompt = WARNING + 'Newer version found at {}\n'.format( record.links.get('latest_html')) + text_prompt click.confirm(text_prompt, abort=True) except KeyError as e: raise ParameterError( ('Could not process {0}.\n' 'Unable to fetch metadata due to {1}'.format(uri, e))) except LookupError: raise ParameterError(('Could not process {0}.\n' 'URI not found.'.format(uri))) if files: if not short_name: short_name = generate_default_short_name(dataset.name, dataset.version) dataset.url = remove_credentials(dataset.url) add_to_dataset( client, urls=[f.url for f in files], short_name=short_name, create=True, with_metadata=dataset, force=True, extract=extract, all_at_once=True, progress=progress, ) if dataset.version: tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version) tag_dataset( client, short_name, tag_name, 'Tag {} created by renku import'.format(dataset.version))
def _add_from_git(self, dataset, url, sources, destination, ref): """Process adding resources from another git repository.""" from renku import LocalClient u = parse.urlparse(url) sources = self._resolve_paths(u.path, sources) # Get all files from repo that match sources repo, repo_path = self.prepare_git_repo(url, ref) files = set() used_sources = set() for file in repo.head.commit.tree.traverse(): path = file.path result = self._get_src_and_dst(path, repo_path, sources, destination, used_sources) if result: files.add(result) unused_sources = set(sources.keys()) - used_sources if unused_sources: unused_sources = {str(s) for s in unused_sources} raise errors.ParameterError("No such file or directory", param_hint=unused_sources) # Create metadata and move files to dataset results = [] remote_client = LocalClient(repo_path) # Pull files from LFS paths = set() for path, src, _ in files: if src.is_dir(): continue if src.is_symlink(): try: path = str(src.resolve().relative_to(repo_path)) except ValueError: # External file pass paths.add(path) self._fetch_lfs_files(repo_path, paths) # Fetch metadata from Renku if any paths = {f[0] for f in files} metadata = self._fetch_files_metadata(remote_client, paths) new_files = [] for path, src, dst in files: if not src.is_dir(): # Use original metadata if it exists based_on = metadata.get(path) if based_on: based_on.url = url based_on.based_on = None based_on.source = url else: based_on = DatasetFile.from_revision(remote_client, path=src, url=url, source=url) path_in_dst_repo = dst.relative_to(self.path) if path_in_dst_repo in new_files: # A path with the same destination is already copied continue new_files.append(path_in_dst_repo) if remote_client._is_external_file(src): operation = (src.resolve(), dst, "symlink") else: operation = (src, dst, "copy") results.append( { "path": path_in_dst_repo, "source": remove_credentials(url), "parent": self, "based_on": based_on, "operation": operation, } ) return results