def _construct_creators(creators, ignore_email=False): from collections.abc import Iterable creators = creators or () if not isinstance(creators, Iterable) or isinstance(creators, str): raise errors.ParameterError("Invalid type") people = [] no_email_warnings = [] for creator in creators: if isinstance(creator, str): person = Person.from_string(creator) elif isinstance(creator, dict): person = Person.from_dict(creator) else: raise errors.ParameterError("Invalid type") message = 'A valid format is "Name <email> [affiliation]"' if not person.name: # pragma: no cover raise errors.ParameterError( f'Name is invalid: "{creator}".\n{message}') if not person.email: if not ignore_email: # pragma: no cover raise errors.ParameterError( f'Email is invalid: "{creator}".\n{message}') else: no_email_warnings.append(creator) people.append(person) return people, no_email_warnings
def find_record(self, uri, client=None): """Retrieves a dataset from Renku. :raises: ``LookupError`` :param uri: URL :return: ``DataverseRecord`` """ from renku.core.management import LocalClient same_as, kg_urls = self._get_dataset_info(uri) project_url = None failed_urls = [] for kg_url in kg_urls: kg_datasets_url, ssh_url, https_url = self._get_project_urls(kg_url) # Check if the project contains the dataset if same_as is None: # Dataset is in the project dataset_id = self._extract_dataset_id(uri) else: # Dataset is sameAs one of the datasets in the project datasets = self._query_knowledge_graph(kg_datasets_url) ids = [ds["identifier"] for ds in datasets if ds["sameAs"] == same_as] if not ids: continue dataset_id = ids[0] # Check if we can clone the project for url in (ssh_url, https_url): try: repo, repo_path = client.prepare_git_repo(url) except errors.GitError: failed_urls.append(url) else: project_url = url break if project_url is not None: break if project_url is None: if failed_urls: message = "Cannot clone remote projects:\n\t" + "\n\t".join(failed_urls) else: message = "Cannot find any project for the dataset." raise errors.ParameterError(message, param_hint=uri) remote_client = LocalClient(repo_path) self._migrate_project(remote_client) datasets = [d for d in remote_client.datasets.values() if urllib.parse.quote(d.uid, safe="") == dataset_id] if len(datasets) == 0: raise errors.ParameterError( 'Cannot find dataset with id "{}" in project "{}"'.format(dataset_id, project_url) ) if len(datasets) > 1: raise errors.ParameterError('Found multiple datasets with id "{}"'.format(dataset_id)) return _RenkuRecordSerializer(datasets[0], project_url, remote_client)
def create_dataset( self, short_name=None, title=None, description=None, creators=None, keywords=None, ): """Create a dataset.""" if not short_name: raise errors.ParameterError('Dataset short_name must be provided.') if not is_dataset_short_name_valid(short_name): raise errors.ParameterError( 'Dataset short_name "{}" is not valid.'.format(short_name)) if self.load_dataset(short_name=short_name): raise errors.DatasetExistsError( 'Dataset exists: "{}".'.format(short_name)) if not title: title = short_name identifier = str(uuid.uuid4()) path = self.renku_datasets_path / identifier / self.METADATA if path.exists(): raise errors.DatasetExistsError( 'Dataset with reference {} exists'.format(path)) path.parent.mkdir(parents=True, exist_ok=True) if creators is None: creators = [Person.from_git(self.repo)] keywords = keywords or () with with_reference(path): dataset = Dataset( client=self, identifier=identifier, short_name=short_name, name=title, description=description, creator=creators, keywords=keywords, ) dataset_ref = LinkReference.create(client=self, name='datasets/' + short_name) dataset_ref.set_reference(path) dataset.path = Path(dataset.path).relative_to(self.path) dataset.to_yaml() return dataset, path, dataset_ref
def _add_from_local(self, dataset, path, external, destination): """Add a file or directory from a local filesystem.""" src = Path(os.path.abspath(path)) if not src.exists(): raise errors.ParameterError(f"Cannot find file/directory: {path}") dst = destination / src.name # if we have a directory, recurse if src.is_dir(): if dst.exists() and not dst.is_dir(): raise errors.ParameterError(f'Cannot copy directory to a file: "{dst}"') if src == (self.path / dataset.data_dir).resolve(): raise errors.ParameterError(f"Cannot add dataset's data directory recursively: {path}") if self._check_protected_path(src): raise errors.ProtectedFiles([src]) files = [] for f in src.iterdir(): files.extend( self._add_from_local(dataset=dataset, path=os.path.abspath(f), external=external, destination=dst) ) return files else: # Check if file is in the project and return it path_in_repo = None if self._is_external_file(src): path_in_repo = path else: try: path_in_repo = src.relative_to(self.path) except ValueError: pass else: if self._check_protected_path(src): raise errors.ProtectedFiles([src]) if path_in_repo: return [{"path": path_in_repo, "source": path_in_repo, "parent": self}] action = "symlink" if external else "copy" return [ { "path": dst.relative_to(self.path), "source": os.path.relpath(str(src), str(self.path)), "parent": self, "operation": (src, dst, action), } ]
def create_dataset(self, name, short_name=None, description='', creators=None): """Create a dataset.""" if not name: raise errors.ParameterError('Dataset name must be provided.') if not short_name: short_name = generate_default_short_name(name, None) if not is_dataset_name_valid(short_name): raise errors.ParameterError( 'Dataset name "{}" is not valid.'.format(short_name)) if self.load_dataset(name=short_name): raise errors.DatasetExistsError( 'Dataset exists: "{}".'.format(short_name)) identifier = str(uuid.uuid4()) path = self.renku_datasets_path / identifier / self.METADATA if path.exists(): raise errors.DatasetExistsError( 'Dataset with reference {} exists'.format(path)) path.parent.mkdir(parents=True, exist_ok=True) if creators is None: creators = [Person.from_git(self.repo)] with with_reference(path): dataset = Dataset(client=self, identifier=identifier, name=name, short_name=short_name, description=description, creator=creators) dataset_ref = LinkReference.create(client=self, name='datasets/' + short_name) dataset_ref.set_reference(path) dataset.to_yaml() return dataset, path, dataset_ref
def list_unpushed_lfs_paths(self, client=None): """List paths tracked in lfs for a client.""" client = client or self if (len(client.repo.remotes) < 1 or not client.repo.active_branch.tracking_branch()): raise errors.ConfigurationError( 'No git remote is configured for {} branch {}.'.format( client.path, client.repo.active_branch.name) + 'Cleaning the storage cache would lead to a loss of data as ' + 'it is not on a server. Please see ' + 'https://www.atlassian.com/git/tutorials/syncing for ' + 'information on how to sync with a remote.') try: status = check_output(self._CMD_STORAGE_STATUS, cwd=client.path, encoding='UTF-8') except (KeyboardInterrupt, OSError) as e: raise errors.ParameterError( 'Couldn\'t run \'git lfs\':\n{0}'.format(e)) files = status.split('Objects to be committed:')[0].splitlines()[2:] files = [ client.path / f.rsplit('(', 1)[0].strip() for f in files if f.strip() ] return files
def from_string(cls, string): """Create an instance from a 'Name <email>' string.""" regex_pattern = r'([^<]*)<{0,1}([^@<>]+@[^@<>]+\.[^@<>]+)*>{0,1}' name, email = re.search(regex_pattern, string).groups() name = name.rstrip() # Check the git configuration. if not name: # pragma: no cover raise errors.ParameterError( 'Name is invalid: A valid format is "Name <email>"') if not email: # pragma: no cover raise errors.ParameterError( 'Email is invalid: A valid format is "Name <email>"') return cls(name=name, email=email)
def short_name_validator(self, attribute, value): """Validate short_name.""" # short_name might have been scaped and have '%' in it if value and not is_dataset_short_name_valid(value): raise errors.ParameterError( 'Invalid "short_name": {}'.format(value) )
def track_paths_in_storage(self, *paths): """Track paths in the external storage.""" # Calculate which paths can be tracked in lfs track_paths = [] attrs = self.find_attr(*paths) for path in paths: # Do not add files with filter=lfs in .gitattributes if attrs.get(path, {}).get('filter') == 'lfs': continue path = Path(path) if path.is_dir(): track_paths.append(str(path / '**')) elif path.suffix != '.ipynb': # TODO create configurable filter and follow .gitattributes track_paths.append(str(path)) if track_paths: try: call( self._CMD_STORAGE_TRACK + track_paths, stdout=PIPE, stderr=STDOUT, cwd=str(self.path), ) except (KeyboardInterrupt, OSError) as e: raise errors.ParameterError( 'Couldn\'t run \'git lfs\':\n{0}'.format(e) )
def checkout(repo, ref): try: repo.git.checkout(ref) except GitCommandError: raise errors.ParameterError( 'Cannot find reference "{}" in Git repository: {}'.format( ref, url))
def init_external_storage(self, force=False): """Initialize the external storage for data.""" try: call( self._CMD_STORAGE_INSTALL + (["--force"] if force else []), stdout=PIPE, stderr=STDOUT, cwd=self.path, ) except (KeyboardInterrupt, OSError) as e: raise errors.ParameterError("Couldn't run 'git lfs':\n{0}".format(e))
def untrack_paths_from_storage(self, *paths): """Untrack paths from the external storage.""" try: call( self._CMD_STORAGE_UNTRACK + list(paths), stdout=PIPE, stderr=STDOUT, cwd=self.path, ) except (KeyboardInterrupt, OSError) as e: raise errors.ParameterError("Couldn't run 'git lfs':\n{0}".format(e))
def _resolve_path(self, root_path, path): """Check if a path is within a root path and resolve it.""" try: root_path = Path(root_path).resolve() return (root_path / path).resolve().relative_to(root_path) except ValueError: raise errors.ParameterError('File {} is not within path {}'.format( path, root_path))
def raise_template_error(value): """Raise template error with short explanation.""" error_info = [ '{0}'.format(value), 'Tip: a dictionary is expected', ('Example: --template-variables ' '\'{ "variable_1": "string", "variable_2": 2 }\'') ] raise errors.ParameterError('\n'.join(error_info), '"--template-variables"')
def list_tracked_paths(self, client=None): """List paths tracked in lfs for a client.""" client = client or self try: files = check_output(self._CMD_STORAGE_LIST, cwd=client.path, encoding="UTF-8") except (KeyboardInterrupt, OSError) as e: raise errors.ParameterError("Couldn't run 'git lfs':\n{0}".format(e)) files = [client.path / f for f in files.splitlines()] return files
def fmt_path(path): """Format path as relative to the client path.""" abs_path = os.path.abspath(client.path / path) try: return str(Path(abs_path).relative_to(client.path)) except ValueError: raise errors.ParameterError( f'File {abs_path} is not within the project.' )
def resolve_data_directory(data_dir, path): """Check data directory is within the project path.""" if not data_dir: return absolute_data_dir = (Path(path) / data_dir).resolve() try: data_dir = absolute_data_dir.relative_to(path) except ValueError: raise errors.ParameterError( f"Data directory {data_dir} is not within project {path}") if str(data_dir).rstrip(os.path.sep) in INVALID_DATA_DIRS: raise errors.ParameterError( f"Cannot use {data_dir} as data directory.") return data_dir
def remove_dataset_tags(self, dataset, tags): """Removes tags from a dataset.""" tag_names = {t.name for t in dataset.tags} not_found = set(tags).difference(tag_names) if len(not_found) > 0: raise errors.ParameterError("Tags {} not found".format(", ".join(not_found))) dataset.tags = [t for t in dataset.tags if t.name not in tags] return dataset
def update_config(client, key, *, value=None, remove=False, global_only=False, commit_message=None): """Add, update, or remove configuration values.""" section, section_key = _split_section_and_key(key) if remove: value = client.remove_value(section, section_key, global_only=global_only) if value is None: raise errors.ParameterError('Key "{}" not found.'.format(key)) else: client.set_value(section, section_key, value, global_only=global_only) return value
def read_config(client, key, local_only, global_only): """Read configuration.""" if key: section, section_key = _split_section_and_key(key) value = client.get_value(section, section_key, local_only=local_only, global_only=global_only) if value is None: raise errors.ParameterError('Key "{}" not found.'.format(key)) return value return client.get_config(local_only=local_only, global_only=global_only)
def _make_headers(columns, columns_mapping): headers = OrderedDict() for column in columns: if column not in columns_mapping: raise errors.ParameterError( 'Invalid column name: "{}".\nPossible values: {}'.format( column, ', '.join(columns_mapping))) name, display_name = columns_mapping.get(column) headers[name] = display_name return headers
def parse_parameters(ctx, param, value): """Parse parameters to dictionary.""" parameters = {} for parameter in value: splitted = parameter.split("=", 1) if len(splitted) < 2 or len(splitted[0]) < 1: raise errors.ParameterError( 'Parameter format must be --parameter "param1"="value". ', f'--parameter "{parameter}"') parameters[splitted[0]] = splitted[1] return parameters
def _check_config_is_not_readonly(self, section, key): from renku.core import errors readonly_configs = {'renku': [self.DATA_DIR_CONFIG_KEY]} value = self.get_value(section, key, local_only=True) if not value: return if key in readonly_configs.get(section, []): raise errors.ParameterError( f'Configuration {key} cannot be modified.')
def init_external_storage(self, force=False): """Initialize the external storage for data.""" try: call( self._CMD_STORAGE_INSTALL + (['--force'] if force else []), stdout=PIPE, stderr=STDOUT, cwd=str(self.path.absolute()), ) except (KeyboardInterrupt, OSError) as e: raise errors.ParameterError( 'Couldn\'t run \'git lfs\':\n{0}'.format(e) )
def set_parameters(self, client, *, dataverse_server_url, dataverse_name, **kwargs): """Set and validate required parameters for a provider.""" CONFIG_BASE_URL = 'server_url' if not dataverse_server_url: dataverse_server_url = client.get_value('dataverse', CONFIG_BASE_URL) else: client.set_value('dataverse', CONFIG_BASE_URL, dataverse_server_url, global_only=True) if not dataverse_server_url: raise errors.ParameterError('Dataverse server URL is required.') if not dataverse_name: raise errors.ParameterError('Dataverse name is required.') self._server_url = dataverse_server_url self._dataverse_name = dataverse_name
def _download(self, url, filename, extract, progress_class=None, chunk_size=16384): def extract_dataset(filepath): """Extract downloaded file.""" try: tmp = tempfile.mkdtemp() patoolib.extract_archive(str(filepath), outdir=tmp, verbosity=-1) except patoolib.util.PatoolError: return filepath.parent, [filepath] else: filepath.unlink() return Path(tmp), [p for p in Path(tmp).rglob('*')] tmp_root = self.renku_path / self.CACHE tmp_root.mkdir(parents=True, exist_ok=True) tmp = tempfile.mkdtemp(dir=tmp_root) with requests.get(url, stream=True) as request: request.raise_for_status() if not filename: u = parse.urlparse(url) filename = Path(u.path).name if not filename: raise errors.ParameterError( f'URL Cannot find a file to download from {url}') download_to = Path(tmp) / filename with open(str(download_to), 'wb') as file_: total_size = int(request.headers.get('content-length', 0)) progress_class = progress_class or DownloadProgressCallback progress = progress_class(description=filename, total_size=total_size) try: for chunk in request.iter_content(chunk_size=chunk_size): if chunk: # ignore keep-alive chunks file_.write(chunk) progress.update(size=len(chunk)) finally: progress.finalize() if extract: return extract_dataset(download_to) return download_to.parent, [download_to]
def _get_src_and_dst(self, path, repo_path, sources, dst_root): if not sources: source = Path('.') else: source = None for s in sources: try: Path(path).relative_to(s) except ValueError: pass else: source = s break if not source: return src = repo_path / path source_name = Path(source).name relative_path = Path(path).relative_to(source) if not dst_root.exists(): if len(sources) == 1: dst = dst_root / relative_path else: # Treat destination as a directory dst = dst_root / source_name / relative_path elif dst_root.is_dir(): dst = dst_root / source_name / relative_path else: # Destination is an existing file if len(sources) == 1 and not src.is_dir(): dst = dst_root elif not sources: raise errors.ParameterError('Cannot copy repo to file') else: raise errors.ParameterError( 'Cannot copy multiple files or directories to a file') return (path, src, dst, source)
def add_dataset_tag(self, dataset, tag, description='', force=False): """Adds a new tag to a dataset. Validates if the tag already exists and that the tag follows the same rules as docker tags. See https://docs.docker.com/engine/reference/commandline/tag/ for a documentation of docker tag syntax. :raises: errors.ParameterError """ if len(tag) > 128: raise errors.ParameterError( 'Tags can be at most 128 characters long.') if not re.match('^(?![.-])[a-zA-Z0-9_.-]{1,128}$', tag): raise errors.ParameterError( ('Tag {} is invalid. \n' 'Only characters a-z, A-Z, 0-9, ., - and _ ' 'are allowed. \nTag can\'t start with a . or -').format(tag)) if any(t for t in dataset.tags if t.name == tag): if force: # remove duplicate tag dataset.tags = [t for t in dataset.tags if t.name != tag] else: raise errors.ParameterError( 'Tag {} already exists'.format(tag)) latest_commit = list(self.dataset_commits(dataset, max_results=1))[0] tag = DatasetTag(name=tag, description=description, commit=latest_commit.hexsha, dataset=dataset.name) dataset.tags.append(tag) return dataset
def tabulate(collection, columns, columns_mapping): """Format collection with a tabular output.""" if not columns: raise errors.ParameterError('Columns cannot be empty.') columns = [c.lower().strip() for c in columns.split(',') if c] headers = _make_headers(columns, columns_mapping) # Sort based on the first requested field attr = list(headers.keys())[0] collection = sorted(collection, key=lambda d: getattr(d, attr)) return tabulate_(collection, headers=headers, disable_numparse=True)
def _get_src_and_dst(self, path, repo_path, sources, dst_root, used_sources): is_wildcard = False if not sources: source = Path('.') else: source = None for s in sources.keys(): try: Path(path).relative_to(s) except ValueError: if glob.globmatch(path, str(s), flags=glob.GLOBSTAR): is_wildcard = True source = path used_sources.add(s) break else: source = s used_sources.add(source) break if not source: return src = repo_path / path source_name = Path(source).name relative_path = Path(path).relative_to(source) if src.is_dir() and is_wildcard: sources[source] = None used_sources.add(source) if not dst_root.exists(): # Destination will be a file or directory if len(sources) == 1 and not is_wildcard: dst = dst_root / relative_path else: # Treat destination as a directory dst = dst_root / source_name / relative_path elif dst_root.is_dir(): dst = dst_root / source_name / relative_path else: # Destination is an existing file if src.is_dir(): raise errors.ParameterError( 'Cannot copy multiple files or directories to a file') # Later we need to check if we are copying multiple files dst = dst_root return (path, src, dst)