def _construct_creators(creators, ignore_email=False): from collections.abc import Iterable creators = creators or () if not isinstance(creators, Iterable) or isinstance(creators, str): raise errors.ParameterError("Invalid type") people = [] no_email_warnings = [] for creator in creators: if isinstance(creator, str): person = Person.from_string(creator) elif isinstance(creator, dict): person = Person.from_dict(creator) else: raise errors.ParameterError("Invalid type") message = 'A valid format is "Name <email> [affiliation]"' if not person.name: # pragma: no cover raise errors.ParameterError( f'Name is invalid: "{creator}".\n{message}') if not person.email: if not ignore_email: # pragma: no cover raise errors.ParameterError( f'Email is invalid: "{creator}".\n{message}') else: no_email_warnings.append(creator) people.append(person) return people, no_email_warnings
def test_data_add(scheme, path, overwrite, error, client, data_file, directory_tree, dataset_responses): """Test data import.""" with raises(error): if path == "temp": path = str(data_file) elif path == "tempdir": path = str(directory_tree) with client.with_dataset("dataset", create=True) as d: d.creators = [Person(name="me", email="*****@*****.**", id="me_id")] client.add_data_to_dataset(d, ["{}{}".format(scheme, path)], overwrite=overwrite) target_path = os.path.join(DATA_DIR, "dataset", "file") with open(target_path) as f: assert f.read() == "1234" assert d.find_file(target_path) # check that the imported file is read-only assert not os.access(target_path, stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) # check the linking if scheme in ("", "file://"): shutil.rmtree("./data/dataset") with client.with_dataset("dataset") as d: d.creators = [Person(name="me", email="*****@*****.**", id="me_id")] client.add_data_to_dataset(d, ["{}{}".format(scheme, path)], overwrite=True) assert os.path.exists(target_path)
def create_dataset( client, name, short_name=None, description=None, creators=None, commit_message=None, ): """Create an empty dataset in the current repo. :raises: ``renku.core.errors.ParameterError`` """ if not creators: creators = [Person.from_git(client.repo)] elif hasattr(creators, '__iter__') and isinstance(creators[0], str): creators = [Person.from_string(c) for c in creators] elif hasattr(creators, '__iter__') and isinstance(creators[0], dict): creators = [Person.from_dict(creator) for creator in creators] dataset, _, __ = client.create_dataset(name=name, short_name=short_name, description=description, creators=creators) return dataset
def _convert_creators(value): """Convert creators.""" if isinstance(value, dict): # compatibility with previous versions return [Person.from_jsonld(value)] if isinstance(value, list): return [Person.from_jsonld(v) for v in value]
def test_creators_with_same_email(tmp_path): """Test creators with different names and same email address.""" creators = [Person(name="me", email="*****@*****.**"), Person(name="me2", email="*****@*****.**")] dataset = Dataset(name="dataset", creators=creators) path = tmp_path / "dataset.yml" dataset.__reference__ = path dataset.to_yaml() dataset = Dataset.from_yaml(path) assert 1 == len(dataset.creators) assert dataset.creators[0].name in ["me", "me2"]
def __attrs_post_init__(self): """Initialize computed attributes.""" if not self.creator and self.client: if self.client.renku_metadata_path.exists(): self.creator = Person.from_commit( self.client.find_previous_commit( self.client.renku_metadata_path, return_first=True), ) else: # this assumes the project is being newly created self.creator = Person.from_git(self.client.repo) self._id = self.project_id
def test_creator_parse(creators, data_file): """Test that different options for specifying creators work.""" dataset = Dataset(name="dataset", creators=creators) creator = Person(name="me", email="*****@*****.**") assert creator in dataset.creators # email check with pytest.raises(ValueError): Person(name="me", email="meexample.com") # creators must be a set or list of dicts or Person with pytest.raises(ValueError): Dataset(name="dataset", creators=["name"])
def test_creator_parse(creators, data_file): """Test that different options for specifying creators work.""" f = DatasetFile(path='file', creator=creators) creator = Person(name='me', email='*****@*****.**') assert creator in f.creator # email check with pytest.raises(ValueError): Person(name='me', email='meexample.com') # creators must be a set or list of dicts or Person with pytest.raises(ValueError): f = DatasetFile(path='file', creator=['name'])
def test_project_creator_deserialization(client, project): """Check that the correct creator is returned on deserialization.""" from renku.core.models.provenance.agents import Person # modify the project metadata to change the creator project = client.project project.creator = Person(email='*****@*****.**', name='Johnny Doe') project.to_yaml() client.repo.git.commit('-a', '--amend', '-C', 'HEAD', '--author', 'Johnny Doe <*****@*****.**>', '--no-verify') # the project creator should always be the one in the metadata assert '*****@*****.**' == client.project.creator.email assert 'Johnny Doe' == client.project.creator.name assert client.project.creator.label == client.project.creator.name # Remove the creator from metadata project = client.project project.creator = None project.to_yaml() client.repo.git.commit('-a', '--amend', '-C', 'HEAD', '--author', 'Jane Doe <*****@*****.**>', '--no-verify') # now the creator should be the one from the commit project = Project.from_yaml(client.renku_metadata_path, client=client) assert '*****@*****.**' == project.creator.email assert 'Jane Doe' == project.creator.name assert project.creator.label == project.creator.name
def test_data_add_recursive(directory_tree, client): """Test recursive data imports.""" with client.with_dataset("dataset", create=True) as dataset: dataset.creators = [Person(name="me", email="*****@*****.**", id="me_id")] client.add_data_to_dataset(dataset, [directory_tree.join("dir2").strpath]) assert os.path.basename(os.path.dirname(dataset.files[0].path)) == "dir2"
def create_dataset( client, short_name, title=None, description='', creators=None, keywords=None, commit_message=None ): """Create an empty dataset in the current repo. :raises: ``renku.core.errors.ParameterError`` """ if not creators: creators = [Person.from_git(client.repo)] else: creators, _ = _construct_creators(creators) dataset, _, __ = client.create_dataset( short_name=short_name, title=title, description=description, creators=creators, keywords=keywords ) return dataset
def init_repository(self, force=False): """Initialize an empty Renku repository.""" from git import Repo from renku.core.models.provenance.agents import Person # verify if folder is empty if self.repo is not None and not force: raise errors.InvalidFileOperation( 'Folder {0} already contains file. Use --force to overwrite'. format(self.repo.git_dir)) # initialize repo path = self.path.absolute() self.repo = Repo.init(str(path)) # verify if author information is available Person.from_git(self.repo)
def create_dataset( self, short_name=None, title=None, description=None, creators=None, keywords=None, ): """Create a dataset.""" if not short_name: raise errors.ParameterError('Dataset short_name must be provided.') if not is_dataset_short_name_valid(short_name): raise errors.ParameterError( 'Dataset short_name "{}" is not valid.'.format(short_name)) if self.load_dataset(short_name=short_name): raise errors.DatasetExistsError( 'Dataset exists: "{}".'.format(short_name)) if not title: title = short_name identifier = str(uuid.uuid4()) path = self.renku_datasets_path / identifier / self.METADATA if path.exists(): raise errors.DatasetExistsError( 'Dataset with reference {} exists'.format(path)) path.parent.mkdir(parents=True, exist_ok=True) if creators is None: creators = [Person.from_git(self.repo)] keywords = keywords or () with with_reference(path): dataset = Dataset( client=self, identifier=identifier, short_name=short_name, name=title, description=description, creator=creators, keywords=keywords, ) dataset_ref = LinkReference.create(client=self, name='datasets/' + short_name) dataset_ref.set_reference(path) dataset.path = Path(dataset.path).relative_to(self.path) dataset.to_yaml() return dataset, path, dataset_ref
def test_data_add_recursive(directory_tree, client): """Test recursive data imports.""" with client.with_dataset('dataset', create=True) as dataset: dataset.creator = [ Person(name='me', email='*****@*****.**', id='me_id') ] client.add_data_to_dataset(dataset, [directory_tree.join('dir2').strpath]) assert os.path.basename(os.path.dirname( dataset.files[0].path)) == 'dir2'
def __attrs_post_init__(self): """Initialize computed attributes.""" if not self.creator and self.client: if self.client.renku_metadata_path.exists(): self.creator = Person.from_commit( self.client.find_previous_commit(self.client.renku_metadata_path, return_first=True), ) else: # this assumes the project is being newly created self.creator = Person.from_git(self.client.repo) try: self._id = self.project_id except ValueError: """Fallback to old behaviour.""" if self._id: pass elif self.client and self.client.is_project_set(): self._id = self.client.project._id else: raise
def init_repository(self, force=False, user=None): """Initialize an empty Renku repository.""" from git import Repo from renku.core.models.provenance.agents import Person # verify if folder is empty if self.repo is not None and not force: raise errors.InvalidFileOperation( "Folder {0} already contains file. Use --force to overwrite". format(self.repo.git_dir)) # initialize repo and set user data path = self.path.absolute() self.repo = Repo.init(str(path)) if user: config_writer = self.repo.config_writer() for key, value in user.items(): config_writer.set_value("user", key, value) config_writer.release() # verify if author information is available Person.from_git(self.repo)
def test_data_add(scheme, path, overwrite, error, client, data_file, directory_tree, dataset_responses): """Test data import.""" with raises(error): if path == 'temp': path = str(data_file) elif path == 'tempdir': path = str(directory_tree) with client.with_dataset('dataset', create=True) as d: d.creator = [Person(name='me', email='*****@*****.**', id='me_id')] client.add_data_to_dataset(d, ['{}{}'.format(scheme, path)], overwrite=overwrite) target_path = os.path.join(DATA_DIR, 'dataset', 'file') with open(target_path) as f: assert f.read() == '1234' assert d.find_file(target_path) # check that the imported file is read-only assert not os.access(target_path, stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) # check the linking if scheme in ('', 'file://'): shutil.rmtree('./data/dataset') with client.with_dataset('dataset') as d: d.creator = [ Person(name='me', email='*****@*****.**', id='me_id') ] client.add_data_to_dataset(d, ['{}{}'.format(scheme, path)], overwrite=True) assert os.path.exists(target_path)
def dataset(client): """Create a dataset.""" from renku.core.models.provenance.agents import Person with client.with_dataset("dataset", create=True) as dataset: dataset.creators = [ Person( **{ "affiliation": "xxx", "email": "*****@*****.**", "id": "me_id", "name": "me", }) ] return dataset
def dataset(client): """Create a dataset.""" from renku.core.models.provenance.agents import Person with client.with_dataset('dataset', create=True) as dataset: dataset.creator = [ Person( **{ 'affiliation': 'xxx', 'email': '*****@*****.**', 'id': 'me_id', 'name': 'me', }) ] return dataset
def create_dataset(self, name, short_name=None, description='', creators=None): """Create a dataset.""" if not name: raise errors.ParameterError('Dataset name must be provided.') if not short_name: short_name = generate_default_short_name(name, None) if not is_dataset_name_valid(short_name): raise errors.ParameterError( 'Dataset name "{}" is not valid.'.format(short_name)) if self.load_dataset(name=short_name): raise errors.DatasetExistsError( 'Dataset exists: "{}".'.format(short_name)) identifier = str(uuid.uuid4()) path = self.renku_datasets_path / identifier / self.METADATA if path.exists(): raise errors.DatasetExistsError( 'Dataset with reference {} exists'.format(path)) path.parent.mkdir(parents=True, exist_ok=True) if creators is None: creators = [Person.from_git(self.repo)] with with_reference(path): dataset = Dataset(client=self, identifier=identifier, name=name, short_name=short_name, description=description, creator=creators) dataset_ref = LinkReference.create(client=self, name='datasets/' + short_name) dataset_ref.set_reference(path) dataset.to_yaml() return dataset, path, dataset_ref
def test_construct_person(value, has_name, has_email, has_affiliation): """Test construct person from string.""" p = Person.from_string(value) if has_name: assert "John Doe" == p.name else: assert "" == p.name if has_email: assert "*****@*****.**" == p.email else: assert p.email is None if has_affiliation: assert "Some Affiliation" == p.affiliation else: assert p.affiliation is None
def _handle_sentry(self): """Handle exceptions using Sentry.""" from sentry_sdk import capture_exception, configure_scope from sentry_sdk.utils import capture_internal_exceptions with configure_scope() as scope: with capture_internal_exceptions(): from git import Repo from renku.core.commands import get_git_home from renku.core.models.provenance.agents import Person repo = Repo(get_git_home()) user = Person.from_git(repo) scope.user = {'name': user.name, 'email': user.email} event_id = capture_exception() click.echo( _BUG + 'Recorded in Sentry with ID: {0}\n'.format(event_id), err=True, ) raise
def test_project_shacl(project, client): """Test project metadata structure.""" from renku.core.models.provenance.agents import Person path = Path( __file__ ).parent.parent.parent / 'fixtures' / 'force_project_shacl.json' project = client.project project.creator = Person(email='*****@*****.**', name='Johnny Doe') g = project.as_jsonld() rdf = pyld.jsonld.to_rdf(g, options={ 'format': 'application/n-quads', 'produceGeneralizedRdf': False }) r, _, t = validate_graph(rdf, shacl_path=str(path)) assert r is True, t r, _, t = validate_graph(rdf) assert r is True, t
def update_dataset_files(self, files, ref, delete=False): """Update files and dataset metadata according to their remotes. :param files: List of files to be updated :param delete: Indicates whether to delete files or not :return: List of files that should be deleted """ from renku import LocalClient visited_repos = {} updated_files = [] deleted_files = [] for file_ in files: if not file_.based_on: continue file_.based_on = DatasetFile.from_jsonld(file_.based_on) based_on = file_.based_on url = based_on.url if url in visited_repos: repo, repo_path, remote_client = visited_repos[url] else: repo, repo_path = self.prepare_git_repo(url, ref) remote_client = LocalClient(repo_path) visited_repos[url] = repo, repo_path, remote_client remote_file = self._fetch_file_metadata(remote_client, based_on.path) if not remote_file: try: remote_file = DatasetFile.from_revision( remote_client, path=based_on.path, url=url, added=based_on.added) except KeyError: raise errors.ParameterError( 'Cannot find file {} in the repo {}'.format( based_on.url, url)) commit_sha = self._get_commit_sha_from_label(based_on) remote_commit_sha = self._get_commit_sha_from_label(remote_file) if commit_sha != remote_commit_sha: src = Path(repo.working_dir) / based_on.path dst = self.renku_path.parent / file_.path if src.exists(): # Fetch file is it is tracked by Git LFS self._fetch_lfs_files(repo_path, {based_on.path}) if remote_client._is_external_file(src): self.remove_file(dst) self._create_external_file(src.resolve(), dst) else: shutil.copy(src, dst) file_.based_on.commit = remote_file.commit file_.based_on._label = remote_file._label updated_files.append(file_) else: # File was removed or renamed if delete: self.remove_file(dst) deleted_files.append(file_) if not updated_files and (not delete or not deleted_files): # Nothing to commit or update return deleted_files # Commit changes in files file_paths = {str(f.path) for f in updated_files + deleted_files} # Force-add to include possible ignored files that are in datasets self.repo.git.add(*(file_paths), force=True) self.repo.index.commit( 'renku dataset: updated {} files and deleted {} files'.format( len(updated_files), len(deleted_files))) # Update datasets' metadata modified_datasets = {} for file_ in updated_files: # Re-create list of creators creators = [] # grab all the creators from the commit history for commit in repo.iter_commits(paths=file_.path): creator = Person.from_commit(commit) if creator not in creators: creators.append(creator) new_file = DatasetFile.from_revision(self, path=file_.path, based_on=file_.based_on, creator=creators) file_.dataset.update_files([new_file]) modified_datasets[file_.dataset.name] = file_.dataset if delete: for file_ in deleted_files: file_.dataset.unlink_file(file_.path) modified_datasets[file_.dataset.name] = file_.dataset for dataset in modified_datasets.values(): dataset.to_yaml() return deleted_files
def _add_from_git(self, dataset, url, sources, destination, ref): """Process adding resources from another git repository.""" from renku import LocalClient u = parse.urlparse(url) sources = self._resolve_paths(u.path, sources) # Get all files from repo that match sources repo, repo_path = self.prepare_git_repo(url, ref) files = set() used_sources = set() for file in repo.head.commit.tree.traverse(): path = file.path result = self._get_src_and_dst(path, repo_path, sources, destination, used_sources) if result: files.add(result) unused_sources = set(sources.keys()) - used_sources if unused_sources: unused_sources = {str(s) for s in unused_sources} raise errors.ParameterError('No such file or directory', param_hint=unused_sources) if destination.exists() and not destination.is_dir(): if len(files) > 1: raise errors.ParameterError( 'Cannot copy multiple files or directories to a file') # Create metadata and move files to dataset results = [] remote_client = LocalClient(repo_path) # Pull files from LFS paths = set() for path, src, _ in files: if src.is_dir(): continue if src.is_symlink(): try: path = str(src.resolve().relative_to(repo_path)) except ValueError: # External file pass paths.add(path) self._fetch_lfs_files(repo_path, paths) # Fetch metadata from Renku if any paths = {f[0] for f in files} metadata = self._fetch_files_metadata(remote_client, paths) for path, src, dst in files: if not src.is_dir(): # Use original metadata if it exists based_on = metadata.get(path) if based_on: based_on.url = url based_on.based_on = None creators = based_on.creator else: creators = [] # grab all the creators from the commit history for commit in repo.iter_commits(paths=path): creator = Person.from_commit(commit) if creator not in creators: creators.append(creator) based_on = DatasetFile.from_revision(remote_client, path=path, url=url, creator=creators) path_in_dst_repo = dst.relative_to(self.path) if remote_client._is_external_file(src): operation = (src.resolve(), dst, 'symlink') else: operation = (src, dst, 'copy') results.append({ 'path': path_in_dst_repo, 'url': remove_credentials(url), 'creator': creators, 'parent': self, 'based_on': based_on, 'operation': operation }) return results
def _add_from_git(self, dataset, url, sources, destination, ref): """Process adding resources from another git repository.""" from renku import LocalClient u = parse.urlparse(url) sources = self._resolve_paths(u.path, sources) # Get all files from repo that match sources repo, repo_path = self._prepare_git_repo(url, ref) copied_sources = set() files = set() for file in repo.head.commit.tree.traverse(): path = file.path result = self._get_src_and_dst(path, repo_path, sources, destination) if result: files.add(result) source = result[3] copied_sources.add(source) uncopied_sources = sources - copied_sources if uncopied_sources: uncopied_sources = {str(s) for s in uncopied_sources} raise errors.ParameterError('No such file or directory', param_hint=uncopied_sources) # Create metadata and move files to dataset results = [] remote_client = LocalClient(repo_path) # Pull files from LFS paths = set() for path, src, _, __ in files: if src.is_dir(): continue if src.is_symlink(): path = str(src.resolve().relative_to(repo_path)) paths.add(path) self._fetch_lfs_files(repo_path, paths) # Fetch metadata from Renku if any paths = {f[0] for f in files} metadata = self._fetch_files_metadata(remote_client, paths) for path, src, dst, _ in files: if not src.is_dir(): # Use original metadata if it exists based_on = metadata.get(path) if based_on: based_on.url = url based_on.based_on = None creators = based_on.creator else: creators = [] # grab all the creators from the commit history for commit in repo.iter_commits(paths=path): creator = Person.from_commit(commit) if creator not in creators: creators.append(creator) based_on = DatasetFile.from_revision(remote_client, path=path, url=url) path_in_dst_repo = dst.relative_to(self.path) results.append({ 'path': path_in_dst_repo, 'url': remove_credentials(url), 'creator': creators, 'parent': self, 'based_on': based_on }) dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy(str(src), str(dst)) return results
assert os.path.basename(os.path.dirname( dataset.files[0].path)) == 'dir2' def test_git_repo_import(client, dataset, tmpdir, data_repository): """Test an import from a git repository.""" # add data from local repo client.add_data_to_dataset( dataset, [os.path.join(os.path.dirname(data_repository.git_dir), 'dir2')]) assert os.stat('data/dataset/dir2/file2') assert dataset.files[0].path.endswith('dir2/file2') @pytest.mark.parametrize('creators', [ [Person(name='me', email='*****@*****.**')], [{ 'name': 'me', 'email': '*****@*****.**', }], ]) def test_creator_parse(creators, data_file): """Test that different options for specifying creators work.""" f = DatasetFile(path='file', creator=creators) creator = Person(name='me', email='*****@*****.**') assert creator in f.creator # email check with pytest.raises(ValueError): Person(name='me', email='meexample.com')
def test_git_repo_import(client, dataset, tmpdir, data_repository): """Test an import from a git repository.""" # add data from local repo client.add_data_to_dataset(dataset, [os.path.join(os.path.dirname(data_repository.git_dir), "dir2")]) path = os.path.join(DATA_DIR, "dataset", "dir2", "file2") assert os.stat(path) path = os.path.join("dir2", "file2") assert dataset.files[0].path.endswith(path) @pytest.mark.parametrize( "creators", [ [Person(name="me", email="*****@*****.**")], [{"http://schema.org/name": "me", "http://schema.org/email": "*****@*****.**",}], ], ) def test_creator_parse(creators, data_file): """Test that different options for specifying creators work.""" dataset = Dataset(name="dataset", creators=creators) creator = Person(name="me", email="*****@*****.**") assert creator in dataset.creators # email check with pytest.raises(ValueError): Person(name="me", email="meexample.com") # creators must be a set or list of dicts or Person with pytest.raises(ValueError):