def test_dataset_creator_email(dataset_metadata): """Check that creators without an email are assigned a blank node.""" # modify the dataset metadata to change the creator dataset = Dataset.from_jsonld(dataset_metadata, client=LocalClient("."),) dataset.creators[0]._id = "mailto:None" dataset_broken = Dataset.from_jsonld(dataset.as_jsonld(), client=LocalClient(".")) assert "mailto:None" not in dataset_broken.creators[0]._id
def create_dataset( self, short_name=None, title=None, description=None, creators=None, keywords=None, ): """Create a dataset.""" if not short_name: raise errors.ParameterError('Dataset short_name must be provided.') if not is_dataset_short_name_valid(short_name): raise errors.ParameterError( 'Dataset short_name "{}" is not valid.'.format(short_name)) if self.load_dataset(short_name=short_name): raise errors.DatasetExistsError( 'Dataset exists: "{}".'.format(short_name)) if not title: title = short_name identifier = str(uuid.uuid4()) path = self.renku_datasets_path / identifier / self.METADATA if path.exists(): raise errors.DatasetExistsError( 'Dataset with reference {} exists'.format(path)) path.parent.mkdir(parents=True, exist_ok=True) if creators is None: creators = [Person.from_git(self.repo)] keywords = keywords or () with with_reference(path): dataset = Dataset( client=self, identifier=identifier, short_name=short_name, name=title, description=description, creator=creators, keywords=keywords, ) dataset_ref = LinkReference.create(client=self, name='datasets/' + short_name) dataset_ref.set_reference(path) dataset.path = Path(dataset.path).relative_to(self.path) dataset.to_yaml() return dataset, path, dataset_ref
def test_creator_parse(creators, data_file): """Test that different options for specifying creators work.""" dataset = Dataset(name="dataset", creators=creators) creator = Person(name="me", email="*****@*****.**") assert creator in dataset.creators # email check with pytest.raises(ValueError): Person(name="me", email="meexample.com") # creators must be a set or list of dicts or Person with pytest.raises(ValueError): Dataset(name="dataset", creators=["name"])
def create_dataset(self, name, short_name=None, description='', creators=None): """Create a dataset.""" if not name: raise errors.ParameterError('Dataset name must be provided.') if not short_name: short_name = generate_default_short_name(name, None) if not is_dataset_name_valid(short_name): raise errors.ParameterError( 'Dataset name "{}" is not valid.'.format(short_name)) if self.load_dataset(name=short_name): raise errors.DatasetExistsError( 'Dataset exists: "{}".'.format(short_name)) identifier = str(uuid.uuid4()) path = self.renku_datasets_path / identifier / self.METADATA if path.exists(): raise errors.DatasetExistsError( 'Dataset with reference {} exists'.format(path)) path.parent.mkdir(parents=True, exist_ok=True) if creators is None: creators = [Person.from_git(self.repo)] with with_reference(path): dataset = Dataset(client=self, identifier=identifier, name=name, short_name=short_name, description=description, creator=creators) dataset_ref = LinkReference.create(client=self, name='datasets/' + short_name) dataset_ref.set_reference(path) dataset.to_yaml() return dataset, path, dataset_ref
def test_dataset_files_empty_metadata(dataset_metadata): """Check parsing metadata of dataset files with empty filename.""" dataset = Dataset.from_jsonld(dataset_metadata, client=LocalClient("."),) files = [file.filename for file in dataset.files if not file.filename] if files: assert None in files
def as_dataset(self, client): """Deserialize `DataverseRecordSerializer` to `Dataset`.""" files = self.get_files() dataset = Dataset.from_jsonld(self._json, client=client, schema_class=_DataverseDatasetSchema) if dataset.description and not dataset.description.strip(): dataset.description = None for creator in dataset.creator: if creator.affiliation == '': creator.affiliation = None serialized_files = [] for file_ in files: remote_ = file_.remote_url dataset_file = DatasetFile( url=remote_.geturl(), id=file_._id if file_._id else file_.name, filename=file_.name, filesize=file_.content_size, filetype=file_.file_format, path='', ) serialized_files.append(dataset_file) dataset.files = serialized_files return dataset
def as_dataset(self, client): """Deserialize `ZenodoRecordSerializer` to `Dataset`.""" files = self.get_files() metadata = self.get_jsonld() dataset = Dataset.from_jsonld(metadata, client=client) serialized_files = [] for file_ in files: remote_ = file_.remote_url dataset_file = DatasetFile( url=remote_.geturl(), id=file_.id, checksum=file_.checksum, filename=file_.filename, filesize=file_.filesize, filetype=file_.type, path='', ) serialized_files.append(dataset_file) dataset.files = serialized_files if isinstance(dataset.url, dict) and '_id' in dataset.url: dataset.url = urllib.parse.urlparse(dataset.url.pop('_id')) dataset.url = dataset.url.geturl() return dataset
def test_uuid_migration(dataset_metadata, client): """Test migration of id with UUID.""" dataset = Dataset.from_jsonld(dataset_metadata, client=client) assert is_uuid(dataset.identifier) assert urljoin('https://localhost/datasets/', dataset.identifier) == dataset._id
def migrate_datasets_pre_v0_3(client): """Migrate datasets from Renku 0.3.x.""" for old_path in dataset_pre_0_3(client): name = str(old_path.parent.relative_to(client.path / 'data')) dataset = Dataset.from_yaml(old_path, client=client) new_path = (client.renku_datasets_path / dataset.uid / client.METADATA) new_path.parent.mkdir(parents=True, exist_ok=True) with client.with_metadata(read_only=True) as meta: for module in client.repo.submodules: if Path(module.url).name == meta.name: module.remove() for file_ in dataset.files: if not Path(file_.path).exists(): expected_path = ( client.path / 'data' / dataset.name / file_.path ) if expected_path.exists(): file_.path = expected_path.relative_to(client.path) dataset.__reference__ = new_path dataset.to_yaml() Path(old_path).unlink() ref = LinkReference.create( client=client, name='datasets/{0}'.format(name), force=True, ) ref.set_reference(new_path)
def test_migration_broken_urls(dataset_metadata): """Check that migration of broken dataset file URLs is string.""" dataset = Dataset.from_jsonld( dataset_metadata, client=LocalClient('.'), ) for file_ in dataset.files: assert isinstance(url_to_string(file_.url), str)
def test_doi_migration(dataset_metadata): """Test migration of id with doi.""" dataset = Dataset.from_jsonld( dataset_metadata, client=LocalClient('.'), ) assert is_doi(dataset.identifier) assert urljoin('https://localhost', 'datasets/' + quote(dataset.identifier, safe='')) == dataset._id assert dataset.same_as == urljoin('https://doi.org', dataset.identifier)
def test_calamus(client, dataset_metadata_before_calamus): """Check Calamus loads project correctly.""" dataset = Dataset.from_jsonld(dataset_metadata_before_calamus, client=LocalClient(".")) file_ = dataset.find_file("data/dataverse/external/data.txt") assert file_.external is True assert "file://../../../../tmp/data.txt" == file_.url file_ = dataset.find_file("data/dataverse/local/result.csv") assert file_.external is False assert "file://../../../../tmp/result.csv" == file_.url
def edit_dataset(client, dataset_id, transform_fn, commit_message=None): """Edit dataset metadata.""" dataset = client.load_dataset(dataset_id) if not dataset: raise DatasetNotFound() edited = yaml.safe_load(transform_fn(dataset)) updated_ = Dataset(client=client, **edited) dataset.update_metadata(updated_) dataset.to_yaml()
def test_dataset_doi_metadata(dataset_metadata): """Check dataset metadata for correct DOI.""" from renku.core.utils.doi import is_doi dataset = Dataset.from_jsonld( dataset_metadata, client=LocalClient('.'), ) if is_doi(dataset.identifier): assert urljoin('https://doi.org', dataset.identifier) == dataset.same_as assert dataset._id.endswith('datasets/{}'.format( quote(dataset.identifier, safe='')))
def test_creators_with_same_email(tmp_path): """Test creators with different names and same email address.""" creators = [Person(name="me", email="*****@*****.**"), Person(name="me2", email="*****@*****.**")] dataset = Dataset(name="dataset", creators=creators) path = tmp_path / "dataset.yml" dataset.__reference__ = path dataset.to_yaml() dataset = Dataset.from_yaml(path) assert 1 == len(dataset.creators) assert dataset.creators[0].name in ["me", "me2"]
def datasets_from_commit(self, commit=None): """Return datasets defined in a commit.""" commit = commit or self.repo.head.commit try: datasets = commit.tree / self.renku_home / self.DATASETS except KeyError: return for tree in datasets: try: blob = tree / self.METADATA except KeyError: continue dataset = Dataset.from_yaml(self.path / Path(blob.path), client=self) dataset.commit = commit yield dataset
def test_calamus(client, dataset_metadata_before_calamus): """Check Calamus loads project correctly.""" dataset = Dataset.from_jsonld(dataset_metadata_before_calamus, client=LocalClient('.')) assert 'Open Source at Harvard' == dataset.name assert '51db02ad-3cba-47e2-84d0-5ee5914bd654' == dataset.identifier assert '51db02ad-3cba-47e2-84d0-5ee5914bd654' == dataset._label assert 'Harvard University' == dataset.creator[0].affiliation assert 'Durbin, Philip' == dataset.creator[0].name assert 'Durbin, Philip' == dataset.creator[0].label assert dataset.created is None assert '2019-07-03T00:00:00' == dataset.date_published.isoformat('T') assert 'The tabular file contains information' in dataset.description assert 'https://doi.org/10.7910/DVN/TJCLKP' == dataset.same_as.url assert '3' == dataset.tags[0].name assert 'Tag 3 created by renku import' == dataset.tags[0].description assert isinstance(dataset.license, dict) assert ('https://creativecommons.org/publicdomain/zero/1.0/' in str(dataset.license)) file_ = dataset.find_file('data/dataverse/IQSS-UNF.json') assert ('https://dataverse.harvard.edu/api/access/datafile/3371500' == file_.url) assert '2020-06-15T08:37:04.571573+00:00' == file_.added.isoformat('T') assert 'https://orcid.org/0000-0002-9528-9470' == file_.creator[0]._id assert file_.based_on is None file_ = dataset.find_file('data/dataverse/git/index.ipynb') assert ('https://github.com/SwissDataScienceCenter/r10e-ds-py.git' == file_.based_on.url) assert ('notebooks/index.ipynb@f98325d81c700f4b86ee05c2154e94d43ca068b8' == file_.based_on._label) assert file_.based_on.based_on is None assert 'mailto:cramakri@' in file_.based_on.creator[0]._id assert ('https://github.com/SwissDataScienceCenter/r10e-ds-py.git' == file_.url) file_ = dataset.find_file('data/dataverse/external/data.txt') assert file_.external is True assert 'file://../../../../tmp/data.txt' == file_.url file_ = dataset.find_file('data/dataverse/local/result.csv') assert file_.external is False assert 'file://../../../../tmp/result.csv' == file_.url
def test_dataset_serialization(dataset): """Test dataset (de)serialization.""" dataset_metadata = dataset.asjsonld() dataset = Dataset.from_jsonld(dataset_metadata) # assert that all attributes found in metadata are set in the instance assert dataset.created assert dataset.creator assert dataset.identifier assert dataset.name assert dataset.path assert dataset._project # check values assert str(dataset.created.isoformat()) == dataset_metadata.get('created') assert dataset.creator[0].email == dataset_metadata.get('creator')[0].get( 'email') assert dataset.identifier == dataset_metadata.get('identifier') assert dataset.name == dataset_metadata.get('name') assert dataset.path == dataset_metadata.get('path')
def as_dataset(self, client): """Deserialize `DataverseRecordSerializer` to `Dataset`.""" files = self.get_files() dataset = Dataset.from_jsonld(self._json, client=client) serialized_files = [] for file_ in files: remote_ = file_.remote_url dataset_file = DatasetFile( url=remote_.geturl(), id=file_._id if file_._id else file_.name, filename=file_.name, filesize=file_.content_size, filetype=file_.file_format, path='', ) serialized_files.append(dataset_file) dataset.files = serialized_files return dataset
def test_dataset_deserialization(client, dataset): """Test Dataset deserialization.""" dataset_ = Dataset.from_yaml(client.get_dataset_path("dataset"), client=client) dataset_types = { "date_created": [datetime.datetime], "creators": [list], "description": [str, type(None)], "files": [list], "identifier": [str], "keywords": [list], } for attribute, type_ in dataset_types.items(): assert type(dataset_.__getattribute__(attribute)) in type_ creator_types = {"email": str, "_id": str, "name": str, "affiliation": str} creator = dataset.creators[0] for attribute, type_ in creator_types.items(): assert type(getattr(creator, attribute)) is type_
def _migrate_datasets_pre_v0_3(client): """Migrate datasets from Renku 0.3.x.""" def _dataset_pre_0_3(client): """Return paths of dataset metadata for pre 0.3.4.""" project_is_pre_0_3 = int(client.project.version) < 2 if project_is_pre_0_3: return (client.path / DATA_DIR).rglob(client.METADATA) return [] for old_path in _dataset_pre_0_3(client): name = str(old_path.parent.relative_to(client.path / DATA_DIR)) dataset = Dataset.from_yaml(old_path, client=client) new_path = (client.renku_datasets_path / dataset.uid / client.METADATA) new_path.parent.mkdir(parents=True, exist_ok=True) with client.with_metadata(read_only=True) as meta: for module in client.repo.submodules: if Path(module.url).name == meta.name: module.remove() for file_ in dataset.files: if not Path(file_.path).exists(): expected_path = (client.path / DATA_DIR / dataset.name / file_.path) if expected_path.exists(): file_.path = expected_path.relative_to(client.path) dataset.__reference__ = new_path.relative_to(client.path) dataset.to_yaml() Path(old_path).unlink() ref = LinkReference.create( client=client, name='datasets/{0}'.format(name), force=True, ) ref.set_reference(new_path)
def test_dataset_serialization(dataset): """Test dataset (de)serialization.""" dataset_metadata = dataset.asjsonld() dataset = Dataset.from_jsonld(dataset_metadata) # assert that all attributes found in metadata are set in the instance assert dataset.created assert dataset.creator assert dataset.identifier assert dataset.name assert dataset.path assert dataset._project # check values assert str(dataset.created.isoformat()) == dataset_metadata.get( 'http://schema.org/dateCreated') assert dataset.creator[0].email == dataset_metadata.get( 'http://schema.org/creator')[0].get('http://schema.org/email') assert dataset.identifier == dataset_metadata.get( 'http://schema.org/identifier') assert dataset.name == dataset_metadata.get('http://schema.org/name') assert dataset.path == dataset_metadata.get( 'http://www.w3.org/ns/prov#atLocation')
def test_dataset_serialization(dataset): """Test dataset (de)serialization.""" def read_value(key): return dataset_metadata.get(key)[0].get("@value") flattened_metadata = dataset.as_jsonld() dataset = Dataset.from_jsonld(flattened_metadata) # assert that all attributes found in metadata are set in the instance assert dataset.date_created assert dataset.creators assert dataset.identifier assert dataset.title assert dataset.path assert dataset._project dataset_metadata = [m for m in flattened_metadata if "Dataset" in str(m["@type"])][0] # check values assert str(dataset.date_created.isoformat()) == read_value("http://schema.org/dateCreated") assert dataset.identifier == read_value("http://schema.org/identifier") assert dataset.title == read_value("http://schema.org/name") assert dataset.path == read_value("http://www.w3.org/ns/prov#atLocation")
def test_dataset_deserialization(client, dataset): """Test Dataset deserialization.""" from renku.core.models.datasets import Dataset dataset_ = Dataset.from_yaml(client.get_dataset_path('dataset'), client=client) dataset_types = { 'created': [datetime.datetime], 'creator': [list], 'description': [str, type(None)], 'files': [list], 'identifier': [str], 'keywords': [list], } for attribute, type_ in dataset_types.items(): assert type(dataset_.__getattribute__(attribute)) in type_ creator_types = {'email': str, '_id': str, 'name': str, 'affiliation': str} creator = dataset.creator[0] for attribute, type_ in creator_types.items(): assert type(getattr(creator, attribute)) is type_
def load_dataset_from_path(self, path, commit=None): """Return a dataset from a given path.""" path = Path(path) if not path.is_absolute(): path = self.path / path return Dataset.from_yaml(path, client=self, commit=commit)