def test_dataset_creator_email(dataset_metadata):
    """Check that creators without an email are assigned a blank node."""
    # modify the dataset metadata to change the creator
    dataset = Dataset.from_jsonld(dataset_metadata, client=LocalClient("."),)

    dataset.creators[0]._id = "mailto:None"
    dataset_broken = Dataset.from_jsonld(dataset.as_jsonld(), client=LocalClient("."))
    assert "mailto:None" not in dataset_broken.creators[0]._id
示例#2
0
    def create_dataset(
        self,
        short_name=None,
        title=None,
        description=None,
        creators=None,
        keywords=None,
    ):
        """Create a dataset."""
        if not short_name:
            raise errors.ParameterError('Dataset short_name must be provided.')

        if not is_dataset_short_name_valid(short_name):
            raise errors.ParameterError(
                'Dataset short_name "{}" is not valid.'.format(short_name))

        if self.load_dataset(short_name=short_name):
            raise errors.DatasetExistsError(
                'Dataset exists: "{}".'.format(short_name))

        if not title:
            title = short_name

        identifier = str(uuid.uuid4())

        path = self.renku_datasets_path / identifier / self.METADATA

        if path.exists():
            raise errors.DatasetExistsError(
                'Dataset with reference {} exists'.format(path))

        path.parent.mkdir(parents=True, exist_ok=True)

        if creators is None:
            creators = [Person.from_git(self.repo)]

        keywords = keywords or ()

        with with_reference(path):
            dataset = Dataset(
                client=self,
                identifier=identifier,
                short_name=short_name,
                name=title,
                description=description,
                creator=creators,
                keywords=keywords,
            )

        dataset_ref = LinkReference.create(client=self,
                                           name='datasets/' + short_name)

        dataset_ref.set_reference(path)
        dataset.path = Path(dataset.path).relative_to(self.path)
        dataset.to_yaml()

        return dataset, path, dataset_ref
示例#3
0
def test_creator_parse(creators, data_file):
    """Test that different options for specifying creators work."""
    dataset = Dataset(name="dataset", creators=creators)
    creator = Person(name="me", email="*****@*****.**")
    assert creator in dataset.creators

    # email check
    with pytest.raises(ValueError):
        Person(name="me", email="meexample.com")

    # creators must be a set or list of dicts or Person
    with pytest.raises(ValueError):
        Dataset(name="dataset", creators=["name"])
示例#4
0
    def create_dataset(self,
                       name,
                       short_name=None,
                       description='',
                       creators=None):
        """Create a dataset."""
        if not name:
            raise errors.ParameterError('Dataset name must be provided.')

        if not short_name:
            short_name = generate_default_short_name(name, None)

        if not is_dataset_name_valid(short_name):
            raise errors.ParameterError(
                'Dataset name "{}" is not valid.'.format(short_name))

        if self.load_dataset(name=short_name):
            raise errors.DatasetExistsError(
                'Dataset exists: "{}".'.format(short_name))

        identifier = str(uuid.uuid4())

        path = self.renku_datasets_path / identifier / self.METADATA

        if path.exists():
            raise errors.DatasetExistsError(
                'Dataset with reference {} exists'.format(path))

        path.parent.mkdir(parents=True, exist_ok=True)

        if creators is None:
            creators = [Person.from_git(self.repo)]

        with with_reference(path):
            dataset = Dataset(client=self,
                              identifier=identifier,
                              name=name,
                              short_name=short_name,
                              description=description,
                              creator=creators)

        dataset_ref = LinkReference.create(client=self,
                                           name='datasets/' + short_name)

        dataset_ref.set_reference(path)
        dataset.to_yaml()

        return dataset, path, dataset_ref
def test_dataset_files_empty_metadata(dataset_metadata):
    """Check parsing metadata of dataset files with empty filename."""
    dataset = Dataset.from_jsonld(dataset_metadata, client=LocalClient("."),)
    files = [file.filename for file in dataset.files if not file.filename]

    if files:
        assert None in files
示例#6
0
    def as_dataset(self, client):
        """Deserialize `DataverseRecordSerializer` to `Dataset`."""
        files = self.get_files()
        dataset = Dataset.from_jsonld(self._json,
                                      client=client,
                                      schema_class=_DataverseDatasetSchema)

        if dataset.description and not dataset.description.strip():
            dataset.description = None

        for creator in dataset.creator:
            if creator.affiliation == '':
                creator.affiliation = None

        serialized_files = []
        for file_ in files:
            remote_ = file_.remote_url
            dataset_file = DatasetFile(
                url=remote_.geturl(),
                id=file_._id if file_._id else file_.name,
                filename=file_.name,
                filesize=file_.content_size,
                filetype=file_.file_format,
                path='',
            )
            serialized_files.append(dataset_file)

        dataset.files = serialized_files

        return dataset
示例#7
0
    def as_dataset(self, client):
        """Deserialize `ZenodoRecordSerializer` to `Dataset`."""
        files = self.get_files()
        metadata = self.get_jsonld()
        dataset = Dataset.from_jsonld(metadata, client=client)

        serialized_files = []
        for file_ in files:
            remote_ = file_.remote_url
            dataset_file = DatasetFile(
                url=remote_.geturl(),
                id=file_.id,
                checksum=file_.checksum,
                filename=file_.filename,
                filesize=file_.filesize,
                filetype=file_.type,
                path='',
            )
            serialized_files.append(dataset_file)

        dataset.files = serialized_files

        if isinstance(dataset.url, dict) and '_id' in dataset.url:
            dataset.url = urllib.parse.urlparse(dataset.url.pop('_id'))
            dataset.url = dataset.url.geturl()

        return dataset
def test_uuid_migration(dataset_metadata, client):
    """Test migration of id with UUID."""
    dataset = Dataset.from_jsonld(dataset_metadata, client=client)

    assert is_uuid(dataset.identifier)
    assert urljoin('https://localhost/datasets/',
                   dataset.identifier) == dataset._id
示例#9
0
def migrate_datasets_pre_v0_3(client):
    """Migrate datasets from Renku 0.3.x."""
    for old_path in dataset_pre_0_3(client):
        name = str(old_path.parent.relative_to(client.path / 'data'))

        dataset = Dataset.from_yaml(old_path, client=client)
        new_path = (client.renku_datasets_path / dataset.uid / client.METADATA)
        new_path.parent.mkdir(parents=True, exist_ok=True)

        with client.with_metadata(read_only=True) as meta:
            for module in client.repo.submodules:
                if Path(module.url).name == meta.name:
                    module.remove()

        for file_ in dataset.files:
            if not Path(file_.path).exists():
                expected_path = (
                    client.path / 'data' / dataset.name / file_.path
                )
                if expected_path.exists():
                    file_.path = expected_path.relative_to(client.path)

        dataset.__reference__ = new_path
        dataset.to_yaml()

        Path(old_path).unlink()
        ref = LinkReference.create(
            client=client,
            name='datasets/{0}'.format(name),
            force=True,
        )
        ref.set_reference(new_path)
def test_migration_broken_urls(dataset_metadata):
    """Check that migration of broken dataset file URLs is string."""
    dataset = Dataset.from_jsonld(
        dataset_metadata,
        client=LocalClient('.'),
    )

    for file_ in dataset.files:
        assert isinstance(url_to_string(file_.url), str)
示例#11
0
def test_doi_migration(dataset_metadata):
    """Test migration of id with doi."""
    dataset = Dataset.from_jsonld(
        dataset_metadata,
        client=LocalClient('.'),
    )
    assert is_doi(dataset.identifier)
    assert urljoin('https://localhost', 'datasets/' +
                   quote(dataset.identifier, safe='')) == dataset._id
    assert dataset.same_as == urljoin('https://doi.org', dataset.identifier)
示例#12
0
def test_calamus(client, dataset_metadata_before_calamus):
    """Check Calamus loads project correctly."""
    dataset = Dataset.from_jsonld(dataset_metadata_before_calamus, client=LocalClient("."))

    file_ = dataset.find_file("data/dataverse/external/data.txt")
    assert file_.external is True
    assert "file://../../../../tmp/data.txt" == file_.url

    file_ = dataset.find_file("data/dataverse/local/result.csv")
    assert file_.external is False
    assert "file://../../../../tmp/result.csv" == file_.url
示例#13
0
def edit_dataset(client, dataset_id, transform_fn, commit_message=None):
    """Edit dataset metadata."""
    dataset = client.load_dataset(dataset_id)

    if not dataset:
        raise DatasetNotFound()

    edited = yaml.safe_load(transform_fn(dataset))
    updated_ = Dataset(client=client, **edited)
    dataset.update_metadata(updated_)
    dataset.to_yaml()
示例#14
0
def test_dataset_doi_metadata(dataset_metadata):
    """Check dataset metadata for correct DOI."""
    from renku.core.utils.doi import is_doi
    dataset = Dataset.from_jsonld(
        dataset_metadata,
        client=LocalClient('.'),
    )

    if is_doi(dataset.identifier):
        assert urljoin('https://doi.org',
                       dataset.identifier) == dataset.same_as

    assert dataset._id.endswith('datasets/{}'.format(
        quote(dataset.identifier, safe='')))
示例#15
0
def test_creators_with_same_email(tmp_path):
    """Test creators with different names and same email address."""
    creators = [Person(name="me", email="*****@*****.**"), Person(name="me2", email="*****@*****.**")]
    dataset = Dataset(name="dataset", creators=creators)
    path = tmp_path / "dataset.yml"
    dataset.__reference__ = path
    dataset.to_yaml()

    dataset = Dataset.from_yaml(path)
    assert 1 == len(dataset.creators)
    assert dataset.creators[0].name in ["me", "me2"]
示例#16
0
    def datasets_from_commit(self, commit=None):
        """Return datasets defined in a commit."""
        commit = commit or self.repo.head.commit

        try:
            datasets = commit.tree / self.renku_home / self.DATASETS
        except KeyError:
            return

        for tree in datasets:
            try:
                blob = tree / self.METADATA
            except KeyError:
                continue
            dataset = Dataset.from_yaml(self.path / Path(blob.path), client=self)
            dataset.commit = commit
            yield dataset
def test_calamus(client, dataset_metadata_before_calamus):
    """Check Calamus loads project correctly."""
    dataset = Dataset.from_jsonld(dataset_metadata_before_calamus,
                                  client=LocalClient('.'))
    assert 'Open Source at Harvard' == dataset.name
    assert '51db02ad-3cba-47e2-84d0-5ee5914bd654' == dataset.identifier
    assert '51db02ad-3cba-47e2-84d0-5ee5914bd654' == dataset._label
    assert 'Harvard University' == dataset.creator[0].affiliation
    assert 'Durbin, Philip' == dataset.creator[0].name
    assert 'Durbin, Philip' == dataset.creator[0].label
    assert dataset.created is None
    assert '2019-07-03T00:00:00' == dataset.date_published.isoformat('T')
    assert 'The tabular file contains information' in dataset.description
    assert 'https://doi.org/10.7910/DVN/TJCLKP' == dataset.same_as.url
    assert '3' == dataset.tags[0].name
    assert 'Tag 3 created by renku import' == dataset.tags[0].description
    assert isinstance(dataset.license, dict)
    assert ('https://creativecommons.org/publicdomain/zero/1.0/'
            in str(dataset.license))

    file_ = dataset.find_file('data/dataverse/IQSS-UNF.json')
    assert ('https://dataverse.harvard.edu/api/access/datafile/3371500' ==
            file_.url)
    assert '2020-06-15T08:37:04.571573+00:00' == file_.added.isoformat('T')
    assert 'https://orcid.org/0000-0002-9528-9470' == file_.creator[0]._id
    assert file_.based_on is None

    file_ = dataset.find_file('data/dataverse/git/index.ipynb')
    assert ('https://github.com/SwissDataScienceCenter/r10e-ds-py.git' ==
            file_.based_on.url)
    assert ('notebooks/index.ipynb@f98325d81c700f4b86ee05c2154e94d43ca068b8' ==
            file_.based_on._label)
    assert file_.based_on.based_on is None
    assert 'mailto:cramakri@' in file_.based_on.creator[0]._id
    assert ('https://github.com/SwissDataScienceCenter/r10e-ds-py.git' ==
            file_.url)

    file_ = dataset.find_file('data/dataverse/external/data.txt')
    assert file_.external is True
    assert 'file://../../../../tmp/data.txt' == file_.url

    file_ = dataset.find_file('data/dataverse/local/result.csv')
    assert file_.external is False
    assert 'file://../../../../tmp/result.csv' == file_.url
示例#18
0
def test_dataset_serialization(dataset):
    """Test dataset (de)serialization."""
    dataset_metadata = dataset.asjsonld()
    dataset = Dataset.from_jsonld(dataset_metadata)

    # assert that all attributes found in metadata are set in the instance
    assert dataset.created
    assert dataset.creator
    assert dataset.identifier
    assert dataset.name
    assert dataset.path
    assert dataset._project

    # check values
    assert str(dataset.created.isoformat()) == dataset_metadata.get('created')
    assert dataset.creator[0].email == dataset_metadata.get('creator')[0].get(
        'email')
    assert dataset.identifier == dataset_metadata.get('identifier')
    assert dataset.name == dataset_metadata.get('name')
    assert dataset.path == dataset_metadata.get('path')
示例#19
0
    def as_dataset(self, client):
        """Deserialize `DataverseRecordSerializer` to `Dataset`."""
        files = self.get_files()
        dataset = Dataset.from_jsonld(self._json, client=client)

        serialized_files = []
        for file_ in files:
            remote_ = file_.remote_url
            dataset_file = DatasetFile(
                url=remote_.geturl(),
                id=file_._id if file_._id else file_.name,
                filename=file_.name,
                filesize=file_.content_size,
                filetype=file_.file_format,
                path='',
            )
            serialized_files.append(dataset_file)

        dataset.files = serialized_files

        return dataset
示例#20
0
def test_dataset_deserialization(client, dataset):
    """Test Dataset deserialization."""
    dataset_ = Dataset.from_yaml(client.get_dataset_path("dataset"), client=client)

    dataset_types = {
        "date_created": [datetime.datetime],
        "creators": [list],
        "description": [str, type(None)],
        "files": [list],
        "identifier": [str],
        "keywords": [list],
    }

    for attribute, type_ in dataset_types.items():
        assert type(dataset_.__getattribute__(attribute)) in type_

    creator_types = {"email": str, "_id": str, "name": str, "affiliation": str}

    creator = dataset.creators[0]

    for attribute, type_ in creator_types.items():
        assert type(getattr(creator, attribute)) is type_
def _migrate_datasets_pre_v0_3(client):
    """Migrate datasets from Renku 0.3.x."""
    def _dataset_pre_0_3(client):
        """Return paths of dataset metadata for pre 0.3.4."""
        project_is_pre_0_3 = int(client.project.version) < 2
        if project_is_pre_0_3:
            return (client.path / DATA_DIR).rglob(client.METADATA)
        return []

    for old_path in _dataset_pre_0_3(client):
        name = str(old_path.parent.relative_to(client.path / DATA_DIR))

        dataset = Dataset.from_yaml(old_path, client=client)
        new_path = (client.renku_datasets_path / dataset.uid / client.METADATA)
        new_path.parent.mkdir(parents=True, exist_ok=True)

        with client.with_metadata(read_only=True) as meta:
            for module in client.repo.submodules:
                if Path(module.url).name == meta.name:
                    module.remove()

        for file_ in dataset.files:
            if not Path(file_.path).exists():
                expected_path = (client.path / DATA_DIR / dataset.name /
                                 file_.path)
                if expected_path.exists():
                    file_.path = expected_path.relative_to(client.path)

        dataset.__reference__ = new_path.relative_to(client.path)
        dataset.to_yaml()

        Path(old_path).unlink()
        ref = LinkReference.create(
            client=client,
            name='datasets/{0}'.format(name),
            force=True,
        )
        ref.set_reference(new_path)
示例#22
0
def test_dataset_serialization(dataset):
    """Test dataset (de)serialization."""
    dataset_metadata = dataset.asjsonld()
    dataset = Dataset.from_jsonld(dataset_metadata)

    # assert that all attributes found in metadata are set in the instance
    assert dataset.created
    assert dataset.creator
    assert dataset.identifier
    assert dataset.name
    assert dataset.path
    assert dataset._project

    # check values
    assert str(dataset.created.isoformat()) == dataset_metadata.get(
        'http://schema.org/dateCreated')
    assert dataset.creator[0].email == dataset_metadata.get(
        'http://schema.org/creator')[0].get('http://schema.org/email')
    assert dataset.identifier == dataset_metadata.get(
        'http://schema.org/identifier')
    assert dataset.name == dataset_metadata.get('http://schema.org/name')
    assert dataset.path == dataset_metadata.get(
        'http://www.w3.org/ns/prov#atLocation')
示例#23
0
def test_dataset_serialization(dataset):
    """Test dataset (de)serialization."""

    def read_value(key):
        return dataset_metadata.get(key)[0].get("@value")

    flattened_metadata = dataset.as_jsonld()
    dataset = Dataset.from_jsonld(flattened_metadata)

    # assert that all attributes found in metadata are set in the instance
    assert dataset.date_created
    assert dataset.creators
    assert dataset.identifier
    assert dataset.title
    assert dataset.path
    assert dataset._project

    dataset_metadata = [m for m in flattened_metadata if "Dataset" in str(m["@type"])][0]

    # check values
    assert str(dataset.date_created.isoformat()) == read_value("http://schema.org/dateCreated")
    assert dataset.identifier == read_value("http://schema.org/identifier")
    assert dataset.title == read_value("http://schema.org/name")
    assert dataset.path == read_value("http://www.w3.org/ns/prov#atLocation")
def test_dataset_deserialization(client, dataset):
    """Test Dataset deserialization."""
    from renku.core.models.datasets import Dataset
    dataset_ = Dataset.from_yaml(client.get_dataset_path('dataset'),
                                 client=client)

    dataset_types = {
        'created': [datetime.datetime],
        'creator': [list],
        'description': [str, type(None)],
        'files': [list],
        'identifier': [str],
        'keywords': [list],
    }

    for attribute, type_ in dataset_types.items():
        assert type(dataset_.__getattribute__(attribute)) in type_

    creator_types = {'email': str, '_id': str, 'name': str, 'affiliation': str}

    creator = dataset.creator[0]

    for attribute, type_ in creator_types.items():
        assert type(getattr(creator, attribute)) is type_
示例#25
0
 def load_dataset_from_path(self, path, commit=None):
     """Return a dataset from a given path."""
     path = Path(path)
     if not path.is_absolute():
         path = self.path / path
     return Dataset.from_yaml(path, client=self, commit=commit)