def test_unicode(self): g = Graph() title = 'ééé' description = 'éééé' node = BNode() g.add((node, RDF.type, DCAT.Dataset)) g.add((node, DCT.title, Literal(title))) g.add((node, DCT.description, Literal(description))) rnode = BNode() g.add((rnode, RDF.type, DCAT.Distribution)) g.add((rnode, DCT.title, Literal(title))) g.add((rnode, DCT.description, Literal(description))) g.add((rnode, DCAT.downloadURL, URIRef(faker.uri()))) g.add((node, DCAT.distribution, rnode)) dataset = dataset_from_rdf(g) dataset.validate() assert dataset.title == title assert dataset.description == description resource = dataset.resources[0] assert resource.title == title assert resource.description == description
def parse_url(url, quiet=False, rid=''): '''Parse the datasets in a DCAT format located at URL (debug)''' if quiet: verbose_loggers = ['rdflib', 'udata.core.dataset'] [logging.getLogger(l).setLevel(logging.ERROR) for l in verbose_loggers] class MockSource: url = '' class MockJob: items = [] class MockDatasetFactory(DatasetFactory): '''Use DatasetFactory without .save()''' @classmethod def _create(cls, model_class, *args, **kwargs): instance = model_class(*args, **kwargs) return instance echo(cyan('Parsing url {}'.format(url))) source = MockSource() source.url = url backend = DcatBackend(source, dryrun=True) backend.job = MockJob() format = backend.get_format() echo(yellow('Detected format: {}'.format(format))) graph = backend.parse_graph(url, format) # serialize/unserialize graph like in the job mechanism _graph = graph.serialize(format=format, indent=None) graph = Graph(namespace_manager=namespace_manager) graph.parse(data=_graph, format=format) for item in backend.job.items: if not rid or rid in item.remote_id: echo(magenta('Processing item {}'.format(item.remote_id))) echo('Item kwargs: {}'.format(yellow(item.kwargs))) node = backend.get_node_from_item(item) dataset = MockDatasetFactory() dataset = dataset_from_rdf(graph, dataset, node=node) echo('') echo(green('Dataset found!')) echo('Title: {}'.format(yellow(dataset))) echo('License: {}'.format(yellow(dataset.license))) echo('Description: {}'.format(yellow(dataset.description))) echo('Tags: {}'.format(yellow(dataset.tags))) echo('Resources: {}'.format(yellow([(r.title, r.format, r.url) for r in dataset.resources]))) try: dataset.validate() except mongoengine.errors.ValidationError as e: log.error(e, exc_info=True) else: echo(green('Dataset is valid ✅')) echo('')
def process(self, item): graph = Graph(namespace_manager=namespace_manager) data = self.job.data['graph'] format = self.job.data['format'] node = self.get_node_from_item(item) graph.parse(data=bytes(data, encoding='utf8'), format=format) dataset = self.get_dataset(item.remote_id) dataset = dataset_from_rdf(graph, dataset, node=node) return dataset
def test_minimal(self): node = BNode() g = Graph() title = faker.sentence() g.add((node, RDF.type, DCAT.Dataset)) g.add((node, DCT.title, Literal(title))) dataset = dataset_from_rdf(g) dataset.validate() assert isinstance(dataset, Dataset) assert dataset.title == title
def test_html_description(self): node = BNode() g = Graph() g.add((node, RDF.type, DCAT.Dataset)) g.add((node, DCT.identifier, Literal(faker.uuid4()))) g.add((node, DCT.title, Literal(faker.sentence()))) g.add((node, DCT.description, Literal('<div>a description</div>'))) dataset = dataset_from_rdf(g) dataset.validate() assert isinstance(dataset, Dataset) assert dataset.description == 'a description'
def process(self, item): graph = Graph(namespace_manager=namespace_manager) data = item.kwargs.get('graph', self.job.data['graph']) # handles legacy graphs node = None graph.parse(data=data, format='json-ld') if 'nid' in item.kwargs and 'type' in item.kwargs: nid = item.kwargs['nid'] node = URIRef(nid) if item.kwargs['type'] == 'uriref' else BNode(nid) dataset = self.get_dataset(item.remote_id) dataset = dataset_from_rdf(graph, dataset, node=node) return dataset
def test_update(self): original = DatasetFactory() node = URIRef('https://test.org/dataset') g = Graph() new_title = faker.sentence() g.add((node, RDF.type, DCAT.Dataset)) g.add((node, DCT.title, Literal(new_title))) dataset = dataset_from_rdf(g, dataset=original) dataset.validate() assert isinstance(dataset, Dataset) assert dataset.id == original.id assert dataset.title == new_title
def test_match_license_from_rights_uri(self): license = LicenseFactory() node = BNode() g = Graph() g.set((node, RDF.type, DCAT.Dataset)) g.set((node, DCT.title, Literal(faker.sentence()))) rnode = BNode() g.set((rnode, RDF.type, DCAT.Distribution)) g.set((rnode, DCAT.downloadURL, URIRef(faker.uri()))) g.set((rnode, DCT.rights, URIRef(license.url))) g.add((node, DCAT.distribution, rnode)) dataset = dataset_from_rdf(g) assert isinstance(dataset.license, License) assert dataset.license == license
def test_dataset_has_resources_from_literal_instead_of_uriref(self): node = BNode() g = Graph() g.add((node, RDF.type, DCAT.Dataset)) g.add((node, DCT.title, Literal(faker.sentence()))) rnode = BNode() g.set((rnode, RDF.type, DCAT.Distribution)) # Resource URL is expressed as a Literal g.set((rnode, DCAT.downloadURL, Literal(faker.uri()))) g.add((node, DCAT.distribution, rnode)) dataset = dataset_from_rdf(g) dataset.validate() assert isinstance(dataset, Dataset) assert len(dataset.resources) == 1
def test_dataset_has_resources_from_buggy_plural_distribution(self): '''Try to extract resources from the wrong distributions attribute''' node = BNode() g = Graph() g.add((node, RDF.type, DCAT.Dataset)) g.add((node, DCT.title, Literal(faker.sentence()))) rnode = BNode() g.set((rnode, RDF.type, DCAT.Distribution)) g.set((rnode, DCAT.downloadURL, URIRef(faker.uri()))) g.add((node, DCAT.distributions, rnode)) # use plural name dataset = dataset_from_rdf(g) dataset.validate() assert isinstance(dataset, Dataset) assert len(dataset.resources) == 1
def test_dataset_has_resources(self): node = BNode() g = Graph() g.add((node, RDF.type, DCAT.Dataset)) g.add((node, DCT.title, Literal(faker.sentence()))) for i in range(3): rnode = BNode() g.set((rnode, RDF.type, DCAT.Distribution)) g.set((rnode, DCAT.downloadURL, URIRef(faker.uri()))) g.add((node, DCAT.distribution, rnode)) dataset = dataset_from_rdf(g) dataset.validate() assert isinstance(dataset, Dataset) assert len(dataset.resources) == 3
def test_theme_and_tags(self): node = BNode() g = Graph() tags = faker.words(nb=3) themes = faker.words(nb=3) g.add((node, RDF.type, DCAT.Dataset)) g.add((node, DCT.title, Literal(faker.sentence()))) for tag in tags: g.add((node, DCAT.keyword, Literal(tag))) for theme in themes: g.add((node, DCAT.theme, Literal(theme))) dataset = dataset_from_rdf(g) dataset.validate() assert isinstance(dataset, Dataset) assert set(dataset.tags) == set(tags + themes)
def test_minimal_from_multiple(self): node = BNode() g = Graph() title = faker.sentence() g.add((node, RDF.type, DCAT.Dataset)) g.add((node, DCT.title, Literal(title))) for i in range(3): other = BNode() g.add((other, RDF.type, DCAT.Dataset)) g.add((other, DCT.title, Literal(faker.sentence()))) dataset = dataset_from_rdf(g, node=node) dataset.validate() assert isinstance(dataset, Dataset) assert dataset.title == title
def test_minimal_from_multiple(self): node = BNode() g = Graph() title = faker.sentence() g.add((node, RDF.type, DCAT.Dataset)) g.add((node, DCT.title, Literal(title))) for i in range(3): other = BNode() g.add((other, RDF.type, DCAT.Dataset)) g.add((other, DCT.title, Literal(faker.sentence()))) dataset = dataset_from_rdf(g, node=node) dataset.validate() assert isinstance(dataset, Dataset) assert dataset.title == title
def test_match_license_from_license_title(self): license = LicenseFactory() node = BNode() g = Graph() g.set((node, RDF.type, DCAT.Dataset)) g.set((node, DCT.title, Literal(faker.sentence()))) rnode = BNode() g.set((rnode, RDF.type, DCAT.Distribution)) g.set((rnode, DCAT.downloadURL, URIRef(faker.uri()))) g.set((rnode, DCT.license, Literal(license.title))) g.add((node, DCAT.distribution, rnode)) dataset = dataset_from_rdf(g) dataset.validate() self.assertIsInstance(dataset.license, License) self.assertEqual(dataset.license, license)
def test_all_fields(self): uri = 'https://test.org/dataset' node = URIRef(uri) g = Graph() id = faker.uuid4() title = faker.sentence() acronym = faker.word() description = faker.paragraph() tags = faker.words(nb=3) start = faker.past_date(start_date='-30d') end = faker.future_date(end_date='+30d') g.set((node, RDF.type, DCAT.Dataset)) g.set((node, DCT.identifier, Literal(id))) g.set((node, DCT.title, Literal(title))) g.set((node, SKOS.altLabel, Literal(acronym))) g.set((node, DCT.description, Literal(description))) g.set((node, DCT.accrualPeriodicity, FREQ.daily)) pot = BNode() g.add((node, DCT.temporal, pot)) g.set((pot, RDF.type, DCT.PeriodOfTime)) g.set((pot, SCHEMA.startDate, Literal(start))) g.set((pot, SCHEMA.endDate, Literal(end))) for tag in tags: g.add((node, DCAT.keyword, Literal(tag))) dataset = dataset_from_rdf(g) dataset.validate() self.assertIsInstance(dataset, Dataset) self.assertEqual(dataset.title, title) self.assertEqual(dataset.acronym, acronym) self.assertEqual(dataset.description, description) self.assertEqual(dataset.frequency, 'daily') self.assertEqual(set(dataset.tags), set(tags)) self.assertIsInstance(dataset.temporal_coverage, db.DateRange) self.assertEqual(dataset.temporal_coverage.start, start) self.assertEqual(dataset.temporal_coverage.end, end) extras = dataset.extras self.assertIn('dct:identifier', extras) self.assertEqual(extras['dct:identifier'], id) self.assertIn('uri', extras) self.assertEqual(extras['uri'], uri)
def test_all_fields(self): uri = 'https://test.org/dataset' node = URIRef(uri) g = Graph() id = faker.uuid4() title = faker.sentence() acronym = faker.word() description = faker.paragraph() tags = faker.words(nb=3) start = faker.past_date(start_date='-30d') end = faker.future_date(end_date='+30d') g.set((node, RDF.type, DCAT.Dataset)) g.set((node, DCT.identifier, Literal(id))) g.set((node, DCT.title, Literal(title))) g.set((node, SKOS.altLabel, Literal(acronym))) g.set((node, DCT.description, Literal(description))) g.set((node, DCT.accrualPeriodicity, FREQ.daily)) pot = BNode() g.add((node, DCT.temporal, pot)) g.set((pot, RDF.type, DCT.PeriodOfTime)) g.set((pot, SCHEMA.startDate, Literal(start))) g.set((pot, SCHEMA.endDate, Literal(end))) for tag in tags: g.add((node, DCAT.keyword, Literal(tag))) dataset = dataset_from_rdf(g) dataset.validate() assert isinstance(dataset, Dataset) assert dataset.title == title assert dataset.acronym == acronym assert dataset.description == description assert dataset.frequency == 'daily' assert set(dataset.tags) == set(tags) assert isinstance(dataset.temporal_coverage, db.DateRange) assert dataset.temporal_coverage.start == start assert dataset.temporal_coverage.end == end extras = dataset.extras assert 'dct:identifier' in extras assert extras['dct:identifier'] == id assert 'uri' in extras assert extras['uri'] == uri
def process(self, item): graph = Graph(namespace_manager=namespace_manager) graph.parse(data=item.kwargs['graph'], format='json-ld') dataset = self.get_dataset(item.remote_id) dataset = dataset_from_rdf(graph, dataset) return dataset