def test_spatial_multiple_dct_spatial_instances(self): g = Graph() dataset = URIRef("http://example.org/datasets/1") g.add((dataset, RDF.type, DCAT.Dataset)) spatial_uri = URIRef("http://geonames/Newark") g.add((dataset, DCT.spatial, spatial_uri)) location_ref = BNode() g.add((location_ref, RDF.type, DCT.Location)) g.add((dataset, DCT.spatial, location_ref)) g.add( (location_ref, LOCN.geometry, Literal('{"type": "Point", "coordinates": [23, 45]}', datatype=GEOJSON_IMT)) ) location_ref = BNode() g.add((location_ref, RDF.type, DCT.Location)) g.add((dataset, DCT.spatial, location_ref)) g.add((location_ref, SKOS.prefLabel, Literal("Newark"))) p = RDFParser(profiles=["euro_dcat_ap"]) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) eq_(extras["spatial_uri"], "http://geonames/Newark") eq_(extras["spatial_text"], "Newark") eq_(extras["spatial"], '{"type": "Point", "coordinates": [23, 45]}')
def test_dataset_ttl(self): dataset = factories.Dataset( notes='Test dataset' ) url = url_for('dcat_dataset', _id=dataset['id'], _format='ttl') app = self._get_test_app() response = app.get(url) eq_(response.headers['Content-Type'], 'text/turtle') content = response.body # Parse the contents to check it's an actual serialization p = RDFParser() p.parse(content, _format='turtle') dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 1) dcat_dataset = dcat_datasets[0] eq_(dcat_dataset['title'], dataset['title']) eq_(dcat_dataset['notes'], dataset['notes'])
def test_dataset_ttl(self): dataset = factories.Dataset(notes="Test dataset") url = url_for("dcat_dataset", _id=dataset["id"], _format="ttl") app = self._get_test_app() response = app.get(url) eq_(response.headers["Content-Type"], "text/turtle") content = response.body # Parse the contents to check it's an actual serialization p = RDFParser() p.parse(content, _format="turtle") dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 1) dcat_dataset = dcat_datasets[0] eq_(dcat_dataset["title"], dataset["title"]) eq_(dcat_dataset["notes"], dataset["notes"])
def test_dataset_json_ld_1(self): contents = self._get_file_contents("catalog_pod.jsonld") p = RDFParser(profiles=["euro_dcat_ap"]) p.parse(contents, _format="json-ld") datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] extras = dict((e["key"], e["value"]) for e in dataset["extras"]) eq_(dataset["title"], "U.S. Widget Manufacturing Statistics") eq_(extras["contact_name"], "Jane Doe") eq_(extras["contact_email"], "mailto:[email protected]") eq_(extras["publisher_name"], "Widget Services") eq_(extras["publisher_email"], "*****@*****.**") eq_(len(dataset["resources"]), 4) resource = [r for r in dataset["resources"] if r["name"] == "widgets.csv"][0] eq_(resource["name"], u"widgets.csv") eq_(resource["url"], u"https://data.agency.gov/datasets/widgets-statistics/widgets.csv") eq_(resource["download_url"], u"https://data.agency.gov/datasets/widgets-statistics/widgets.csv")
def test_datasets_none_found(self): p = RDFParser() p.g = Graph() eq_(len([d for d in p.datasets()]), 0)
def test_distribution_format_format_normalized(self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") g.add((distribution1_1, RDF.type, DCAT.Distribution)) g.add((distribution1_1, DCAT.mediaType, Literal('text/csv'))) g.add((distribution1_1, DCT['format'], Literal('Comma Separated Values'))) g.add((dataset1, DCAT.distribution, distribution1_1)) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] resource = datasets[0]['resources'][0] if toolkit.check_ckan_version(min_version='2.3'): eq_(resource['format'], u'CSV') eq_(resource['mimetype'], u'text/csv') else: eq_(resource['format'], u'Comma Separated Values')
def test_dataset_json_ld_1(self): contents = self._get_file_contents('catalog_pod.jsonld') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents, _format='json-ld') datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] extras = dict((e['key'], e['value']) for e in dataset['extras']) eq_(dataset['title'], 'U.S. Widget Manufacturing Statistics') eq_(extras['contact_name'], 'Jane Doe') eq_(extras['contact_email'], 'mailto:[email protected]') eq_(extras['publisher_name'], 'Widget Services') eq_(extras['publisher_email'], '*****@*****.**') eq_(len(dataset['resources']), 4) resource = [r for r in dataset['resources'] if r['name'] == 'widgets.csv'][0] eq_(resource['name'], u'widgets.csv') eq_(resource['url'], u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv') eq_(resource['download_url'], u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv')
def test_distribution_format_IMT_field(self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") imt = BNode() g.add((imt, RDF.type, DCT.IMT)) g.add((imt, RDF.value, Literal('text/turtle'))) g.add((imt, RDFS.label, Literal('Turtle'))) g.add((distribution1_1, RDF.type, DCAT.Distribution)) g.add((distribution1_1, DCT['format'], imt)) g.add((dataset1, DCAT.distribution, distribution1_1)) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] resource = datasets[0]['resources'][0] eq_(resource['format'], u'Turtle') eq_(resource['mimetype'], u'text/turtle')
def test_spatial_one_dct_spatial_instance_no_uri(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) location_ref = BNode() g.add((dataset, DCT.spatial, location_ref)) g.add((location_ref, RDF.type, DCT.Location)) g.add((location_ref, LOCN.geometry, Literal('{"type": "Point", "coordinates": [23, 45]}', datatype=GEOJSON_IMT))) g.add((location_ref, SKOS.prefLabel, Literal('Newark'))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) assert_true('spatial_uri' not in extras) eq_(extras['spatial_text'], 'Newark') eq_(extras['spatial'], '{"type": "Point", "coordinates": [23, 45]}')
def test_spatial_both_geojson_and_wkt(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) spatial_uri = URIRef('http://geonames/Newark') g.add((dataset, DCT.spatial, spatial_uri)) g.add((spatial_uri, RDF.type, DCT.Location)) g.add((spatial_uri, LOCN.geometry, Literal('{"type": "Point", "coordinates": [23, 45]}', datatype=GEOJSON_IMT))) g.add((spatial_uri, LOCN.geometry, Literal('POINT (67 89)', datatype=GSP.wktLiteral))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) eq_(extras['spatial'], '{"type": "Point", "coordinates": [23, 45]}')
def test_spatial_wrong_geometries(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) spatial_uri = URIRef('http://geonames/Newark') g.add((dataset, DCT.spatial, spatial_uri)) g.add((spatial_uri, RDF.type, DCT.Location)) g.add((spatial_uri, LOCN.geometry, Literal('Not GeoJSON', datatype=GEOJSON_IMT))) g.add((spatial_uri, LOCN.geometry, Literal('Not WKT', datatype=GSP.wktLiteral))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) assert_true('spatial' not in extras)
def test_catalog_modified_date(self): dataset1 = factories.Dataset(title='First dataset') time.sleep(1) dataset2 = factories.Dataset(title='Second dataset') url = url_for('dcat_catalog', _format='ttl', modified_since=dataset2['metadata_modified']) app = self._get_test_app() response = app.get(url) content = response.body p = RDFParser() p.parse(content, _format='turtle') dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 1) eq_(dcat_datasets[0]['title'], dataset2['title'])
def test_distribution_format_format_normalized(self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") g.add((distribution1_1, RDF.type, DCAT.Distribution)) g.add((distribution1_1, DCAT.mediaType, Literal("text/csv"))) g.add((distribution1_1, DCT["format"], Literal("Comma Separated Values"))) g.add((dataset1, DCAT.distribution, distribution1_1)) p = RDFParser(profiles=["euro_dcat_ap"]) p.g = g datasets = [d for d in p.datasets()] resource = datasets[0]["resources"][0] if toolkit.check_ckan_version(min_version="2.3"): eq_(resource["format"], u"CSV") eq_(resource["mimetype"], u"text/csv") else: eq_(resource["format"], u"Comma Separated Values")
def test_parse_subcatalog(self): publisher = {'name': 'Publisher', 'email': '*****@*****.**', 'type': 'Publisher', 'uri': 'http://pub.lish.er'} dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'test dataset', 'extras': [ {'key': 'source_catalog_title', 'value': 'Subcatalog example'}, {'key': 'source_catalog_homepage', 'value': 'http://subcatalog.example'}, {'key': 'source_catalog_description', 'value': 'Subcatalog example description'}, {'key': 'source_catalog_language', 'value': 'http://publications.europa.eu/resource/authority/language/ITA'}, {'key': 'source_catalog_modified', 'value': '2000-01-01'}, {'key': 'source_catalog_publisher', 'value': json.dumps(publisher)} ] } catalog_dict = { 'title': 'My Catalog', 'description': 'An Open Data Catalog', 'homepage': 'http://example.com', 'language': 'de', } s = RDFSerializer() s.serialize_catalog(catalog_dict, dataset_dicts=[dataset]) g = s.g p = RDFParser(profiles=['euro_dcat_ap']) p.g = g # at least one subcatalog with hasPart subcatalogs = list(p.g.objects(None, DCT.hasPart)) assert_true(subcatalogs) # at least one dataset in subcatalogs subdatasets = [] for subcatalog in subcatalogs: datasets = p.g.objects(subcatalog, DCAT.dataset) for dataset in datasets: subdatasets.append((dataset,subcatalog,)) assert_true(subdatasets) datasets = dict([(d['title'], d) for d in p.datasets()]) for subdataset, subcatalog in subdatasets: title = unicode(list(p.g.objects(subdataset, DCT.title))[0]) dataset = datasets[title] has_subcat = False for ex in dataset['extras']: exval = ex['value'] exkey = ex['key'] if exkey == 'source_catalog_homepage': has_subcat = True eq_(exval, unicode(subcatalog)) # check if we had subcatalog in extras assert_true(has_subcat)
def test_profiles_are_called_on_datasets(self): p = RDFParser() p._profiles = [MockRDFProfile1, MockRDFProfile2] p.g = _default_graph() for dataset in p.datasets(): assert dataset['profile_1'] assert dataset['profile_2']
def test_tags_with_commas(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) g.add((dataset, DCAT.keyword, Literal('Tree, forest, shrub'))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] eq_(len(datasets[0]['tags']), 3)
def test_datasets(self): p = RDFParser() p.g = _default_graph() datasets = [] for dataset in p.datasets(): assert 'title' in dataset datasets.append(dataset) eq_(len(datasets), 3)
def test_tags_with_commas_clean_tags_on(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) g.add((dataset, DCAT.keyword, Literal(self.INVALID_TAG))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] assert_true(self.VALID_TAG in datasets[0]['tags']) assert_true(self.INVALID_TAG not in datasets[0]['tags'])
def test_tags_with_commas_clean_tags_off(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) g.add((dataset, DCAT.keyword, Literal(self.INVALID_TAG))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g # when config flag is set to false, bad tags can happen datasets = [d for d in p.datasets()] assert_true(self.VALID_TAG not in datasets[0]['tags']) assert_true({'name': self.INVALID_TAG} in datasets[0]['tags'])
def test_dataset_version_adms(self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) g.add((dataset1, ADMS.version, Literal('2.3a'))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g dataset = [d for d in p.datasets()][0] eq_(dataset['version'], u'2.3a')
def test_catalog_xml_rdf(self): contents = self._get_file_contents("catalog.rdf") p = RDFParser(profiles=["euro_dcat_ap"]) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 2) dataset = datasets[0] if datasets[0]["title"] == "Example dataset 1" else datasets[1] eq_(dataset["title"], "Example dataset 1") eq_(len(dataset["resources"]), 3) eq_(len(dataset["tags"]), 2)
def test_catalog_xml_rdf(self): contents = self._get_file_contents('catalog.rdf') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 2) dataset = (datasets[0] if datasets[0]['title'] == 'Example dataset 1' else datasets[1]) eq_(dataset['title'], 'Example dataset 1') eq_(len(dataset['resources']), 3) eq_(len(dataset['tags']), 2)
def test_dataset_license_from_distribution_by_uri(self): # license_id retrieved from the URI of dcat:license object g = Graph() dataset = URIRef("http://example.org/datasets/1") g.add((dataset, RDF.type, DCAT.Dataset)) distribution = URIRef("http://example.org/datasets/1/ds/1") g.add((dataset, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) g.add((distribution, DCT.license, URIRef("http://www.opendefinition.org/licenses/cc-by"))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g dataset = [d for d in p.datasets()][0] eq_(dataset['license_id'], 'cc-by')
def test_spatial_uri_only(self): g = Graph() dataset = URIRef("http://example.org/datasets/1") g.add((dataset, RDF.type, DCAT.Dataset)) spatial_uri = URIRef("http://geonames/Newark") g.add((dataset, DCT.spatial, spatial_uri)) p = RDFParser(profiles=["euro_dcat_ap"]) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) eq_(extras["spatial_uri"], "http://geonames/Newark") assert_true("spatial_text" not in extras) assert_true("spatial" not in extras)
def test_spatial_uri_only(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) spatial_uri = URIRef('http://geonames/Newark') g.add((dataset, DCT.spatial, spatial_uri)) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) eq_(extras['spatial_uri'], 'http://geonames/Newark') assert_true('spatial_text' not in extras) assert_true('spatial' not in extras)
def test_dataset_license_from_distribution_by_title(self): # license_id retrieved from dct:title of dcat:license object g = Graph() dataset = URIRef("http://example.org/datasets/1") g.add((dataset, RDF.type, DCAT.Dataset)) distribution = URIRef("http://example.org/datasets/1/ds/1") g.add((distribution, RDF.type, DCAT.Distribution)) g.add((dataset, DCAT.distribution, distribution)) license = BNode() g.add((distribution, DCT.license, license)) g.add((license, DCT.title, Literal("Creative Commons Attribution"))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g dataset = [d for d in p.datasets()][0] eq_(dataset['license_id'], 'cc-by')
def test_dataset_show_without_format(self): dataset = factories.Dataset( notes='Test dataset' ) content = helpers.call_action('dcat_dataset_show', id=dataset['id']) # Parse the contents to check it's an actual serialization p = RDFParser() p.parse(content) dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 1) dcat_dataset = dcat_datasets[0] eq_(dcat_dataset['title'], dataset['title']) eq_(dcat_dataset['notes'], dataset['notes'])
def test_distribution_format_format_only(self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") g.add((distribution1_1, RDF.type, DCAT.Distribution)) g.add((distribution1_1, DCT['format'], Literal('CSV'))) g.add((dataset1, DCAT.distribution, distribution1_1)) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] resource = datasets[0]['resources'][0] eq_(resource['format'], u'CSV')
def test_dataset_turtle_1(self): contents = self._get_file_contents('dataset_deri.ttl') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents, _format='n3') datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] eq_(dataset['title'], 'Abandoned Vehicles') eq_(len(dataset['resources']), 1) resource = dataset['resources'][0] eq_(resource['name'], u'CSV distribution of: Abandoned Vehicles') eq_(resource['url'], u'http://data.london.gov.uk/datafiles/environment/abandoned-vehicles-borough.csv') eq_(resource['uri'], u'http://data.london.gov.uk/dataset/Abandoned_Vehicles/csv')
def test_spatial_rdfs_label(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) spatial_uri = URIRef('http://geonames/Newark') g.add((dataset, DCT.spatial, spatial_uri)) g.add((spatial_uri, RDF.type, DCT.Location)) g.add((spatial_uri, RDFS.label, Literal('Newark'))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) eq_(extras['spatial_text'], 'Newark')
def _build_and_parse_format_mediatype_graph(self, format_item=None, mediatype_item=None): """ Creates a minimal graph with a distribution having the specified dct:format and dcat:mediaType nodes. At least one of those nodes has to be given. After creating the graph, it is parsed using the euro_dcat_ap profile. :param format_item: Literal or URIRef object for dct:format. None if the node should be omitted. :param mediatype_item: Literal or URIRef object for dcat:mediaType. None if the node should be omitted. :returns: The parsed resource dict """ g = Graph() dataset = URIRef("http://example.org/datasets/1") g.add((dataset, RDF.type, DCAT.Dataset)) distribution = URIRef("http://example.org/datasets/1/ds/1") g.add((dataset, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) if format_item: g.add((distribution, DCT['format'], format_item)) if mediatype_item: g.add((distribution, DCAT.mediaType, mediatype_item)) if format_item is None and mediatype_item is None: raise AssertionError( 'At least one of format or mediaType is required!') p = RDFParser(profiles=['euro_dcat_ap']) p.g = g dataset = [d for d in p.datasets()][0] return dataset.get('resources')
def test_dataset_issued_with_year_before_1900(self): contents = self._get_file_contents('1894.xml') p = RDFParser(profiles=['swiss_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] # Check date values eq_(dataset['issued'], -2398377600) issued = datetime.fromtimestamp(dataset['issued']) eq_(issued.date().isoformat(), u'1893-12-31') eq_(dataset['modified'], 1524528000) modified = datetime.fromtimestamp(dataset['modified']) eq_(modified.date().isoformat(), u'2018-04-24')
def test_distribution_format_imt_and_format(self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") g.add((distribution1_1, RDF.type, DCAT.Distribution)) g.add((distribution1_1, DCAT.mediaType, Literal('text/csv'))) g.add((distribution1_1, DCT['format'], Literal('CSV'))) g.add((dataset1, DCAT.distribution, distribution1_1)) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] resource = datasets[0]['resources'][0] eq_(resource['format'], u'CSV') eq_(resource['mimetype'], u'text/csv')
def test_distribution_download_url(self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") g.add((distribution1_1, RDF.type, DCAT.Distribution)) g.add((distribution1_1, DCAT.downloadURL, Literal('http://download.url.org'))) g.add((dataset1, DCAT.distribution, distribution1_1)) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] resource = datasets[0]['resources'][0] eq_(resource['url'], u'http://download.url.org') eq_(resource['download_url'], u'http://download.url.org')
def _build_and_parse_format_mediatype_graph(self, format_item=None, mediatype_item=None): g = Graph() dataset = URIRef("http://example.org/datasets/1") g.add((dataset, RDF.type, self.DCAT.Dataset)) distribution = URIRef("http://example.org/datasets/1/ds/1") g.add((dataset, self.DCAT.distribution, distribution)) g.add((distribution, RDF.type, self.DCAT.Distribution)) if format_item: g.add((distribution, self.DCT['format'], format_item)) if mediatype_item: g.add((distribution, self.DCAT.mediaType, mediatype_item)) if format_item is None and mediatype_item is None: raise AssertionError('At least one of format or mediaType is required!') p = RDFParser(profiles=['euro_dcat_ap', 'dcatap_de']) p.g = g dataset = [d for d in p.datasets()][0] return dataset.get('resources')
def test_dataset_turtle_1(self): contents = self._get_file_contents('dataset_deri.ttl') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents, _format='n3') datasets = [d for d in p.datasets()] assert len(datasets) == 1 dataset = datasets[0] assert dataset['title'] == 'Abandoned Vehicles' assert len(dataset['resources']) == 1 resource = dataset['resources'][0] assert resource['name'] == u'CSV distribution of: Abandoned Vehicles' assert resource[ 'url'] == u'http://data.london.gov.uk/datafiles/environment/abandoned-vehicles-borough.csv' assert resource[ 'uri'] == u'http://data.london.gov.uk/dataset/Abandoned_Vehicles/csv'
def test_catalog_ttl(self): for i in xrange(4): factories.Dataset() url = url_for('dcat_catalog', _format='ttl') app = self._get_test_app() response = app.get(url) eq_(response.headers['Content-Type'], 'text/turtle') content = response.body # Parse the contents to check it's an actual serialization p = RDFParser() p.parse(content, _format='turtle') dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 4)
def test_distribution_format_format_only_without_slash_normalize_false( self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") g.add((distribution1_1, RDF.type, DCAT.Distribution)) g.add((distribution1_1, DCT['format'], Literal('Comma Separated Values'))) g.add((dataset1, DCAT.distribution, distribution1_1)) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] resource = datasets[0]['resources'][0] assert resource['format'] == u'Comma Separated Values' assert 'mimetype' not in resource
def test_spatial_wkt_only(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) spatial_uri = URIRef('http://geonames/Newark') g.add((dataset, DCT.spatial, spatial_uri)) g.add((spatial_uri, RDF.type, DCT.Location)) g.add((spatial_uri, LOCN.geometry, Literal('POINT (67 89)', datatype=GSP.wktLiteral))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) # NOTE: geomet returns floats for coordinates on WKT -> GeoJSON assert extras[ 'spatial'], '{"type": "Point", "coordinates": [67.0 == 89.0]}'
def test_distribution_format_imt_only(self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") g.add((distribution1_1, RDF.type, DCAT.Distribution)) g.add((distribution1_1, DCAT.mediaType, Literal('text/csv'))) g.add((dataset1, DCAT.distribution, distribution1_1)) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] resource = datasets[0]['resources'][0] if toolkit.check_ckan_version(min_version='2.3'): assert resource['format'] == u'CSV' assert resource['mimetype'] == u'text/csv' else: assert resource['format'] == u'text/csv'
def test_dataset_compatibility_mode(self): contents = self._get_file_contents('dataset.rdf') p = RDFParser(profiles=['euro_dcat_ap'], compatibility_mode=True) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] def _get_extra_value(key): v = [extra['value'] for extra in dataset['extras'] if extra['key'] == key] return v[0] if v else None eq_(_get_extra_value('dcat_issued'), u'2012-05-10') eq_(_get_extra_value('dcat_modified'), u'2012-05-10T21:04:00') eq_(_get_extra_value('dcat_publisher_name'), 'Publishing Organization for dataset 1') eq_(_get_extra_value('dcat_publisher_email'), '*****@*****.**') eq_(_get_extra_value('language'), 'ca,en,es')
def test_spatial_wrong_geometries(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) spatial_uri = URIRef('http://geonames/Newark') g.add((dataset, DCT.spatial, spatial_uri)) g.add((spatial_uri, RDF.type, DCT.Location)) g.add((spatial_uri, LOCN.geometry, Literal('Not GeoJSON', datatype=GEOJSON_IMT))) g.add((spatial_uri, LOCN.geometry, Literal('Not WKT', datatype=GSP.wktLiteral))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) assert 'spatial' not in extras
def test_parse_dataset_default_lang_de(self): maxrdf = self._get_max_rdf() p = RDFParser(profiles=['euro_dcat_ap', 'dcatap_de']) p.parse(maxrdf) self._add_basic_fields_with_languages(p) datasets = [d for d in p.datasets()] self.assertEqual(len(datasets), 1) dataset = datasets[0] # Title and description to be in default language "de" self.assertEqual(dataset.get('title'), u'Naturräume Geest und Marsch (DE)') self.assertEqual( dataset.get('notes'), u'Die Zuordnung des Hamburger Stadtgebietes zu den Naturräumen Geest und Marsch wird dargestellt. (DE)') # Publisher and ContactPoint extras = dataset.get('extras') self.assertTrue(len(extras) > 0) self._assert_extras_string(extras, 'publisher_name', u'Behörde für Umwelt und Energie (BUE), Amt für Umweltschutz (DE)') self._assert_extras_string(extras, 'contact_name', u'Herr Dr. Michael Schröder (DE)') # Resources self._assert_resource_lang(dataset, 'DE')
def test_conforms_to(self): conforms_to_in = [{'identifier': 'CONF1', 'uri': 'http://conf01/abc', 'title': {'en': 'title', 'it': 'title'}, 'referenceDocumentation': ['http://abc.efg/'],}, {'identifier': 'CONF2', 'title': {'en': 'title', 'it': 'title'}, 'description': {'en': 'descen', 'it': 'descit'}, 'referenceDocumentation': ['http://abc.efg/'],}, ] dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'Dataset di test DCAT_AP-IT', 'notes': 'dcatapit dataset di test', 'metadata_created': '2015-06-26T15:21:09.034694', 'metadata_modified': '2015-06-26T15:21:09.075774', 'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}], 'issued':'2016-11-29', 'modified':'2016-11-29', 'identifier':'ISBN', 'temporal_start':'2016-11-01', 'temporal_end':'2016-11-30', 'frequency':'UPDATE_CONT', 'publisher_name':'bolzano', 'publisher_identifier':'234234234', 'creator_name':'test', 'creator_identifier':'412946129', 'holder_name':'bolzano', 'holder_identifier':'234234234', 'alternate_identifier':'ISBN,TEST', 'theme':'{ECON,ENVI}', 'geographical_geonames_url':'http://www.geonames.org/3181913', 'language':'{DEU,ENG,ITA}', 'is_version_of':'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2', 'conforms_to':json.dumps(conforms_to_in) } s = RDFSerializer() p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap']) serialized = s.serialize_dataset(dataset) p.parse(serialized) datasets = list(p.datasets()) assert len(datasets) == 1 d = datasets[0] conforms_to = dict((d['identifier'], d) for d in conforms_to_in) dataset_conforms_to = json.loads(d['conforms_to']) assert len(dataset_conforms_to) == len(conforms_to_in), "got {}, should be {}".format(len(d['conforms_to']), len(conforms_to_in)) for conf in dataset_conforms_to: check = conforms_to[conf['identifier']] for k,v in check.items(): # there should be no empty uri if k == 'uri' and not v: assert conf.get(k) is None else: assert conf.get(k) == v for k, v in conf.items(): src_v = check.get(k) # ref may be extracted from rdf, but it can be # generated by serializer if not src_v and k == 'uri': continue # no value, may be missing key in source elif not src_v: assert not check.get(k) else: assert check[k] == v
def test_license(self): def get_path(fname): return os.path.join(os.path.dirname(__file__), '..', '..', '..', 'examples', fname) licenses = get_path('licenses.rdf') load_from_graph(path=licenses) Session.flush() dataset = {'title': 'some title', 'id': 'sometitle', 'resources': [ { 'id': 'resource/1111', 'uri': 'http://resource/1111', 'license_type': 'invalid', }, { 'id': 'resource/2222', 'uri': 'http://resource/2222', 'license_type': 'https://w3id.org/italia/controlled-vocabulary/licences/A311_GFDL13' } ] } p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap']) s = RDFSerializer() dataset_ref = s.graph_from_dataset(dataset) g = s.g r1 = URIRef(dataset['resources'][0]['uri']) r2 = URIRef(dataset['resources'][1]['uri']) unknown = License.get(License.DEFAULT_LICENSE) license_ref = g.value(r1, DCT.license) assert license_ref is not None assert str(license_ref) == unknown.uri,\ "got license {}, instead of {}".format(license_ref, unknown.license_type) gpl = License.get(dataset['resources'][1]['license_type']) assert gpl is not None license_ref = g.value(r2, DCT.license) license_type = g.value(license_ref, DCT.type) assert license_ref is not None assert str(license_ref) == gpl.document_uri assert str(license_type) == gpl.license_type serialized = s.serialize_dataset(dataset) p.parse(serialized) datasets = list(p.datasets()) assert len(datasets) == 1 new_dataset = datasets[0] resources = new_dataset['resources'] def _find_res(res_uri): for res in resources: if res_uri == res['uri']: return res raise ValueError("No resource for {}".format(res_uri)) new_res_unknown = _find_res(str(r1)) new_res_gpl = _find_res(str(r2)) assert new_res_unknown['license_type'] == unknown.uri, (new_res_unknown['license_type'], unknown.uri,) assert new_res_gpl['license_type'] == dataset['resources'][1]['license_type']
def test_mapping(self): # multilang requires lang to be set from pylons.i18n.translation import set_lang, get_lang import pylons class dummyreq(object): class p(object): translator = object() environ = {'pylons.pylons': p()} pylons.request = dummyreq() pylons.translator.pylons_lang = ['en_GB'] set_lang('en_GB') assert get_lang() == ['en_GB'] assert 'dcatapit_theme_group_mapper' in config['ckan.plugins'], "No dcatapit_theme_group_mapper plugin in config" contents = self._get_file_contents('dataset.rdf') p = RDFParser(profiles=['it_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) package_dict = datasets[0] user = User.get('dummy') if not user: user = call_action('user_create', name='dummy', password='******', email='*****@*****.**') user_name = user['name'] else: user_name = user.name org = Group.by_name('dummy') if org is None: org = call_action('organization_create', context={'user': user_name}, name='dummy', identifier='aaaaaa') existing_g = Group.by_name('existing-group') if existing_g is None: existing_g = call_action('group_create', context={'user': user_name}, name='existing-group') context = {'user': '******', 'ignore_auth': True, 'defer_commit': False} package_schema = schema.default_create_package_schema() context['schema'] = package_schema _p = {'frequency': 'manual', 'publisher_name': 'dummy', 'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}], 'groups': [], 'title': 'dummy', 'holder_name': 'dummy', 'holder_identifier': 'dummy', 'name': 'dummy', 'notes': 'dummy', 'owner_org': 'dummy', 'modified': datetime.now(), 'publisher_identifier': 'dummy', 'metadata_created' : datetime.now(), 'metadata_modified': datetime.now(), 'guid': unicode(uuid.uuid4), 'identifier': 'dummy'} package_dict.update(_p) config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = '' package_data = call_action('package_create', context=context, **package_dict) p = Package.get(package_data['id']) # no groups should be assigned at this point (no map applied) assert {'theme': ['non-mappable', 'thememap1']} == p.extras, '{} vs {}'.format(_p['extras'], p.extras) assert [] == p.get_groups(group_type='group'), 'should be {}, got {}'.format([], p.get_groups(group_type='group')) package_data = call_action('package_show', context=context, id=package_data['id']) # use test mapping, which replaces thememap1 to thememap2 and thememap3 test_map_file = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'examples', 'test_map.ini') config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file package_dict['theme'] = ['non-mappable', 'thememap1'] expected_groups_existing = ['existing-group'] expected_groups_new = expected_groups_existing + ['somegroup1', 'somegroup2'] expected_groups_multi = expected_groups_new + ['othergroup'] package_dict.pop('extras', None) p = Package.get(package_data['id']) context['package'] = p package_data = call_action('package_update', context=context, **package_dict) #meta.Session.flush() #meta.Session.revision = repo.new_revision() # check - only existing group should be assigned p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert expected_groups_existing == groups, (expected_groups_existing, 'vs', groups,) config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'true' package_dict['theme'] = ['non-mappable', 'thememap1'] package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() meta.Session.revision = repo.new_revision() # recheck - this time, new groups should appear p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert len(expected_groups_new) == len(groups), (expected_groups_new, 'vs', groups,) assert set(expected_groups_new) == set(groups), (expected_groups_new, 'vs', groups,) package_dict['theme'] = ['non-mappable', 'thememap1', 'thememap-multi'] package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() meta.Session.revision = repo.new_revision() # recheck - there should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert len(expected_groups_multi) == len(groups), (expected_groups_multi, 'vs', groups,) assert set(expected_groups_multi) == set(groups), (expected_groups_multi, 'vs', groups,) package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() meta.Session.revision = repo.new_revision() # recheck - there still should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert len(expected_groups_multi) == len(groups), (expected_groups_multi, 'vs', groups,) assert set(expected_groups_multi) == set(groups), (expected_groups_multi, 'vs', groups,) meta.Session.rollback()
def test_dataset_all_fields(self): contents = self._get_file_contents('dataset.rdf') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] # Basic fields eq_(dataset['title'], u'Zimbabwe Regional Geochemical Survey.') eq_( dataset['notes'], u'During the period 1982-86 a team of geologists from the British Geological Survey ...' ) eq_(dataset['url'], 'http://dataset.info.org') eq_(dataset['version'], '2.3') # Tags eq_(sorted(dataset['tags'], key=lambda k: k['name']), [{ 'name': u'exploration' }, { 'name': u'geochemistry' }, { 'name': u'geology' }]) # Extras def _get_extra_value(key): v = [ extra['value'] for extra in dataset['extras'] if extra['key'] == key ] return v[0] if v else None def _get_extra_value_as_list(key): value = _get_extra_value(key) return json.loads(value) if value else [] # Simple values eq_(_get_extra_value('issued'), u'2012-05-10') eq_(_get_extra_value('modified'), u'2012-05-10T21:04:00') eq_(_get_extra_value('identifier'), u'9df8df51-63db-37a8-e044-0003ba9b0d98') eq_(_get_extra_value('alternate_identifier'), u'alternate-identifier-x343') eq_(_get_extra_value('version_notes'), u'New schema added') eq_(_get_extra_value('temporal_start'), '1905-03-01') eq_(_get_extra_value('temporal_end'), '2013-01-05') eq_(_get_extra_value('frequency'), 'http://purl.org/cld/freq/daily') eq_(_get_extra_value('spatial_uri'), 'http://publications.europa.eu/mdr/authority/country/ZWE') eq_(_get_extra_value('publisher_uri'), 'http://orgs.vocab.org/some-org') eq_(_get_extra_value('publisher_name'), 'Publishing Organization for dataset 1') eq_(_get_extra_value('publisher_email'), '*****@*****.**') eq_(_get_extra_value('publisher_url'), 'http://some.org') eq_(_get_extra_value('publisher_type'), 'http://purl.org/adms/publishertype/NonProfitOrganisation') eq_(_get_extra_value('contact_name'), 'Point of Contact') eq_(_get_extra_value('contact_email'), 'mailto:[email protected]') # Lists eq_(sorted(_get_extra_value_as_list('language')), [u'ca', u'en', u'es']) eq_(sorted(_get_extra_value_as_list('theme')), [ u'Earth Sciences', u'http://eurovoc.europa.eu/100142', u'http://eurovoc.europa.eu/209065' ]) eq_(sorted(_get_extra_value_as_list('conforms_to')), [u'Standard 1', u'Standard 2']) # Dataset URI eq_( _get_extra_value('uri'), u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98' ) # Resources eq_(len(dataset['resources']), 1) resource = dataset['resources'][0] # Simple values eq_(resource['name'], u'Some website') eq_(resource['description'], u'A longer description') eq_(resource['format'], u'HTML') eq_(resource['mimetype'], u'text/html') eq_(resource['issued'], u'2012-05-11') eq_(resource['modified'], u'2012-05-01T00:04:06') eq_(resource['status'], u'http://purl.org/adms/status/Completed') # These two are likely to need clarification eq_(resource['license'], u'http://creativecommons.org/licenses/by/3.0/') eq_(resource['rights'], u'Some statement about rights') eq_(resource['url'], u'http://www.bgs.ac.uk/gbase/geochemcd/home.html') assert 'download_url' not in resource eq_(resource['size'], 12323) # Distribution URI eq_( resource['uri'], u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/1' )
def test_parse_dataset(self): maxrdf = self._get_max_rdf() p = RDFParser(profiles=['euro_dcat_ap', 'dcatap_de']) p.parse(maxrdf) datasets = [d for d in p.datasets()] self.assertEqual(len(datasets), 1) dataset = datasets[0] extras = dataset.get('extras') self.assertTrue(len(extras) > 0) resources = dataset.get('resources') self.assertEqual(len(resources), 2) # identify resources to be independent of their order if u'Distribution 1' in resources[0].get('description'): dist1 = resources[0] dist2 = resources[1] else: dist1 = resources[1] dist2 = resources[0] # list values are serialized by parser # dcatde:maintainer self.assertEqual(dataset.get('maintainer'), u'Peter Schröder') self._assert_extras_string(extras, 'maintainer_contacttype', u'Person') # dcatde:contributorID self._assert_extras_list_serialized( extras, 'contributorID', ['http://dcat-ap.de/def/contributors/transparenzportalHamburg']) # dcatde:originator self._assert_extras_string(extras, 'originator_name', u'Peter Schröder originator') self._assert_extras_string(extras, 'originator_contacttype', u'Person') # dcatde:politicalGeocodingURI self._assert_extras_list_serialized(extras, 'politicalGeocodingURI', [ 'http://dcat-ap.de/def/politicalGeocoding/regionalKey/020000000000', 'http://dcat-ap.de/def/politicalGeocoding/stateKey/02' ]) # dcatde:politicalGeocodingLevelURI self._assert_extras_string( extras, 'politicalGeocodingLevelURI', 'http://dcat-ap.de/def/politicalGeocoding/Level/state') # dcatde:legalbasisText self._assert_extras_list_serialized(extras, 'legalbasisText', ['Umweltinformationsgesetz (UIG)']) # dcatde:geocodingText self._assert_extras_list_serialized(extras, 'geocodingText', ['Hamburg']) # dcatde:qualityProcessURI self._assert_extras_string(extras, 'qualityProcessURI', 'https://www.example.com/') # resource checks self.assertEqual( dist1['__extras'].get('plannedAvailability'), 'http://dcat-ap.de/def/plannedAvailability/experimental') self.assertEqual( dist1['__extras'].get('licenseAttributionByText'), u'Freie und Hansestadt Hamburg, Behörde für Umwelt und Energie, 2016' ) self.assertEqual(dist1.get('license'), "http://dcat-ap.de/def/licenses/dl-by-de/2_0") self.assertEqual(dist1.get('size'), 685246) self.assertEqual( dist2['__extras'].get('plannedAvailability'), 'http://dcat-ap.de/def/plannedAvailability/available') self.assertEqual( dist2['__extras'].get('licenseAttributionByText'), u'Freie und Hansestadt Hamburg, Behörde für Umwelt und Energie, 2015' ) self.assertEqual(dist2.get('license'), "http://dcat-ap.de/def/licenses/dl-by-de/2_0") self.assertEqual(dist2.get('size'), 222441) # some non-dcatde fields self._assert_extras_list_serialized( extras, 'alternate_identifier', ['4635D337-4805-4C32-A211-13F8C038BF27']) # dcat:contactPoint self._assert_extras_string(extras, 'contact_email', u'*****@*****.**') self._assert_extras_string(extras, 'contact_name', u'Herr Dr. Michael Schröder') self._assert_extras_string(extras, 'maintainer_tel', u'+49 40 4 28 40 - 3494') self._assert_extras_string(extras, 'maintainer_street', u'Beispielstraße 4') self._assert_extras_string(extras, 'maintainer_city', u'Beispielort') self._assert_extras_string(extras, 'maintainer_zip', u'12345') self._assert_extras_string(extras, 'maintainer_country', u'DE') # Groups self.assertEqual(len(dataset['groups']), 2) self.assertTrue({'id': 'envi', 'name': 'envi'} in dataset['groups']) self.assertTrue({'id': 'agri', 'name': 'agri'} in dataset['groups']) # Keywords self._assert_tag_list(dataset, [ u'Karte', u'hmbtg_09_geodaten', u'Grundwasser', u'Bodenschutz', u'Geodaten', u'Umwelt und Klima', u'hmbtg', u'opendata', u'Thematische Karte' ]) # dct:location self._assert_extras_dict_serialized( extras, 'spatial', { "type": "Polygon", "coordinates": [[[10.3263, 53.3949], [10.3263, 53.9641], [8.4205, 53.9641], [8.4205, 53.3949], [10.3263, 53.3949]]] }) # dcat:landingPage self._assert_extras_string( extras, 'metadata_original_html', 'https://www.govdata.de/web/guest/daten/-/details/naturraume-geest-und-marsch3' )
def test_dataset_all_fields(self): contents = self._get_file_contents('1901.xml') p = RDFParser(profiles=['swiss_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] extras = self._extras(dataset) # Basic fields assert all( l in dataset['title'] for l in ['de', 'fr', 'it', 'en']), "title contains all languages" eq_(dataset['title']['de'], u'Statistisches Jahrbuch der Schweiz 1901') eq_(dataset['title']['fr'], u'Annuaire statistique de la Suisse 1901') assert all(l in dataset['description'] for l in ['de', 'fr', 'it', 'en' ]), "description contains all languages" eq_(dataset['description']['de'], u'') eq_(dataset['url'], u'https://www.bfs.admin.ch/bfs/de/home/statistiken.html') # Keywords assert all(l in dataset['keywords'] for l in ['de', 'fr', 'it', 'en']), "keywords contains all languages" eq_(sorted(dataset['keywords']['de']), ['publikation', 'statistische-grundlagen-und-ubersichten']) eq_(sorted(dataset['keywords']['fr']), ['bases-statistiques-et-generalites', 'publication']) eq_(sorted(dataset['keywords']['it']), ['basi-statistiche-e-presentazioni-generali', 'pubblicazione']) eq_(sorted(dataset['keywords']['en']), ['publication', 'statistical-basis-and-overviews']) eq_(sorted(dataset['tags'], key=lambda k: k['name']), [{ 'name': 'basas-statisticas-e-survistas' }, { 'name': 'bases-statistiques-et-generalites' }, { 'name': 'basi-statistiche-e-presentazioni-generali' }, { 'name': 'pubblicazione' }, { 'name': 'publication' }, { 'name': 'publication' }, { 'name': 'publikation' }, { 'name': 'statistical-basis-and-overviews' }, { 'name': 'statistische-grundlagen-und-ubersichten' }]) # Simple values eq_(dataset['issued'], -2177539200) eq_(dataset['modified'], 1524528000) eq_(dataset['identifier'], u'346266@bundesamt-fur-statistik-bfs') eq_(dataset['spatial'], 'Schweiz') # Temporals temporal = dataset['temporals'][0] eq_(temporal['end_date'], -2146003200) end_date = datetime.fromtimestamp(temporal['end_date']) eq_(end_date.date().isoformat(), '1901-12-31') eq_(temporal['start_date'], -2177452800) start_date = datetime.fromtimestamp(temporal['start_date']) eq_(start_date.date().isoformat(), '1901-01-01') # Publisher publisher = dataset['publishers'][0] eq_(publisher['label'], 'BFS/OFS') # Contact points contact_point = dataset['contact_points'][0] eq_(contact_point['name'], '*****@*****.**') eq_(contact_point['email'], '*****@*****.**') # See alsos see_also = dataset['see_alsos'][0] eq_(see_also['dataset_identifier'], u'4682791@bundesamt-fur-statistik-bfs') # Lists eq_(sorted(dataset['language']), [u'de', u'fr']) eq_(sorted(dataset['groups']), [{'name': u'statistical-basis'}]) # Dataset URI eq_( extras['uri'], u'https://opendata.swiss/dataset/7451e012-64b2-4bbc-af20-a0e2bc61b585' ) # Resources eq_(len(dataset['resources']), 1) resource = dataset['resources'][0] # Simple values assert all(l in resource['title'] for l in ['de', 'fr', 'it', 'en' ]), "resource title contains all languages" eq_(resource['title']['fr'], u'Annuaire statistique de la Suisse 1901') eq_(resource['title']['de'], u'') assert all(l in resource['description'] for l in ['de', 'fr', 'it', 'en' ]), "resource description contains all languages" eq_(resource['description']['de'], u'') eq_(resource['format'], u'HTML') eq_(resource['mimetype'], u'text/html') eq_(resource['media_type'], u'text/html') eq_(resource['identifier'], u'346265-fr@bundesamt-fur-statistik-bfs') eq_( resource['rights'], u'NonCommercialAllowed-CommercialWithPermission-ReferenceRequired') eq_(resource['language'], [u'fr']) eq_(resource['issued'], -2177539200) eq_(resource['url'], u'https://www.bfs.admin.ch/asset/fr/hs-b-00.01-jb-1901') assert 'download_url' not in resource, "download_url not available on resource" # Distribution URI eq_( resource['uri'], u'https://opendata.swiss/dataset/7451e012-64b2-4bbc-af20-a0e2bc61b585/resource/c8ec6ca0-6923-4cf3-92f2-95a10e6f8e25' )
def test_graph_to_dataset(self): contents = self._get_file_contents('dataset.rdf') p = RDFParser(profiles=['it_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] # Basic fields eq_(dataset['title'], u'Dataset di test DCAT_AP-IT') eq_(dataset['notes'], u'dcatapit dataset di test') # Simple values eq_(dataset['issued'], u'2016-11-29') eq_(dataset['modified'], u'2016-11-29') eq_(dataset['identifier'], u'ISBN') #eq_(dataset['temporal_start'], '2016-11-01') #eq_(dataset['temporal_end'], '2016-11-30') eq_(dataset['frequency'], 'UPDATE_CONT') geographical_name = dataset['geographical_name'][1:-1].split(',') if ',' in dataset['geographical_name'] else [dataset['geographical_name']] geographical_name.sort() geographical_name = '{' + ','.join([str(x) for x in geographical_name]) + '}' eq_(geographical_name, '{ITA_BZO}') eq_(dataset['publisher_name'], 'bolzano it') eq_(dataset['publisher_identifier'], '234234234') eq_(dataset['creator_name'], 'test') eq_(dataset['creator_identifier'], '412946129') eq_(dataset['holder_name'], 'bolzano') eq_(dataset['holder_identifier'], '234234234') alternate_identifier = set([i['identifier'] for i in json.loads(dataset['alternate_identifier'])]) eq_(alternate_identifier, set(['ISBN:123', 'TEST'])) theme = dataset['theme'] theme = json.loads(dataset['theme']) allowed_themes = ('ECON', 'ENVI',) assert theme, 'got {}'.format(dataset['theme']) for t in theme: assert t.get('theme') in allowed_themes, "themes {} not in {}".format(theme, allowed_themes) eq_(dataset['geographical_geonames_url'], 'http://www.geonames.org/3181913') language = dataset['language'][1:-1].split(',') if ',' in dataset['language'] else [dataset['language']] language.sort() language = '{' + ','.join([str(x) for x in language]) + '}' eq_(language, '{DEU,ENG,ITA}') eq_(dataset['is_version_of'], 'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2') conforms_to = json.loads(dataset['conforms_to']) conforms_to_ids = set([c['identifier'] for c in conforms_to]) eq_(conforms_to_ids, set('CONF1,CONF2,CONF3'.split(','))) # Multilang values ok_(dataset['DCATAPIT_MULTILANG_BASE']) multilang_notes = dataset['DCATAPIT_MULTILANG_BASE'].get('notes', None) ok_(multilang_notes) eq_(multilang_notes['de'], u'dcatapit test-dataset') eq_(multilang_notes['it'], u'dcatapit dataset di test') eq_(multilang_notes['en_GB'], u'dcatapit dataset test') multilang_holder_name = dataset['DCATAPIT_MULTILANG_BASE'].get('holder_name', None) ok_(multilang_holder_name) eq_(multilang_holder_name['de'], u'bolzano') eq_(multilang_holder_name['it'], u'bolzano') eq_(multilang_holder_name['en_GB'], u'bolzano') multilang_title = dataset['DCATAPIT_MULTILANG_BASE'].get('title', None) ok_(multilang_title) eq_(multilang_title['de'], u'Dcatapit Test-Dataset') eq_(multilang_title['it'], u'Dataset di test DCAT_AP-IT') eq_(multilang_title['en_GB'], u'DCAT_AP-IT test dataset') multilang_pub_name = dataset['DCATAPIT_MULTILANG_BASE'].get('publisher_name', None) ok_(multilang_pub_name) eq_(multilang_pub_name['en_GB'], u'bolzano en') eq_(multilang_pub_name['it'], u'bolzano it it')
def test_subthemes(self): load_themes() subthemes = [{ 'theme': 'AGRI', 'subthemes': [ 'http://eurovoc.europa.eu/100253', 'http://eurovoc.europa.eu/100258' ] }, { 'theme': 'ENVI', 'subthemes': [] }] dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'Dataset di test DCAT_AP-IT', 'notes': 'dcatapit dataset di test', 'metadata_created': '2015-06-26T15:21:09.034694', 'metadata_modified': '2015-06-26T15:21:09.075774', 'tags': [{ 'name': 'Tag 1' }, { 'name': 'Tag 2' }], 'issued': '2016-11-29', 'modified': '2016-11-29', 'frequency': 'UPDATE_CONT', 'publisher_name': 'bolzano', 'publisher_identifier': '234234234', 'creator_name': 'test', 'creator_identifier': '412946129', 'holder_name': 'bolzano', 'holder_identifier': '234234234', 'alternate_identifier': 'ISBN,TEST', FIELD_THEMES_AGGREGATE: json.dumps(subthemes), 'theme': theme_aggr_to_theme_uris( subthemes ) # this is added dinamically when retrieving datasets from the db } s = RDFSerializer() p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap']) serialized = s.serialize_dataset(dataset) p.parse(serialized) datasets = list(p.datasets()) assert len(datasets) == 1 parsed_dataset = datasets[0] # test themes parsed_themes_raw = _get_extra_value(parsed_dataset.get('extras'), 'theme') self.assertIsNotNone( parsed_themes_raw, f'Themes not found in parsed dataset {parsed_dataset}') parsed_themes = json.loads(parsed_themes_raw) self.assertEqual(2, len(parsed_themes)) self.assertSetEqual(set(theme_names_to_uris(['AGRI', 'ENVI'])), set(parsed_themes)) # test aggregated themes parsed_aggr_raw = parsed_dataset.get(FIELD_THEMES_AGGREGATE, None) self.assertIsNotNone( parsed_aggr_raw, f'Aggregated themes not found in parsed dataset {parsed_dataset}') parsed_aggr = json.loads(parsed_aggr_raw) self.assertIsNotNone(parsed_aggr, 'Aggregate is None') self.assertEquals(2, len(parsed_aggr)) for t in parsed_aggr: if t['theme'] == 'ENVI': self.assertSetEqual(set([]), set(t['subthemes'])) elif t['theme'] == 'AGRI': self.assertSetEqual(set(subthemes[0]['subthemes']), set(t['subthemes'])) else: self.fail(f'Unknown theme: {t}')
def test_temporal_coverage(self): load_themes() temporal_coverage = [ { 'temporal_start': '2001-01-01T00:00:00', 'temporal_end': '2001-02-01T10:11:12' }, { 'temporal_start': '2001-01-01T00:00:00', 'temporal_end': '2001-02-01T10:11:12' }, ] dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'Dataset di test DCAT_AP-IT', 'notes': 'dcatapit dataset di test', 'metadata_created': '2015-06-26T15:21:09.034694', 'metadata_modified': '2015-06-26T15:21:09.075774', 'tags': [{ 'name': 'Tag 1' }, { 'name': 'Tag 2' }], 'issued': '2016-11-29', 'modified': '2016-11-29', 'identifier': 'ISBN', 'temporal_start': '2016-11-01T00:00:00', 'temporal_end': '2016-11-30T00:00:00', 'temporal_coverage': json.dumps(temporal_coverage), 'frequency': 'UPDATE_CONT', 'publisher_name': 'bolzano', 'publisher_identifier': '234234234', 'creator_name': 'test', 'creator_identifier': '412946129', 'holder_name': 'bolzano', 'holder_identifier': '234234234', 'alternate_identifier': 'ISBN,TEST', 'theme': '{ECON,ENVI}', 'geographical_geonames_url': 'http://www.geonames.org/3181913', 'language': '{DEU,ENG,ITA}', 'is_version_of': 'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2', } s = RDFSerializer() p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap']) serialized = s.serialize_dataset(dataset) p.parse(serialized) datasets = list(p.datasets()) assert len(datasets) == 1 d = datasets[0] temporal_coverage.append({ 'temporal_start': dataset['temporal_start'], 'temporal_end': dataset['temporal_end'] }) try: validators.dcatapit_temporal_coverage(d['temporal_coverage'], {}) # this should not raise exception assert True except validators.Invalid, err: assert False, "Temporal coverage should be valid: {}".format(err)
def test_creators(self): creators = [ { 'creator_name': { DEFAULT_LANG: 'abc', 'it': 'abc it' }, 'creator_identifier': "ABC" }, { 'creator_name': { DEFAULT_LANG: 'cde', 'it': 'cde it' }, 'creator_identifier': "CDE" }, ] dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'Dataset di test DCAT_AP-IT', 'notes': 'dcatapit dataset di test', 'metadata_created': '2015-06-26T15:21:09.034694', 'metadata_modified': '2015-06-26T15:21:09.075774', 'tags': [{ 'name': 'Tag 1' }, { 'name': 'Tag 2' }], 'issued': '2016-11-29', 'modified': '2016-11-29', 'identifier': 'ISBN', 'temporal_start': '2016-11-01', 'temporal_end': '2016-11-30', 'frequency': 'UPDATE_CONT', 'publisher_name': 'bolzano', 'publisher_identifier': '234234234', 'creator_name': 'test', 'creator_identifier': '412946129', 'holder_name': 'bolzano', 'holder_identifier': '234234234', 'alternate_identifier': 'ISBN,TEST', 'theme': '{ECON,ENVI}', 'geographical_geonames_url': 'http://www.geonames.org/3181913', 'language': '{DEU,ENG,ITA}', 'is_version_of': 'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2', 'creator': json.dumps(creators) } s = RDFSerializer() p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap']) serialized = s.serialize_dataset(dataset) p.parse(serialized) datasets = list(p.datasets()) assert len(datasets) == 1 d = datasets[0] creators.append({ 'creator_identifier': dataset['creator_identifier'], 'creator_name': { DEFAULT_LANG: dataset['creator_name'] } }) creators_dict = dict((v['creator_identifier'], v) for v in creators) creators_in = json.loads(d['creator']) for c in creators_in: assert c['creator_identifier'] in creators_dict.keys( ), "no {} key in {}".format(c['creator_identifier'], creators_dict.keys()) assert c['creator_name'] == creators_dict[c['creator_identifier']]['creator_name'],\ "{} vs {}".format(c['creator_name'], creators_dict[c['creator_identifier']]['creator_name']) for c in creators_dict.keys(): assert c in [_c['creator_identifier'] for _c in creators_in] cdata = creators_dict[c] assert cdata in creators_in
def test_dataset_all_fields(self): contents = self._get_file_contents('dataset.rdf') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] assert len(datasets) == 1 dataset = datasets[0] # Basic fields assert dataset['title'] == u'Zimbabwe Regional Geochemical Survey.' assert dataset[ 'notes'] == u'During the period 1982-86 a team of geologists from the British Geological Survey ...' assert dataset['url'] == 'http://dataset.info.org' assert dataset['version'] == '2.3' assert dataset['license_id'] == 'cc-nc' # Tags assert (sorted(dataset['tags'], key=lambda k: k['name']) == [{ 'name': u'exploration' }, { 'name': u'geochemistry' }, { 'name': u'geology' }]) # Extras def _get_extra_value(key): v = [ extra['value'] for extra in dataset['extras'] if extra['key'] == key ] return v[0] if v else None def _get_extra_value_as_list(key): value = _get_extra_value(key) return json.loads(value) if value else [] # Simple values assert _get_extra_value('issued') == u'2012-05-10' assert _get_extra_value('modified') == u'2012-05-10T21:04:00' assert _get_extra_value( 'identifier') == u'9df8df51-63db-37a8-e044-0003ba9b0d98' assert _get_extra_value('version_notes') == u'New schema added' assert _get_extra_value('temporal_start') == '1905-03-01' assert _get_extra_value('temporal_end') == '2013-01-05' assert _get_extra_value( 'frequency') == 'http://purl.org/cld/freq/daily' assert _get_extra_value( 'spatial_uri' ) == 'http://publications.europa.eu/mdr/authority/country/ZWE' assert _get_extra_value( 'publisher_uri') == 'http://orgs.vocab.org/some-org' assert _get_extra_value( 'publisher_name') == 'Publishing Organization for dataset 1' assert _get_extra_value('publisher_email') == '*****@*****.**' assert _get_extra_value('publisher_url') == 'http://some.org' assert _get_extra_value( 'publisher_type' ) == 'http://purl.org/adms/publishertype/NonProfitOrganisation' assert _get_extra_value('contact_name') == 'Point of Contact' # mailto gets removed for storage and is added again on output assert _get_extra_value('contact_email') == '*****@*****.**' assert _get_extra_value('access_rights') == 'public' assert _get_extra_value( 'provenance') == 'Some statement about provenance' assert _get_extra_value('dcat_type') == 'test-type' # Lists assert sorted( _get_extra_value_as_list('language')), [u'ca', u'en' == u'es'] assert (sorted(_get_extra_value_as_list('theme')) == [ u'Earth Sciences', u'http://eurovoc.europa.eu/100142', u'http://eurovoc.europa.eu/209065' ]) assert sorted(_get_extra_value_as_list('conforms_to')), [ u'Standard 1' == u'Standard 2' ] assert sorted(_get_extra_value_as_list('alternate_identifier')), [ u'alternate-identifier-1' == u'alternate-identifier-2' ] assert (sorted(_get_extra_value_as_list('documentation')) == [ u'http://dataset.info.org/doc1', u'http://dataset.info.org/doc2' ]) assert (sorted(_get_extra_value_as_list('related_resource')) == [ u'http://dataset.info.org/related1', u'http://dataset.info.org/related2' ]) assert (sorted(_get_extra_value_as_list('has_version')) == [ u'https://data.some.org/catalog/datasets/derived-dataset-1', u'https://data.some.org/catalog/datasets/derived-dataset-2' ]) assert sorted(_get_extra_value_as_list('is_version_of')) == [ u'https://data.some.org/catalog/datasets/original-dataset' ] assert (sorted(_get_extra_value_as_list('source')) == [ u'https://data.some.org/catalog/datasets/source-dataset-1', u'https://data.some.org/catalog/datasets/source-dataset-2' ]) assert sorted(_get_extra_value_as_list('sample')) == [ u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/sample' ] # Dataset URI assert _get_extra_value( 'uri' ) == u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98' # Resources assert len(dataset['resources']) == 1 resource = dataset['resources'][0] # Simple values assert resource['name'] == u'Some website' assert resource['description'] == u'A longer description' assert resource['format'] == u'HTML' assert resource['mimetype'] == u'text/html' assert resource['issued'] == u'2012-05-11' assert resource['modified'] == u'2012-05-01T00:04:06' assert resource['status'] == u'http://purl.org/adms/status/Completed' assert resource['hash'] == u'4304cf2e751e6053c90b1804c89c0ebb758f395a' assert resource[ 'hash_algorithm'] == u'http://spdx.org/rdf/terms#checksumAlgorithm_sha1' # Lists for item in [ ('documentation', [ u'http://dataset.info.org/distribution1/doc1', u'http://dataset.info.org/distribution1/doc2' ]), ('language', [u'ca', u'en', u'es']), ('conforms_to', [u'Standard 1', u'Standard 2']), ]: assert sorted(json.loads(resource[item[0]])) == item[1] # These two are likely to need clarification assert resource[ 'license'] == u'http://creativecommons.org/licenses/by-nc/2.0/' assert resource['rights'] == u'Some statement about rights' assert resource[ 'url'] == u'http://www.bgs.ac.uk/gbase/geochemcd/home.html' assert 'download_url' not in resource assert resource['size'] == 12323 # Distribution URI assert resource[ 'uri'] == u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/1'
def test_parse_subcatalog(self): publisher = { 'name': 'Publisher', 'email': '*****@*****.**', 'type': 'Publisher', 'uri': 'http://pub.lish.er' } dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'test dataset', 'extras': [{ 'key': 'source_catalog_title', 'value': 'Subcatalog example' }, { 'key': 'source_catalog_homepage', 'value': 'http://subcatalog.example' }, { 'key': 'source_catalog_description', 'value': 'Subcatalog example description' }, { 'key': 'source_catalog_language', 'value': 'http://publications.europa.eu/resource/authority/language/ITA' }, { 'key': 'source_catalog_modified', 'value': '2000-01-01' }, { 'key': 'source_catalog_publisher', 'value': json.dumps(publisher) }] } catalog_dict = { 'title': 'My Catalog', 'description': 'An Open Data Catalog', 'homepage': 'http://example.com', 'language': 'de', } s = RDFSerializer() s.serialize_catalog(catalog_dict, dataset_dicts=[dataset]) g = s.g p = RDFParser(profiles=['euro_dcat_ap']) p.g = g # at least one subcatalog with hasPart subcatalogs = list(p.g.objects(None, DCT.hasPart)) assert subcatalogs # at least one dataset in subcatalogs subdatasets = [] for subcatalog in subcatalogs: datasets = p.g.objects(subcatalog, DCAT.dataset) for dataset in datasets: subdatasets.append(( dataset, subcatalog, )) assert subdatasets datasets = dict([(d['title'], d) for d in p.datasets()]) for subdataset, subcatalog in subdatasets: title = str(list(p.g.objects(subdataset, DCT.title))[0]) dataset = datasets[title] has_subcat = False for ex in dataset['extras']: exval = ex['value'] exkey = ex['key'] if exkey == 'source_catalog_homepage': has_subcat = True assert exval == str(subcatalog) # check if we had subcatalog in extras assert has_subcat
def gather_stage(self, harvest_job): log.debug('In DCATRDFHarvester gather_stage') rdf_format = None if harvest_job.source.config: rdf_format = json.loads( harvest_job.source.config).get("rdf_format") # Get file contents of first page next_page_url = harvest_job.source.url guids_in_source = [] object_ids = [] last_content_hash = None self._names_taken = [] while next_page_url: for harvester in p.PluginImplementations(IDCATRDFHarvester): next_page_url, before_download_errors = harvester.before_download( next_page_url, harvest_job) for error_msg in before_download_errors: self._save_gather_error(error_msg, harvest_job) if not next_page_url: return [] content, rdf_format = self._get_content_and_type( next_page_url, harvest_job, 1, content_type=rdf_format) content_hash = hashlib.md5() if content: if six.PY2: content_hash.update(content) else: content_hash.update(content.encode('utf8')) if last_content_hash: if content_hash.digest() == last_content_hash.digest(): log.warning( 'Remote content was the same even when using a paginated URL, skipping' ) break else: last_content_hash = content_hash # TODO: store content? for harvester in p.PluginImplementations(IDCATRDFHarvester): content, after_download_errors = harvester.after_download( content, harvest_job) for error_msg in after_download_errors: self._save_gather_error(error_msg, harvest_job) if not content: return [] # TODO: profiles conf parser = RDFParser() try: parser.parse(content, _format=rdf_format) except RDFParserException as e: self._save_gather_error( 'Error parsing the RDF file: {0}'.format(e), harvest_job) return [] for harvester in p.PluginImplementations(IDCATRDFHarvester): parser, after_parsing_errors = harvester.after_parsing( parser, harvest_job) for error_msg in after_parsing_errors: self._save_gather_error(error_msg, harvest_job) if not parser: return [] try: source_dataset = model.Package.get(harvest_job.source.id) for dataset in parser.datasets(): if not dataset.get('name'): dataset['name'] = self._gen_new_name(dataset['title']) if dataset['name'] in self._names_taken: suffix = len([ i for i in self._names_taken if i.startswith(dataset['name'] + '-') ]) + 1 dataset['name'] = '{}-{}'.format( dataset['name'], suffix) self._names_taken.append(dataset['name']) # Unless already set by the parser, get the owner organization (if any) # from the harvest source dataset if not dataset.get('owner_org'): if source_dataset.owner_org: dataset['owner_org'] = source_dataset.owner_org # Try to get a unique identifier for the harvested dataset guid = self._get_guid(dataset, source_url=source_dataset.url) if not guid: self._save_gather_error( 'Could not get a unique identifier for dataset: {0}' .format(dataset), harvest_job) continue dataset['extras'].append({'key': 'guid', 'value': guid}) guids_in_source.append(guid) obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset)) obj.save() object_ids.append(obj.id) except Exception as e: self._save_gather_error( 'Error when processsing dataset: %r / %s' % (e, traceback.format_exc()), harvest_job) return [] # get the next page next_page_url = parser.next_page() # Check if some datasets need to be deleted object_ids_to_delete = self._mark_datasets_for_deletion( guids_in_source, harvest_job) object_ids.extend(object_ids_to_delete) return object_ids
def gather_stage(self, harvest_job): log.debug('In DCATRDFHarvester gather_stage') rdf_format = None if harvest_job.source.config: rdf_format = json.loads( harvest_job.source.config).get("rdf_format") # Get file contents of first page next_page_url = harvest_job.source.url guids_in_source = [] object_ids = [] last_content_hash = None while next_page_url: for harvester in p.PluginImplementations(IDCATRDFHarvester): next_page_url, before_download_errors = harvester.before_download( next_page_url, harvest_job) for error_msg in before_download_errors: self._save_gather_error(error_msg, harvest_job) if not next_page_url: return [] content, rdf_format = self._get_content_and_type( next_page_url, harvest_job, 1, content_type=rdf_format) content_hash = hashlib.md5() content_hash.update(content) if last_content_hash: if content_hash.digest() == last_content_hash.digest(): log.warning( 'Remote content was the same even when using a paginated URL, skipping' ) break else: last_content_hash = content_hash # TODO: store content? for harvester in p.PluginImplementations(IDCATRDFHarvester): content, after_download_errors = harvester.after_download( content, harvest_job) for error_msg in after_download_errors: self._save_gather_error(error_msg, harvest_job) if not content: return [] # TODO: profiles conf parser = RDFParser() try: parser.parse(content, _format=rdf_format) except RDFParserException, e: self._save_gather_error( 'Error parsing the RDF file: {0}'.format(e), harvest_job) return [] for dataset in parser.datasets(): if not dataset.get('name'): dataset['name'] = self._gen_new_name(dataset['title']) # Unless already set by the parser, get the owner organization (if any) # from the harvest source dataset if not dataset.get('owner_org'): source_dataset = model.Package.get(harvest_job.source.id) if source_dataset.owner_org: dataset['owner_org'] = source_dataset.owner_org # Try to get a unique identifier for the harvested dataset guid = self._get_guid(dataset) if not guid: self._save_gather_error( 'Could not get a unique identifier for dataset: {0}'. format(dataset), harvest_job) continue dataset['extras'].append({'key': 'guid', 'value': guid}) guids_in_source.append(guid) obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset)) obj.save() object_ids.append(obj.id) # get the next page next_page_url = parser.next_page()
def test_theme_to_group_mapping(self): # multilang requires lang to be set # class dummyreq(object): # class p(object): # translator = object() # environ = {'pylons.pylons': p()} # CKANRequest(dummyreq) # pylons.request = dummyreq() # pylons.translator.pylons_lang = ['en_GB'] #set_lang('en_GB') #assert get_lang() == ['en_GB'] assert 'dcatapit_theme_group_mapper' in config[ 'ckan.plugins'], 'No dcatapit_theme_group_mapper plugin in config' with open(get_example_file('dataset.rdf'), 'r') as f: contents = f.read() p = RDFParser(profiles=['it_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] self.assertEqual(len(datasets), 1) package_dict = datasets[0] user = User.get('dummy') if not user: user = call_action('user_create', name='dummy', password='******', email='*****@*****.**') user_name = user['name'] else: user_name = user.name org = Group.by_name('dummy') if org is None: org = call_action('organization_create', context={'user': user_name}, name='dummy', identifier='aaaaaa') existing_g = Group.by_name('existing-group') if existing_g is None: existing_g = call_action('group_create', context={'user': user_name}, name='existing-group') context = {'user': '******', 'ignore_auth': True, 'defer_commit': False} package_schema = schema.default_create_package_schema() context['schema'] = package_schema _p = { 'frequency': 'manual', 'publisher_name': 'dummy', 'extras': [{ 'key': 'theme', 'value': ['non-mappable', 'thememap1'] }], 'groups': [], # [{'name':existing_g.name}], 'title': 'dummy', 'holder_name': 'dummy', 'holder_identifier': 'dummy', 'name': 'dummy-' + uuid4().hex, 'identifier': 'dummy' + uuid4().hex, 'notes': 'dummy', 'owner_org': 'dummy', 'modified': datetime.now(), 'publisher_identifier': 'dummy', 'metadata_created': datetime.now(), 'metadata_modified': datetime.now(), 'guid': str(uuid.uuid4), } package_dict.update(_p) config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = '' config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'false' package_data = call_action('package_create', context=context, **package_dict) p = Package.get(package_data['id']) # no groups should be assigned at this point (no map applied) assert { 'theme': ['non-mappable', 'thememap1'] } == p.extras, '{} vs {}'.format(_p['extras'], p.extras) assert [] == p.get_groups( group_type='group'), 'should be {}, got {}'.format( [], p.get_groups(group_type='group')) package_data = call_action('package_show', context=context, id=package_data['id']) # use test mapping, which replaces thememap1 to thememap2 and thememap3 test_map_file = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'examples', 'test_map.ini') config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'false' # package_dict['theme'] = ['non-mappable', 'thememap1'] package_dict.pop('extras', None) p = Package.get(package_data['id']) context['package'] = p package_data = call_action('package_update', context=context, **package_dict) # check - only existing group should be assigned p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] # the map file maps ECON to existing group, and 2 other unexisting groups that will not be created expected_groups = ['existing-group'] self.assertSetEqual(set(expected_groups), set(groups), 'Error in assigned groups') config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'true' # package_dict['theme'] = ['non-mappable', 'thememap1'] package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() # recheck - this time, new groups should appear p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] # the map file maps ECON to existing group and 2 other groups that have been automatically created expected_groups = expected_groups + ['somegroup1', 'somegroup2'] self.assertSetEqual(set(expected_groups), set(groups), 'Groups differ') # package_dict['theme'] = ['non-mappable', 'thememap1', 'thememap-multi'] aggr = json.loads(package_dict[FIELD_THEMES_AGGREGATE]) aggr.append({'theme': 'thememap-multi', 'subthemes': []}) package_dict[FIELD_THEMES_AGGREGATE] = json.dumps(aggr) package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() # recheck - there should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] # added theme 'thememap-multi', that maps to 'othergroup' and other already exisintg groups expected_groups = expected_groups + ['othergroup'] self.assertEqual(len(expected_groups), len(groups), 'New groups differ - there may be duplicated groups') self.assertSetEqual(set(expected_groups), set(groups), 'New groups differ') package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() # recheck - there still should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] self.assertEqual(len(expected_groups), len(groups), 'New groups differ - there may be duplicated groups') self.assertSetEqual(set(expected_groups), set(groups), 'New groups differ') meta.Session.rollback()