def test_spatial_multiple_dct_spatial_instances(self):
        g = Graph()

        dataset = URIRef("http://example.org/datasets/1")
        g.add((dataset, RDF.type, DCAT.Dataset))

        spatial_uri = URIRef("http://geonames/Newark")
        g.add((dataset, DCT.spatial, spatial_uri))

        location_ref = BNode()
        g.add((location_ref, RDF.type, DCT.Location))
        g.add((dataset, DCT.spatial, location_ref))
        g.add(
            (location_ref, LOCN.geometry, Literal('{"type": "Point", "coordinates": [23, 45]}', datatype=GEOJSON_IMT))
        )

        location_ref = BNode()
        g.add((location_ref, RDF.type, DCT.Location))
        g.add((dataset, DCT.spatial, location_ref))
        g.add((location_ref, SKOS.prefLabel, Literal("Newark")))

        p = RDFParser(profiles=["euro_dcat_ap"])

        p.g = g

        datasets = [d for d in p.datasets()]

        extras = self._extras(datasets[0])

        eq_(extras["spatial_uri"], "http://geonames/Newark")
        eq_(extras["spatial_text"], "Newark")
        eq_(extras["spatial"], '{"type": "Point", "coordinates": [23, 45]}')
Exemplo n.º 2
0
    def test_dataset_ttl(self):

        dataset = factories.Dataset(
            notes='Test dataset'
        )

        url = url_for('dcat_dataset', _id=dataset['id'], _format='ttl')

        app = self._get_test_app()

        response = app.get(url)

        eq_(response.headers['Content-Type'], 'text/turtle')

        content = response.body

        # Parse the contents to check it's an actual serialization
        p = RDFParser()

        p.parse(content, _format='turtle')

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 1)

        dcat_dataset = dcat_datasets[0]

        eq_(dcat_dataset['title'], dataset['title'])
        eq_(dcat_dataset['notes'], dataset['notes'])
Exemplo n.º 3
0
    def test_dataset_ttl(self):

        dataset = factories.Dataset(notes="Test dataset")

        url = url_for("dcat_dataset", _id=dataset["id"], _format="ttl")

        app = self._get_test_app()

        response = app.get(url)

        eq_(response.headers["Content-Type"], "text/turtle")

        content = response.body

        # Parse the contents to check it's an actual serialization
        p = RDFParser()

        p.parse(content, _format="turtle")

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 1)

        dcat_dataset = dcat_datasets[0]

        eq_(dcat_dataset["title"], dataset["title"])
        eq_(dcat_dataset["notes"], dataset["notes"])
    def test_dataset_json_ld_1(self):

        contents = self._get_file_contents("catalog_pod.jsonld")

        p = RDFParser(profiles=["euro_dcat_ap"])

        p.parse(contents, _format="json-ld")

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]
        extras = dict((e["key"], e["value"]) for e in dataset["extras"])

        eq_(dataset["title"], "U.S. Widget Manufacturing Statistics")

        eq_(extras["contact_name"], "Jane Doe")
        eq_(extras["contact_email"], "mailto:[email protected]")
        eq_(extras["publisher_name"], "Widget Services")
        eq_(extras["publisher_email"], "*****@*****.**")

        eq_(len(dataset["resources"]), 4)

        resource = [r for r in dataset["resources"] if r["name"] == "widgets.csv"][0]
        eq_(resource["name"], u"widgets.csv")
        eq_(resource["url"], u"https://data.agency.gov/datasets/widgets-statistics/widgets.csv")
        eq_(resource["download_url"], u"https://data.agency.gov/datasets/widgets-statistics/widgets.csv")
Exemplo n.º 5
0
    def test_datasets_none_found(self):

        p = RDFParser()

        p.g = Graph()

        eq_(len([d for d in p.datasets()]), 0)
    def test_distribution_format_format_normalized(self):
        g = Graph()

        dataset1 = URIRef("http://example.org/datasets/1")
        g.add((dataset1, RDF.type, DCAT.Dataset))

        distribution1_1 = URIRef("http://example.org/datasets/1/ds/1")
        g.add((distribution1_1, RDF.type, DCAT.Distribution))
        g.add((distribution1_1, DCAT.mediaType, Literal('text/csv')))
        g.add((distribution1_1, DCT['format'], Literal('Comma Separated Values')))
        g.add((dataset1, DCAT.distribution, distribution1_1))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        resource = datasets[0]['resources'][0]

        if toolkit.check_ckan_version(min_version='2.3'):
            eq_(resource['format'], u'CSV')
            eq_(resource['mimetype'], u'text/csv')
        else:
            eq_(resource['format'], u'Comma Separated Values')
    def test_dataset_json_ld_1(self):

        contents = self._get_file_contents('catalog_pod.jsonld')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents, _format='json-ld')

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]
        extras = dict((e['key'], e['value']) for e in dataset['extras'])

        eq_(dataset['title'], 'U.S. Widget Manufacturing Statistics')

        eq_(extras['contact_name'], 'Jane Doe')
        eq_(extras['contact_email'], 'mailto:[email protected]')
        eq_(extras['publisher_name'], 'Widget Services')
        eq_(extras['publisher_email'], '*****@*****.**')

        eq_(len(dataset['resources']), 4)

        resource = [r for r in dataset['resources'] if r['name'] == 'widgets.csv'][0]
        eq_(resource['name'], u'widgets.csv')
        eq_(resource['url'], u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv')
        eq_(resource['download_url'], u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv')
    def test_distribution_format_IMT_field(self):
        g = Graph()

        dataset1 = URIRef("http://example.org/datasets/1")
        g.add((dataset1, RDF.type, DCAT.Dataset))

        distribution1_1 = URIRef("http://example.org/datasets/1/ds/1")

        imt = BNode()

        g.add((imt, RDF.type, DCT.IMT))
        g.add((imt, RDF.value, Literal('text/turtle')))
        g.add((imt, RDFS.label, Literal('Turtle')))

        g.add((distribution1_1, RDF.type, DCAT.Distribution))
        g.add((distribution1_1, DCT['format'], imt))
        g.add((dataset1, DCAT.distribution, distribution1_1))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        resource = datasets[0]['resources'][0]

        eq_(resource['format'], u'Turtle')
        eq_(resource['mimetype'], u'text/turtle')
    def test_spatial_one_dct_spatial_instance_no_uri(self):
        g = Graph()

        dataset = URIRef('http://example.org/datasets/1')
        g.add((dataset, RDF.type, DCAT.Dataset))

        location_ref = BNode()
        g.add((dataset, DCT.spatial, location_ref))

        g.add((location_ref, RDF.type, DCT.Location))
        g.add((location_ref,
               LOCN.geometry,
               Literal('{"type": "Point", "coordinates": [23, 45]}', datatype=GEOJSON_IMT)))
        g.add((location_ref, SKOS.prefLabel, Literal('Newark')))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        extras = self._extras(datasets[0])

        assert_true('spatial_uri' not in extras)
        eq_(extras['spatial_text'], 'Newark')
        eq_(extras['spatial'], '{"type": "Point", "coordinates": [23, 45]}')
    def test_spatial_both_geojson_and_wkt(self):
        g = Graph()

        dataset = URIRef('http://example.org/datasets/1')
        g.add((dataset, RDF.type, DCAT.Dataset))

        spatial_uri = URIRef('http://geonames/Newark')
        g.add((dataset, DCT.spatial, spatial_uri))

        g.add((spatial_uri, RDF.type, DCT.Location))
        g.add((spatial_uri,
               LOCN.geometry,
               Literal('{"type": "Point", "coordinates": [23, 45]}', datatype=GEOJSON_IMT)))
        g.add((spatial_uri,
               LOCN.geometry,
               Literal('POINT (67 89)', datatype=GSP.wktLiteral)))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        extras = self._extras(datasets[0])

        eq_(extras['spatial'], '{"type": "Point", "coordinates": [23, 45]}')
    def test_spatial_wrong_geometries(self):
        g = Graph()

        dataset = URIRef('http://example.org/datasets/1')
        g.add((dataset, RDF.type, DCAT.Dataset))

        spatial_uri = URIRef('http://geonames/Newark')
        g.add((dataset, DCT.spatial, spatial_uri))

        g.add((spatial_uri, RDF.type, DCT.Location))
        g.add((spatial_uri,
               LOCN.geometry,
               Literal('Not GeoJSON', datatype=GEOJSON_IMT)))
        g.add((spatial_uri,
               LOCN.geometry,
               Literal('Not WKT', datatype=GSP.wktLiteral)))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        extras = self._extras(datasets[0])

        assert_true('spatial' not in extras)
Exemplo n.º 12
0
    def test_catalog_modified_date(self):

        dataset1 = factories.Dataset(title='First dataset')
        time.sleep(1)
        dataset2 = factories.Dataset(title='Second dataset')

        url = url_for('dcat_catalog',
                      _format='ttl',
                      modified_since=dataset2['metadata_modified'])

        app = self._get_test_app()

        response = app.get(url)

        content = response.body

        p = RDFParser()

        p.parse(content, _format='turtle')

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 1)

        eq_(dcat_datasets[0]['title'], dataset2['title'])
    def test_distribution_format_format_normalized(self):
        g = Graph()

        dataset1 = URIRef("http://example.org/datasets/1")
        g.add((dataset1, RDF.type, DCAT.Dataset))

        distribution1_1 = URIRef("http://example.org/datasets/1/ds/1")
        g.add((distribution1_1, RDF.type, DCAT.Distribution))
        g.add((distribution1_1, DCAT.mediaType, Literal("text/csv")))
        g.add((distribution1_1, DCT["format"], Literal("Comma Separated Values")))
        g.add((dataset1, DCAT.distribution, distribution1_1))

        p = RDFParser(profiles=["euro_dcat_ap"])

        p.g = g

        datasets = [d for d in p.datasets()]

        resource = datasets[0]["resources"][0]

        if toolkit.check_ckan_version(min_version="2.3"):
            eq_(resource["format"], u"CSV")
            eq_(resource["mimetype"], u"text/csv")
        else:
            eq_(resource["format"], u"Comma Separated Values")
    def test_parse_subcatalog(self):
        publisher = {'name': 'Publisher',
                     'email': '*****@*****.**',
                     'type': 'Publisher',
                     'uri': 'http://pub.lish.er'}
        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'test dataset',
            'extras': [
                {'key': 'source_catalog_title', 'value': 'Subcatalog example'},
                {'key': 'source_catalog_homepage', 'value': 'http://subcatalog.example'},
                {'key': 'source_catalog_description', 'value': 'Subcatalog example description'},
                {'key': 'source_catalog_language', 'value': 'http://publications.europa.eu/resource/authority/language/ITA'},
                {'key': 'source_catalog_modified', 'value': '2000-01-01'},
                {'key': 'source_catalog_publisher', 'value': json.dumps(publisher)}
            ]
        }        
        catalog_dict = {
            'title': 'My Catalog',
            'description': 'An Open Data Catalog',
            'homepage': 'http://example.com',
            'language': 'de',
        }

        s = RDFSerializer()
        s.serialize_catalog(catalog_dict, dataset_dicts=[dataset])
        g = s.g

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        # at least one subcatalog with hasPart
        subcatalogs = list(p.g.objects(None, DCT.hasPart))
        assert_true(subcatalogs)

        # at least one dataset in subcatalogs
        subdatasets = []
        for subcatalog in subcatalogs:
            datasets = p.g.objects(subcatalog, DCAT.dataset)
            for dataset in datasets:
                subdatasets.append((dataset,subcatalog,))
        assert_true(subdatasets)
        
        datasets = dict([(d['title'], d) for d in p.datasets()])

        for subdataset, subcatalog in subdatasets:
            title = unicode(list(p.g.objects(subdataset, DCT.title))[0])
            dataset = datasets[title]
            has_subcat = False
            for ex in dataset['extras']:
                exval = ex['value']
                exkey = ex['key']
                if exkey == 'source_catalog_homepage':
                    has_subcat = True
                    eq_(exval, unicode(subcatalog))
            # check if we had subcatalog in extras
            assert_true(has_subcat)
Exemplo n.º 15
0
    def test_profiles_are_called_on_datasets(self):

        p = RDFParser()

        p._profiles = [MockRDFProfile1, MockRDFProfile2]

        p.g = _default_graph()

        for dataset in p.datasets():
            assert dataset['profile_1']
            assert dataset['profile_2']
    def test_tags_with_commas(self):
        g = Graph()

        dataset = URIRef('http://example.org/datasets/1')
        g.add((dataset, RDF.type, DCAT.Dataset))
        g.add((dataset, DCAT.keyword, Literal('Tree, forest, shrub')))
        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]
        
        eq_(len(datasets[0]['tags']), 3)
Exemplo n.º 17
0
    def test_datasets(self):

        p = RDFParser()

        p.g = _default_graph()

        datasets = []
        for dataset in p.datasets():

            assert 'title' in dataset

            datasets.append(dataset)

        eq_(len(datasets), 3)
    def test_tags_with_commas_clean_tags_on(self):
        g = Graph()

        dataset = URIRef('http://example.org/datasets/1')
        g.add((dataset, RDF.type, DCAT.Dataset))
        g.add((dataset, DCAT.keyword, Literal(self.INVALID_TAG)))
        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        assert_true(self.VALID_TAG in datasets[0]['tags'])
        assert_true(self.INVALID_TAG not in datasets[0]['tags'])
    def test_tags_with_commas_clean_tags_off(self):
        g = Graph()

        dataset = URIRef('http://example.org/datasets/1')
        g.add((dataset, RDF.type, DCAT.Dataset))
        g.add((dataset, DCAT.keyword, Literal(self.INVALID_TAG)))
        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        # when config flag is set to false, bad tags can happen
        
        datasets = [d for d in p.datasets()]
        assert_true(self.VALID_TAG not in datasets[0]['tags'])
        assert_true({'name': self.INVALID_TAG} in datasets[0]['tags'])
    def test_dataset_version_adms(self):
        g = Graph()

        dataset1 = URIRef("http://example.org/datasets/1")
        g.add((dataset1, RDF.type, DCAT.Dataset))

        g.add((dataset1, ADMS.version, Literal('2.3a')))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        dataset = [d for d in p.datasets()][0]

        eq_(dataset['version'], u'2.3a')
    def test_catalog_xml_rdf(self):

        contents = self._get_file_contents("catalog.rdf")

        p = RDFParser(profiles=["euro_dcat_ap"])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 2)

        dataset = datasets[0] if datasets[0]["title"] == "Example dataset 1" else datasets[1]

        eq_(dataset["title"], "Example dataset 1")
        eq_(len(dataset["resources"]), 3)
        eq_(len(dataset["tags"]), 2)
    def test_catalog_xml_rdf(self):

        contents = self._get_file_contents('catalog.rdf')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 2)

        dataset = (datasets[0] if datasets[0]['title'] == 'Example dataset 1'
                   else datasets[1])

        eq_(dataset['title'], 'Example dataset 1')
        eq_(len(dataset['resources']), 3)
        eq_(len(dataset['tags']), 2)
    def test_dataset_license_from_distribution_by_uri(self):
        # license_id retrieved from the URI of dcat:license object
        g = Graph()

        dataset = URIRef("http://example.org/datasets/1")
        g.add((dataset, RDF.type, DCAT.Dataset))

        distribution = URIRef("http://example.org/datasets/1/ds/1")
        g.add((dataset, DCAT.distribution, distribution))
        g.add((distribution, RDF.type, DCAT.Distribution))
        g.add((distribution, DCT.license,
               URIRef("http://www.opendefinition.org/licenses/cc-by")))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        dataset = [d for d in p.datasets()][0]
        eq_(dataset['license_id'], 'cc-by')
    def test_spatial_uri_only(self):
        g = Graph()

        dataset = URIRef("http://example.org/datasets/1")
        g.add((dataset, RDF.type, DCAT.Dataset))

        spatial_uri = URIRef("http://geonames/Newark")
        g.add((dataset, DCT.spatial, spatial_uri))
        p = RDFParser(profiles=["euro_dcat_ap"])

        p.g = g

        datasets = [d for d in p.datasets()]

        extras = self._extras(datasets[0])

        eq_(extras["spatial_uri"], "http://geonames/Newark")
        assert_true("spatial_text" not in extras)
        assert_true("spatial" not in extras)
    def test_spatial_uri_only(self):
        g = Graph()

        dataset = URIRef('http://example.org/datasets/1')
        g.add((dataset, RDF.type, DCAT.Dataset))

        spatial_uri = URIRef('http://geonames/Newark')
        g.add((dataset, DCT.spatial, spatial_uri))
        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        extras = self._extras(datasets[0])

        eq_(extras['spatial_uri'], 'http://geonames/Newark')
        assert_true('spatial_text' not in extras)
        assert_true('spatial' not in extras)
    def test_dataset_license_from_distribution_by_title(self):
        # license_id retrieved from dct:title of dcat:license object
        g = Graph()

        dataset = URIRef("http://example.org/datasets/1")
        g.add((dataset, RDF.type, DCAT.Dataset))

        distribution = URIRef("http://example.org/datasets/1/ds/1")
        g.add((distribution, RDF.type, DCAT.Distribution))
        g.add((dataset, DCAT.distribution, distribution))
        license = BNode()
        g.add((distribution, DCT.license, license))
        g.add((license, DCT.title, Literal("Creative Commons Attribution")))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        dataset = [d for d in p.datasets()][0]
        eq_(dataset['license_id'], 'cc-by')
Exemplo n.º 27
0
    def test_dataset_show_without_format(self):
        dataset = factories.Dataset(
            notes='Test dataset'
        )

        content = helpers.call_action('dcat_dataset_show', id=dataset['id'])

        # Parse the contents to check it's an actual serialization
        p = RDFParser()

        p.parse(content)

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 1)

        dcat_dataset = dcat_datasets[0]

        eq_(dcat_dataset['title'], dataset['title'])
        eq_(dcat_dataset['notes'], dataset['notes'])
    def test_distribution_format_format_only(self):
        g = Graph()

        dataset1 = URIRef("http://example.org/datasets/1")
        g.add((dataset1, RDF.type, DCAT.Dataset))

        distribution1_1 = URIRef("http://example.org/datasets/1/ds/1")
        g.add((distribution1_1, RDF.type, DCAT.Distribution))
        g.add((distribution1_1, DCT['format'], Literal('CSV')))
        g.add((dataset1, DCAT.distribution, distribution1_1))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        resource = datasets[0]['resources'][0]

        eq_(resource['format'], u'CSV')
    def test_dataset_turtle_1(self):

        contents = self._get_file_contents('dataset_deri.ttl')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents, _format='n3')

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        eq_(dataset['title'], 'Abandoned Vehicles')
        eq_(len(dataset['resources']), 1)

        resource = dataset['resources'][0]
        eq_(resource['name'], u'CSV distribution of: Abandoned Vehicles')
        eq_(resource['url'], u'http://data.london.gov.uk/datafiles/environment/abandoned-vehicles-borough.csv')
        eq_(resource['uri'], u'http://data.london.gov.uk/dataset/Abandoned_Vehicles/csv')
    def test_spatial_rdfs_label(self):
        g = Graph()

        dataset = URIRef('http://example.org/datasets/1')
        g.add((dataset, RDF.type, DCAT.Dataset))

        spatial_uri = URIRef('http://geonames/Newark')
        g.add((dataset, DCT.spatial, spatial_uri))

        g.add((spatial_uri, RDF.type, DCT.Location))
        g.add((spatial_uri, RDFS.label, Literal('Newark')))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        extras = self._extras(datasets[0])

        eq_(extras['spatial_text'], 'Newark')
    def _build_and_parse_format_mediatype_graph(self,
                                                format_item=None,
                                                mediatype_item=None):
        """
        Creates a minimal graph with a distribution having the specified dct:format and dcat:mediaType
        nodes. At least one of those nodes has to be given.

        After creating the graph, it is parsed using the euro_dcat_ap profile.

        :param format_item:
            Literal or URIRef object for dct:format. None if the node should be omitted.
        :param mediatype_item:
            Literal or URIRef object for dcat:mediaType. None if the node should be omitted.

        :returns:
            The parsed resource dict
        """
        g = Graph()

        dataset = URIRef("http://example.org/datasets/1")
        g.add((dataset, RDF.type, DCAT.Dataset))

        distribution = URIRef("http://example.org/datasets/1/ds/1")
        g.add((dataset, DCAT.distribution, distribution))
        g.add((distribution, RDF.type, DCAT.Distribution))
        if format_item:
            g.add((distribution, DCT['format'], format_item))
        if mediatype_item:
            g.add((distribution, DCAT.mediaType, mediatype_item))
        if format_item is None and mediatype_item is None:
            raise AssertionError(
                'At least one of format or mediaType is required!')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        dataset = [d for d in p.datasets()][0]
        return dataset.get('resources')
Exemplo n.º 32
0
    def test_dataset_issued_with_year_before_1900(self):

        contents = self._get_file_contents('1894.xml')

        p = RDFParser(profiles=['swiss_dcat_ap'])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        # Check date values
        eq_(dataset['issued'], -2398377600)
        issued = datetime.fromtimestamp(dataset['issued'])
        eq_(issued.date().isoformat(), u'1893-12-31')

        eq_(dataset['modified'], 1524528000)
        modified = datetime.fromtimestamp(dataset['modified'])
        eq_(modified.date().isoformat(), u'2018-04-24')
Exemplo n.º 33
0
    def test_distribution_format_imt_and_format(self):
        g = Graph()

        dataset1 = URIRef("http://example.org/datasets/1")
        g.add((dataset1, RDF.type, DCAT.Dataset))

        distribution1_1 = URIRef("http://example.org/datasets/1/ds/1")
        g.add((distribution1_1, RDF.type, DCAT.Distribution))
        g.add((distribution1_1, DCAT.mediaType, Literal('text/csv')))
        g.add((distribution1_1, DCT['format'], Literal('CSV')))
        g.add((dataset1, DCAT.distribution, distribution1_1))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        resource = datasets[0]['resources'][0]

        eq_(resource['format'], u'CSV')
        eq_(resource['mimetype'], u'text/csv')
Exemplo n.º 34
0
    def test_distribution_download_url(self):
        g = Graph()

        dataset1 = URIRef("http://example.org/datasets/1")
        g.add((dataset1, RDF.type, DCAT.Dataset))

        distribution1_1 = URIRef("http://example.org/datasets/1/ds/1")
        g.add((distribution1_1, RDF.type, DCAT.Distribution))
        g.add((distribution1_1, DCAT.downloadURL,
               Literal('http://download.url.org')))
        g.add((dataset1, DCAT.distribution, distribution1_1))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        resource = datasets[0]['resources'][0]

        eq_(resource['url'], u'http://download.url.org')
        eq_(resource['download_url'], u'http://download.url.org')
Exemplo n.º 35
0
    def _build_and_parse_format_mediatype_graph(self, format_item=None, mediatype_item=None):
        g = Graph()

        dataset = URIRef("http://example.org/datasets/1")
        g.add((dataset, RDF.type, self.DCAT.Dataset))

        distribution = URIRef("http://example.org/datasets/1/ds/1")
        g.add((dataset, self.DCAT.distribution, distribution))
        g.add((distribution, RDF.type, self.DCAT.Distribution))
        if format_item:
            g.add((distribution, self.DCT['format'], format_item))
        if mediatype_item:
            g.add((distribution, self.DCAT.mediaType, mediatype_item))
        if format_item is None and mediatype_item is None:
            raise AssertionError('At least one of format or mediaType is required!')

        p = RDFParser(profiles=['euro_dcat_ap', 'dcatap_de'])

        p.g = g

        dataset = [d for d in p.datasets()][0]
        return dataset.get('resources')
    def test_dataset_turtle_1(self):

        contents = self._get_file_contents('dataset_deri.ttl')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents, _format='n3')

        datasets = [d for d in p.datasets()]

        assert len(datasets) == 1

        dataset = datasets[0]

        assert dataset['title'] == 'Abandoned Vehicles'
        assert len(dataset['resources']) == 1

        resource = dataset['resources'][0]
        assert resource['name'] == u'CSV distribution of: Abandoned Vehicles'
        assert resource[
            'url'] == u'http://data.london.gov.uk/datafiles/environment/abandoned-vehicles-borough.csv'
        assert resource[
            'uri'] == u'http://data.london.gov.uk/dataset/Abandoned_Vehicles/csv'
Exemplo n.º 37
0
    def test_catalog_ttl(self):

        for i in xrange(4):
            factories.Dataset()

        url = url_for('dcat_catalog', _format='ttl')

        app = self._get_test_app()

        response = app.get(url)

        eq_(response.headers['Content-Type'], 'text/turtle')

        content = response.body

        # Parse the contents to check it's an actual serialization
        p = RDFParser()

        p.parse(content, _format='turtle')

        dcat_datasets = [d for d in p.datasets()]

        eq_(len(dcat_datasets), 4)
    def test_distribution_format_format_only_without_slash_normalize_false(
            self):
        g = Graph()

        dataset1 = URIRef("http://example.org/datasets/1")
        g.add((dataset1, RDF.type, DCAT.Dataset))

        distribution1_1 = URIRef("http://example.org/datasets/1/ds/1")
        g.add((distribution1_1, RDF.type, DCAT.Distribution))
        g.add((distribution1_1, DCT['format'],
               Literal('Comma Separated Values')))
        g.add((dataset1, DCAT.distribution, distribution1_1))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        resource = datasets[0]['resources'][0]

        assert resource['format'] == u'Comma Separated Values'
        assert 'mimetype' not in resource
    def test_spatial_wkt_only(self):
        g = Graph()

        dataset = URIRef('http://example.org/datasets/1')
        g.add((dataset, RDF.type, DCAT.Dataset))

        spatial_uri = URIRef('http://geonames/Newark')
        g.add((dataset, DCT.spatial, spatial_uri))

        g.add((spatial_uri, RDF.type, DCT.Location))
        g.add((spatial_uri, LOCN.geometry,
               Literal('POINT (67 89)', datatype=GSP.wktLiteral)))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        extras = self._extras(datasets[0])
        # NOTE: geomet returns floats for coordinates on WKT -> GeoJSON
        assert extras[
            'spatial'], '{"type": "Point", "coordinates": [67.0 == 89.0]}'
    def test_distribution_format_imt_only(self):
        g = Graph()

        dataset1 = URIRef("http://example.org/datasets/1")
        g.add((dataset1, RDF.type, DCAT.Dataset))

        distribution1_1 = URIRef("http://example.org/datasets/1/ds/1")
        g.add((distribution1_1, RDF.type, DCAT.Distribution))
        g.add((distribution1_1, DCAT.mediaType, Literal('text/csv')))
        g.add((dataset1, DCAT.distribution, distribution1_1))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        resource = datasets[0]['resources'][0]
        if toolkit.check_ckan_version(min_version='2.3'):
            assert resource['format'] == u'CSV'
            assert resource['mimetype'] == u'text/csv'
        else:
            assert resource['format'] == u'text/csv'
    def test_dataset_compatibility_mode(self):

        contents = self._get_file_contents('dataset.rdf')

        p = RDFParser(profiles=['euro_dcat_ap'], compatibility_mode=True)

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        def _get_extra_value(key):
            v = [extra['value'] for extra in dataset['extras'] if extra['key'] == key]
            return v[0] if v else None

        eq_(_get_extra_value('dcat_issued'), u'2012-05-10')
        eq_(_get_extra_value('dcat_modified'), u'2012-05-10T21:04:00')
        eq_(_get_extra_value('dcat_publisher_name'), 'Publishing Organization for dataset 1')
        eq_(_get_extra_value('dcat_publisher_email'), '*****@*****.**')
        eq_(_get_extra_value('language'), 'ca,en,es')
    def test_spatial_wrong_geometries(self):
        g = Graph()

        dataset = URIRef('http://example.org/datasets/1')
        g.add((dataset, RDF.type, DCAT.Dataset))

        spatial_uri = URIRef('http://geonames/Newark')
        g.add((dataset, DCT.spatial, spatial_uri))

        g.add((spatial_uri, RDF.type, DCT.Location))
        g.add((spatial_uri, LOCN.geometry,
               Literal('Not GeoJSON', datatype=GEOJSON_IMT)))
        g.add((spatial_uri, LOCN.geometry,
               Literal('Not WKT', datatype=GSP.wktLiteral)))

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        datasets = [d for d in p.datasets()]

        extras = self._extras(datasets[0])

        assert 'spatial' not in extras
Exemplo n.º 43
0
    def test_parse_dataset_default_lang_de(self):
        maxrdf = self._get_max_rdf()

        p = RDFParser(profiles=['euro_dcat_ap', 'dcatap_de'])

        p.parse(maxrdf)
        self._add_basic_fields_with_languages(p)

        datasets = [d for d in p.datasets()]
        self.assertEqual(len(datasets), 1)
        dataset = datasets[0]

        # Title and description to be in default language "de"
        self.assertEqual(dataset.get('title'), u'Naturräume Geest und Marsch (DE)')
        self.assertEqual(
            dataset.get('notes'),
            u'Die Zuordnung des Hamburger Stadtgebietes zu den Naturräumen Geest und Marsch wird dargestellt. (DE)')
        # Publisher and ContactPoint
        extras = dataset.get('extras')
        self.assertTrue(len(extras) > 0)
        self._assert_extras_string(extras, 'publisher_name', u'Behörde für Umwelt und Energie (BUE), Amt für Umweltschutz (DE)')
        self._assert_extras_string(extras, 'contact_name', u'Herr Dr. Michael Schröder (DE)')
        # Resources
        self._assert_resource_lang(dataset, 'DE')
    def test_conforms_to(self):

        conforms_to_in = [{'identifier': 'CONF1',
                                       'uri': 'http://conf01/abc',
                                 'title': {'en': 'title', 'it': 'title'},
                                 'referenceDocumentation': ['http://abc.efg/'],},
                                {'identifier': 'CONF2',
                                 'title': {'en': 'title', 'it': 'title'},
                                 'description': {'en': 'descen', 'it': 'descit'},
                                 'referenceDocumentation': ['http://abc.efg/'],},
                                 ]
        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Dataset di test DCAT_AP-IT',
            'notes': 'dcatapit dataset di test',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}],
            'issued':'2016-11-29',
            'modified':'2016-11-29',
            'identifier':'ISBN',
            'temporal_start':'2016-11-01',
            'temporal_end':'2016-11-30',
            'frequency':'UPDATE_CONT',
            'publisher_name':'bolzano',
            'publisher_identifier':'234234234',
            'creator_name':'test',
            'creator_identifier':'412946129',
            'holder_name':'bolzano',
            'holder_identifier':'234234234',
            'alternate_identifier':'ISBN,TEST',
            'theme':'{ECON,ENVI}',
            'geographical_geonames_url':'http://www.geonames.org/3181913',
            'language':'{DEU,ENG,ITA}',
            'is_version_of':'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2',
            'conforms_to':json.dumps(conforms_to_in)
        }

        s = RDFSerializer()
        p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap'])
        
        serialized = s.serialize_dataset(dataset)

        p.parse(serialized)
        datasets = list(p.datasets())
        
        assert len(datasets) == 1
        d = datasets[0]

        conforms_to = dict((d['identifier'], d) for d in conforms_to_in)
        dataset_conforms_to = json.loads(d['conforms_to'])

        assert len(dataset_conforms_to) == len(conforms_to_in), "got {}, should be {}".format(len(d['conforms_to']), len(conforms_to_in))
        for conf in dataset_conforms_to:
            check = conforms_to[conf['identifier']]
            for k,v  in check.items():
                # there should be no empty uri 
                if k == 'uri' and not v:
                    assert conf.get(k) is None
                else:
                    assert conf.get(k) == v
            for k, v in conf.items():
                src_v = check.get(k)
                # ref may be extracted from rdf, but it can be
                # generated by serializer
                if not src_v and k == 'uri':
                    continue
                # no value, may be missing key in source
                elif not src_v:
                    assert not check.get(k)
                else:
                    assert check[k] == v
    def test_license(self):
        
        def get_path(fname):
            return os.path.join(os.path.dirname(__file__),
                        '..', '..', '..', 'examples', fname)
        licenses = get_path('licenses.rdf')
        load_from_graph(path=licenses)
        Session.flush()


        dataset = {'title': 'some title',
                   'id': 'sometitle',
                   'resources': [
                            {
                                'id': 'resource/1111',
                                'uri': 'http://resource/1111',
                                'license_type': 'invalid',
                            },
                            {
                                'id': 'resource/2222',
                                'uri': 'http://resource/2222',
                                'license_type': 'https://w3id.org/italia/controlled-vocabulary/licences/A311_GFDL13'
                            }
                        ]
                    }
       

        p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap'])

        s = RDFSerializer()


        dataset_ref = s.graph_from_dataset(dataset)

        g = s.g

        r1 = URIRef(dataset['resources'][0]['uri'])
        r2 = URIRef(dataset['resources'][1]['uri'])

        unknown = License.get(License.DEFAULT_LICENSE)

        license_ref = g.value(r1, DCT.license)
        
        assert license_ref is not None
        assert str(license_ref) == unknown.uri,\
            "got license {}, instead of {}".format(license_ref, unknown.license_type)

        gpl = License.get(dataset['resources'][1]['license_type'])
        assert gpl is not None

        license_ref = g.value(r2, DCT.license)
        license_type = g.value(license_ref, DCT.type)
        
        assert license_ref is not None

        assert str(license_ref) == gpl.document_uri
        assert str(license_type) == gpl.license_type

        serialized = s.serialize_dataset(dataset)

        p.parse(serialized)
        datasets = list(p.datasets())
        assert len(datasets) == 1
        new_dataset = datasets[0]
        resources = new_dataset['resources']

        def _find_res(res_uri):
            for res in resources:
                if res_uri == res['uri']:
                    return res
            raise ValueError("No resource for {}".format(res_uri))

        new_res_unknown = _find_res(str(r1))
        new_res_gpl = _find_res(str(r2))

        assert new_res_unknown['license_type'] == unknown.uri, (new_res_unknown['license_type'], unknown.uri,)
        assert new_res_gpl['license_type'] == dataset['resources'][1]['license_type']
    def test_mapping(self):

        # multilang requires lang to be set
        from pylons.i18n.translation import set_lang, get_lang
        import pylons
        class dummyreq(object):
            class p(object):
                translator = object()
            environ = {'pylons.pylons': p()}
        pylons.request = dummyreq()
        pylons.translator.pylons_lang = ['en_GB']
        set_lang('en_GB')
        assert get_lang() == ['en_GB']

        assert 'dcatapit_theme_group_mapper' in config['ckan.plugins'], "No dcatapit_theme_group_mapper plugin in config"
        contents = self._get_file_contents('dataset.rdf')

        p = RDFParser(profiles=['it_dcat_ap'])

        p.parse(contents)
        datasets = [d for d in p.datasets()]
        eq_(len(datasets), 1)
        package_dict = datasets[0]


        user = User.get('dummy')
        
        if not user:
            user = call_action('user_create',
                               name='dummy',
                               password='******',
                               email='*****@*****.**')
            user_name = user['name']
        else:
            user_name = user.name
        org = Group.by_name('dummy')
        if org is None:
            org  = call_action('organization_create',
                                context={'user': user_name},
                                name='dummy',
                                identifier='aaaaaa')
        existing_g = Group.by_name('existing-group')
        if existing_g is None:
            existing_g  = call_action('group_create',
                                      context={'user': user_name},
                                      name='existing-group')

        context = {'user': '******',
                   'ignore_auth': True,
                   'defer_commit': False}
        package_schema = schema.default_create_package_schema()
        context['schema'] = package_schema
        _p = {'frequency': 'manual',
              'publisher_name': 'dummy',
              'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}],
              'groups': [],
              'title': 'dummy',
              'holder_name': 'dummy',
              'holder_identifier': 'dummy',
              'name': 'dummy',
              'notes': 'dummy',
              'owner_org': 'dummy',
              'modified': datetime.now(),
              'publisher_identifier': 'dummy',
              'metadata_created' : datetime.now(),
              'metadata_modified': datetime.now(),
              'guid': unicode(uuid.uuid4),
              'identifier': 'dummy'}
        
        package_dict.update(_p)
        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = ''
        package_data = call_action('package_create', context=context, **package_dict)

        p = Package.get(package_data['id'])

        # no groups should be assigned at this point (no map applied)
        assert {'theme': ['non-mappable', 'thememap1']} == p.extras, '{} vs {}'.format(_p['extras'], p.extras)
        assert [] == p.get_groups(group_type='group'), 'should be {}, got {}'.format([], p.get_groups(group_type='group'))

        package_data = call_action('package_show', context=context, id=package_data['id'])

        # use test mapping, which replaces thememap1 to thememap2 and thememap3
        test_map_file = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'examples', 'test_map.ini')
        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file

        package_dict['theme'] = ['non-mappable', 'thememap1']

        expected_groups_existing = ['existing-group']
        expected_groups_new = expected_groups_existing + ['somegroup1', 'somegroup2']
        expected_groups_multi = expected_groups_new + ['othergroup']

        package_dict.pop('extras', None)
        p = Package.get(package_data['id'])
        context['package'] = p 

        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)
        
        #meta.Session.flush()
        #meta.Session.revision = repo.new_revision()

        # check - only existing group should be assigned
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        assert expected_groups_existing == groups, (expected_groups_existing, 'vs', groups,)

        config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'true'


        package_dict['theme'] = ['non-mappable', 'thememap1']
        package_data = call_action('package_update', context=context, **package_dict)


        meta.Session.flush()
        meta.Session.revision = repo.new_revision()

        # recheck - this time, new groups should appear
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        assert len(expected_groups_new) == len(groups), (expected_groups_new, 'vs', groups,)
        assert set(expected_groups_new) == set(groups), (expected_groups_new, 'vs', groups,)

        package_dict['theme'] = ['non-mappable', 'thememap1', 'thememap-multi']
        package_data = call_action('package_update', context=context, **package_dict)

        meta.Session.flush()
        meta.Session.revision = repo.new_revision()

        # recheck - there should be no duplicates
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        assert len(expected_groups_multi) == len(groups), (expected_groups_multi, 'vs', groups,)
        assert set(expected_groups_multi) == set(groups), (expected_groups_multi, 'vs', groups,)

        package_data = call_action('package_update', context=context, **package_dict)

        meta.Session.flush()
        meta.Session.revision = repo.new_revision()

        # recheck - there still should be no duplicates
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        assert len(expected_groups_multi) == len(groups), (expected_groups_multi, 'vs', groups,)
        assert set(expected_groups_multi) == set(groups), (expected_groups_multi, 'vs', groups,)

        meta.Session.rollback()
Exemplo n.º 47
0
    def test_dataset_all_fields(self):

        contents = self._get_file_contents('dataset.rdf')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]

        # Basic fields

        eq_(dataset['title'], u'Zimbabwe Regional Geochemical Survey.')
        eq_(
            dataset['notes'],
            u'During the period 1982-86 a team of geologists from the British Geological Survey ...'
        )
        eq_(dataset['url'], 'http://dataset.info.org')
        eq_(dataset['version'], '2.3')

        # Tags

        eq_(sorted(dataset['tags'], key=lambda k: k['name']),
            [{
                'name': u'exploration'
            }, {
                'name': u'geochemistry'
            }, {
                'name': u'geology'
            }])

        # Extras

        def _get_extra_value(key):
            v = [
                extra['value'] for extra in dataset['extras']
                if extra['key'] == key
            ]
            return v[0] if v else None

        def _get_extra_value_as_list(key):
            value = _get_extra_value(key)
            return json.loads(value) if value else []

        #  Simple values
        eq_(_get_extra_value('issued'), u'2012-05-10')
        eq_(_get_extra_value('modified'), u'2012-05-10T21:04:00')
        eq_(_get_extra_value('identifier'),
            u'9df8df51-63db-37a8-e044-0003ba9b0d98')
        eq_(_get_extra_value('alternate_identifier'),
            u'alternate-identifier-x343')
        eq_(_get_extra_value('version_notes'), u'New schema added')
        eq_(_get_extra_value('temporal_start'), '1905-03-01')
        eq_(_get_extra_value('temporal_end'), '2013-01-05')
        eq_(_get_extra_value('frequency'), 'http://purl.org/cld/freq/daily')
        eq_(_get_extra_value('spatial_uri'),
            'http://publications.europa.eu/mdr/authority/country/ZWE')
        eq_(_get_extra_value('publisher_uri'),
            'http://orgs.vocab.org/some-org')
        eq_(_get_extra_value('publisher_name'),
            'Publishing Organization for dataset 1')
        eq_(_get_extra_value('publisher_email'), '*****@*****.**')
        eq_(_get_extra_value('publisher_url'), 'http://some.org')
        eq_(_get_extra_value('publisher_type'),
            'http://purl.org/adms/publishertype/NonProfitOrganisation')
        eq_(_get_extra_value('contact_name'), 'Point of Contact')
        eq_(_get_extra_value('contact_email'), 'mailto:[email protected]')

        #  Lists
        eq_(sorted(_get_extra_value_as_list('language')),
            [u'ca', u'en', u'es'])
        eq_(sorted(_get_extra_value_as_list('theme')), [
            u'Earth Sciences', u'http://eurovoc.europa.eu/100142',
            u'http://eurovoc.europa.eu/209065'
        ])
        eq_(sorted(_get_extra_value_as_list('conforms_to')),
            [u'Standard 1', u'Standard 2'])

        # Dataset URI
        eq_(
            _get_extra_value('uri'),
            u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98'
        )

        # Resources
        eq_(len(dataset['resources']), 1)

        resource = dataset['resources'][0]

        #  Simple values
        eq_(resource['name'], u'Some website')
        eq_(resource['description'], u'A longer description')
        eq_(resource['format'], u'HTML')
        eq_(resource['mimetype'], u'text/html')
        eq_(resource['issued'], u'2012-05-11')
        eq_(resource['modified'], u'2012-05-01T00:04:06')
        eq_(resource['status'], u'http://purl.org/adms/status/Completed')

        # These two are likely to need clarification
        eq_(resource['license'],
            u'http://creativecommons.org/licenses/by/3.0/')
        eq_(resource['rights'], u'Some statement about rights')

        eq_(resource['url'], u'http://www.bgs.ac.uk/gbase/geochemcd/home.html')
        assert 'download_url' not in resource

        eq_(resource['size'], 12323)

        # Distribution URI
        eq_(
            resource['uri'],
            u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/1'
        )
Exemplo n.º 48
0
    def test_parse_dataset(self):
        maxrdf = self._get_max_rdf()

        p = RDFParser(profiles=['euro_dcat_ap', 'dcatap_de'])

        p.parse(maxrdf)

        datasets = [d for d in p.datasets()]
        self.assertEqual(len(datasets), 1)
        dataset = datasets[0]

        extras = dataset.get('extras')
        self.assertTrue(len(extras) > 0)
        resources = dataset.get('resources')
        self.assertEqual(len(resources), 2)

        # identify resources to be independent of their order
        if u'Distribution 1' in resources[0].get('description'):
            dist1 = resources[0]
            dist2 = resources[1]
        else:
            dist1 = resources[1]
            dist2 = resources[0]

        # list values are serialized by parser

        # dcatde:maintainer
        self.assertEqual(dataset.get('maintainer'), u'Peter Schröder')
        self._assert_extras_string(extras, 'maintainer_contacttype', u'Person')

        # dcatde:contributorID
        self._assert_extras_list_serialized(
            extras, 'contributorID',
            ['http://dcat-ap.de/def/contributors/transparenzportalHamburg'])

        # dcatde:originator
        self._assert_extras_string(extras, 'originator_name',
                                   u'Peter Schröder originator')
        self._assert_extras_string(extras, 'originator_contacttype', u'Person')

        # dcatde:politicalGeocodingURI
        self._assert_extras_list_serialized(extras, 'politicalGeocodingURI', [
            'http://dcat-ap.de/def/politicalGeocoding/regionalKey/020000000000',
            'http://dcat-ap.de/def/politicalGeocoding/stateKey/02'
        ])

        # dcatde:politicalGeocodingLevelURI
        self._assert_extras_string(
            extras, 'politicalGeocodingLevelURI',
            'http://dcat-ap.de/def/politicalGeocoding/Level/state')

        # dcatde:legalbasisText
        self._assert_extras_list_serialized(extras, 'legalbasisText',
                                            ['Umweltinformationsgesetz (UIG)'])

        # dcatde:geocodingText
        self._assert_extras_list_serialized(extras, 'geocodingText',
                                            ['Hamburg'])

        # dcatde:qualityProcessURI
        self._assert_extras_string(extras, 'qualityProcessURI',
                                   'https://www.example.com/')

        # resource checks
        self.assertEqual(
            dist1['__extras'].get('plannedAvailability'),
            'http://dcat-ap.de/def/plannedAvailability/experimental')
        self.assertEqual(
            dist1['__extras'].get('licenseAttributionByText'),
            u'Freie und Hansestadt Hamburg, Behörde für Umwelt und Energie, 2016'
        )
        self.assertEqual(dist1.get('license'),
                         "http://dcat-ap.de/def/licenses/dl-by-de/2_0")
        self.assertEqual(dist1.get('size'), 685246)

        self.assertEqual(
            dist2['__extras'].get('plannedAvailability'),
            'http://dcat-ap.de/def/plannedAvailability/available')
        self.assertEqual(
            dist2['__extras'].get('licenseAttributionByText'),
            u'Freie und Hansestadt Hamburg, Behörde für Umwelt und Energie, 2015'
        )
        self.assertEqual(dist2.get('license'),
                         "http://dcat-ap.de/def/licenses/dl-by-de/2_0")
        self.assertEqual(dist2.get('size'), 222441)

        # some non-dcatde fields
        self._assert_extras_list_serialized(
            extras, 'alternate_identifier',
            ['4635D337-4805-4C32-A211-13F8C038BF27'])

        # dcat:contactPoint
        self._assert_extras_string(extras, 'contact_email',
                                   u'*****@*****.**')
        self._assert_extras_string(extras, 'contact_name',
                                   u'Herr Dr. Michael Schröder')
        self._assert_extras_string(extras, 'maintainer_tel',
                                   u'+49 40 4 28 40 - 3494')
        self._assert_extras_string(extras, 'maintainer_street',
                                   u'Beispielstraße 4')
        self._assert_extras_string(extras, 'maintainer_city', u'Beispielort')
        self._assert_extras_string(extras, 'maintainer_zip', u'12345')
        self._assert_extras_string(extras, 'maintainer_country', u'DE')

        # Groups
        self.assertEqual(len(dataset['groups']), 2)
        self.assertTrue({'id': 'envi', 'name': 'envi'} in dataset['groups'])
        self.assertTrue({'id': 'agri', 'name': 'agri'} in dataset['groups'])

        # Keywords
        self._assert_tag_list(dataset, [
            u'Karte', u'hmbtg_09_geodaten', u'Grundwasser', u'Bodenschutz',
            u'Geodaten', u'Umwelt und Klima', u'hmbtg', u'opendata',
            u'Thematische Karte'
        ])

        # dct:location
        self._assert_extras_dict_serialized(
            extras, 'spatial', {
                "type":
                "Polygon",
                "coordinates": [[[10.3263, 53.3949], [10.3263, 53.9641],
                                 [8.4205, 53.9641], [8.4205, 53.3949],
                                 [10.3263, 53.3949]]]
            })

        # dcat:landingPage
        self._assert_extras_string(
            extras, 'metadata_original_html',
            'https://www.govdata.de/web/guest/daten/-/details/naturraume-geest-und-marsch3'
        )
Exemplo n.º 49
0
    def test_dataset_all_fields(self):

        contents = self._get_file_contents('1901.xml')

        p = RDFParser(profiles=['swiss_dcat_ap'])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]
        extras = self._extras(dataset)

        # Basic fields
        assert all(
            l in dataset['title']
            for l in ['de', 'fr', 'it', 'en']), "title contains all languages"
        eq_(dataset['title']['de'], u'Statistisches Jahrbuch der Schweiz 1901')
        eq_(dataset['title']['fr'], u'Annuaire statistique de la Suisse 1901')

        assert all(l in dataset['description']
                   for l in ['de', 'fr', 'it', 'en'
                             ]), "description contains all languages"
        eq_(dataset['description']['de'], u'')
        eq_(dataset['url'],
            u'https://www.bfs.admin.ch/bfs/de/home/statistiken.html')

        # Keywords
        assert all(l in dataset['keywords'] for l in
                   ['de', 'fr', 'it', 'en']), "keywords contains all languages"
        eq_(sorted(dataset['keywords']['de']),
            ['publikation', 'statistische-grundlagen-und-ubersichten'])
        eq_(sorted(dataset['keywords']['fr']),
            ['bases-statistiques-et-generalites', 'publication'])
        eq_(sorted(dataset['keywords']['it']),
            ['basi-statistiche-e-presentazioni-generali', 'pubblicazione'])
        eq_(sorted(dataset['keywords']['en']),
            ['publication', 'statistical-basis-and-overviews'])
        eq_(sorted(dataset['tags'], key=lambda k: k['name']),
            [{
                'name': 'basas-statisticas-e-survistas'
            }, {
                'name': 'bases-statistiques-et-generalites'
            }, {
                'name': 'basi-statistiche-e-presentazioni-generali'
            }, {
                'name': 'pubblicazione'
            }, {
                'name': 'publication'
            }, {
                'name': 'publication'
            }, {
                'name': 'publikation'
            }, {
                'name': 'statistical-basis-and-overviews'
            }, {
                'name': 'statistische-grundlagen-und-ubersichten'
            }])

        #  Simple values
        eq_(dataset['issued'], -2177539200)
        eq_(dataset['modified'], 1524528000)
        eq_(dataset['identifier'], u'346266@bundesamt-fur-statistik-bfs')
        eq_(dataset['spatial'], 'Schweiz')

        # Temporals
        temporal = dataset['temporals'][0]
        eq_(temporal['end_date'], -2146003200)
        end_date = datetime.fromtimestamp(temporal['end_date'])
        eq_(end_date.date().isoformat(), '1901-12-31')

        eq_(temporal['start_date'], -2177452800)
        start_date = datetime.fromtimestamp(temporal['start_date'])
        eq_(start_date.date().isoformat(), '1901-01-01')

        # Publisher
        publisher = dataset['publishers'][0]
        eq_(publisher['label'], 'BFS/OFS')

        # Contact points
        contact_point = dataset['contact_points'][0]
        eq_(contact_point['name'], '*****@*****.**')
        eq_(contact_point['email'], '*****@*****.**')

        # See alsos
        see_also = dataset['see_alsos'][0]
        eq_(see_also['dataset_identifier'],
            u'4682791@bundesamt-fur-statistik-bfs')

        #  Lists
        eq_(sorted(dataset['language']), [u'de', u'fr'])
        eq_(sorted(dataset['groups']), [{'name': u'statistical-basis'}])

        # Dataset URI
        eq_(
            extras['uri'],
            u'https://opendata.swiss/dataset/7451e012-64b2-4bbc-af20-a0e2bc61b585'
        )

        # Resources
        eq_(len(dataset['resources']), 1)
        resource = dataset['resources'][0]

        #  Simple values
        assert all(l in resource['title']
                   for l in ['de', 'fr', 'it', 'en'
                             ]), "resource title contains all languages"
        eq_(resource['title']['fr'], u'Annuaire statistique de la Suisse 1901')
        eq_(resource['title']['de'], u'')
        assert all(l in resource['description']
                   for l in ['de', 'fr', 'it', 'en'
                             ]), "resource description contains all languages"
        eq_(resource['description']['de'], u'')
        eq_(resource['format'], u'HTML')
        eq_(resource['mimetype'], u'text/html')
        eq_(resource['media_type'], u'text/html')
        eq_(resource['identifier'], u'346265-fr@bundesamt-fur-statistik-bfs')
        eq_(
            resource['rights'],
            u'NonCommercialAllowed-CommercialWithPermission-ReferenceRequired')
        eq_(resource['language'], [u'fr'])
        eq_(resource['issued'], -2177539200)
        eq_(resource['url'],
            u'https://www.bfs.admin.ch/asset/fr/hs-b-00.01-jb-1901')
        assert 'download_url' not in resource, "download_url not available on resource"

        # Distribution URI
        eq_(
            resource['uri'],
            u'https://opendata.swiss/dataset/7451e012-64b2-4bbc-af20-a0e2bc61b585/resource/c8ec6ca0-6923-4cf3-92f2-95a10e6f8e25'
        )
    def test_graph_to_dataset(self):

        contents = self._get_file_contents('dataset.rdf')

        p = RDFParser(profiles=['it_dcat_ap'])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        eq_(len(datasets), 1)

        dataset = datasets[0]
        
        # Basic fields
        eq_(dataset['title'], u'Dataset di test DCAT_AP-IT')
        eq_(dataset['notes'], u'dcatapit dataset di test')

        #  Simple values
        eq_(dataset['issued'], u'2016-11-29')
        eq_(dataset['modified'], u'2016-11-29')
        eq_(dataset['identifier'], u'ISBN')
        #eq_(dataset['temporal_start'], '2016-11-01')
        #eq_(dataset['temporal_end'], '2016-11-30')
        eq_(dataset['frequency'], 'UPDATE_CONT')

        geographical_name = dataset['geographical_name'][1:-1].split(',') if ',' in dataset['geographical_name'] else [dataset['geographical_name']]
        geographical_name.sort()
        geographical_name = '{' + ','.join([str(x) for x in geographical_name]) + '}'
        eq_(geographical_name, '{ITA_BZO}')

        eq_(dataset['publisher_name'], 'bolzano it')
        eq_(dataset['publisher_identifier'], '234234234')
        eq_(dataset['creator_name'], 'test')
        eq_(dataset['creator_identifier'], '412946129')
        eq_(dataset['holder_name'], 'bolzano')
        eq_(dataset['holder_identifier'], '234234234')

        alternate_identifier = set([i['identifier'] for i in json.loads(dataset['alternate_identifier'])])
        eq_(alternate_identifier, set(['ISBN:123', 'TEST']))

        theme = dataset['theme']
        theme = json.loads(dataset['theme'])
        allowed_themes = ('ECON', 'ENVI',)
        assert theme, 'got {}'.format(dataset['theme'])
        for t in theme:
            assert t.get('theme') in allowed_themes, "themes {} not in {}".format(theme, allowed_themes)

        eq_(dataset['geographical_geonames_url'], 'http://www.geonames.org/3181913')

        language = dataset['language'][1:-1].split(',') if ',' in dataset['language'] else [dataset['language']]
        language.sort()
        language = '{' + ','.join([str(x) for x in language]) + '}'
        eq_(language, '{DEU,ENG,ITA}')
        
        eq_(dataset['is_version_of'], 'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2')

        conforms_to = json.loads(dataset['conforms_to'])
        conforms_to_ids = set([c['identifier'] for c in conforms_to])
        eq_(conforms_to_ids, set('CONF1,CONF2,CONF3'.split(',')))

        # Multilang values
        ok_(dataset['DCATAPIT_MULTILANG_BASE'])

        multilang_notes = dataset['DCATAPIT_MULTILANG_BASE'].get('notes', None)
        ok_(multilang_notes)
        eq_(multilang_notes['de'], u'dcatapit test-dataset')
        eq_(multilang_notes['it'], u'dcatapit dataset di test')
        eq_(multilang_notes['en_GB'], u'dcatapit dataset test')

        multilang_holder_name = dataset['DCATAPIT_MULTILANG_BASE'].get('holder_name', None)
        ok_(multilang_holder_name)
        eq_(multilang_holder_name['de'], u'bolzano')
        eq_(multilang_holder_name['it'], u'bolzano')
        eq_(multilang_holder_name['en_GB'], u'bolzano')

        multilang_title = dataset['DCATAPIT_MULTILANG_BASE'].get('title', None)
        ok_(multilang_title)
        eq_(multilang_title['de'], u'Dcatapit Test-Dataset')
        eq_(multilang_title['it'], u'Dataset di test DCAT_AP-IT')
        eq_(multilang_title['en_GB'], u'DCAT_AP-IT test dataset')

        multilang_pub_name = dataset['DCATAPIT_MULTILANG_BASE'].get('publisher_name', None)
        ok_(multilang_pub_name)
        eq_(multilang_pub_name['en_GB'], u'bolzano en')
        eq_(multilang_pub_name['it'], u'bolzano it it')
Exemplo n.º 51
0
    def test_subthemes(self):

        load_themes()

        subthemes = [{
            'theme':
            'AGRI',
            'subthemes': [
                'http://eurovoc.europa.eu/100253',
                'http://eurovoc.europa.eu/100258'
            ]
        }, {
            'theme': 'ENVI',
            'subthemes': []
        }]

        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Dataset di test DCAT_AP-IT',
            'notes': 'dcatapit dataset di test',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'tags': [{
                'name': 'Tag 1'
            }, {
                'name': 'Tag 2'
            }],
            'issued': '2016-11-29',
            'modified': '2016-11-29',
            'frequency': 'UPDATE_CONT',
            'publisher_name': 'bolzano',
            'publisher_identifier': '234234234',
            'creator_name': 'test',
            'creator_identifier': '412946129',
            'holder_name': 'bolzano',
            'holder_identifier': '234234234',
            'alternate_identifier': 'ISBN,TEST',
            FIELD_THEMES_AGGREGATE: json.dumps(subthemes),
            'theme': theme_aggr_to_theme_uris(
                subthemes
            )  # this is added dinamically when retrieving datasets from the db
        }

        s = RDFSerializer()
        p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap'])

        serialized = s.serialize_dataset(dataset)

        p.parse(serialized)
        datasets = list(p.datasets())

        assert len(datasets) == 1
        parsed_dataset = datasets[0]

        # test themes
        parsed_themes_raw = _get_extra_value(parsed_dataset.get('extras'),
                                             'theme')
        self.assertIsNotNone(
            parsed_themes_raw,
            f'Themes not found in parsed dataset {parsed_dataset}')
        parsed_themes = json.loads(parsed_themes_raw)
        self.assertEqual(2, len(parsed_themes))
        self.assertSetEqual(set(theme_names_to_uris(['AGRI', 'ENVI'])),
                            set(parsed_themes))

        # test aggregated themes
        parsed_aggr_raw = parsed_dataset.get(FIELD_THEMES_AGGREGATE, None)
        self.assertIsNotNone(
            parsed_aggr_raw,
            f'Aggregated themes not found in parsed dataset {parsed_dataset}')
        parsed_aggr = json.loads(parsed_aggr_raw)
        self.assertIsNotNone(parsed_aggr, 'Aggregate is None')
        self.assertEquals(2, len(parsed_aggr))
        for t in parsed_aggr:
            if t['theme'] == 'ENVI':
                self.assertSetEqual(set([]), set(t['subthemes']))
            elif t['theme'] == 'AGRI':
                self.assertSetEqual(set(subthemes[0]['subthemes']),
                                    set(t['subthemes']))
            else:
                self.fail(f'Unknown theme: {t}')
Exemplo n.º 52
0
    def test_temporal_coverage(self):

        load_themes()
        temporal_coverage = [
            {
                'temporal_start': '2001-01-01T00:00:00',
                'temporal_end': '2001-02-01T10:11:12'
            },
            {
                'temporal_start': '2001-01-01T00:00:00',
                'temporal_end': '2001-02-01T10:11:12'
            },
        ]
        dataset = {
            'id':
            '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name':
            'test-dataset',
            'title':
            'Dataset di test DCAT_AP-IT',
            'notes':
            'dcatapit dataset di test',
            'metadata_created':
            '2015-06-26T15:21:09.034694',
            'metadata_modified':
            '2015-06-26T15:21:09.075774',
            'tags': [{
                'name': 'Tag 1'
            }, {
                'name': 'Tag 2'
            }],
            'issued':
            '2016-11-29',
            'modified':
            '2016-11-29',
            'identifier':
            'ISBN',
            'temporal_start':
            '2016-11-01T00:00:00',
            'temporal_end':
            '2016-11-30T00:00:00',
            'temporal_coverage':
            json.dumps(temporal_coverage),
            'frequency':
            'UPDATE_CONT',
            'publisher_name':
            'bolzano',
            'publisher_identifier':
            '234234234',
            'creator_name':
            'test',
            'creator_identifier':
            '412946129',
            'holder_name':
            'bolzano',
            'holder_identifier':
            '234234234',
            'alternate_identifier':
            'ISBN,TEST',
            'theme':
            '{ECON,ENVI}',
            'geographical_geonames_url':
            'http://www.geonames.org/3181913',
            'language':
            '{DEU,ENG,ITA}',
            'is_version_of':
            'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2',
        }

        s = RDFSerializer()
        p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap'])

        serialized = s.serialize_dataset(dataset)

        p.parse(serialized)
        datasets = list(p.datasets())

        assert len(datasets) == 1
        d = datasets[0]

        temporal_coverage.append({
            'temporal_start': dataset['temporal_start'],
            'temporal_end': dataset['temporal_end']
        })

        try:
            validators.dcatapit_temporal_coverage(d['temporal_coverage'], {})
            # this should not raise exception
            assert True
        except validators.Invalid, err:
            assert False, "Temporal coverage should be valid: {}".format(err)
Exemplo n.º 53
0
    def test_creators(self):

        creators = [
            {
                'creator_name': {
                    DEFAULT_LANG: 'abc',
                    'it': 'abc it'
                },
                'creator_identifier': "ABC"
            },
            {
                'creator_name': {
                    DEFAULT_LANG: 'cde',
                    'it': 'cde it'
                },
                'creator_identifier': "CDE"
            },
        ]
        dataset = {
            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset',
            'title': 'Dataset di test DCAT_AP-IT',
            'notes': 'dcatapit dataset di test',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'tags': [{
                'name': 'Tag 1'
            }, {
                'name': 'Tag 2'
            }],
            'issued': '2016-11-29',
            'modified': '2016-11-29',
            'identifier': 'ISBN',
            'temporal_start': '2016-11-01',
            'temporal_end': '2016-11-30',
            'frequency': 'UPDATE_CONT',
            'publisher_name': 'bolzano',
            'publisher_identifier': '234234234',
            'creator_name': 'test',
            'creator_identifier': '412946129',
            'holder_name': 'bolzano',
            'holder_identifier': '234234234',
            'alternate_identifier': 'ISBN,TEST',
            'theme': '{ECON,ENVI}',
            'geographical_geonames_url': 'http://www.geonames.org/3181913',
            'language': '{DEU,ENG,ITA}',
            'is_version_of':
            'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2',
            'creator': json.dumps(creators)
        }

        s = RDFSerializer()
        p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap'])

        serialized = s.serialize_dataset(dataset)

        p.parse(serialized)
        datasets = list(p.datasets())

        assert len(datasets) == 1
        d = datasets[0]
        creators.append({
            'creator_identifier': dataset['creator_identifier'],
            'creator_name': {
                DEFAULT_LANG: dataset['creator_name']
            }
        })

        creators_dict = dict((v['creator_identifier'], v) for v in creators)

        creators_in = json.loads(d['creator'])

        for c in creators_in:
            assert c['creator_identifier'] in creators_dict.keys(
            ), "no {} key in {}".format(c['creator_identifier'],
                                        creators_dict.keys())
            assert c['creator_name'] == creators_dict[c['creator_identifier']]['creator_name'],\
                "{} vs {}".format(c['creator_name'], creators_dict[c['creator_identifier']]['creator_name'])
        for c in creators_dict.keys():
            assert c in [_c['creator_identifier'] for _c in creators_in]
            cdata = creators_dict[c]
            assert cdata in creators_in
    def test_dataset_all_fields(self):

        contents = self._get_file_contents('dataset.rdf')

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.parse(contents)

        datasets = [d for d in p.datasets()]

        assert len(datasets) == 1

        dataset = datasets[0]

        # Basic fields

        assert dataset['title'] == u'Zimbabwe Regional Geochemical Survey.'
        assert dataset[
            'notes'] == u'During the period 1982-86 a team of geologists from the British Geological Survey ...'
        assert dataset['url'] == 'http://dataset.info.org'
        assert dataset['version'] == '2.3'
        assert dataset['license_id'] == 'cc-nc'

        # Tags

        assert (sorted(dataset['tags'], key=lambda k: k['name']) == [{
            'name':
            u'exploration'
        }, {
            'name':
            u'geochemistry'
        }, {
            'name':
            u'geology'
        }])

        # Extras

        def _get_extra_value(key):
            v = [
                extra['value'] for extra in dataset['extras']
                if extra['key'] == key
            ]
            return v[0] if v else None

        def _get_extra_value_as_list(key):
            value = _get_extra_value(key)
            return json.loads(value) if value else []

        #  Simple values
        assert _get_extra_value('issued') == u'2012-05-10'
        assert _get_extra_value('modified') == u'2012-05-10T21:04:00'
        assert _get_extra_value(
            'identifier') == u'9df8df51-63db-37a8-e044-0003ba9b0d98'
        assert _get_extra_value('version_notes') == u'New schema added'
        assert _get_extra_value('temporal_start') == '1905-03-01'
        assert _get_extra_value('temporal_end') == '2013-01-05'
        assert _get_extra_value(
            'frequency') == 'http://purl.org/cld/freq/daily'
        assert _get_extra_value(
            'spatial_uri'
        ) == 'http://publications.europa.eu/mdr/authority/country/ZWE'
        assert _get_extra_value(
            'publisher_uri') == 'http://orgs.vocab.org/some-org'
        assert _get_extra_value(
            'publisher_name') == 'Publishing Organization for dataset 1'
        assert _get_extra_value('publisher_email') == '*****@*****.**'
        assert _get_extra_value('publisher_url') == 'http://some.org'
        assert _get_extra_value(
            'publisher_type'
        ) == 'http://purl.org/adms/publishertype/NonProfitOrganisation'
        assert _get_extra_value('contact_name') == 'Point of Contact'
        # mailto gets removed for storage and is added again on output
        assert _get_extra_value('contact_email') == '*****@*****.**'
        assert _get_extra_value('access_rights') == 'public'
        assert _get_extra_value(
            'provenance') == 'Some statement about provenance'
        assert _get_extra_value('dcat_type') == 'test-type'

        #  Lists
        assert sorted(
            _get_extra_value_as_list('language')), [u'ca', u'en' == u'es']
        assert (sorted(_get_extra_value_as_list('theme')) == [
            u'Earth Sciences', u'http://eurovoc.europa.eu/100142',
            u'http://eurovoc.europa.eu/209065'
        ])
        assert sorted(_get_extra_value_as_list('conforms_to')), [
            u'Standard 1' == u'Standard 2'
        ]

        assert sorted(_get_extra_value_as_list('alternate_identifier')), [
            u'alternate-identifier-1' == u'alternate-identifier-2'
        ]
        assert (sorted(_get_extra_value_as_list('documentation')) == [
            u'http://dataset.info.org/doc1', u'http://dataset.info.org/doc2'
        ])
        assert (sorted(_get_extra_value_as_list('related_resource')) == [
            u'http://dataset.info.org/related1',
            u'http://dataset.info.org/related2'
        ])
        assert (sorted(_get_extra_value_as_list('has_version')) == [
            u'https://data.some.org/catalog/datasets/derived-dataset-1',
            u'https://data.some.org/catalog/datasets/derived-dataset-2'
        ])
        assert sorted(_get_extra_value_as_list('is_version_of')) == [
            u'https://data.some.org/catalog/datasets/original-dataset'
        ]
        assert (sorted(_get_extra_value_as_list('source')) == [
            u'https://data.some.org/catalog/datasets/source-dataset-1',
            u'https://data.some.org/catalog/datasets/source-dataset-2'
        ])
        assert sorted(_get_extra_value_as_list('sample')) == [
            u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/sample'
        ]

        # Dataset URI
        assert _get_extra_value(
            'uri'
        ) == u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98'

        # Resources
        assert len(dataset['resources']) == 1

        resource = dataset['resources'][0]

        #  Simple values
        assert resource['name'] == u'Some website'
        assert resource['description'] == u'A longer description'
        assert resource['format'] == u'HTML'
        assert resource['mimetype'] == u'text/html'
        assert resource['issued'] == u'2012-05-11'
        assert resource['modified'] == u'2012-05-01T00:04:06'
        assert resource['status'] == u'http://purl.org/adms/status/Completed'

        assert resource['hash'] == u'4304cf2e751e6053c90b1804c89c0ebb758f395a'
        assert resource[
            'hash_algorithm'] == u'http://spdx.org/rdf/terms#checksumAlgorithm_sha1'

        # Lists
        for item in [
            ('documentation', [
                u'http://dataset.info.org/distribution1/doc1',
                u'http://dataset.info.org/distribution1/doc2'
            ]),
            ('language', [u'ca', u'en', u'es']),
            ('conforms_to', [u'Standard 1', u'Standard 2']),
        ]:
            assert sorted(json.loads(resource[item[0]])) == item[1]

        # These two are likely to need clarification
        assert resource[
            'license'] == u'http://creativecommons.org/licenses/by-nc/2.0/'
        assert resource['rights'] == u'Some statement about rights'

        assert resource[
            'url'] == u'http://www.bgs.ac.uk/gbase/geochemcd/home.html'
        assert 'download_url' not in resource

        assert resource['size'] == 12323

        # Distribution URI
        assert resource[
            'uri'] == u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/1'
    def test_parse_subcatalog(self):
        publisher = {
            'name': 'Publisher',
            'email': '*****@*****.**',
            'type': 'Publisher',
            'uri': 'http://pub.lish.er'
        }
        dataset = {
            'id':
            '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name':
            'test-dataset',
            'title':
            'test dataset',
            'extras': [{
                'key': 'source_catalog_title',
                'value': 'Subcatalog example'
            }, {
                'key': 'source_catalog_homepage',
                'value': 'http://subcatalog.example'
            }, {
                'key': 'source_catalog_description',
                'value': 'Subcatalog example description'
            }, {
                'key':
                'source_catalog_language',
                'value':
                'http://publications.europa.eu/resource/authority/language/ITA'
            }, {
                'key': 'source_catalog_modified',
                'value': '2000-01-01'
            }, {
                'key': 'source_catalog_publisher',
                'value': json.dumps(publisher)
            }]
        }
        catalog_dict = {
            'title': 'My Catalog',
            'description': 'An Open Data Catalog',
            'homepage': 'http://example.com',
            'language': 'de',
        }

        s = RDFSerializer()
        s.serialize_catalog(catalog_dict, dataset_dicts=[dataset])
        g = s.g

        p = RDFParser(profiles=['euro_dcat_ap'])

        p.g = g

        # at least one subcatalog with hasPart
        subcatalogs = list(p.g.objects(None, DCT.hasPart))
        assert subcatalogs

        # at least one dataset in subcatalogs
        subdatasets = []
        for subcatalog in subcatalogs:
            datasets = p.g.objects(subcatalog, DCAT.dataset)
            for dataset in datasets:
                subdatasets.append((
                    dataset,
                    subcatalog,
                ))
        assert subdatasets

        datasets = dict([(d['title'], d) for d in p.datasets()])

        for subdataset, subcatalog in subdatasets:
            title = str(list(p.g.objects(subdataset, DCT.title))[0])
            dataset = datasets[title]
            has_subcat = False
            for ex in dataset['extras']:
                exval = ex['value']
                exkey = ex['key']
                if exkey == 'source_catalog_homepage':
                    has_subcat = True
                    assert exval == str(subcatalog)
            # check if we had subcatalog in extras
            assert has_subcat
Exemplo n.º 56
0
    def gather_stage(self, harvest_job):

        log.debug('In DCATRDFHarvester gather_stage')

        rdf_format = None
        if harvest_job.source.config:
            rdf_format = json.loads(
                harvest_job.source.config).get("rdf_format")

        # Get file contents of first page
        next_page_url = harvest_job.source.url

        guids_in_source = []
        object_ids = []
        last_content_hash = None
        self._names_taken = []

        while next_page_url:
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                next_page_url, before_download_errors = harvester.before_download(
                    next_page_url, harvest_job)

                for error_msg in before_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

                if not next_page_url:
                    return []

            content, rdf_format = self._get_content_and_type(
                next_page_url, harvest_job, 1, content_type=rdf_format)

            content_hash = hashlib.md5()
            if content:
                if six.PY2:
                    content_hash.update(content)
                else:
                    content_hash.update(content.encode('utf8'))

            if last_content_hash:
                if content_hash.digest() == last_content_hash.digest():
                    log.warning(
                        'Remote content was the same even when using a paginated URL, skipping'
                    )
                    break
            else:
                last_content_hash = content_hash

            # TODO: store content?
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                content, after_download_errors = harvester.after_download(
                    content, harvest_job)

                for error_msg in after_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

            if not content:
                return []

            # TODO: profiles conf
            parser = RDFParser()

            try:
                parser.parse(content, _format=rdf_format)
            except RDFParserException as e:
                self._save_gather_error(
                    'Error parsing the RDF file: {0}'.format(e), harvest_job)
                return []

            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                parser, after_parsing_errors = harvester.after_parsing(
                    parser, harvest_job)

                for error_msg in after_parsing_errors:
                    self._save_gather_error(error_msg, harvest_job)

            if not parser:
                return []

            try:

                source_dataset = model.Package.get(harvest_job.source.id)

                for dataset in parser.datasets():
                    if not dataset.get('name'):
                        dataset['name'] = self._gen_new_name(dataset['title'])
                    if dataset['name'] in self._names_taken:
                        suffix = len([
                            i for i in self._names_taken
                            if i.startswith(dataset['name'] + '-')
                        ]) + 1
                        dataset['name'] = '{}-{}'.format(
                            dataset['name'], suffix)
                    self._names_taken.append(dataset['name'])

                    # Unless already set by the parser, get the owner organization (if any)
                    # from the harvest source dataset
                    if not dataset.get('owner_org'):
                        if source_dataset.owner_org:
                            dataset['owner_org'] = source_dataset.owner_org

                    # Try to get a unique identifier for the harvested dataset
                    guid = self._get_guid(dataset,
                                          source_url=source_dataset.url)

                    if not guid:
                        self._save_gather_error(
                            'Could not get a unique identifier for dataset: {0}'
                            .format(dataset), harvest_job)
                        continue

                    dataset['extras'].append({'key': 'guid', 'value': guid})
                    guids_in_source.append(guid)

                    obj = HarvestObject(guid=guid,
                                        job=harvest_job,
                                        content=json.dumps(dataset))

                    obj.save()
                    object_ids.append(obj.id)
            except Exception as e:
                self._save_gather_error(
                    'Error when processsing dataset: %r / %s' %
                    (e, traceback.format_exc()), harvest_job)
                return []

            # get the next page
            next_page_url = parser.next_page()

        # Check if some datasets need to be deleted
        object_ids_to_delete = self._mark_datasets_for_deletion(
            guids_in_source, harvest_job)

        object_ids.extend(object_ids_to_delete)

        return object_ids
Exemplo n.º 57
0
    def gather_stage(self, harvest_job):

        log.debug('In DCATRDFHarvester gather_stage')

        rdf_format = None
        if harvest_job.source.config:
            rdf_format = json.loads(
                harvest_job.source.config).get("rdf_format")

        # Get file contents of first page
        next_page_url = harvest_job.source.url

        guids_in_source = []
        object_ids = []
        last_content_hash = None

        while next_page_url:
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                next_page_url, before_download_errors = harvester.before_download(
                    next_page_url, harvest_job)

                for error_msg in before_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

                if not next_page_url:
                    return []

            content, rdf_format = self._get_content_and_type(
                next_page_url, harvest_job, 1, content_type=rdf_format)

            content_hash = hashlib.md5()
            content_hash.update(content)

            if last_content_hash:
                if content_hash.digest() == last_content_hash.digest():
                    log.warning(
                        'Remote content was the same even when using a paginated URL, skipping'
                    )
                    break
            else:
                last_content_hash = content_hash

            # TODO: store content?
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                content, after_download_errors = harvester.after_download(
                    content, harvest_job)

                for error_msg in after_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

            if not content:
                return []

            # TODO: profiles conf
            parser = RDFParser()

            try:
                parser.parse(content, _format=rdf_format)
            except RDFParserException, e:
                self._save_gather_error(
                    'Error parsing the RDF file: {0}'.format(e), harvest_job)
                return []

            for dataset in parser.datasets():
                if not dataset.get('name'):
                    dataset['name'] = self._gen_new_name(dataset['title'])

                # Unless already set by the parser, get the owner organization (if any)
                # from the harvest source dataset
                if not dataset.get('owner_org'):
                    source_dataset = model.Package.get(harvest_job.source.id)
                    if source_dataset.owner_org:
                        dataset['owner_org'] = source_dataset.owner_org

                # Try to get a unique identifier for the harvested dataset
                guid = self._get_guid(dataset)

                if not guid:
                    self._save_gather_error(
                        'Could not get a unique identifier for dataset: {0}'.
                        format(dataset), harvest_job)
                    continue

                dataset['extras'].append({'key': 'guid', 'value': guid})
                guids_in_source.append(guid)

                obj = HarvestObject(guid=guid,
                                    job=harvest_job,
                                    content=json.dumps(dataset))

                obj.save()
                object_ids.append(obj.id)

            # get the next page
            next_page_url = parser.next_page()
Exemplo n.º 58
0
    def test_theme_to_group_mapping(self):
        # multilang requires lang to be set
        # class dummyreq(object):
        #     class p(object):
        #         translator = object()
        #     environ = {'pylons.pylons': p()}

        # CKANRequest(dummyreq)
        # pylons.request = dummyreq()
        # pylons.translator.pylons_lang = ['en_GB']

        #set_lang('en_GB')
        #assert get_lang() == ['en_GB']
        assert 'dcatapit_theme_group_mapper' in config[
            'ckan.plugins'], 'No dcatapit_theme_group_mapper plugin in config'

        with open(get_example_file('dataset.rdf'), 'r') as f:
            contents = f.read()

        p = RDFParser(profiles=['it_dcat_ap'])

        p.parse(contents)
        datasets = [d for d in p.datasets()]
        self.assertEqual(len(datasets), 1)
        package_dict = datasets[0]

        user = User.get('dummy')

        if not user:
            user = call_action('user_create',
                               name='dummy',
                               password='******',
                               email='*****@*****.**')
            user_name = user['name']
        else:
            user_name = user.name
        org = Group.by_name('dummy')
        if org is None:
            org = call_action('organization_create',
                              context={'user': user_name},
                              name='dummy',
                              identifier='aaaaaa')
        existing_g = Group.by_name('existing-group')
        if existing_g is None:
            existing_g = call_action('group_create',
                                     context={'user': user_name},
                                     name='existing-group')

        context = {'user': '******', 'ignore_auth': True, 'defer_commit': False}
        package_schema = schema.default_create_package_schema()
        context['schema'] = package_schema
        _p = {
            'frequency': 'manual',
            'publisher_name': 'dummy',
            'extras': [{
                'key': 'theme',
                'value': ['non-mappable', 'thememap1']
            }],
            'groups': [],  #  [{'name':existing_g.name}],
            'title': 'dummy',
            'holder_name': 'dummy',
            'holder_identifier': 'dummy',
            'name': 'dummy-' + uuid4().hex,
            'identifier': 'dummy' + uuid4().hex,
            'notes': 'dummy',
            'owner_org': 'dummy',
            'modified': datetime.now(),
            'publisher_identifier': 'dummy',
            'metadata_created': datetime.now(),
            'metadata_modified': datetime.now(),
            'guid': str(uuid.uuid4),
        }

        package_dict.update(_p)

        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = ''
        config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'false'

        package_data = call_action('package_create',
                                   context=context,
                                   **package_dict)

        p = Package.get(package_data['id'])

        # no groups should be assigned at this point (no map applied)
        assert {
            'theme': ['non-mappable', 'thememap1']
        } == p.extras, '{} vs {}'.format(_p['extras'], p.extras)
        assert [] == p.get_groups(
            group_type='group'), 'should be {}, got {}'.format(
                [], p.get_groups(group_type='group'))

        package_data = call_action('package_show',
                                   context=context,
                                   id=package_data['id'])

        # use test mapping, which replaces thememap1 to thememap2 and thememap3
        test_map_file = os.path.join(os.path.dirname(__file__), '..', '..',
                                     '..', 'examples', 'test_map.ini')

        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file
        config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'false'

        # package_dict['theme'] = ['non-mappable', 'thememap1']

        package_dict.pop('extras', None)
        p = Package.get(package_data['id'])
        context['package'] = p

        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)

        # check - only existing group should be assigned
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        # the map file maps ECON to existing group, and 2 other unexisting groups that will not be created
        expected_groups = ['existing-group']
        self.assertSetEqual(set(expected_groups), set(groups),
                            'Error in assigned groups')

        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file
        config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'true'

        # package_dict['theme'] = ['non-mappable', 'thememap1']
        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)

        meta.Session.flush()

        # recheck - this time, new groups should appear
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        # the map file maps ECON to existing group and 2 other groups that have been automatically created
        expected_groups = expected_groups + ['somegroup1', 'somegroup2']
        self.assertSetEqual(set(expected_groups), set(groups), 'Groups differ')

        # package_dict['theme'] = ['non-mappable', 'thememap1', 'thememap-multi']
        aggr = json.loads(package_dict[FIELD_THEMES_AGGREGATE])
        aggr.append({'theme': 'thememap-multi', 'subthemes': []})
        package_dict[FIELD_THEMES_AGGREGATE] = json.dumps(aggr)

        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)

        meta.Session.flush()

        # recheck - there should be no duplicates
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        # added theme 'thememap-multi', that maps to 'othergroup' and other already exisintg groups
        expected_groups = expected_groups + ['othergroup']
        self.assertEqual(len(expected_groups), len(groups),
                         'New groups differ - there may be duplicated groups')
        self.assertSetEqual(set(expected_groups), set(groups),
                            'New groups differ')

        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)

        meta.Session.flush()

        # recheck - there still should be no duplicates
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        self.assertEqual(len(expected_groups), len(groups),
                         'New groups differ - there may be duplicated groups')
        self.assertSetEqual(set(expected_groups), set(groups),
                            'New groups differ')

        meta.Session.rollback()