Exemplo n.º 1
0
    def license_2_license_id(self, license_title, logger=None):
        """
        Gets the license ID based on the license description value. If doesn't exist then return empty.
        """

        # import is here, as it creates a dependency on ckan, which many importers won't want
        from ckan.model.license import LicenseRegister
        licenses = LicenseRegister()
        license_obj = licenses.get(license_title)

        if license_obj:
            return u'%s' % license_obj.id
        else:
            log.warn('Warning: No license name matches %s. Ignoring license.' % license_title)
            return u''
Exemplo n.º 2
0
    def license_2_license_id(self, license_title, logger=None):
        """
        Gets the license ID based on the license description value. If doesn't exist then return empty.
        """

        # import is here, as it creates a dependency on ckan, which many importers won't want
        from ckan.model.license import LicenseRegister
        licenses = LicenseRegister()
        license_obj = licenses.get(license_title)

        if license_obj:
            return u'%s' % license_obj.id
        else:
            log.warn('Warning: No license name matches %s. Ignoring license.' %
                     license_title)
            return u''
Exemplo n.º 3
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        g = self.g

        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        # -- start
        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        # Basic fields
        items = [
            ('title', DCT.title, None, Literal),
            ('notes', DCT.description, None, Literal),
            ('url', DCAT.landingPage, None, URIRef),
            ('identifier', DCT.identifier, ['guid', 'id'], Literal),
            ('version', OWL.versionInfo, ['dcat_version'], Literal),
            ('version_notes', ADMS.versionNotes, None, Literal),
            ('frequency', DCT.accrualPeriodicity, None, URIRef),
            ('subject', DCT.subject, None,
             URIRef),  # Mentioned in the vocabulary
            ('provenance', DCT.provenance, None, URIRef)
        ]
        self._add_triples_from_dict(dataset_dict, dataset_ref, items)

        # Tags
        for tag in dataset_dict.get('tags', []):
            g.add((dataset_ref, DCAT.keyword, Literal(tag['name'])))

        # Dates
        items = [
            ('issued', DCT.issued, ['metadata_created'], Literal),
            ('modified', DCT.modified, ['metadata_modified'], Literal),
        ]
        self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)

        #  Lists
        items = [('language', DCT.language, None, URIRef),
                 ('theme', DCAT.theme, None, URIRef),
                 ('spatial_uri', DCT.spatial, None, URIRef),
                 ('conforms_to', DCT.conformsTo, None, URIRef),
                 ('alternate_identifier', ADMS.identifier, None, Literal),
                 ('documentation', FOAF.page, None, URIRef),
                 ('access_rights', DCT.accessRights, None, URIRef),
                 ('related_resource', DCT.relation, None, URIRef),
                 ('has_version', DCT.hasVersion, None, Literal),
                 ('is_version_of', DCT.isVersionOf, None, Literal),
                 ('source', DCT.source, None, Literal),
                 ('sample', ADMS.sample, None, Literal)]
        self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

        # Contact details
        if any([
                self._get_dataset_value(dataset_dict, 'contact_uri'),
                self._get_dataset_value(dataset_dict, 'contact_name'),
                self._get_dataset_value(dataset_dict, 'contact_email'),
                self._get_dataset_value(dataset_dict, 'maintainer'),
                self._get_dataset_value(dataset_dict, 'maintainer_email'),
        ]):

            contact_uri = self._get_dataset_value(dataset_dict, 'contact_uri')
            if contact_uri:
                contact_details = URIRef(contact_uri)
            else:
                contact_details = BNode()

            g.add((contact_details, RDF.type, VCARD.Kind))
            g.add((dataset_ref, DCAT.contactPoint, contact_details))

            items = [
                ('contact_name', VCARD.fn, ['maintainer'], Literal),
                ('contact_email', VCARD.hasEmail, ['maintainer_email'],
                 Literal),
            ]

            self._add_triples_from_dict(dataset_dict, contact_details, items)

        # Publisher
        if any([
                self._get_dataset_value(dataset_dict, 'publisher_uri'),
                self._get_dataset_value(dataset_dict, 'publisher_name'),
                self._get_dataset_value(dataset_dict, 'publisher_identifier'),
                dataset_dict.get('organization'),
        ]):

            publisher_uri = publisher_uri_from_dataset_dict(dataset_dict)
            if publisher_uri:
                publisher_details = URIRef(publisher_uri)
            else:
                # No organization nor publisher_uri
                publisher_details = BNode()

            g.add((publisher_details, RDF.type, FOAF.Agent))
            g.add((dataset_ref, DCT.publisher, publisher_details))

            publisher_name = self._get_dataset_value(dataset_dict,
                                                     'publisher_name')
            if not publisher_name and dataset_dict.get('organization'):
                publisher_name = dataset_dict['organization']['title']

            g.add((publisher_details, FOAF.name, Literal(publisher_name)))
            # TODO: It would make sense to fallback these to organization
            # fields but they are not in the default schema and the
            # `organization` object in the dataset_dict does not include
            # custom fields
            items = [('publisher_email', FOAF.mbox, None, Literal),
                     ('publisher_identifier', DCT.identifier, None, Literal),
                     ('publisher_url', FOAF.homepage, None, URIRef),
                     ('publisher_type', DCT.type, None, Literal)]

            self._add_triples_from_dict(dataset_dict, publisher_details, items)

        # Temporal
        start = self._get_dataset_value(dataset_dict, 'temporal_start')
        end = self._get_dataset_value(dataset_dict, 'temporal_end')
        if start or end:
            temporal_extent = BNode()

            g.add((temporal_extent, RDF.type, DCT.PeriodOfTime))
            if start:
                self._add_date_triple(temporal_extent, SCHEMA.startDate, start)
            if end:
                self._add_date_triple(temporal_extent, SCHEMA.endDate, end)
            g.add((dataset_ref, DCT.temporal, temporal_extent))

        # parts - has part/is part of

        if any([
                self._get_dataset_value(dataset_dict, 'has_part'),
                self._get_dataset_value(dataset_dict, 'is_part_of')
        ]):
            items = [('has_part', DCT.hasPart, None, URIRef),
                     ('is_part_of', DCT.isPartOf, None, URIRef)]

            self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

        # Spatial
        spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri')
        spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text')
        spatial_geom = self._get_dataset_value(dataset_dict, 'spatial')

        if spatial_uri:
            spatial_uri = get_spatial_uri(spatial_uri)  # map from code to URI

        if spatial_uri or spatial_text or spatial_geom:
            if spatial_uri:
                spatial_ref = URIRef(spatial_uri)
            else:
                spatial_ref = BNode()

            g.add((spatial_ref, RDF.type, DCT.Location))
            g.add((dataset_ref, DCT.spatial, spatial_ref))

            if spatial_text:
                g.add((spatial_ref, SKOS.prefLabel, Literal(spatial_text)))

            if spatial_geom:
                # GeoJSON
                g.add((spatial_ref, LOCN.geometry,
                       Literal(spatial_geom, datatype=GEOJSON_IMT)))
                # WKT, because GeoDCAT-AP says so
                try:
                    g.add((spatial_ref, LOCN.geometry,
                           Literal(wkt.dumps(json.loads(spatial_geom),
                                             decimals=4),
                                   datatype=GSP.wktLiteral)))
                except (TypeError, ValueError, InvalidGeoJSONException):
                    pass

        # Resources
        for resource_dict in dataset_dict.get('resources', []):

            distribution = URIRef(resource_uri(resource_dict))

            g.add((dataset_ref, DCAT.distribution, distribution))

            g.add((distribution, RDF.type, DCAT.Distribution))

            if 'license' not in resource_dict and 'license_id' in dataset_dict:
                lr = LicenseRegister()
                _license = lr.get(dataset_dict['license_id'])
                if _license:
                    resource_dict['license'] = _license.url

            #  Simple values
            items = [
                ('name', DCT.title, None, Literal),
                ('description', DCT.description, None, Literal),
                ('status', ADMS.status, None, Literal),
                ('rights', DCT.rights, None, Literal),
                ('license', DCT.license, None, URIRef),
            ]

            self._add_triples_from_dict(resource_dict, distribution, items)

            #  Lists
            items = [
                ('documentation', FOAF.page, None, URIRef),
                ('language', DCT.language, None, URIRef),
                ('conforms_to', DCT.conformsTo, None, URIRef),
            ]
            self._add_list_triples_from_dict(resource_dict, distribution,
                                             items)

            # Format
            if '/' in resource_dict.get('format', ''):
                g.add((distribution, DCAT.mediaType,
                       Literal(resource_dict['format'])))
            else:
                if resource_dict.get('format'):
                    g.add((distribution, DCT['format'],
                           Literal(resource_dict['format'])))

                if resource_dict.get('mimetype'):
                    g.add((distribution, DCAT.mediaType,
                           Literal(resource_dict['mimetype'])))

            # URL
            url = resource_dict.get('url')
            download_url = resource_dict.get('download_url')
            if download_url:
                g.add((distribution, DCAT.downloadURL, URIRef(download_url)))
            if (url and not download_url) or (url and url != download_url):
                g.add((distribution, DCAT.accessURL, URIRef(url)))

            # Dates
            items = [
                ('issued', DCT.issued, None, Literal),
                ('modified', DCT.modified, None, Literal),
            ]

            self._add_date_triples_from_dict(resource_dict, distribution,
                                             items)

            # Numbers
            if resource_dict.get('size'):
                try:
                    g.add((distribution, DCAT.byteSize,
                           Literal(float(resource_dict['size']),
                                   datatype=XSD.decimal)))
                except (ValueError, TypeError):
                    g.add((distribution, DCAT.byteSize,
                           Literal(resource_dict['size'])))
            # Checksum
            if resource_dict.get('hash'):
                checksum = BNode()
                g.add((checksum, SPDX.checksumValue,
                       Literal(resource_dict['hash'], datatype=XSD.hexBinary)))

                if resource_dict.get('hash_algorithm'):
                    if resource_dict['hash_algorithm'].startswith('http'):
                        g.add((checksum, SPDX.algorithm,
                               URIRef(resource_dict['hash_algorithm'])))
                    else:
                        g.add((checksum, SPDX.algorithm,
                               Literal(resource_dict['hash_algorithm'])))
                g.add((distribution, SPDX.checksum, checksum))
Exemplo n.º 4
0
def export_resource_to_rdf(resource_dict, dataset_dict, _format='xml'):
    """Export the resource in RDF format.

    Builds an RDF Graph containing only the selected resource and exports it to the
    selected format (default ``xml``).

    :param dict resource_dict: resource metadata.
    :param dict dataset_dict: dataset metadata.
    :param str _format: export format. Default is ``xml``.

    :returns: the serialized RDF graph of the resource.
    :rtype: 
    """
    g = Graph()

    distribution = URIRef(resource_uri(resource_dict))

    g.add((distribution, RDF.type, DCAT.Distribution))

    if 'license' not in resource_dict and 'license_id' in dataset_dict:
        lr = LicenseRegister()
        _license = lr.get(dataset_dict['license_id'])
        resource_dict['license'] = _license.url

    #  Simple values
    items = [
        ('name', DCT.title, None, Literal),
        ('description', DCT.description, None, Literal),
        ('status', ADMS.status, None, Literal),
        ('rights', DCT.rights, None, Literal),
        ('license', DCT.license, None, URIRef),
    ]

    for itm in items:
        key, rdf_prop, def_value, rdf_type = itm
        value = resource_dict.get(key, def_value)
        if value:
            g.add((distribution, rdf_prop, rdf_type(value)))

    #  Lists
    items = [
        ('documentation', FOAF.page, None, URIRef),
        ('language', DCT.language, None, URIRef),
        ('conforms_to', DCT.conformsTo, None, URIRef),
    ]
    # self._add_list_triples_from_dict(resource_dict, distribution, items)
    for itm in items:
        key, rdf_prop, def_value, rdf_type = itm
        value = resource_dict.get(key, def_value)
        if value:
            if isinstance(value, list):
                for val in value:
                    g.add((distribution, rdf_prop, rdf_type(val)))
            else:
                g.add((distribution, rdf_prop, rdf_type(value)))

    # Format
    if '/' in resource_dict.get('format', ''):
        g.add((distribution, DCAT.mediaType, Literal(resource_dict['format'])))
    else:
        if resource_dict.get('format'):
            g.add((distribution, DCT['format'],
                   Literal(resource_dict['format'])))

        if resource_dict.get('mimetype'):
            g.add((distribution, DCAT.mediaType,
                   Literal(resource_dict['mimetype'])))

    # URL
    url = resource_dict.get('url')
    download_url = resource_dict.get('download_url')
    if download_url:
        g.add((distribution, DCAT.downloadURL, URIRef(download_url)))
    if (url and not download_url) or (url and url != download_url):
        g.add((distribution, DCAT.accessURL, URIRef(url)))

    # Dates
    items = [
        ('issued', DCT.issued, None, Literal),
        ('modified', DCT.modified, None, Literal),
    ]

    #self._add_date_triples_from_dict(resource_dict, distribution, items)
    for itm in items:
        key, rdf_prop, def_value, rdf_type = itm
        value = resource_dict.get(key, def_value)
        if value:
            g.add((distribution, rdf_prop, rdf_type(value)))

    # Numbers
    if resource_dict.get('size'):
        try:
            g.add((distribution, DCAT.byteSize,
                   Literal(float(resource_dict['size']),
                           datatype=XSD.decimal)))
        except (ValueError, TypeError):
            g.add(
                (distribution, DCAT.byteSize, Literal(resource_dict['size'])))
    # Checksum
    if resource_dict.get('hash'):
        checksum = BNode()
        g.add((checksum, SPDX.checksumValue,
               Literal(resource_dict['hash'], datatype=XSD.hexBinary)))

        if resource_dict.get('hash_algorithm'):
            if resource_dict['hash_algorithm'].startswith('http'):
                g.add((checksum, SPDX.algorithm,
                       URIRef(resource_dict['hash_algorithm'])))
            else:
                g.add((checksum, SPDX.algorithm,
                       Literal(resource_dict['hash_algorithm'])))
        g.add((distribution, SPDX.checksum, checksum))

    return g.serialize(format=_format)