def test_format():
    value = helpers.format('14-11-2011', '%Y-%m-%d', 'date')
    eq_(value, '2011-11-14')
    # invalid value, but valid format
    # Python on linux has different behavior with year less than 1000
    value = helpers.format('14-11-0011', '%2Y-%m-%d', 'date')
    eq_(value, '11-11-14')
示例#2
0
def test_format():
    value = helpers.format('14-11-2011', '%Y-%m-%d', 'date')
    eq_(value, '2011-11-14')
    # invalid value, but valid format
    value = helpers.format('14-11-0011', '%Y-%m-%d', 'date')
    eq_(value, '14-11-0011')
示例#3
0
    def parse_dataset(self, dataset_dict, dataset_ref):

        # check the dataset type
        if (dataset_ref, RDF.type, DCATAPIT.Dataset) not in self.g:
            # not a DCATAPIT dataset
            return dataset_dict

        # date info
        for predicate, key, logf in (
            (DCT.issued, 'issued', log.debug),
            (DCT.modified, 'modified', log.warn),
        ):
            value = self._object_value(dataset_ref, predicate)
            if value:
                self._remove_from_extra(dataset_dict, key)

                value = helpers.format(value, '%Y-%m-%d', 'date')
                dataset_dict[key] = value
            else:
                logf('No %s found for dataset "%s"', predicate,
                     dataset_dict.get('title', '---'))

        # 0..1 predicates
        for predicate, key, logf in ((DCT.identifier, 'identifier',
                                      log.warn), ):
            value = self._object_value(dataset_ref, predicate)
            if value:
                self._remove_from_extra(dataset_dict, key)
                dataset_dict[key] = value
            else:
                logf('No %s found for dataset "%s"', predicate,
                     dataset_dict.get('title', '---'))

        # 0..n predicates list
        for predicate, key, logf in ((DCT.isVersionOf, 'is_version_of',
                                      log.debug), ):
            valueList = self._object_value_list(dataset_ref, predicate)
            if valueList:
                self._remove_from_extra(dataset_dict, key)
                value = ','.join(valueList)
                dataset_dict[key] = value
            else:
                logf('No %s found for dataset "%s"', predicate,
                     dataset_dict.get('title', '---'))

        alternate_identifiers = self.g.objects(dataset_ref, ADMS.identifier)
        alt_ids = []
        for alt_id in alternate_identifiers:
            alternate_id = self._alternate_id(dataset_ref, alt_id)
            if alternate_id:
                alt_ids.append(alternate_id)
        dataset_dict['alternate_identifier'] = json.dumps(alt_ids)

        # conformsTo
        self._remove_from_extra(dataset_dict, 'conforms_to')
        conform_list = []
        for conforms_to in self.g.objects(dataset_ref, DCT.conformsTo):
            conform_list.append(self._conforms_to(conforms_to))
        if conform_list:
            dataset_dict['conforms_to'] = json.dumps(conform_list)
        else:
            log.debug('No DCT.conformsTo found for dataset "%s"',
                      dataset_dict.get('title', '---'))

        # Temporal
        temporal_coverage = self._get_temporal_coverage(dataset_ref)
        if temporal_coverage:
            dataset_dict['temporal_coverage'] = json.dumps(temporal_coverage)

        #start, end = self._time_interval(dataset_ref, DCT.temporal)

        # URI 0..1
        for predicate, key, base_uri in ((DCT.accrualPeriodicity, 'frequency',
                                          FREQ_BASE_URI), ):
            valueRef = self._object_value(dataset_ref, predicate)
            if valueRef:
                self._remove_from_extra(dataset_dict, key)
                value = self._strip_uri(valueRef, base_uri)
                dataset_dict[key] = value
            else:
                log.warn('No %s found for dataset "%s"', predicate,
                         dataset_dict.get('title', '---'))

        # URI lists
        for predicate, key, base_uri in ((DCT.language, 'language',
                                          LANG_BASE_URI), ):
            self._remove_from_extra(dataset_dict, key)
            valueRefList = self._object_value_list(dataset_ref, predicate)
            valueList = [
                self._strip_uri(valueRef, base_uri)
                for valueRef in valueRefList
            ]
            value = ','.join(valueList)
            if len(valueList) > 1:
                value = '{' + value + '}'
            dataset_dict[key] = value

        self._parse_themes(dataset_dict, dataset_ref)

        # Spatial
        spatial_tags = []
        geonames_url = None

        for spatial in self.g.objects(dataset_ref, DCT.spatial):
            for spatial_literal in self.g.objects(
                    spatial, DCATAPIT.geographicalIdentifier):
                spatial_value = spatial_literal.value
                if GEO_BASE_URI in spatial_value:
                    spatial_tags.append(
                        self._strip_uri(spatial_value, GEO_BASE_URI))
                else:
                    if geonames_url:
                        log.warn(
                            "GeoName URL is already set to %s, value %s will not be imported",
                            geonames_url, spatial_value)
                    else:
                        geonames_url = spatial_value

        if len(spatial_tags) > 0:
            value = ','.join(spatial_tags)
            if len(spatial_tags) > 1:
                value = '{' + value + '}'
            dataset_dict['geographical_name'] = value

        if geonames_url:
            dataset_dict['geographical_geonames_url'] = geonames_url

        ### Collect strings from multilang fields

        # { 'field_name': {'it': 'italian loc', 'de': 'german loc', ...}, ...}
        localized_dict = {}

        for key, predicate in (
            ('title', DCT.title),
            ('notes', DCT.description),
        ):
            self._collect_multilang_strings(dataset_dict, key, dataset_ref,
                                            predicate, localized_dict)

        # Agents
        for predicate, basekey in (
            (DCT.publisher, 'publisher'),
            (DCT.rightsHolder, 'holder'),
                # for backward compatibility only,
                # new format is handled with self._parse_creators() below
            (DCT.creator, 'creator'),
        ):
            agent_dict, agent_loc_dict = self._parse_agent(
                dataset_ref, predicate, basekey)
            for key, v in agent_dict.iteritems():
                self._remove_from_extra(dataset_dict, key)
                dataset_dict[key] = v
            localized_dict.update(agent_loc_dict)

        creators = self._parse_creators(dataset_ref)

        # use data from old method to populate new format
        from_old = {}
        if dataset_dict.get('creator_name'):
            from_old['creator_name'] = {
                DEFAULT_LANG: dataset_dict['creator_name']
            }
        if dataset_dict.get('creator_identifier'):
            from_old['creator_identifier'] = dataset_dict['creator_identifier']

        # do not add old format if the same identifier is in new data
        # this will avoid duplicates in re-harvesting
        from_old_add = False
        if from_old:
            from_old_add = True
            if from_old.get('creator_identifier'):
                for cr in creators:
                    cid = cr.get('creator_identifier')
                    if cid is None:
                        continue
                    if cid == from_old['creator_identifier']:
                        from_old_add = False
                        break
        if from_old_add:
            creators.append(from_old)
        dataset_dict['creator'] = json.dumps(creators)

        # when all localized data have been parsed, check if there really any and add it to the dict
        if len(localized_dict) > 0:
            log.debug('Found multilang metadata')
            dataset_dict[LOCALISED_DICT_NAME_BASE] = localized_dict

        ### Resources

        resources_loc_dict = {}

        # In ckan, the license is a dataset property, not resource's
        # We'll collect all of the resources' licenses, then we will postprocess them
        licenses = []  #  contains tuples (url, name)

        for resource_dict in dataset_dict.get('resources', []):
            resource_uri = resource_dict['uri']
            if not resource_uri:
                log.warn("URI not defined for resource %s",
                         resource_dict['name'])
                continue

            distribution = URIRef(resource_uri)
            if not (dataset_ref, DCAT.distribution, distribution) in self.g:
                log.warn("Distribution not found in dataset %s", resource_uri)
                continue

            # fix the CKAN resource's url set by the dcat extension
            resource_dict['url'] = (
                self._object_value(distribution, DCAT.downloadURL)
                or self._object_value(distribution, DCAT.accessURL))

            # URI 0..1
            for predicate, key, base_uri in (
                (DCT['format'], 'format', FORMAT_BASE_URI),  # Format
            ):
                valueRef = self._object_value(distribution, predicate)
                if valueRef:
                    value = self._strip_uri(valueRef, base_uri)
                    resource_dict[key] = value
                else:
                    log.warn('No %s found for resource "%s"::"%s"', predicate,
                             dataset_dict.get('title', '---'),
                             resource_dict.get('name', '---'))

            # License
            license = self._object(distribution, DCT.license)
            if license:

                license_uri = unicode(license)
                license_dct = self._object_value(license, DCT.type)
                license_names = self.g.objects(
                    license, FOAF.name)  # may be either the title or the id
                license_version = self._object_value(license, FOAF.versionInfo)

                names = {}
                prefname = None
                for l in license_names:
                    if l.language:
                        names[l.language] = unicode(l)
                    else:
                        prefname = unicode(l)

                license_type = interfaces.get_license_from_dcat(
                    license_uri, license_dct, prefname, **names)
                if license_version and unicode(
                        license_version) != license_type.version:
                    log.warn("License version mismatch between %s and %s",
                             license_versions, license_type.version)

                resource_dict['license_type'] = license_type.uri
                try:
                    license_name = names['it']
                except KeyError:
                    try:
                        license_name = names['en']
                    except KeyError:
                        license_name = names.values(
                        )[0] if names else license_type.default_name

                log.info("Setting lincense %s %s %s", license_type.uri,
                         license_name, license_type.document_uri)

                licenses.append((license_type.uri, license_name,
                                 license_type.document_uri))
            else:
                log.warn('No license found for resource "%s"::"%s"',
                         dataset_dict.get('title', '---'),
                         resource_dict.get('name', '---'))

            # Multilang
            loc_dict = {}

            for key, predicate in (
                ('name', DCT.title),
                ('description', DCT.description),
            ):
                self._collect_multilang_strings(resource_dict, key,
                                                distribution, predicate,
                                                loc_dict)

            if len(loc_dict) > 0:
                log.debug('Found multilang metadata in resource %s',
                          resource_dict['name'])
                resources_loc_dict[resource_uri] = loc_dict

        if len(resources_loc_dict) > 0:
            log.debug('Found multilang metadata in resources')
            dataset_dict[LOCALISED_DICT_NAME_RESOURCES] = resources_loc_dict

        # postprocess licenses
        # license_ids = {id for url,id in licenses}  # does not work in python 2.6
        license_ids = set()
        for lic_uri, id, doc_uri in licenses:
            license_ids.add(id)

        if len(license_ids) == 1:
            dataset_dict['license_id'] = license_ids.pop()
            # TODO Map to internally defined licenses
        else:
            log.warn('%d licenses found for dataset "%s"', len(license_ids),
                     dataset_dict.get('title', '---'))
            dataset_dict['license_id'] = 'notspecified'

        return dataset_dict
示例#4
0
def test_format():
    value = helpers.format('14-11-2011', '%Y-%m-%d', 'date')
    eq_(value, '2011-11-14')
示例#5
0
    def parse_dataset(self, dataset_dict, dataset_ref):

        # check the dataset type
        if (dataset_ref, RDF.type, DCATAPIT.Dataset) not in self.g:
            # not a DCATAPIT dataset
            return dataset_dict

        # date info
        for predicate, key, logf in (
            (DCT.issued, 'issued', log.debug),
            (DCT.modified, 'modified', log.warn),
        ):
            value = self._object_value(dataset_ref, predicate)
            if value:
                self._remove_from_extra(dataset_dict, key)

                value = helpers.format(value, '%Y-%m-%d', 'date')
                dataset_dict[key] = value
            else:
                logf('No %s found for dataset "%s"', predicate,
                     dataset_dict.get('title', '---'))

        # 0..1 predicates
        for predicate, key, logf in ((DCT.identifier, 'identifier',
                                      log.warn), ):
            value = self._object_value(dataset_ref, predicate)
            if value:
                self._remove_from_extra(dataset_dict, key)
                dataset_dict[key] = value
            else:
                logf('No %s found for dataset "%s"', predicate,
                     dataset_dict.get('title', '---'))

        # 0..n predicates list
        for predicate, key, logf in (
            (ADMS.identifier, 'alternate_identifier', log.debug),
            (DCT.isVersionOf, 'is_version_of', log.debug),
        ):
            valueList = self._object_value_list(dataset_ref, predicate)
            if valueList:
                self._remove_from_extra(dataset_dict, key)
                value = ','.join(valueList)
                dataset_dict[key] = value
            else:
                logf('No %s found for dataset "%s"', predicate,
                     dataset_dict.get('title', '---'))

        # conformsTo
        self._remove_from_extra(dataset_dict, 'conforms_to')
        conform_list = []
        for conforms_to in self.g.objects(dataset_ref, DCT.conformsTo):
            conform_list.append(self._object_value(conforms_to,
                                                   DCT.identifier))
        if conform_list:
            value = ','.join(conform_list)
            dataset_dict['conforms_to'] = value
        else:
            log.debug('No DCT.conformsTo found for dataset "%s"',
                      dataset_dict.get('title', '---'))

        # Temporal
        start, end = self._time_interval(dataset_ref, DCT.temporal)
        for v, key, logf in (
            (start, 'temporal_start', log.debug),
            (end, 'temporal_end', log.debug),
        ):
            if v:
                self._remove_from_extra(dataset_dict, key)

                value = helpers.format(v, '%Y-%m-%d', 'date')
                dataset_dict[key] = value
            else:
                log.warn('No %s Date found for dataset "%s"', key,
                         dataset_dict.get('title', '---'))

        # URI 0..1
        for predicate, key, base_uri in ((DCT.accrualPeriodicity, 'frequency',
                                          FREQ_BASE_URI), ):
            valueRef = self._object_value(dataset_ref, predicate)
            if valueRef:
                self._remove_from_extra(dataset_dict, key)
                value = self._strip_uri(valueRef, base_uri)
                dataset_dict[key] = value
            else:
                log.warn('No %s found for dataset "%s"', predicate,
                         dataset_dict.get('title', '---'))

        # URI lists
        for predicate, key, base_uri in (
            (DCT.language, 'language', LANG_BASE_URI),
            (DCAT.theme, 'theme', THEME_BASE_URI),
        ):
            self._remove_from_extra(dataset_dict, key)
            valueRefList = self._object_value_list(dataset_ref, predicate)
            valueList = [
                self._strip_uri(valueRef, base_uri)
                for valueRef in valueRefList
            ]
            value = ','.join(valueList)
            if len(valueList) > 1:
                value = '{' + value + '}'
            dataset_dict[key] = value

        # Spatial
        spatial_tags = []
        geonames_url = None

        for spatial in self.g.objects(dataset_ref, DCT.spatial):
            for spatial_literal in self.g.objects(
                    spatial, DCATAPIT.geographicalIdentifier):
                spatial_value = spatial_literal.value
                if GEO_BASE_URI in spatial_value:
                    spatial_tags.append(
                        self._strip_uri(spatial_value, GEO_BASE_URI))
                else:
                    if geonames_url:
                        log.warn(
                            "GeoName URL is already set to %s, value %s will not be imported",
                            geonames_url, spatial_value)
                    else:
                        geonames_url = spatial_value

        if len(spatial_tags) > 0:
            value = ','.join(spatial_tags)
            if len(spatial_tags) > 1:
                value = '{' + value + '}'
            dataset_dict['geographical_name'] = value

        if geonames_url:
            dataset_dict['geographical_geonames_url'] = geonames_url

        ### Collect strings from multilang fields

        # { 'field_name': {'it': 'italian loc', 'de': 'german loc', ...}, ...}
        localized_dict = {}

        for key, predicate in (
            ('title', DCT.title),
            ('notes', DCT.description),
        ):
            self._collect_multilang_strings(dataset_dict, key, dataset_ref,
                                            predicate, localized_dict)

        # Agents
        for predicate, basekey in (
            (DCT.publisher, 'publisher'),
            (DCT.rightsHolder, 'holder'),
            (DCT.creator, 'creator'),
        ):
            agent_dict, agent_loc_dict = self._parse_agent(
                dataset_ref, predicate, basekey)
            for key, v in agent_dict.iteritems():
                self._remove_from_extra(dataset_dict, key)
                dataset_dict[key] = v
            localized_dict.update(agent_loc_dict)

        # when all localized data have been parsed, check if there really any and add it to the dict
        if len(localized_dict) > 0:
            log.debug('Found multilang metadata')
            dataset_dict[LOCALISED_DICT_NAME_BASE] = localized_dict

        ### Resources

        resources_loc_dict = {}

        # In ckan, the license is a dataset property, not resource's
        # We'll collect all of the resources' licenses, then we will postprocess them
        licenses = []  #  contains tuples (url, name)

        for resource_dict in dataset_dict.get('resources', []):
            resource_uri = resource_dict['uri']
            if not resource_uri:
                log.warn("URI not defined for resource %s",
                         resource_dict['name'])
                continue

            distribution = URIRef(resource_uri)
            if not (dataset_ref, DCAT.distribution, distribution) in self.g:
                log.warn("Distribution not found in dataset %s", resource_uri)
                continue

            # URI 0..1
            for predicate, key, base_uri in (
                (DCT['format'], 'format', FORMAT_BASE_URI),  # Format
            ):
                valueRef = self._object_value(distribution, predicate)
                if valueRef:
                    value = self._strip_uri(valueRef, base_uri)
                    resource_dict[key] = value
                else:
                    log.warn('No %s found for resource "%s"::"%s"', predicate,
                             dataset_dict.get('title', '---'),
                             resource_dict.get('name', '---'))

            # License
            license = self._object(distribution, DCT.license)
            if license:
                # just add this info in the resource extras
                resource_dict['license_url'] = str(license)
                license_name = self._object_value(
                    license, FOAF.name)  # may be either the title or the id
                if (license_name):
                    # just add this info in the resource extras
                    resource_dict['license_name'] = license_name
                else:
                    license_name = "unknown"
                licenses.append((str(license), license_name))
            else:
                log.warn('No license found for resource "%s"::"%s"',
                         dataset_dict.get('title', '---'),
                         resource_dict.get('name', '---'))

            # Multilang
            loc_dict = {}

            for key, predicate in (
                ('name', DCT.title),
                ('description', DCT.description),
            ):
                self._collect_multilang_strings(resource_dict, key,
                                                distribution, predicate,
                                                loc_dict)

            if len(loc_dict) > 0:
                log.debug('Found multilang metadata in resource %s',
                          resource_dict['name'])
                resources_loc_dict[resource_uri] = loc_dict

        if len(resources_loc_dict) > 0:
            log.debug('Found multilang metadata in resources')
            dataset_dict[LOCALISED_DICT_NAME_RESOURCES] = resources_loc_dict

        # postprocess licenses
        # license_ids = {id for url,id in licenses}  # does not work in python 2.6
        license_ids = set()
        for url, id in licenses:
            license_ids.add(id)

        if license_ids:
            if len(license_ids) > 1:
                log.warn('More than one license found for dataset "%s"',
                         dataset_dict.get('title', '---'))
            dataset_dict['license_id'] = license_ids.pop()  # take a random one

        return dataset_dict