def test_format(): value = helpers.format('14-11-2011', '%Y-%m-%d', 'date') eq_(value, '2011-11-14') # invalid value, but valid format # Python on linux has different behavior with year less than 1000 value = helpers.format('14-11-0011', '%2Y-%m-%d', 'date') eq_(value, '11-11-14')
def test_format(): value = helpers.format('14-11-2011', '%Y-%m-%d', 'date') eq_(value, '2011-11-14') # invalid value, but valid format value = helpers.format('14-11-0011', '%Y-%m-%d', 'date') eq_(value, '14-11-0011')
def parse_dataset(self, dataset_dict, dataset_ref): # check the dataset type if (dataset_ref, RDF.type, DCATAPIT.Dataset) not in self.g: # not a DCATAPIT dataset return dataset_dict # date info for predicate, key, logf in ( (DCT.issued, 'issued', log.debug), (DCT.modified, 'modified', log.warn), ): value = self._object_value(dataset_ref, predicate) if value: self._remove_from_extra(dataset_dict, key) value = helpers.format(value, '%Y-%m-%d', 'date') dataset_dict[key] = value else: logf('No %s found for dataset "%s"', predicate, dataset_dict.get('title', '---')) # 0..1 predicates for predicate, key, logf in ((DCT.identifier, 'identifier', log.warn), ): value = self._object_value(dataset_ref, predicate) if value: self._remove_from_extra(dataset_dict, key) dataset_dict[key] = value else: logf('No %s found for dataset "%s"', predicate, dataset_dict.get('title', '---')) # 0..n predicates list for predicate, key, logf in ((DCT.isVersionOf, 'is_version_of', log.debug), ): valueList = self._object_value_list(dataset_ref, predicate) if valueList: self._remove_from_extra(dataset_dict, key) value = ','.join(valueList) dataset_dict[key] = value else: logf('No %s found for dataset "%s"', predicate, dataset_dict.get('title', '---')) alternate_identifiers = self.g.objects(dataset_ref, ADMS.identifier) alt_ids = [] for alt_id in alternate_identifiers: alternate_id = self._alternate_id(dataset_ref, alt_id) if alternate_id: alt_ids.append(alternate_id) dataset_dict['alternate_identifier'] = json.dumps(alt_ids) # conformsTo self._remove_from_extra(dataset_dict, 'conforms_to') conform_list = [] for conforms_to in self.g.objects(dataset_ref, DCT.conformsTo): conform_list.append(self._conforms_to(conforms_to)) if conform_list: dataset_dict['conforms_to'] = json.dumps(conform_list) else: log.debug('No DCT.conformsTo found for dataset "%s"', dataset_dict.get('title', '---')) # Temporal temporal_coverage = self._get_temporal_coverage(dataset_ref) if temporal_coverage: dataset_dict['temporal_coverage'] = json.dumps(temporal_coverage) #start, end = self._time_interval(dataset_ref, DCT.temporal) # URI 0..1 for predicate, key, base_uri in ((DCT.accrualPeriodicity, 'frequency', FREQ_BASE_URI), ): valueRef = self._object_value(dataset_ref, predicate) if valueRef: self._remove_from_extra(dataset_dict, key) value = self._strip_uri(valueRef, base_uri) dataset_dict[key] = value else: log.warn('No %s found for dataset "%s"', predicate, dataset_dict.get('title', '---')) # URI lists for predicate, key, base_uri in ((DCT.language, 'language', LANG_BASE_URI), ): self._remove_from_extra(dataset_dict, key) valueRefList = self._object_value_list(dataset_ref, predicate) valueList = [ self._strip_uri(valueRef, base_uri) for valueRef in valueRefList ] value = ','.join(valueList) if len(valueList) > 1: value = '{' + value + '}' dataset_dict[key] = value self._parse_themes(dataset_dict, dataset_ref) # Spatial spatial_tags = [] geonames_url = None for spatial in self.g.objects(dataset_ref, DCT.spatial): for spatial_literal in self.g.objects( spatial, DCATAPIT.geographicalIdentifier): spatial_value = spatial_literal.value if GEO_BASE_URI in spatial_value: spatial_tags.append( self._strip_uri(spatial_value, GEO_BASE_URI)) else: if geonames_url: log.warn( "GeoName URL is already set to %s, value %s will not be imported", geonames_url, spatial_value) else: geonames_url = spatial_value if len(spatial_tags) > 0: value = ','.join(spatial_tags) if len(spatial_tags) > 1: value = '{' + value + '}' dataset_dict['geographical_name'] = value if geonames_url: dataset_dict['geographical_geonames_url'] = geonames_url ### Collect strings from multilang fields # { 'field_name': {'it': 'italian loc', 'de': 'german loc', ...}, ...} localized_dict = {} for key, predicate in ( ('title', DCT.title), ('notes', DCT.description), ): self._collect_multilang_strings(dataset_dict, key, dataset_ref, predicate, localized_dict) # Agents for predicate, basekey in ( (DCT.publisher, 'publisher'), (DCT.rightsHolder, 'holder'), # for backward compatibility only, # new format is handled with self._parse_creators() below (DCT.creator, 'creator'), ): agent_dict, agent_loc_dict = self._parse_agent( dataset_ref, predicate, basekey) for key, v in agent_dict.iteritems(): self._remove_from_extra(dataset_dict, key) dataset_dict[key] = v localized_dict.update(agent_loc_dict) creators = self._parse_creators(dataset_ref) # use data from old method to populate new format from_old = {} if dataset_dict.get('creator_name'): from_old['creator_name'] = { DEFAULT_LANG: dataset_dict['creator_name'] } if dataset_dict.get('creator_identifier'): from_old['creator_identifier'] = dataset_dict['creator_identifier'] # do not add old format if the same identifier is in new data # this will avoid duplicates in re-harvesting from_old_add = False if from_old: from_old_add = True if from_old.get('creator_identifier'): for cr in creators: cid = cr.get('creator_identifier') if cid is None: continue if cid == from_old['creator_identifier']: from_old_add = False break if from_old_add: creators.append(from_old) dataset_dict['creator'] = json.dumps(creators) # when all localized data have been parsed, check if there really any and add it to the dict if len(localized_dict) > 0: log.debug('Found multilang metadata') dataset_dict[LOCALISED_DICT_NAME_BASE] = localized_dict ### Resources resources_loc_dict = {} # In ckan, the license is a dataset property, not resource's # We'll collect all of the resources' licenses, then we will postprocess them licenses = [] # contains tuples (url, name) for resource_dict in dataset_dict.get('resources', []): resource_uri = resource_dict['uri'] if not resource_uri: log.warn("URI not defined for resource %s", resource_dict['name']) continue distribution = URIRef(resource_uri) if not (dataset_ref, DCAT.distribution, distribution) in self.g: log.warn("Distribution not found in dataset %s", resource_uri) continue # fix the CKAN resource's url set by the dcat extension resource_dict['url'] = ( self._object_value(distribution, DCAT.downloadURL) or self._object_value(distribution, DCAT.accessURL)) # URI 0..1 for predicate, key, base_uri in ( (DCT['format'], 'format', FORMAT_BASE_URI), # Format ): valueRef = self._object_value(distribution, predicate) if valueRef: value = self._strip_uri(valueRef, base_uri) resource_dict[key] = value else: log.warn('No %s found for resource "%s"::"%s"', predicate, dataset_dict.get('title', '---'), resource_dict.get('name', '---')) # License license = self._object(distribution, DCT.license) if license: license_uri = unicode(license) license_dct = self._object_value(license, DCT.type) license_names = self.g.objects( license, FOAF.name) # may be either the title or the id license_version = self._object_value(license, FOAF.versionInfo) names = {} prefname = None for l in license_names: if l.language: names[l.language] = unicode(l) else: prefname = unicode(l) license_type = interfaces.get_license_from_dcat( license_uri, license_dct, prefname, **names) if license_version and unicode( license_version) != license_type.version: log.warn("License version mismatch between %s and %s", license_versions, license_type.version) resource_dict['license_type'] = license_type.uri try: license_name = names['it'] except KeyError: try: license_name = names['en'] except KeyError: license_name = names.values( )[0] if names else license_type.default_name log.info("Setting lincense %s %s %s", license_type.uri, license_name, license_type.document_uri) licenses.append((license_type.uri, license_name, license_type.document_uri)) else: log.warn('No license found for resource "%s"::"%s"', dataset_dict.get('title', '---'), resource_dict.get('name', '---')) # Multilang loc_dict = {} for key, predicate in ( ('name', DCT.title), ('description', DCT.description), ): self._collect_multilang_strings(resource_dict, key, distribution, predicate, loc_dict) if len(loc_dict) > 0: log.debug('Found multilang metadata in resource %s', resource_dict['name']) resources_loc_dict[resource_uri] = loc_dict if len(resources_loc_dict) > 0: log.debug('Found multilang metadata in resources') dataset_dict[LOCALISED_DICT_NAME_RESOURCES] = resources_loc_dict # postprocess licenses # license_ids = {id for url,id in licenses} # does not work in python 2.6 license_ids = set() for lic_uri, id, doc_uri in licenses: license_ids.add(id) if len(license_ids) == 1: dataset_dict['license_id'] = license_ids.pop() # TODO Map to internally defined licenses else: log.warn('%d licenses found for dataset "%s"', len(license_ids), dataset_dict.get('title', '---')) dataset_dict['license_id'] = 'notspecified' return dataset_dict
def test_format(): value = helpers.format('14-11-2011', '%Y-%m-%d', 'date') eq_(value, '2011-11-14')
def parse_dataset(self, dataset_dict, dataset_ref): # check the dataset type if (dataset_ref, RDF.type, DCATAPIT.Dataset) not in self.g: # not a DCATAPIT dataset return dataset_dict # date info for predicate, key, logf in ( (DCT.issued, 'issued', log.debug), (DCT.modified, 'modified', log.warn), ): value = self._object_value(dataset_ref, predicate) if value: self._remove_from_extra(dataset_dict, key) value = helpers.format(value, '%Y-%m-%d', 'date') dataset_dict[key] = value else: logf('No %s found for dataset "%s"', predicate, dataset_dict.get('title', '---')) # 0..1 predicates for predicate, key, logf in ((DCT.identifier, 'identifier', log.warn), ): value = self._object_value(dataset_ref, predicate) if value: self._remove_from_extra(dataset_dict, key) dataset_dict[key] = value else: logf('No %s found for dataset "%s"', predicate, dataset_dict.get('title', '---')) # 0..n predicates list for predicate, key, logf in ( (ADMS.identifier, 'alternate_identifier', log.debug), (DCT.isVersionOf, 'is_version_of', log.debug), ): valueList = self._object_value_list(dataset_ref, predicate) if valueList: self._remove_from_extra(dataset_dict, key) value = ','.join(valueList) dataset_dict[key] = value else: logf('No %s found for dataset "%s"', predicate, dataset_dict.get('title', '---')) # conformsTo self._remove_from_extra(dataset_dict, 'conforms_to') conform_list = [] for conforms_to in self.g.objects(dataset_ref, DCT.conformsTo): conform_list.append(self._object_value(conforms_to, DCT.identifier)) if conform_list: value = ','.join(conform_list) dataset_dict['conforms_to'] = value else: log.debug('No DCT.conformsTo found for dataset "%s"', dataset_dict.get('title', '---')) # Temporal start, end = self._time_interval(dataset_ref, DCT.temporal) for v, key, logf in ( (start, 'temporal_start', log.debug), (end, 'temporal_end', log.debug), ): if v: self._remove_from_extra(dataset_dict, key) value = helpers.format(v, '%Y-%m-%d', 'date') dataset_dict[key] = value else: log.warn('No %s Date found for dataset "%s"', key, dataset_dict.get('title', '---')) # URI 0..1 for predicate, key, base_uri in ((DCT.accrualPeriodicity, 'frequency', FREQ_BASE_URI), ): valueRef = self._object_value(dataset_ref, predicate) if valueRef: self._remove_from_extra(dataset_dict, key) value = self._strip_uri(valueRef, base_uri) dataset_dict[key] = value else: log.warn('No %s found for dataset "%s"', predicate, dataset_dict.get('title', '---')) # URI lists for predicate, key, base_uri in ( (DCT.language, 'language', LANG_BASE_URI), (DCAT.theme, 'theme', THEME_BASE_URI), ): self._remove_from_extra(dataset_dict, key) valueRefList = self._object_value_list(dataset_ref, predicate) valueList = [ self._strip_uri(valueRef, base_uri) for valueRef in valueRefList ] value = ','.join(valueList) if len(valueList) > 1: value = '{' + value + '}' dataset_dict[key] = value # Spatial spatial_tags = [] geonames_url = None for spatial in self.g.objects(dataset_ref, DCT.spatial): for spatial_literal in self.g.objects( spatial, DCATAPIT.geographicalIdentifier): spatial_value = spatial_literal.value if GEO_BASE_URI in spatial_value: spatial_tags.append( self._strip_uri(spatial_value, GEO_BASE_URI)) else: if geonames_url: log.warn( "GeoName URL is already set to %s, value %s will not be imported", geonames_url, spatial_value) else: geonames_url = spatial_value if len(spatial_tags) > 0: value = ','.join(spatial_tags) if len(spatial_tags) > 1: value = '{' + value + '}' dataset_dict['geographical_name'] = value if geonames_url: dataset_dict['geographical_geonames_url'] = geonames_url ### Collect strings from multilang fields # { 'field_name': {'it': 'italian loc', 'de': 'german loc', ...}, ...} localized_dict = {} for key, predicate in ( ('title', DCT.title), ('notes', DCT.description), ): self._collect_multilang_strings(dataset_dict, key, dataset_ref, predicate, localized_dict) # Agents for predicate, basekey in ( (DCT.publisher, 'publisher'), (DCT.rightsHolder, 'holder'), (DCT.creator, 'creator'), ): agent_dict, agent_loc_dict = self._parse_agent( dataset_ref, predicate, basekey) for key, v in agent_dict.iteritems(): self._remove_from_extra(dataset_dict, key) dataset_dict[key] = v localized_dict.update(agent_loc_dict) # when all localized data have been parsed, check if there really any and add it to the dict if len(localized_dict) > 0: log.debug('Found multilang metadata') dataset_dict[LOCALISED_DICT_NAME_BASE] = localized_dict ### Resources resources_loc_dict = {} # In ckan, the license is a dataset property, not resource's # We'll collect all of the resources' licenses, then we will postprocess them licenses = [] # contains tuples (url, name) for resource_dict in dataset_dict.get('resources', []): resource_uri = resource_dict['uri'] if not resource_uri: log.warn("URI not defined for resource %s", resource_dict['name']) continue distribution = URIRef(resource_uri) if not (dataset_ref, DCAT.distribution, distribution) in self.g: log.warn("Distribution not found in dataset %s", resource_uri) continue # URI 0..1 for predicate, key, base_uri in ( (DCT['format'], 'format', FORMAT_BASE_URI), # Format ): valueRef = self._object_value(distribution, predicate) if valueRef: value = self._strip_uri(valueRef, base_uri) resource_dict[key] = value else: log.warn('No %s found for resource "%s"::"%s"', predicate, dataset_dict.get('title', '---'), resource_dict.get('name', '---')) # License license = self._object(distribution, DCT.license) if license: # just add this info in the resource extras resource_dict['license_url'] = str(license) license_name = self._object_value( license, FOAF.name) # may be either the title or the id if (license_name): # just add this info in the resource extras resource_dict['license_name'] = license_name else: license_name = "unknown" licenses.append((str(license), license_name)) else: log.warn('No license found for resource "%s"::"%s"', dataset_dict.get('title', '---'), resource_dict.get('name', '---')) # Multilang loc_dict = {} for key, predicate in ( ('name', DCT.title), ('description', DCT.description), ): self._collect_multilang_strings(resource_dict, key, distribution, predicate, loc_dict) if len(loc_dict) > 0: log.debug('Found multilang metadata in resource %s', resource_dict['name']) resources_loc_dict[resource_uri] = loc_dict if len(resources_loc_dict) > 0: log.debug('Found multilang metadata in resources') dataset_dict[LOCALISED_DICT_NAME_RESOURCES] = resources_loc_dict # postprocess licenses # license_ids = {id for url,id in licenses} # does not work in python 2.6 license_ids = set() for url, id in licenses: license_ids.add(id) if license_ids: if len(license_ids) > 1: log.warn('More than one license found for dataset "%s"', dataset_dict.get('title', '---')) dataset_dict['license_id'] = license_ids.pop() # take a random one return dataset_dict