def pre_validate(self, form): if self.data: try: uris.validate(self.data) except uris.ValidationError: raise validators.ValidationError(_('Invalid URL')) return True
def pre_validate(self, form): if self.data: try: uris.validate(self.data) except uris.ValidationError: raise validators.ValidationError(_('Invalid URL')) return True
def dataset_from_rdf(graph, dataset=None, node=None): ''' Create or update a dataset from a RDF/DCAT graph ''' dataset = dataset or Dataset() if node is None: # Assume first match is the only match node = graph.value(predicate=RDF.type, object=DCAT.Dataset) d = graph.resource(node) dataset.title = rdf_value(d, DCT.title) # Support dct:abstract if dct:description is missing (sometimes used instead) description = d.value(DCT.description) or d.value(DCT.abstract) dataset.description = sanitize_html(description) dataset.frequency = frequency_from_rdf(d.value(DCT.accrualPeriodicity)) dataset.created_at = rdf_value(d, DCT.issued, dataset.created_at) dataset.last_modified = rdf_value(d, DCT.modified, dataset.last_modified) acronym = rdf_value(d, SKOS.altLabel) if acronym: dataset.acronym = acronym tags = [tag.toPython() for tag in d.objects(DCAT.keyword)] tags += [theme.toPython() for theme in d.objects(DCAT.theme) if not isinstance(theme, RdfResource)] dataset.tags = list(set(tags)) identifier = rdf_value(d, DCT.identifier) if identifier: dataset.extras['dct:identifier'] = identifier if isinstance(d.identifier, URIRef): dataset.extras['uri'] = d.identifier.toPython() landing_page = url_from_rdf(d, DCAT.landingPage) if landing_page: try: uris.validate(landing_page) dataset.extras['remote_url'] = landing_page except uris.ValidationError: pass dataset.temporal_coverage = temporal_from_rdf(d.value(DCT.temporal)) licenses = set() for distrib in d.objects(DCAT.distribution | DCAT.distributions): resource_from_rdf(distrib, dataset) for predicate in DCT.license, DCT.rights: value = distrib.value(predicate) if isinstance(value, (URIRef, Literal)): licenses.add(value.toPython()) elif isinstance(value, RdfResource): licenses.add(value.identifier.toPython()) default_license = dataset.license or License.default() dataset_license = rdf_value(d, DCT.license) dataset.license = License.guess(dataset_license, *licenses, default=default_license) return dataset
def validate(self, value): super(URLField, self).validate(value) kwargs = { a: getattr(self, a) for a in ('private', 'local', 'schemes', 'tlds') if getattr(self, a) is not None } try: uris.validate(value, **kwargs) except uris.ValidationError as e: self.error(e.message)
def validate(self, value): super(URLField, self).validate(value) kwargs = { a: getattr(self, a) for a in ('private', 'local', 'schemes', 'tlds') if getattr(self, a) is not None } try: uris.validate(value, **kwargs) except uris.ValidationError as e: self.error(e.message)
def converter(value): if value is None: return value if '://' not in value and default_scheme: value = '://'.join((default_scheme, value.strip())) try: return uris.validate(value) except uris.ValidationError as e: raise Invalid(str(e))
def converter(value): if value is None: return value if '://' not in value and default_scheme: value = '://'.join((default_scheme, value.strip())) try: return uris.validate(value) except uris.ValidationError as e: raise Invalid(e.message)
def frequency_from_rdf(term): if isinstance(term, basestring): try: term = URIRef(uris.validate(term)) except uris.ValidationError: pass if isinstance(term, RdfResource): term = term.identifier if isinstance(term, URIRef): if EUFREQ in term: return EU_RDF_REQUENCIES.get(term) _, _, freq = namespace_manager.compute_qname(term) return freq
def frequency_from_rdf(term): if isinstance(term, str): try: term = URIRef(uris.validate(term)) except uris.ValidationError: pass if isinstance(term, RdfResource): term = term.identifier if isinstance(term, URIRef): if EUFREQ in term: return EU_RDF_REQUENCIES.get(term) _, _, freq = namespace_manager.compute_qname(term) return freq
def test_local_should_not_validate_private_urls(url): with pytest.raises(uris.ValidationError): uris.validate(url, local=True)
def test_private_should_validate_public_and_private_urls(url): assert uris.validate(url, private=True) == url
def test_default_should_not_validate_local_hosts(url): with pytest.raises(uris.ValidationError): uris.validate(url)
def test_default_should_not_validate_unknown_tlds(tld): url = 'http://somewhere.{0}'.format(tld) with pytest.raises(uris.ValidationError): uris.validate(url)
def test_default_should_validate_default_schemes(scheme): url = '{0}://somewhere.com'.format(scheme) assert uris.validate(url) == url
def test_with_credentials(url): assert uris.validate(url) == url
def test_private_should_validate_public_and_private_urls(url): assert uris.validate(url, private=True) == url
def test_should_not_validate_multicast_urls(url): with pytest.raises(uris.ValidationError): uris.validate(url)
def test_default_should_not_validate_local_hosts(url): with pytest.raises(uris.ValidationError): uris.validate(url)
def test_default_should_not_validate_private_urls(url): with pytest.raises(uris.ValidationError): uris.validate(url)
def test_default_should_not_validate_unknown_tlds(tld): url = 'http://somewhere.{0}'.format(tld) with pytest.raises(uris.ValidationError): uris.validate(url)
def test_default_should_not_validate_non_default_schemes(scheme): url = '{0}://somewhere.com'.format(scheme) with pytest.raises(uris.ValidationError): uris.validate(url)
def test_default_should_validate_default_schemes(scheme): url = '{0}://somewhere.com'.format(scheme) assert uris.validate(url) == url
def test_custom_schemes(scheme): url = '{0}://somewhere.com'.format(scheme) assert uris.validate(url, schemes=CUSTOM_SCHEMES) == url
def test_custom_tlds(tld): url = 'http://somewhere.{0}'.format(tld) assert uris.validate(url, tlds=CUSTOM_TLDS) == url
def test_local_should_validate_public_and_local_urls(url): assert uris.validate(url, local=True) == url
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], self.schema) if type(data) == list: data = data[0] # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = parse_html(data['notes']) # Detect Org organization_acronym = data['organization']['name'] orgObj = Organization.objects(acronym=organization_acronym).first() if orgObj: #print 'Found %s' % orgObj.acronym dataset.organization = orgObj else: orgObj = Organization() orgObj.acronym = organization_acronym orgObj.name = data['organization']['title'] orgObj.description = data['organization']['description'] orgObj.save() #print 'Created %s' % orgObj.acronym dataset.organization = orgObj # Detect license default_license = self.harvest_config.get('license', License.default()) dataset.license = License.guess(data['license_id'], data['license_title'], default=default_license) dataset.tags = [t['name'] for t in data['tags'] if t['name']] dataset.tags.append(urlparse(self.source.url).hostname) dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.frequency = 'unknown' dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom = None for extra in data['extras']: # GeoJSON representation (Polygon or Point) if extra['key'] == 'spatial': spatial_geom = json.loads(extra['value']) # Textual representation of the extent / location elif extra['key'] == 'spatial-text': log.debug('spatial-text value not handled') # Linked Data URI representing the place name elif extra['key'] == 'spatial-uri': log.debug('spatial-uri value not handled') # Update frequency elif extra['key'] == 'frequency': print 'frequency', extra['value'] # Temporal coverage start elif extra['key'] == 'temporal_start': temporal_start = daterange_start(extra['value']) continue # Temporal coverage end elif extra['key'] == 'temporal_end': temporal_end = daterange_end(extra['value']) continue dataset.extras[extra['key']] = extra['value'] # We don't want spatial to be added on harvester if self.harvest_config.get('geozones', False): dataset.spatial = SpatialCoverage() dataset.spatial.zones = [] for zone in self.harvest_config.get('geozones'): geo_zone = GeoZone.objects.get(id=zone) dataset.spatial.zones.append(geo_zone) # # if spatial_geom: # dataset.spatial = SpatialCoverage() # if spatial_geom['type'] == 'Polygon': # coordinates = [spatial_geom['coordinates']] # elif spatial_geom['type'] == 'MultiPolygon': # coordinates = spatial_geom['coordinates'] # else: # HarvestException('Unsupported spatial geometry') # dataset.spatial.geom = { # 'type': 'MultiPolygon', # 'coordinates': coordinates # } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL if data.get('url'): try: url = uris.validate(data['url']) except uris.ValidationError: dataset.extras['remote_url'] = self.dataset_url(data['name']) dataset.extras['ckan:source'] = data['url'] else: dataset.extras['remote_url'] = url dataset.extras['harvest:name'] = self.source.name current_resources = [ str(resource.id) for resource in dataset.resources ] fetched_resources = [] # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue #Ignore invalid Resources try: url = uris.validate(res['url']) except uris.ValidationError: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except Exception: log.error('Unable to parse resource ID %s', res['id']) continue fetched_resources.append(str(res['id'])) if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = parse_html(res.get('description')) resource.url = res['url'] resource.filetype = 'remote' resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created # Clean up old resources removed from source for resource_id in current_resources: if resource_id not in fetched_resources: try: resource = get_by(dataset.resources, 'id', UUID(resource_id)) except Exception: log.error('Unable to parse resource ID %s', resource_id) continue else: if resource and not self.dryrun: dataset.resources.remove(resource) return dataset
def test_local_should_not_validate_private_urls(url): with pytest.raises(uris.ValidationError): uris.validate(url, local=True)
def test_default_should_validate_public_ips(url): assert uris.validate(url) == url
def test_private_local_should_validate_any_valid_urls(url): assert uris.validate(url, local=True, private=True) == url
def test_default_should_not_validate_non_default_schemes(scheme): url = '{0}://somewhere.com'.format(scheme) with pytest.raises(uris.ValidationError): uris.validate(url)
def test_custom_schemes(scheme): url = '{0}://somewhere.com'.format(scheme) assert uris.validate(url, schemes=CUSTOM_SCHEMES) == url
def test_default_should_not_validate_private_urls(url): with pytest.raises(uris.ValidationError): uris.validate(url)
def test_custom_schemes_should_not_validate_defaults(scheme): url = '{0}://somewhere.com'.format(scheme) with pytest.raises(uris.ValidationError): uris.validate(url, schemes=CUSTOM_SCHEMES)
def test_should_not_validate_multicast_urls(url): with pytest.raises(uris.ValidationError): uris.validate(url)
def test_custom_tlds(tld): url = 'http://somewhere.{0}'.format(tld) assert uris.validate(url, tlds=CUSTOM_TLDS) == url
def test_local_should_validate_public_and_local_urls(url): assert uris.validate(url, local=True) == url
def test_custom_tlds_should_not_validate_defaults(tld): url = 'http://somewhere.{0}'.format(tld) with pytest.raises(uris.ValidationError): uris.validate(url, tlds=CUSTOM_TLDS)
def test_private_local_should_validate_any_valid_urls(url): assert uris.validate(url, local=True, private=True) == url
def test_with_credentials(url): assert uris.validate(url) == url
def test_custom_schemes_should_not_validate_defaults(scheme): url = '{0}://somewhere.com'.format(scheme) with pytest.raises(uris.ValidationError): uris.validate(url, schemes=CUSTOM_SCHEMES)
def test_validate_strip_url(): assert uris.validate(' http://somewhere.com ') == 'http://somewhere.com'
def test_custom_tlds_should_not_validate_defaults(tld): url = 'http://somewhere.{0}'.format(tld) with pytest.raises(uris.ValidationError): uris.validate(url, tlds=CUSTOM_TLDS)
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], self.schema) if type(data) == list: data = data[0] # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = parse_html(data['notes']) # Detect license default_license = dataset.license or License.default() dataset.license = License.guess(data['license_id'], data['license_title'], default=default_license) dataset.tags = [t['name'] for t in data['tags'] if t['name']] dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom, spatial_zone = None, None for extra in data['extras']: key = extra['key'] value = extra['value'] if value is None or (isinstance(value, str) and not value.strip()): # Skip empty extras continue elif key == 'spatial': # GeoJSON representation (Polygon or Point) spatial_geom = json.loads(value) elif key == 'spatial-text': # Textual representation of the extent / location qs = GeoZone.objects(db.Q(name=value) | db.Q(slug=value)) qs = qs.valid_at(datetime.now()) if qs.count() == 1: spatial_zone = qs.first() else: dataset.extras['ckan:spatial-text'] = value log.debug('spatial-text value not handled: %s', value) elif key == 'spatial-uri': # Linked Data URI representing the place name dataset.extras['ckan:spatial-uri'] = value log.debug('spatial-uri value not handled: %s', value) elif key == 'frequency': # Update frequency freq = frequency_from_rdf(value) if freq: dataset.frequency = freq elif value in UPDATE_FREQUENCIES: dataset.frequency = value else: dataset.extras['ckan:frequency'] = value log.debug('frequency value not handled: %s', value) # Temporal coverage start elif key == 'temporal_start': temporal_start = daterange_start(value) # Temporal coverage end elif key == 'temporal_end': temporal_end = daterange_end(value) else: dataset.extras[extra['key']] = value if spatial_geom or spatial_zone: dataset.spatial = SpatialCoverage() if spatial_zone: dataset.spatial.zones = [spatial_zone] if spatial_geom: if spatial_geom['type'] == 'Polygon': coordinates = [spatial_geom['coordinates']] elif spatial_geom['type'] == 'MultiPolygon': coordinates = spatial_geom['coordinates'] else: raise HarvestException('Unsupported spatial geometry') dataset.spatial.geom = { 'type': 'MultiPolygon', 'coordinates': coordinates } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL dataset.extras['remote_url'] = self.dataset_url(data['name']) if data.get('url'): try: url = uris.validate(data['url']) except uris.ValidationError: dataset.extras['ckan:source'] = data['url'] else: # use declared `url` as `remote_url` if any dataset.extras['remote_url'] = url # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except Exception: log.error('Unable to parse resource ID %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = parse_html(res.get('description')) resource.url = res['url'] resource.filetype = 'remote' resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created return dataset
def test_with_credentials_disabled(url): with pytest.raises(uris.ValidationError): uris.validate(url, credentials=False)
def test_validate_strip_url(): assert uris.validate(' http://somewhere.com ') == 'http://somewhere.com'
def test_default_should_validate_public_urls_with_utf8_tld(url): assert uris.validate(url) == url
def test_default_should_validate_public_ips(url): assert uris.validate(url) == url
def test_with_credentials_disabled(url): with pytest.raises(uris.ValidationError): uris.validate(url, credentials=False)
def test_default_should_validate_public_urls_with_utf8_tld(url): assert uris.validate(url) == url