def process(self, item): ods_dataset = item.kwargs['dataset'] dataset_id = ods_dataset['datasetid'] ods_metadata = ods_dataset['metas'] ods_interopmetas = ods_dataset.get('interop_metas', {}) if not ods_dataset.get('has_records'): msg = 'Dataset {datasetid} has no record'.format(**ods_dataset) raise HarvestSkipException(msg) if 'inspire' in ods_interopmetas and not self.has_feature('inspire'): msg = 'Dataset {datasetid} has INSPIRE metadata' raise HarvestSkipException(msg.format(**ods_dataset)) dataset = self.get_dataset(item.remote_id) dataset.title = ods_metadata['title'] dataset.frequency = 'unknown' description = ods_metadata.get('description', '').strip() dataset.description = parse_html(description) dataset.private = False # Detect Organization try: organization_acronym = ods_metadata['publisher'] except KeyError: pass else: orgObj = Organization.objects(acronym=organization_acronym).first() if orgObj: dataset.organization = orgObj else: orgObj = Organization() orgObj.acronym = organization_acronym orgObj.name = organization_acronym orgObj.description = organization_acronym orgObj.save() dataset.organization = orgObj tags = set() if 'keyword' in ods_metadata: if isinstance(ods_metadata['keyword'], list): tags |= set(ods_metadata['keyword']) else: tags.add(ods_metadata['keyword']) if 'theme' in ods_metadata: if isinstance(ods_metadata['theme'], list): for theme in ods_metadata['theme']: tags.update([t.strip().lower() for t in theme.split(',')]) else: themes = ods_metadata['theme'].split(',') tags.update([t.strip().lower() for t in themes]) dataset.tags = list(tags) dataset.tags.append(urlparse(self.source.url).hostname) # Detect license default_license = dataset.license or License.default() license_id = ods_metadata.get('license') dataset.license = License.guess(license_id, self.LICENSES.get(license_id), default=default_license) self.process_resources(dataset, ods_dataset, ('csv', 'json')) if 'geo' in ods_dataset['features']: exports = ['geojson'] if ods_metadata['records_count'] <= self.SHAPEFILE_RECORDS_LIMIT: exports.append('shp') self.process_resources(dataset, ods_dataset, exports) self.process_extra_files(dataset, ods_dataset, 'alternative_export') self.process_extra_files(dataset, ods_dataset, 'attachment') dataset.extras['ods:url'] = self.explore_url(dataset_id) dataset.extras['harvest:name'] = self.source.name if 'references' in ods_metadata: dataset.extras['ods:references'] = ods_metadata['references'] dataset.extras['ods:has_records'] = ods_dataset['has_records'] dataset.extras['ods:geo'] = 'geo' in ods_dataset['features'] return dataset
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], self.schema) if type(data) == list: data = data[0] # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = parse_html(data['notes']) # Detect Org organization_acronym = data['organization']['name'] orgObj = Organization.objects(acronym=organization_acronym).first() if orgObj: #print 'Found %s' % orgObj.acronym dataset.organization = orgObj else: orgObj = Organization() orgObj.acronym = organization_acronym orgObj.name = data['organization']['title'] orgObj.description = data['organization']['description'] orgObj.save() #print 'Created %s' % orgObj.acronym dataset.organization = orgObj # Detect license default_license = self.harvest_config.get('license', License.default()) dataset.license = License.guess(data['license_id'], data['license_title'], default=default_license) dataset.tags = [t['name'] for t in data['tags'] if t['name']] dataset.tags.append(urlparse(self.source.url).hostname) dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.frequency = 'unknown' dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom = None for extra in data['extras']: # GeoJSON representation (Polygon or Point) if extra['key'] == 'spatial': spatial_geom = json.loads(extra['value']) # Textual representation of the extent / location elif extra['key'] == 'spatial-text': log.debug('spatial-text value not handled') # Linked Data URI representing the place name elif extra['key'] == 'spatial-uri': log.debug('spatial-uri value not handled') # Update frequency elif extra['key'] == 'frequency': print 'frequency', extra['value'] # Temporal coverage start elif extra['key'] == 'temporal_start': temporal_start = daterange_start(extra['value']) continue # Temporal coverage end elif extra['key'] == 'temporal_end': temporal_end = daterange_end(extra['value']) continue dataset.extras[extra['key']] = extra['value'] # We don't want spatial to be added on harvester if self.harvest_config.get('geozones', False): dataset.spatial = SpatialCoverage() dataset.spatial.zones = [] for zone in self.harvest_config.get('geozones'): geo_zone = GeoZone.objects.get(id=zone) dataset.spatial.zones.append(geo_zone) # # if spatial_geom: # dataset.spatial = SpatialCoverage() # if spatial_geom['type'] == 'Polygon': # coordinates = [spatial_geom['coordinates']] # elif spatial_geom['type'] == 'MultiPolygon': # coordinates = spatial_geom['coordinates'] # else: # HarvestException('Unsupported spatial geometry') # dataset.spatial.geom = { # 'type': 'MultiPolygon', # 'coordinates': coordinates # } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL if data.get('url'): try: url = uris.validate(data['url']) except uris.ValidationError: dataset.extras['remote_url'] = self.dataset_url(data['name']) dataset.extras['ckan:source'] = data['url'] else: dataset.extras['remote_url'] = url dataset.extras['harvest:name'] = self.source.name current_resources = [ str(resource.id) for resource in dataset.resources ] fetched_resources = [] # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue #Ignore invalid Resources try: url = uris.validate(res['url']) except uris.ValidationError: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except Exception: log.error('Unable to parse resource ID %s', res['id']) continue fetched_resources.append(str(res['id'])) if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = parse_html(res.get('description')) resource.url = res['url'] resource.filetype = 'remote' resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created # Clean up old resources removed from source for resource_id in current_resources: if resource_id not in fetched_resources: try: resource = get_by(dataset.resources, 'id', UUID(resource_id)) except Exception: log.error('Unable to parse resource ID %s', resource_id) continue else: if resource and not self.dryrun: dataset.resources.remove(resource) return dataset