def test_multiplicity_warning(): # This dataset lacks a value for Metadata Date and should # produce a log.warning, but not raise an exception. xml_string = open_xml_fixture('FCSConservancyPolygons.xml') gemini_document = GeminiDocument(xml_string) gemini_values = gemini_document.read_values() assert_equal(gemini_values['guid'], 'B8A22DF4-B0DC-4F0B-A713-0CF5F8784A28')
def test_multiplicity_warning(): # This dataset lacks a value for Metadata Date and should # produce a log.warning, but not raise an exception. xml_string = open_xml_fixture('FCSConservancyPolygons.xml') gemini_document = GeminiDocument(xml_string) gemini_values = gemini_document.read_values() assert_equal(gemini_values['guid'], 'B8A22DF4-B0DC-4F0B-A713-0CF5F8784A28')
def test_simple(): xml_string = open_xml_fixture('gemini_dataset.xml') gemini_document = GeminiDocument(xml_string) gemini_values = gemini_document.read_values() assert_equal(gemini_values['guid'], 'test-dataset-1') assert_equal(gemini_values['metadata-date'], '2011-09-23T10:06:08')
def write_package_from_gemini_string(self, content): '''Create or update a Package based on some content that has come from a URL. Returns the package_dict of the result. If there is an error, it returns None or raises Exception. ''' log = logging.getLogger(__name__ + '.import') package = None gemini_document = GeminiDocument(content) gemini_values = gemini_document.read_values() gemini_guid = gemini_values['guid'] # Save the metadata reference date in the Harvest Object try: metadata_modified_date = datetime.strptime( gemini_values['metadata-date'], '%Y-%m-%d') except ValueError: try: metadata_modified_date = datetime.strptime( gemini_values['metadata-date'], '%Y-%m-%dT%H:%M:%S') except: raise Exception('Could not extract reference date for GUID %s (%s)' \ % (gemini_guid,gemini_values['metadata-date'])) self.obj.metadata_modified_date = metadata_modified_date self.obj.save() last_harvested_object = Session.query(HarvestObject) \ .filter(HarvestObject.guid==gemini_guid) \ .filter(HarvestObject.current==True) \ .all() if len(last_harvested_object) == 1: last_harvested_object = last_harvested_object[0] elif len(last_harvested_object) > 1: raise Exception( 'Application Error: more than one current record for GUID %s' % gemini_guid) reactivate_package = False if last_harvested_object: # We've previously harvested this (i.e. it's an update) # Use metadata modified date instead of content to determine if the package # needs to be updated if last_harvested_object.metadata_modified_date is None \ or last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date \ or self.force_import \ or (last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date and last_harvested_object.source.active is False): if self.force_import: log.info('Import forced for object %s with GUID %s' % (self.obj.id, gemini_guid)) else: log.info( 'Package for object with GUID %s needs to be created or updated' % gemini_guid) package = last_harvested_object.package # If the package has a deleted state, we will only update it and reactivate it if the # new document has a more recent modified date if package.state == u'deleted': if last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date: log.info( 'Package for object with GUID %s will be re-activated' % gemini_guid) reactivate_package = True else: log.info( 'Remote record with GUID %s is not more recent than a deleted package, skipping... ' % gemini_guid) return None else: if last_harvested_object.content != self.obj.content and \ last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date: diff_generator = difflib.unified_diff( last_harvested_object.content.split('\n'), self.obj.content.split('\n')) diff = '\n'.join([line for line in diff_generator]) raise Exception( 'The contents of document with GUID %s changed, but the metadata date has not been updated.\nDiff:\n%s' % (gemini_guid, diff)) else: # The content hasn't changed, no need to update the package log.info('Document with GUID %s unchanged, skipping...' % (gemini_guid)) return None else: log.info( 'No package with GEMINI guid %s found, let\'s create one' % gemini_guid) extras = {'UKLP': 'True', 'harvest_object_id': self.obj.id} # Just add some of the metadata as extras, not the whole lot for name in [ # Essentials 'spatial-reference-system', 'guid', # Usefuls 'dataset-reference-date', 'metadata-language', # Language 'metadata-date', # Released 'coupled-resource', 'contact-email', 'frequency-of-update', 'spatial-data-service-type', ]: extras[name] = gemini_values[name] if len(gemini_values.get('progress', [])): extras['progress'] = gemini_values['progress'][0] else: extras['progress'] = '' extras['resource-type'] = gemini_values['resource-type'][0] # Use-constraints can contain values which are: # * free text # * licence URL # Store all values in extra['licence'] and if there is a # URL in there, store that in extra['licence-url'] extras['licence'] = gemini_values.get('use-constraints', '') if len(extras['licence']): licence_url_extracted = self._extract_first_licence_url( extras['licence']) if licence_url_extracted: extras['licence_url'] = licence_url_extracted extras['access_constraints'] = gemini_values.get( 'limitations-on-public-access', '') if 'temporal-extent-begin' in gemini_values: #gemini_values['temporal-extent-begin'].sort() extras['temporal_coverage-from'] = gemini_values[ 'temporal-extent-begin'] if 'temporal-extent-end' in gemini_values: #gemini_values['temporal-extent-end'].sort() extras['temporal_coverage-to'] = gemini_values[ 'temporal-extent-end'] # Save responsible organization roles provider, responsible_parties = self._process_responsible_organisation( gemini_values['responsible-organisation']) extras['provider'] = provider extras['responsible-party'] = '; '.join(responsible_parties) if len(gemini_values['bbox']) > 0: extras['bbox-east-long'] = gemini_values['bbox'][0]['east'] extras['bbox-north-lat'] = gemini_values['bbox'][0]['north'] extras['bbox-south-lat'] = gemini_values['bbox'][0]['south'] extras['bbox-west-long'] = gemini_values['bbox'][0]['west'] # Construct a GeoJSON extent so ckanext-spatial can register the extent geometry extent_string = self.extent_template.substitute( xmin=extras['bbox-east-long'], ymin=extras['bbox-south-lat'], xmax=extras['bbox-west-long'], ymax=extras['bbox-north-lat']) extras['spatial'] = extent_string.strip() tags = [] for tag in gemini_values['tags']: tag = tag[:50] if len(tag) > 50 else tag tags.append({'name': tag}) package_dict = { 'title': gemini_values['title'], 'notes': gemini_values['abstract'], 'tags': tags, 'resources': [] } if self.obj.source.publisher_id: package_dict['groups'] = [{'id': self.obj.source.publisher_id}] if reactivate_package: package_dict['state'] = u'active' if package is None or package.title != gemini_values['title']: name = self.gen_new_name(gemini_values['title']) if not name: name = self.gen_new_name(six.text_type(gemini_guid)) if not name: raise Exception( 'Could not generate a unique name from the title or the GUID. Please choose a more unique title.' ) package_dict['name'] = name else: package_dict['name'] = package.name resource_locators = gemini_values.get('resource-locator', []) if len(resource_locators): for resource_locator in resource_locators: url = resource_locator.get('url', '') if url: resource_format = '' resource = {} if extras['resource-type'] == 'service': # Check if the service is a view service test_url = url.split('?')[0] if '?' in url else url if self._is_wms(test_url): resource['verified'] = True resource['verified_date'] = datetime.now( ).isoformat() resource_format = 'WMS' resource.update({ 'url': url, 'name': resource_locator.get('name', ''), 'description': resource_locator.get('description') if resource_locator.get('description') else 'Resource locator', 'format': resource_format or None, 'resource_locator_protocol': resource_locator.get('protocol', ''), 'resource_locator_function': resource_locator.get('function', '') }) package_dict['resources'].append(resource) # Guess the best view service to use in WMS preview verified_view_resources = [ r for r in package_dict['resources'] if 'verified' in r and r['format'] == 'WMS' ] if len(verified_view_resources): verified_view_resources[0][ 'ckan_recommended_wms_preview'] = True else: view_resources = [ r for r in package_dict['resources'] if r['format'] == 'WMS' ] if len(view_resources): view_resources[0]['ckan_recommended_wms_preview'] = True extras_as_dict = [] for key, value in extras.items(): if isinstance(value, six.string_types + (Number, )): extras_as_dict.append({'key': key, 'value': value}) else: extras_as_dict.append({'key': key, 'value': json.dumps(value)}) package_dict['extras'] = extras_as_dict if package == None: # Create new package from data. package = self._create_package_from_data(package_dict) log.info('Created new package ID %s with GEMINI guid %s', package['id'], gemini_guid) else: package = self._create_package_from_data(package_dict, package=package) log.info( 'Updated existing package ID %s with existing GEMINI guid %s', package['id'], gemini_guid) # Flag the other objects of this source as not current anymore from ckanext.harvest.model import harvest_object_table u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) Session.execute(u, params={'b_package_id': package['id']}) Session.commit() # Refresh current object from session, otherwise the # import paster command fails Session.remove() Session.add(self.obj) Session.refresh(self.obj) # Set reference to package in the HarvestObject and flag it as # the current one if not self.obj.package_id: self.obj.package_id = package['id'] self.obj.current = True self.obj.save() return package
def test_simple(): xml_string = open_xml_fixture('gemini_dataset.xml') gemini_document = GeminiDocument(xml_string) gemini_values = gemini_document.read_values() assert_equal(gemini_values['guid'], 'test-dataset-1') assert_equal(gemini_values['metadata-date'], '2011-09-23T10:06:08')