def _csw_resource_data_dict(self, dataset_name): '''Return an example open data dataset as expected as input to get_package_dict().''' xml_string = self._open_xml_fixture(dataset_name) iso_document = ISODocument(xml_string) iso_values = iso_document.read_values() base_harvester = SpatialHarvester() source = self._create_source() obj = HarvestObject( source=source, ) obj.save() package_dict = base_harvester.get_package_dict(iso_values, obj) data_dict = { 'package_dict': package_dict , 'iso_values': iso_values } return data_dict
def get_package_dict(self, iso_values, harvest_object): ''' ''' #for key, value in iso_values.iteritems(): # log.debug("%s: %s", key, value) package_dict = SpatialHarvester.get_package_dict(self, iso_values, harvest_object) extras = {} if iso_values.get('publisher', None): extras['publisher'] = iso_values.get('publisher', []) if iso_values.get('responsible-organisation'): log.info("Checking for responsible-organisation") extras['responsible-organisation'] = iso_values.get('responsible-organisation', []) extras_as_dict = [] for key, value in extras.iteritems(): if isinstance(value, (list, dict)): extras_as_dict.append({'key': key, 'value': json.dumps(value)}) else: extras_as_dict.append({'key': key, 'value': value}) package_dict['extras'] = package_dict['extras'] + extras_as_dict return package_dict
def get_package_dict(self, iso_values, harvest_object): ''' ''' #for key, value in iso_values.iteritems(): # log.debug("%s: %s", key, value) package_dict = SpatialHarvester.get_package_dict( self, iso_values, harvest_object) extras = {} if iso_values.get('publisher', None): extras['publisher'] = iso_values.get('publisher', []) if iso_values.get('responsible-organisation'): log.info("Checking for responsible-organisation") extras['responsible-organisation'] = iso_values.get( 'responsible-organisation', []) extras_as_dict = [] for key, value in extras.iteritems(): if isinstance(value, (list, dict)): extras_as_dict.append({'key': key, 'value': json.dumps(value)}) else: extras_as_dict.append({'key': key, 'value': value}) package_dict['extras'] = package_dict['extras'] + extras_as_dict return package_dict
def test_clean_tags(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml', 'source_type': u'gemini-single', 'owner_org': 'test-org', 'metadata_created': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'), 'metadata_modified': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'), } user = User.get('dummy') if not user: user = call_action('user_create', name='dummy', password='******', email='*****@*****.**') user_name = user['name'] else: user_name = user.name org = Group.by_name('test-org') if org is None: org = call_action('organization_create', context={'user': user_name}, name='test-org') existing_g = Group.by_name('existing-group') if existing_g is None: existing_g = call_action('group_create', context={'user': user_name}, name='existing-group') context = {'user': '******'} package_schema = default_update_package_schema() context['schema'] = package_schema package_dict = {'frequency': 'manual', 'publisher_name': 'dummy', 'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}], 'groups': [], 'title': 'fakename', 'holder_name': 'dummy', 'holder_identifier': 'dummy', 'name': 'fakename', 'notes': 'dummy', 'owner_org': 'test-org', 'modified': datetime.now(), 'publisher_identifier': 'dummy', 'metadata_created' : datetime.now(), 'metadata_modified' : datetime.now(), 'guid': unicode(uuid4()), 'identifier': 'dummy'} package_data = call_action('package_create', context=context, **package_dict) package = Package.get('fakename') source, job = self._create_source_and_job(source_fixture) job.package = package job.guid = uuid4() harvester = SpatialHarvester() with open(os.path.join('..', 'data', 'dataset.json')) as f: dataset = json.load(f) # long tags are invalid in all cases TAG_LONG_INVALID = 'abcdefghij' * 20 # if clean_tags is not set to true, tags will be truncated to 50 chars TAG_LONG_VALID = TAG_LONG_INVALID[:50] # default truncate to 100 TAG_LONG_VALID_LONG = TAG_LONG_INVALID[:100] assert len(TAG_LONG_VALID) == 50 assert TAG_LONG_VALID[-1] == 'j' TAG_CHARS_INVALID = '[email protected]!' TAG_CHARS_VALID = 'pretty-invlidtag' dataset['tags'].append(TAG_LONG_INVALID) dataset['tags'].append(TAG_CHARS_INVALID) harvester.source_config = {'clean_tags': False} out = harvester.get_package_dict(dataset, job) tags = out['tags'] # no clean tags, so invalid chars are in # but tags are truncated to 50 chars assert {'name': TAG_CHARS_VALID} not in tags assert {'name': TAG_CHARS_INVALID} in tags assert {'name': TAG_LONG_VALID_LONG} in tags assert {'name': TAG_LONG_INVALID} not in tags harvester.source_config = {'clean_tags': True} out = harvester.get_package_dict(dataset, job) tags = out['tags'] assert {'name': TAG_CHARS_VALID} in tags assert {'name': TAG_LONG_VALID_LONG} in tags
def get_package_dict(self, iso_values, harvest_object): ''' ''' package_dict = SpatialHarvester.get_package_dict( self, iso_values, harvest_object) simple_keys = { 'publisher_info', 'resource-provider', 'distributor-info', 'aggregation-info', 'distributor-formats', 'additional-information-source', 'purpose', # Constraints 'use-constraints', 'access-constraints', 'fees', # lineage 'lineage', 'lineage-process-steps', } extras = {k: iso_values.get(k) for k in simple_keys if k in iso_values} keywords = defaultdict(list) for keyword in iso_values['keywords']: keyword_type = keyword['type'] or 'keywords' keywords[keyword_type].append(keyword) extras['grouped_keywords'] = [] for keyword_type in [ 'theme', 'dataCenter', 'platform', 'instrument', 'place', 'project', 'dataResolution', 'stratum', 'otherRestrictions', 'keywords' ]: if keyword_type in keywords: extras['grouped_keywords'].append( [titleize(keyword_type), keywords[keyword_type]]) if iso_values.get('publisher', None): extras['publisher'] = iso_values.get('publisher', []) if iso_values.get('browse-graphic', None): browse_graphic = iso_values['browse-graphic'][0]['file'] extras['browse-graphic'] = browse_graphic if iso_values.get('dataset-edition'): extras['dataset-edition'] = iso_values['dataset-edition'] package_dict["version"] = iso_values['dataset-edition'][0] if iso_values.get('presentation-form'): extras['presentation-form'] = iso_values['presentation-form'][0] if iso_values.get('responsible-organisation'): log.info("Checking for responsible-organisation") extras['responsible-organisation'] = iso_values.get( 'responsible-organisation', []) if iso_values.get('responsible-parties'): extras['responsible-parties'] = self.unique_responsible_parties( iso_values.get('responsible-organisation', [])) for item in harvest_object.extras: key = item.key value = item.value if key == u'waf_location': extras['waf_location'] = value break else: extras['waf_location'] = None extras['object_reference'] = harvest_object.id extras_kv = [{ 'key': k, 'value': json.dumps(v) if isinstance(v, (list, dict)) else v } for k, v in extras.iteritems()] package_dict['extras'] = package_dict['extras'] + extras_kv package_dict['resources'] = self.filter_duplicate_resources( package_dict) package_dict['resources'] = self.reorder_resources(package_dict) package_dict = self.update_resources(package_dict) return package_dict
def get_package_dict(self, iso_values, harvest_object): ''' ''' package_dict = SpatialHarvester.get_package_dict( self, iso_values, harvest_object) simple_keys = { 'publisher_info', 'resource-provider', 'distributor-info', 'aggregation-info', 'distributor-formats', 'additional-information-source', 'purpose', # Constraints 'use-constraints', 'access-constraints', 'use-limitations', 'fees', # lineage 'lineage', 'lineage-process-steps', } extras = {k: iso_values.get(k) for k in simple_keys if k in iso_values} keywords = defaultdict(list) for keyword in iso_values['keywords']: keyword_type = keyword['type'] or 'keywords' keywords[keyword_type].append(keyword) extras['grouped_keywords'] = [] for extra_name, matches, data_filter in ( ('cf_standard_names', ('cf', 'climate and forecast'), lambda s: s.strip().split(' ', 1)[0]), ('gcmd_keywords', ('gcmd', 'global change'), lambda s: s.strip()), ): try: match_raw = next( (d['keywords'] for d in iso_values['keywords'] if d['thesaurus'] and any( v in d['thesaurus']['title'].lower() for v in matches)), None) if match_raw is None: continue elif hasattr(match_raw, '__iter__'): match_result = sorted(set(map(data_filter, match_raw))) else: match_result = data_filter(match_raw) except: match_result = None log.exception( "Execption raised when trying to extract {}".format( extra_name)) if match_result is not None: extras[extra_name] = match_result for keyword_type in [ 'theme', 'dataCenter', 'platform', 'instrument', 'place', 'project', 'dataResolution', 'stratum', 'otherRestrictions', 'keywords' ]: if keyword_type in keywords: extras['grouped_keywords'].append( [titleize(keyword_type), keywords[keyword_type]]) if iso_values.get('publisher', None): extras['publisher'] = iso_values.get('publisher', []) if iso_values.get('browse-graphic', None): browse_graphic = iso_values['browse-graphic'][0]['file'] extras['browse-graphic'] = browse_graphic if iso_values.get('dataset-edition'): extras['dataset-edition'] = iso_values['dataset-edition'] package_dict["version"] = iso_values['dataset-edition'][0] if iso_values.get('presentation-form'): extras['presentation-form'] = iso_values['presentation-form'][0] if iso_values.get('responsible-organisation'): log.info("Checking for responsible-organisation") extras['responsible-organisation'] = iso_values.get( 'responsible-organisation', []) if iso_values.get('responsible-parties'): extras['responsible-parties'] = self.unique_responsible_parties( iso_values.get('responsible-organisation', [])) for item in harvest_object.extras: key = item.key value = item.value if key == 'waf_location': extras['waf_location'] = value break else: extras['waf_location'] = None extras['object_reference'] = harvest_object.id extras_kv = [{ 'key': k, 'value': json.dumps(v) if isinstance(v, (list, dict)) else v } for k, v in extras.items()] package_dict['extras'] = package_dict['extras'] + extras_kv package_dict['resources'] = self.filter_duplicate_resources( package_dict) package_dict['resources'] = self.reorder_resources(package_dict) package_dict = self.update_resources(package_dict) for resource in package_dict["resources"]: if resource["format"] in { "ERDDAP", "ERDDAP-TableDAP", "ERDDAP-GridDAP" }: # TODO: try/catch here try: info_url = re.sub( r"^(https?://.+/erddap/)(?:grid|table)dap(/[^.]+)\.(\w+)$", r"\1info\2/index.csv", resource["url"]) ds = ErddapCSVMetadataReader(info_url) self.get_vertical_extent(ds, package_dict) self.get_ioos_nc_attributes(ds, package_dict) except: pass return package_dict
def get_package_dict(self, iso_values, harvest_object): ''' ''' package_dict = SpatialHarvester.get_package_dict(self, iso_values, harvest_object) simple_keys = { 'publisher_info', 'resource-provider', 'distributor-info', 'aggregation-info', 'distributor-formats', 'additional-information-source', 'purpose', # Constraints 'use-constraints', 'access-constraints', 'fees', # lineage 'lineage', 'lineage-process-steps', } extras = {k: iso_values.get(k) for k in simple_keys if k in iso_values} keywords = defaultdict(list) for keyword in iso_values['keywords']: keyword_type = keyword['type'] or 'keywords' keywords[keyword_type].append(keyword) extras['grouped_keywords'] = [] for keyword_type in ['theme', 'dataCenter', 'platform', 'instrument', 'place', 'project', 'dataResolution', 'stratum', 'otherRestrictions', 'keywords']: if keyword_type in keywords: extras['grouped_keywords'].append([titleize(keyword_type), keywords[keyword_type]]) if iso_values.get('publisher', None): extras['publisher'] = iso_values.get('publisher', []) if iso_values.get('browse-graphic', None): browse_graphic = iso_values['browse-graphic'][0]['file'] extras['browse-graphic'] = browse_graphic if iso_values.get('dataset-edition'): extras['dataset-edition'] = iso_values['dataset-edition'] package_dict["version"] = iso_values['dataset-edition'][0] if iso_values.get('presentation-form'): extras['presentation-form'] = iso_values['presentation-form'][0] if iso_values.get('responsible-organisation'): log.info("Checking for responsible-organisation") extras['responsible-organisation'] = iso_values.get('responsible-organisation', []) if iso_values.get('responsible-parties'): extras['responsible-parties'] = self.unique_responsible_parties(iso_values.get('responsible-organisation', [])) for item in harvest_object.extras: key = item.key value = item.value if key == u'waf_location': extras['waf_location'] = value break else: extras['waf_location'] = None extras['object_reference'] = harvest_object.id extras_kv = [{'key': k, 'value': json.dumps(v) if isinstance(v, (list, dict)) else v} for k, v in extras.iteritems()] package_dict['extras'] = package_dict['extras'] + extras_kv package_dict['resources'] = self.filter_duplicate_resources(package_dict) package_dict['resources'] = self.reorder_resources(package_dict) package_dict = self.update_resources(package_dict) return package_dict