def _csw_resource_data_dict(self, dataset_name):
        '''Return an example open data dataset as expected as input
           to get_package_dict().'''

        xml_string = self._open_xml_fixture(dataset_name)
        iso_document = ISODocument(xml_string)
        iso_values = iso_document.read_values()
        base_harvester = SpatialHarvester()
        source = self._create_source()
        obj = HarvestObject(
            source=source,
        )
        obj.save()
        package_dict = base_harvester.get_package_dict(iso_values, obj)

        data_dict = {
            'package_dict': package_dict ,
            'iso_values': iso_values
        }
        return data_dict
    def get_package_dict(self, iso_values, harvest_object):
        '''
        '''
        #for key, value in iso_values.iteritems():
        #    log.debug("%s: %s", key, value)
        package_dict = SpatialHarvester.get_package_dict(self, iso_values, harvest_object)
        extras = {}
        if iso_values.get('publisher', None):
            extras['publisher'] = iso_values.get('publisher', [])
        if iso_values.get('responsible-organisation'):
            log.info("Checking for responsible-organisation")
            extras['responsible-organisation'] = iso_values.get('responsible-organisation', [])
        extras_as_dict = []
        for key, value in extras.iteritems():
            if isinstance(value, (list, dict)):
                extras_as_dict.append({'key': key, 'value': json.dumps(value)})
            else:
                extras_as_dict.append({'key': key, 'value': value})

        package_dict['extras'] = package_dict['extras'] + extras_as_dict
        
        return package_dict
    def get_package_dict(self, iso_values, harvest_object):
        '''
        '''
        #for key, value in iso_values.iteritems():
        #    log.debug("%s: %s", key, value)
        package_dict = SpatialHarvester.get_package_dict(
            self, iso_values, harvest_object)
        extras = {}
        if iso_values.get('publisher', None):
            extras['publisher'] = iso_values.get('publisher', [])
        if iso_values.get('responsible-organisation'):
            log.info("Checking for responsible-organisation")
            extras['responsible-organisation'] = iso_values.get(
                'responsible-organisation', [])
        extras_as_dict = []
        for key, value in extras.iteritems():
            if isinstance(value, (list, dict)):
                extras_as_dict.append({'key': key, 'value': json.dumps(value)})
            else:
                extras_as_dict.append({'key': key, 'value': value})

        package_dict['extras'] = package_dict['extras'] + extras_as_dict

        return package_dict
示例#4
0
    def test_clean_tags(self):
        
        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
            'source_type': u'gemini-single',
            'owner_org': 'test-org',
            'metadata_created': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'),
            'metadata_modified': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'),

        }

        user = User.get('dummy')
        if not user:
            user = call_action('user_create',
                               name='dummy',
                               password='******',
                               email='*****@*****.**')
            user_name = user['name']
        else:
            user_name = user.name
        org = Group.by_name('test-org')
        if org is None:
            org  = call_action('organization_create',
                                context={'user': user_name},
                                name='test-org')
        existing_g = Group.by_name('existing-group')
        if existing_g is None:
            existing_g  = call_action('group_create',
                                      context={'user': user_name},
                                      name='existing-group')

        context = {'user': '******'} 
        package_schema = default_update_package_schema()
        context['schema'] = package_schema
        package_dict = {'frequency': 'manual',
              'publisher_name': 'dummy',
              'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}],
              'groups': [],
              'title': 'fakename',
              'holder_name': 'dummy',
              'holder_identifier': 'dummy',
              'name': 'fakename',
              'notes': 'dummy',
              'owner_org': 'test-org',
              'modified': datetime.now(),
              'publisher_identifier': 'dummy',
              'metadata_created' : datetime.now(),
              'metadata_modified' : datetime.now(),
              'guid': unicode(uuid4()),
              'identifier': 'dummy'}
        
        package_data = call_action('package_create', context=context, **package_dict)

        package = Package.get('fakename')
        source, job = self._create_source_and_job(source_fixture)
        job.package = package
        job.guid = uuid4()
        harvester = SpatialHarvester()
        with open(os.path.join('..', 'data', 'dataset.json')) as f:
            dataset = json.load(f)

        # long tags are invalid in all cases
        TAG_LONG_INVALID = 'abcdefghij' * 20
        # if clean_tags is not set to true, tags will be truncated to 50 chars
        TAG_LONG_VALID = TAG_LONG_INVALID[:50]
        # default truncate to 100
        TAG_LONG_VALID_LONG = TAG_LONG_INVALID[:100]

        assert len(TAG_LONG_VALID) == 50
        assert TAG_LONG_VALID[-1] == 'j'
        TAG_CHARS_INVALID = '[email protected]!'
        TAG_CHARS_VALID = 'pretty-invlidtag'

        dataset['tags'].append(TAG_LONG_INVALID)
        dataset['tags'].append(TAG_CHARS_INVALID)

        harvester.source_config = {'clean_tags': False}
        out = harvester.get_package_dict(dataset, job)
        tags = out['tags']

        # no clean tags, so invalid chars are in
        # but tags are truncated to 50 chars
        assert {'name': TAG_CHARS_VALID} not in tags
        assert {'name': TAG_CHARS_INVALID} in tags
        assert {'name': TAG_LONG_VALID_LONG} in tags
        assert {'name': TAG_LONG_INVALID} not in tags

        harvester.source_config = {'clean_tags': True}

        out = harvester.get_package_dict(dataset, job)
        tags = out['tags']
        assert {'name': TAG_CHARS_VALID} in tags
        assert {'name': TAG_LONG_VALID_LONG} in tags
示例#5
0
    def test_clean_tags(self):
        
        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
            'source_type': u'gemini-single',
            'owner_org': 'test-org',
            'metadata_created': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'),
            'metadata_modified': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'),

        }

        user = User.get('dummy')
        if not user:
            user = call_action('user_create',
                               name='dummy',
                               password='******',
                               email='*****@*****.**')
            user_name = user['name']
        else:
            user_name = user.name
        org = Group.by_name('test-org')
        if org is None:
            org  = call_action('organization_create',
                                context={'user': user_name},
                                name='test-org')
        existing_g = Group.by_name('existing-group')
        if existing_g is None:
            existing_g  = call_action('group_create',
                                      context={'user': user_name},
                                      name='existing-group')

        context = {'user': '******'} 
        package_schema = default_update_package_schema()
        context['schema'] = package_schema
        package_dict = {'frequency': 'manual',
              'publisher_name': 'dummy',
              'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}],
              'groups': [],
              'title': 'fakename',
              'holder_name': 'dummy',
              'holder_identifier': 'dummy',
              'name': 'fakename',
              'notes': 'dummy',
              'owner_org': 'test-org',
              'modified': datetime.now(),
              'publisher_identifier': 'dummy',
              'metadata_created' : datetime.now(),
              'metadata_modified' : datetime.now(),
              'guid': unicode(uuid4()),
              'identifier': 'dummy'}
        
        package_data = call_action('package_create', context=context, **package_dict)

        package = Package.get('fakename')
        source, job = self._create_source_and_job(source_fixture)
        job.package = package
        job.guid = uuid4()
        harvester = SpatialHarvester()
        with open(os.path.join('..', 'data', 'dataset.json')) as f:
            dataset = json.load(f)

        # long tags are invalid in all cases
        TAG_LONG_INVALID = 'abcdefghij' * 20
        # if clean_tags is not set to true, tags will be truncated to 50 chars
        TAG_LONG_VALID = TAG_LONG_INVALID[:50]
        # default truncate to 100
        TAG_LONG_VALID_LONG = TAG_LONG_INVALID[:100]

        assert len(TAG_LONG_VALID) == 50
        assert TAG_LONG_VALID[-1] == 'j'
        TAG_CHARS_INVALID = '[email protected]!'
        TAG_CHARS_VALID = 'pretty-invlidtag'

        dataset['tags'].append(TAG_LONG_INVALID)
        dataset['tags'].append(TAG_CHARS_INVALID)

        harvester.source_config = {'clean_tags': False}
        out = harvester.get_package_dict(dataset, job)
        tags = out['tags']

        # no clean tags, so invalid chars are in
        # but tags are truncated to 50 chars
        assert {'name': TAG_CHARS_VALID} not in tags
        assert {'name': TAG_CHARS_INVALID} in tags
        assert {'name': TAG_LONG_VALID_LONG} in tags
        assert {'name': TAG_LONG_INVALID} not in tags

        harvester.source_config = {'clean_tags': True}

        out = harvester.get_package_dict(dataset, job)
        tags = out['tags']
        assert {'name': TAG_CHARS_VALID} in tags
        assert {'name': TAG_LONG_VALID_LONG} in tags
示例#6
0
    def get_package_dict(self, iso_values, harvest_object):
        '''
        '''
        package_dict = SpatialHarvester.get_package_dict(
            self, iso_values, harvest_object)
        simple_keys = {
            'publisher_info',
            'resource-provider',
            'distributor-info',
            'aggregation-info',
            'distributor-formats',
            'additional-information-source',
            'purpose',
            # Constraints
            'use-constraints',
            'access-constraints',
            'fees',
            # lineage
            'lineage',
            'lineage-process-steps',
        }
        extras = {k: iso_values.get(k) for k in simple_keys if k in iso_values}

        keywords = defaultdict(list)
        for keyword in iso_values['keywords']:
            keyword_type = keyword['type'] or 'keywords'
            keywords[keyword_type].append(keyword)

        extras['grouped_keywords'] = []
        for keyword_type in [
                'theme', 'dataCenter', 'platform', 'instrument', 'place',
                'project', 'dataResolution', 'stratum', 'otherRestrictions',
                'keywords'
        ]:
            if keyword_type in keywords:
                extras['grouped_keywords'].append(
                    [titleize(keyword_type), keywords[keyword_type]])

        if iso_values.get('publisher', None):
            extras['publisher'] = iso_values.get('publisher', [])
        if iso_values.get('browse-graphic', None):
            browse_graphic = iso_values['browse-graphic'][0]['file']
            extras['browse-graphic'] = browse_graphic
        if iso_values.get('dataset-edition'):
            extras['dataset-edition'] = iso_values['dataset-edition']
            package_dict["version"] = iso_values['dataset-edition'][0]
        if iso_values.get('presentation-form'):
            extras['presentation-form'] = iso_values['presentation-form'][0]
        if iso_values.get('responsible-organisation'):
            log.info("Checking for responsible-organisation")
            extras['responsible-organisation'] = iso_values.get(
                'responsible-organisation', [])
        if iso_values.get('responsible-parties'):
            extras['responsible-parties'] = self.unique_responsible_parties(
                iso_values.get('responsible-organisation', []))

        for item in harvest_object.extras:
            key = item.key
            value = item.value
            if key == u'waf_location':
                extras['waf_location'] = value
                break
        else:
            extras['waf_location'] = None

        extras['object_reference'] = harvest_object.id

        extras_kv = [{
            'key':
            k,
            'value':
            json.dumps(v) if isinstance(v, (list, dict)) else v
        } for k, v in extras.iteritems()]

        package_dict['extras'] = package_dict['extras'] + extras_kv
        package_dict['resources'] = self.filter_duplicate_resources(
            package_dict)
        package_dict['resources'] = self.reorder_resources(package_dict)
        package_dict = self.update_resources(package_dict)

        return package_dict
示例#7
0
    def get_package_dict(self, iso_values, harvest_object):
        '''
        '''
        package_dict = SpatialHarvester.get_package_dict(
            self, iso_values, harvest_object)
        simple_keys = {
            'publisher_info',
            'resource-provider',
            'distributor-info',
            'aggregation-info',
            'distributor-formats',
            'additional-information-source',
            'purpose',
            # Constraints
            'use-constraints',
            'access-constraints',
            'use-limitations',
            'fees',
            # lineage
            'lineage',
            'lineage-process-steps',
        }
        extras = {k: iso_values.get(k) for k in simple_keys if k in iso_values}

        keywords = defaultdict(list)
        for keyword in iso_values['keywords']:
            keyword_type = keyword['type'] or 'keywords'
            keywords[keyword_type].append(keyword)

        extras['grouped_keywords'] = []
        for extra_name, matches, data_filter in (
            ('cf_standard_names', ('cf', 'climate and forecast'),
             lambda s: s.strip().split(' ', 1)[0]),
            ('gcmd_keywords', ('gcmd', 'global change'), lambda s: s.strip()),
        ):
            try:
                match_raw = next(
                    (d['keywords']
                     for d in iso_values['keywords'] if d['thesaurus'] and any(
                         v in d['thesaurus']['title'].lower()
                         for v in matches)), None)
                if match_raw is None:
                    continue
                elif hasattr(match_raw, '__iter__'):
                    match_result = sorted(set(map(data_filter, match_raw)))
                else:
                    match_result = data_filter(match_raw)
            except:
                match_result = None
                log.exception(
                    "Execption raised when trying to extract {}".format(
                        extra_name))
            if match_result is not None:
                extras[extra_name] = match_result

        for keyword_type in [
                'theme', 'dataCenter', 'platform', 'instrument', 'place',
                'project', 'dataResolution', 'stratum', 'otherRestrictions',
                'keywords'
        ]:
            if keyword_type in keywords:
                extras['grouped_keywords'].append(
                    [titleize(keyword_type), keywords[keyword_type]])

        if iso_values.get('publisher', None):
            extras['publisher'] = iso_values.get('publisher', [])
        if iso_values.get('browse-graphic', None):
            browse_graphic = iso_values['browse-graphic'][0]['file']
            extras['browse-graphic'] = browse_graphic
        if iso_values.get('dataset-edition'):
            extras['dataset-edition'] = iso_values['dataset-edition']
            package_dict["version"] = iso_values['dataset-edition'][0]
        if iso_values.get('presentation-form'):
            extras['presentation-form'] = iso_values['presentation-form'][0]
        if iso_values.get('responsible-organisation'):
            log.info("Checking for responsible-organisation")
            extras['responsible-organisation'] = iso_values.get(
                'responsible-organisation', [])
        if iso_values.get('responsible-parties'):
            extras['responsible-parties'] = self.unique_responsible_parties(
                iso_values.get('responsible-organisation', []))

        for item in harvest_object.extras:
            key = item.key
            value = item.value
            if key == 'waf_location':
                extras['waf_location'] = value
                break
        else:
            extras['waf_location'] = None

        extras['object_reference'] = harvest_object.id

        extras_kv = [{
            'key':
            k,
            'value':
            json.dumps(v) if isinstance(v, (list, dict)) else v
        } for k, v in extras.items()]

        package_dict['extras'] = package_dict['extras'] + extras_kv
        package_dict['resources'] = self.filter_duplicate_resources(
            package_dict)
        package_dict['resources'] = self.reorder_resources(package_dict)
        package_dict = self.update_resources(package_dict)

        for resource in package_dict["resources"]:
            if resource["format"] in {
                    "ERDDAP", "ERDDAP-TableDAP", "ERDDAP-GridDAP"
            }:
                # TODO: try/catch here
                try:
                    info_url = re.sub(
                        r"^(https?://.+/erddap/)(?:grid|table)dap(/[^.]+)\.(\w+)$",
                        r"\1info\2/index.csv", resource["url"])
                    ds = ErddapCSVMetadataReader(info_url)
                    self.get_vertical_extent(ds, package_dict)
                    self.get_ioos_nc_attributes(ds, package_dict)
                except:
                    pass

        return package_dict
示例#8
0
    def get_package_dict(self, iso_values, harvest_object):
        '''
        '''
        package_dict = SpatialHarvester.get_package_dict(self, iso_values, harvest_object)
        simple_keys = {
            'publisher_info',
            'resource-provider',
            'distributor-info',
            'aggregation-info',
            'distributor-formats',
            'additional-information-source',
            'purpose',
            # Constraints
            'use-constraints',
            'access-constraints',
            'fees',
            # lineage
            'lineage',
            'lineage-process-steps',
        }
        extras = {k: iso_values.get(k) for k in simple_keys if k in iso_values}

        keywords = defaultdict(list)
        for keyword in iso_values['keywords']:
            keyword_type = keyword['type'] or 'keywords'
            keywords[keyword_type].append(keyword)

        extras['grouped_keywords'] = []
        for keyword_type in ['theme', 'dataCenter', 'platform', 'instrument', 'place', 'project', 'dataResolution', 'stratum', 'otherRestrictions', 'keywords']:
            if keyword_type in keywords:
                extras['grouped_keywords'].append([titleize(keyword_type), keywords[keyword_type]])

        if iso_values.get('publisher', None):
            extras['publisher'] = iso_values.get('publisher', [])
        if iso_values.get('browse-graphic', None):
            browse_graphic = iso_values['browse-graphic'][0]['file']
            extras['browse-graphic'] = browse_graphic
        if iso_values.get('dataset-edition'):
            extras['dataset-edition'] = iso_values['dataset-edition']
            package_dict["version"] = iso_values['dataset-edition'][0]
        if iso_values.get('presentation-form'):
            extras['presentation-form'] = iso_values['presentation-form'][0]
        if iso_values.get('responsible-organisation'):
            log.info("Checking for responsible-organisation")
            extras['responsible-organisation'] = iso_values.get('responsible-organisation', [])
        if iso_values.get('responsible-parties'):
            extras['responsible-parties'] = self.unique_responsible_parties(iso_values.get('responsible-organisation', []))

        for item in harvest_object.extras:
            key = item.key
            value = item.value
            if key == u'waf_location':
                extras['waf_location'] = value
                break
        else:
            extras['waf_location'] = None

        extras['object_reference'] = harvest_object.id

        extras_kv = [{'key': k,
                      'value': json.dumps(v) if isinstance(v, (list, dict))
                               else v} for k, v in extras.iteritems()]

        package_dict['extras'] = package_dict['extras'] + extras_kv
        package_dict['resources'] = self.filter_duplicate_resources(package_dict)
        package_dict['resources'] = self.reorder_resources(package_dict)
        package_dict = self.update_resources(package_dict)

        return package_dict