Пример #1
0
    def import_stage(self,harvest_object):
        omit_tags = ['ogd', 'none']
        

        old_content = json.loads(harvest_object.content)
        old = json.loads(harvest_object.content)

        
        new_content = {}
        
        new_content = old_content
        
        new_content['id'] = old_content.get('extras').get('metadata_identifier') or old_content.get('id')
        new_content['license_id'] = helper.map_license(old_content.get('license'), 'cc-by')
        new_content['name'] = self._gen_new_name(old_content.get('title'))
        new_content['metadata_modified'] = old_content.get('extras').get('metadata_modified') or old_content.get('metadata_modified') or ''
        new_content['extras']['publisher'] = u'Land Oberösterreich'
        new_content['extras']['schema_language'] = old.get('extras').get('schema_language') or 'ger'
        new_content['extras']['schema_name'] = old.get('extras').get('schema_name') or 'OGD Austria Metadata 2.1'
        new_content['extras']['schema_characterset'] = old.get('extras').get('schema_characterset') or 'utf8'
        new_content['resources'] = []
        if isinstance(old.get('resources'), list):
            new_content['resources'] = old.get('resources')
        else:
            new_content['resources'].append(old.get('resources'))
        new_content['tags']= []
        if isinstance(old.get('tags'), list):
            new_content['tags'] = old.get('tags')
        else:
            new_content['tags'].append(old.get('tags'))


        harvest_object.content = json.dumps(new_content)
        print harvest_object
        super(DataOoeGvAtHarvester, self).import_stage(harvest_object)
Пример #2
0
 def test_0_check_setup(self):
     offset = '/api/rest/package'
     resB = self.app.get(offset).body
     resA = self.sub_app_get(offset)
     pkgsB = json.loads(resB or '[]')
     pkgsA = json.loads(resA or '[]')
     assert len(pkgsA) == 2
     assert len(pkgsB) == 0
    def gather_stage(self,harvest_job):
        log.debug('In CKANHarvester gather_stage (%s)' % harvest_job.source.url)
        get_all_packages = True
        package_ids = []

        self._set_config(harvest_job.source.config)

        # Check if this source has been harvested before
        previous_job = Session.query(HarvestJob) \
                        .filter(HarvestJob.source==harvest_job.source) \
                        .filter(HarvestJob.gather_finished!=None) \
                        .filter(HarvestJob.id!=harvest_job.id) \
                        .order_by(HarvestJob.gather_finished.desc()) \
                        .limit(1).first()

        # Get source URL
        base_url = harvest_job.source.url.rstrip('/')
        base_rest_url = base_url + self._get_rest_api_offset()
        base_search_url = base_url + self._get_search_api_offset()

        if (previous_job and not previous_job.gather_errors and not len(previous_job.objects) == 0):
            if not self.config.get('force_all',False):
                get_all_packages = False

                # Request only the packages modified since last harvest job
                last_time = previous_job.gather_finished.isoformat()
                url = base_search_url + '/revision?since_time=%s' % last_time

                try:
                    content = self._get_content(url)

                    revision_ids = json.loads(content)
                    if len(revision_ids):
                        for revision_id in revision_ids:
                            url = base_rest_url + '/revision/%s' % revision_id
                            try:
                                content = self._get_content(url)
                            except ContentFetchError,e:
                                self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job)
                                continue

                            revision = json.loads(content)
                            for package_id in revision['packages']:
                                if not package_id in package_ids:
                                    package_ids.append(package_id)
                    else:
                        log.info('No packages have been updated on the remote CKAN instance since the last harvest job')
                        return None

                except urllib2.HTTPError,e:
                    if e.getcode() == 400:
                        log.info('CKAN instance %s does not suport revision filtering' % base_url)
                        get_all_packages = True
                    else:
                        self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job)
                        return None
    def import_stage(self, harvest_object):
        package_dict = json.loads(harvest_object.content)
        
        if not self._should_import_local(package_dict):
            package_dict['state'] = 'deleted'
        else:
            package_dict = self._apply_package_extras_white_list(package_dict)
            package_dict = self._apply_package_resource_extras_black_list(package_dict)
            package_dict = self._fix_date_in_fields(package_dict)
            package_dict = self._set_license(package_dict)
        
        package_dict = self._pop_black_list_resources_by_type(package_dict)
        harvest_object.content = json.dumps(package_dict)
        upload_resources = self._pop_upload_resources(package_dict)
        
        import_stage_result = super(GuiaHarvesterPlugin, self).import_stage(harvest_object)

        if import_stage_result:
            package_dict = json.loads(harvest_object.content)
            harvested_rels = package_dict.get('relationships', [])
            try:
                this_package = model.Package.get(package_dict['name'])
                if not this_package: raise logic.NotFound()
            except logic.NotFound as nf:
                log.info('import_stage(): could not find package "{0}"; relationships not updated: {1}'.format(package_dict['name'], nf))
                return import_stage_result

            existing_rels = this_package.get_relationships()
            self._update_relationships(existing_rels, harvested_rels)

            for resource_dict in upload_resources:
                resource_url = resource_dict['url']
                resource_filename = resource_url.split('/')[-1]

                try:
                    response = requests.get(resource_url)
                    resource_file = StringIO(response.content)
                except Exception,e:
                    self._save_object_error('Resource not harvested for package "{0}". Unable to fetch resource from "{1}": {2}'.format(package_dict['name'], resource_url, e), harvest_object, 'Import')
                    continue

                cfs = FieldStorage()
                cfs.file = resource_file
                cfs.filename = resource_filename
                resource_dict['upload'] = cfs
                if 'created' in resource_dict: del resource_dict['created']
                if 'last_modified' in resource_dict: del resource_dict['last_modified']
                if 'api' in resource_dict: del resource_dict['api']

                try:
                    the_resource = toolkit.get_action('resource_create')(data_dict=resource_dict)
                except Exception,e:
                    self._save_object_error('Resource not harvested for package "{0}". Unable to import the resource originally from "{1}": {2}'.format(package_dict['name'], resource_url, e), harvest_object, 'Import')
                    continue
Пример #5
0
 def get_pkg_ids_for_organizations(orgs):
     pkg_ids = set()
     for organization in orgs:
         url = base_search_url + '/dataset?organization=%s' % organization
         content = self._get_content(url)
         content_json = json.loads(content)
         result_count = int(content_json['count'])
         pkg_ids |= set(content_json['results'])
         while len(pkg_ids) < result_count or not content_json['results']:
             url = base_search_url + '/dataset?organization=%s&offset=%s' % (organization, len(pkg_ids))
             content = self._get_content(url)
             content_json = json.loads(content)
             pkg_ids |= set(content_json['results'])
     return pkg_ids
    def fetch_stage(self,harvest_object):

        '''
        Fetches the list of datasets from the catalog
        '''
        log.debug('In CustomHarvester fetch_stage')

        self._set_config(harvest_object.job.source.config)
        db=client.odm
        db_jobs=db.jobs
        config=db_jobs.find_one({"cat_url":harvest_object.source.url})
        api_key=config['apikey']
        dataset_url=config['dataset_url']
        metadata_mappings=json.loads(config['metadata_mappings'])
        if "data.norge.no" in harvest_object.source.url.rstrip('/'):
        	many_datasets_list=['/api/dcat/data.json?page=1','/api/dcat/data.json?page=2','/api/dcat/data.json?page=3','/api/dcat/data.json?page=4']
        else:
        	many_datasets_list.append(datasets_list_url) 
        
        
        if dataset_url!="":
		  fetch_url=harvest_object.source.url.rstrip('/')+dataset_url.replace("{api}",api_key).replace("{id}", harvest_object.guid)
		  #print(fetch_url)
        else:
		  fetch_url=""
        
        dataset={}
        features=[]
        
        if fetch_url!="":  
		  result=urllib2.urlopen(fetch_url)
		  try:
			try:
			  dataset=json.load(result)
			except:
			  try:
				headers = {'Accept':'application/json'}
				r=urllib2.Request(fetch_url,headers=headers)
				dataset=json.loads(urllib2.urlopen(r).read())
			  except:
				result=urllib2.urlopen(fetch_url)
				read=result.read()
				read=read.replace("null(","dataset=").rstrip(')')
				exec(read)
			#print(dataset)
		  except Exception, e:
			  log.exception('Could not load ' + fetch_url)
			  self._save_gather_error('%r'%e.message,harvest_object)
    def validate_config(self,config):
        print 'VALIDATE CONFIG'
        if not config:
            return config

        try:
            config_obj = json.loads(config)

            if 'api_version' in config_obj:
                try:
                    int(config_obj['api_version'])
                except ValueError:
                    raise ValueError('api_version must be an integer')

            if 'default_tags' in config_obj:
                if not isinstance(config_obj['default_tags'],list):
                    raise ValueError('default_tags must be a list')

            if 'default_groups' in config_obj:
                if not isinstance(config_obj['default_groups'],list):
                    raise ValueError('default_groups must be a list')

                # Check if default groups exist
                context = {'model':model,'user':c.user}
                for group_name in config_obj['default_groups']:
                    try:
                        group = get_action('group_show')(context,{'id':group_name})
                    except NotFound,e:
                        raise ValueError('Default group not found')

            if 'default_extras' in config_obj:
                if not isinstance(config_obj['default_extras'],dict):
                    raise ValueError('default_extras must be a dictionary')

            if 'from' in config_obj:
                try:
                    datetime.strptime(config_obj['from'], '%Y-%m-%dT%H:%M:%SZ')
                except ValueError:
                    raise ValueError("Incorrect date format, should be yyyy-mm-ddThh:mm:ssZ ")
                #    int(config_obj['from'])

            if 'until' in config_obj:
                try:
                    datetime.strptime(config_obj['until'], '%Y-%m-%dT%H:%M:%SZ')
                except ValueError:
                    raise ValueError("Incorrect date format, should be yyyy-mm-ddThh:mm:ssZ ")

            #if 'vocabulary' in config_obj:
            #    if config_obj['vocabulary'] != 'metashare' and config_obj['vocabulary'] != 'olac' and config_obj['vocabulary'] !='cmdi':
            #        raise ValueError("Incorrect vocabulary, please choose between metashare, olac and cmdi")
            #else:
            #    raise ValueError("Please provide a vocabulary, you can choose between metashare, olac and cmdi")

            if 'user' in config_obj:
                # Check if user exists
                context = {'model':model,'user':c.user}
                try:
                    user = get_action('user_show')(context,{'id':config_obj.get('user')})
                except NotFound,e:
                    raise ValueError('User not found')
Пример #8
0
    def test_revisions__since_revision_id__latest(self):
        last_rev, rev = self._get_last_and_penultimate_revisions()
        offset = "/api/util/revisions?since-revision-id=%s" % rev.id
        result = self.app.get(offset, status=[200])
        res = json.loads(result.body)
        assert isinstance(res, dict), res
        assert set(res.keys()) >= set(("since_timestamp", "datasets")), res.keys()
        assert_equal(res["since_revision_id"], rev.id)
        assert_equal(res["newest_revision_id"], last_rev.id)
        assert_equal(res["number_of_revisions"], 2)
        assert_equal(res["results_limited"], False)
        pkgs = res["datasets"]
        pkg = pkgs[0]
        assert_equal(pkg["name"], "latest")
        assert_equal(pkg["notes"].strip(), "Latest dataset.")
        assert pkg["publisher_title"] in ("National Health Service", "Department of Health"), pkg["publisher_title"]
        assert set(pkg.keys()) >= set(
            ("title", "dataset_link", "notes", "publisher_title", "publisher_link")
        ), pkg.keys()

        # try dataset_link
        if model.engine_is_sqlite():
            raise SkipTest("Link tests need postgres")
        res = self.app.get(pkg["dataset_link"], status=[200])
        assert "latest" in res.body

        # try publisher_link
        res = self.app.get(pkg["publisher_link"], status=[200])
        assert "National Health Service" in res.body, res
Пример #9
0
    def import_stage(self,harvest_object):
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import')
            return False

        try:
            package_dict = json.loads(harvest_object.content)
            package_dict['id'] = harvest_object.guid
            package_dict['name'] = self._gen_new_name(package_dict['title'])

            # Common extras
            package_dict['extras']['harvest_catalogue_name'] = u'Dati Piemonte'
            package_dict['extras']['harvest_catalogue_url'] = u'http://dati.piemonte.it/'
            package_dict['extras']['eu_country'] = u'IT'
            package_dict['extras']['eu_nuts1'] = u'ITC'
            package_dict['extras']['eu_nuts2'] = u'ITC1'

            return self._create_or_update_package(package_dict, harvest_object)
        except Exception, e:
            log.exception(e)
            self._save_object_error('%r' % e, harvest_object, 'Import')
Пример #10
0
def group_to_api2(group, context):
    
    dictized = group_dictize(group, context)
    dictized["extras"] = dict((extra["key"], json.loads(extra["value"])) 
                              for extra in dictized["extras"])
    dictized["packages"] = sorted([package["id"] for package in dictized["packages"]])
    return dictized
 def _get_group(self, base_url, group_name):
     url = base_url + self._get_rest_api_offset() + '/group/' + group_name
     try:
         content = self._get_content(url)
         return json.loads(content)
     except Exception, e:
         raise e
Пример #12
0
 def _set_config(self,config_str):
     if config_str:
         self.config = json.loads(config_str)
         self.api_version = int(self.config['api_version'])
         log.debug('Using config: %r', self.config)
     else:
         self.config = {}
Пример #13
0
    def fetch_stage(self, harvest_object):
        log.debug('In SwisstopoHarvester fetch_stage')

        # Get the URL
        log.debug(json.loads(harvest_object.content))
        name = json.loads(harvest_object.content)['name']
        log.debug(harvest_object.content)

        # Get contents
        try:
            harvest_object.save()
            log.debug('successfully processed ' + name)
            return True
        except Exception, e:
            log.exception(e)
            raise
    def validate_config(self, config):
        if not config:
            return config

        config_obj = json.loads(config)

        return config
Пример #15
0
    def validate_config(self,config):
        if not config:
            return config

        try:
            config_obj = json.loads(config)

            if 'default_tags' in config_obj:
                if not isinstance(config_obj['default_tags'],list):
                    raise ValueError('default_tags must be a list')

            if 'default_groups' in config_obj:
                if not isinstance(config_obj['default_groups'],list):
                    raise ValueError('default_groups must be a list')

                # Check if default groups exist
                context = {'model':model,'user':c.user}
                for group_name in config_obj['default_groups']:
                    try:
                        group = get_action('group_show')(context,{'id':group_name})
                    except NotFound,e:
                        raise ValueError('Default group not found')

            if 'default_extras' in config_obj:
                if not isinstance(config_obj['default_extras'],dict):
                    raise ValueError('default_extras must be a dictionary')

            if 'user' in config_obj:
                # Check if user exists
                context = {'model':model,'user':c.user}
                try:
                    user = get_action('user_show')(context,{'id':config_obj.get('user')})
                except NotFound,e:
                    raise ValueError('User not found')
Пример #16
0
    def import_stage(self,harvest_object):
        log.debug('In SRDAHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' % harvest_object.id,
                    harvest_object, 'Import')
            return False
 
        #self._set_config(harvest_object.job.source.config)
        try:
            package_dict = json.loads(harvest_object.content)
	    package_dict["id"] = harvest_object.guid
	    package_dict["extras"][u"資料庫名稱"] = u'SRDA'
	    package_dict["extras"][u"資料庫網址"] = u'http://srda.sinica.edu.tw/'

	    #print package_dict
            
	    for key in package_dict['extras'].keys():
                if not isinstance(package_dict['extras'][key], basestring):
                    try:
                        package_dict['extras'][key] = json.dumps(package_dict['extras'][key])
                    except TypeError:
                        # If converting to a string fails, just delete it.
                        del package_dict['extras'][key]

            result = self._create_or_update_package(package_dict,harvest_object)

            return True
        except ValidationError,e:
            self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
                    harvest_object, 'Import')
Пример #17
0
 def fetch_stage(self, harvest_object):
     log.debug('In fetch_stage  ' + self.info()['title'])
     d = json.loads(harvest_object.content)
     d = self.scraper.fetch(d)
     harvest_object.content = json.dumps(d)
     harvest_object.save()
     return True
Пример #18
0
    def test_revisions__since_revision_id__latest(self):
        last_rev, rev = self._get_last_and_penultimate_revisions()
        offset = '/api/util/revisions?since-revision-id=%s' % rev.id
        result = self.app.get(offset, status=[200])
        res = json.loads(result.body)
        assert isinstance(res, dict), res
        assert set(res.keys()) >= set(('since_timestamp', 'datasets')), res.keys()
        assert_equal(res['since_revision_id'], rev.id)
        assert_equal(res['newest_revision_id'], last_rev.id)
        assert_equal(res['number_of_revisions'], 2)
        assert_equal(res['results_limited'], False)
        pkgs = res['datasets']
        pkg = pkgs[0]
        assert_equal(pkg['name'], 'latest')
        assert_equal(pkg['notes'].strip(), 'Latest dataset.')
        assert pkg['publisher_title'] in ('National Health Service', 'Department of Health'), pkg['publisher_title']
        assert set(pkg.keys()) >= set(('title', 'dataset_link', 'notes', 'publisher_title', 'publisher_link')), pkg.keys()

        # try dataset_link
        if model.engine_is_sqlite():
            raise SkipTest("Link tests need postgres")
        res = self.app.get(pkg['dataset_link'], status=[200])
        assert 'latest' in res.body

        # try publisher_link
        res = self.app.get(pkg['publisher_link'], status=[200])
        assert 'National Health Service' in res.body, res
Пример #19
0
    def test_create_package(self):
        test_pkg = self.get_package_fixture('test1')
        offset = '/api/rest/package'
        postparams = '%s=1' % json.dumps(test_pkg)
        result = self.app.post(offset, postparams, status=[201], extra_environ=self.extra_environ_sysadmin)

        # check returned dict is correct
        res = json.loads(result.body)
        assert_equal(res['name'], test_pkg['name'])
        assert res['id']
        assert_equal(res['title'], test_pkg['title'])
        assert_equal(res['license_id'], test_pkg['license_id'])
        assert_equal(res['extras'].get('temporal_coverage-to'), test_pkg['extras']['temporal_coverage-to'])
        assert_equal(res['resources'][0].get('description'), test_pkg['resources'][0]['description'])
        assert_equal(set(res['tags']), set(test_pkg['tags']))

        # check package was created ok
        pkg = model.Package.by_name(test_pkg['name'])
        pkg_dict = get_action('package_show')(self.context, {'id': test_pkg['name']})
        assert_equal(pkg.name, test_pkg['name'])
        assert_equal(pkg.title, test_pkg['title'])

        assert_equal(pkg.extras.get('temporal_coverage-to'), test_pkg['extras']['temporal_coverage-to'])
        assert_equal(pkg.resources[0].description, test_pkg['resources'][0]['description'])
        assert_equal(set([tag['name'] for tag in pkg_dict['tags']]), set(test_pkg['tags']))
Пример #20
0
    def test_get_package(self):
        offset = '/api/rest/package/%s' % self.pkg_name
        result = self.app.get(offset, status=[200])
        content_type = result.header_dict['Content-Type']
        assert 'application/json' in content_type, content_type
        res = json.loads(result.body)

        assert_equal(res['name'], self.pkg_name)
        assert_equal(res['id'], self.pkg_id)
        assert_equal(res['notes'], u'Ratings for all articles on the Directgov website.  One data file is available per day. Sets of files are organised by month on the download page')
        assert_equal(res['license_id'], 'uk-ogl')
        assert_equal(res['license'], u'UK Open Government Licence (OGL)')
        assert_equal(set(res['tags']), set(["article", "cota", "directgov", "information", "ranking", "rating"]))
        assert self._is_member_of_org(res, "national-health-service")

        extras = res['extras']
        expected_extra_keys = set((
            'access_constraints', 'contact-email', 'contact-name', 'contact-phone',
            'foi-email', 'foi-name', 'foi-phone', 'foi-web',
            'geographic_coverage', 'mandate', 'temporal_coverage-to',
            'temporal_coverage-from', 'temporal_granularity'))
        assert set(extras.keys()) >= expected_extra_keys, set(extras.keys()) - expected_extra_keys
        assert_equal(extras.get('temporal_coverage-from'), '2010-01-01')
        assert_equal(len(res['resources']), 1)
        resource = res['resources'][0]
        assert_equal(resource['description'], "Directgov Article Ratings")
        assert_equal(resource['url'], "http://innovate-apps.direct.gov.uk/cota/")
        assert_equal(resource['format'], "HTML")
Пример #21
0
    def test_new(self):
        name = "test-spatial-dataset-1"

        offset = url_for(controller="package", action="new")
        res = self.app.get(offset, extra_environ=self.extra_environ)
        assert "Add - Datasets" in res
        fv = res.forms["dataset-edit"]
        prefix = ""
        fv[prefix + "name"] = name
        fv[prefix + "extras__0__key"] = u"spatial"
        fv[prefix + "extras__0__value"] = self.geojson_examples["point"]

        res = fv.submit("save", extra_environ=self.extra_environ)
        assert not "Error" in res, res

        package = Package.get(name)

        # Check that a PackageExtent object has been created
        package_extent = Session.query(PackageExtent).filter(PackageExtent.package_id == package.id).first()

        geojson = json.loads(self.geojson_examples["point"])

        assert package_extent
        assert package_extent.package_id == package.id
        assert Session.scalar(package_extent.the_geom.x) == geojson["coordinates"][0]
        assert Session.scalar(package_extent.the_geom.y) == geojson["coordinates"][1]
        assert Session.scalar(package_extent.the_geom.srid) == self.db_srid
Пример #22
0
def package_to_api1(pkg, context):

    dictized = package_dictize(pkg, context)

    dictized.pop("revision_timestamp")

    dictized["groups"] = [group["name"] for group in dictized["groups"]]
    dictized["tags"] = [tag["name"] for tag in dictized["tags"] if not tag.get("vocabulary_id")]
    dictized["extras"] = dict((extra["key"], json.loads(extra["value"])) for extra in dictized["extras"])
    dictized["notes_rendered"] = ckan.misc.MarkdownFormat().to_html(pkg.notes)

    resources = dictized["resources"]

    for resource in resources:
        resource_dict_to_api(resource, pkg.id, context)

    if pkg.resources:
        dictized["download_url"] = pkg.resources[0].url

    dictized["license"] = pkg.license.title if pkg.license else None

    dictized["ratings_average"] = pkg.get_average_rating()
    dictized["ratings_count"] = len(pkg.ratings)
    site_url = config.get("ckan.site_url", None)
    if site_url:
        dictized["ckan_url"] = "%s/dataset/%s" % (site_url, pkg.name)
    metadata_modified = pkg.metadata_modified
    dictized["metadata_modified"] = metadata_modified.isoformat() if metadata_modified else None
    metadata_created = pkg.metadata_created
    dictized["metadata_created"] = metadata_created.isoformat() if metadata_created else None

    subjects = dictized.pop("relationships_as_subject")
    objects = dictized.pop("relationships_as_object")

    relationships = []
    for relationship in objects:
        model = context["model"]
        swap_types = model.PackageRelationship.forward_to_reverse_type
        type = swap_types(relationship["type"])
        relationships.append(
            {
                "subject": pkg.get(relationship["object_package_id"]).name,
                "type": type,
                "object": pkg.get(relationship["subject_package_id"]).name,
                "comment": relationship["comment"],
            }
        )
    for relationship in subjects:
        model = context["model"]
        relationships.append(
            {
                "subject": pkg.get(relationship["subject_package_id"]).name,
                "type": relationship["type"],
                "object": pkg.get(relationship["object_package_id"]).name,
                "comment": relationship["comment"],
            }
        )

    dictized["relationships"] = relationships
    return dictized
Пример #23
0
    def check_spatial_extra(self,package):
        '''
        For a given package, looks at the spatial extent (as given in the
        extra "spatial" in GeoJSON format) and records it in PostGIS.
        '''
        if not package.id:
            log.warning('Couldn\'t store spatial extent because no id was provided for the package')
            return

        # TODO: deleted extra
        for extra in package.extras_list:
            if extra.key == 'spatial':
                if extra.state == 'active' and extra.value:
                    try:
                        log.debug('Received: %r' % extra.value)
                        geometry = json.loads(extra.value)
                    except ValueError,e:
                        error_dict = {'spatial':[u'Error decoding JSON object: %s' % str(e)]}
                        raise p.toolkit.ValidationError(error_dict, error_summary=package_error_summary(error_dict))
                    except TypeError,e:
                        error_dict = {'spatial':[u'Error decoding JSON object: %s' % str(e)]}
                        raise p.toolkit.ValidationError(error_dict, error_summary=package_error_summary(error_dict))

                    try:
                        save_package_extent(package.id,geometry)

                    except ValueError,e:
                        error_dict = {'spatial':[u'Error creating geometry: %s' % str(e)]}
                        raise p.toolkit.ValidationError(error_dict, error_summary=package_error_summary(error_dict))
                    except Exception, e:
                        raise
                        if bool(os.getenv('DEBUG')):
                            raise
                        error_dict = {'spatial':[u'Error: %s' % str(e)]}
                        raise p.toolkit.ValidationError(error_dict, error_summary=package_error_summary(error_dict))
Пример #24
0
    def test_edit_package(self):
        # create the package to be edited
        pkg_name = 'test4'
        test_pkg = self.get_package_fixture(pkg_name)
        pkg = CreateTestData.create_arbitrary(test_pkg)

        # edit it
        offset = '/api/rest/package/%s' % pkg_name
        edited_pkg = copy.deepcopy(test_pkg)
        edited_pkg['title'] = 'Edited title'
        postparams = '%s=1' % json.dumps(edited_pkg)
        result = self.app.put(offset, postparams, status=[200], extra_environ=self.extra_environ_sysadmin)

        # check returned dict is correct
        res = json.loads(result.body)
        assert_equal(res['name'], test_pkg['name'])
        assert res['id']
        assert_equal(res['title'], 'Edited title')
        assert_equal(res['license_id'], test_pkg['license_id'])
        assert res['organization']['name'] == test_pkg['groups'][0]

        assert_equal(res['extras'].get('temporal_coverage-to'), test_pkg['extras']['temporal_coverage-to'])
        assert_equal(res['resources'][0].get('description'), test_pkg['resources'][0]['description'])
        assert_equal(set(res['tags']), set(test_pkg['tags']))

        # check package was edited ok
        pkg = model.Package.by_name(test_pkg['name'])
        pkg_dict = get_action('package_show')(self.context, {'id': test_pkg['name']})
        assert_equal(pkg.name, test_pkg['name'])
        assert_equal(pkg.title, 'Edited title')
        assert pkg.get_organization().name == test_pkg['groups'][0]

        assert_equal(pkg.extras.get('temporal_coverage-to'), test_pkg['extras']['temporal_coverage-to'])
        assert_equal(pkg.resources[0].description, test_pkg['resources'][0]['description'])
        assert_equal(set([tag['name'] for tag in pkg_dict['tags']]), set(test_pkg['tags']))
Пример #25
0
    def _parse_recline_state(self, params):
        state_version = int(request.params.get('state_version', '1'))
        if state_version != 1:
            return None

        recline_state = {}
        for k, v in request.params.items():
            try:
                v = json.loads(v)
            except ValueError:
                pass
            recline_state[k] = v

        recline_state.pop('width', None)
        recline_state.pop('height', None)
        recline_state['readOnly'] = True

        # previous versions of recline setup used elasticsearch_url attribute
        # for data api url - see http://trac.ckan.org/ticket/2639
        # fix by relocating this to url attribute which is the default location
        if 'dataset' in recline_state and 'elasticsearch_url' in recline_state['dataset']:
            recline_state['dataset']['url'] = recline_state['dataset']['elasticsearch_url']

        # Ensure only the currentView is available
        # default to grid view if none specified
        if not recline_state.get('currentView', None):
            recline_state['currentView'] = 'grid'
        for k in recline_state.keys():
            if k.startswith('view-') and \
                    not k.endswith(recline_state['currentView']):
                recline_state.pop(k)
        return recline_state
Пример #26
0
    def check_spatial_extra(self,package):
        if not package.id:
            log.warning('Couldn\'t store spatial extent because no id was provided for the package')
            return

        # TODO: deleted extra
        for extra in package.extras_list:
            if extra.key == 'spatial':
                if extra.state == 'active':
                    try:
                        log.debug('Received: %r' % extra.value)
                        geometry = json.loads(extra.value)
                    except ValueError,e:
                        error_dict = {'spatial':[u'Error decoding JSON object: %s' % str(e)]}
                        raise ValidationError(error_dict, error_summary=package_error_summary(error_dict))
                    except TypeError,e:
                        error_dict = {'spatial':[u'Error decoding JSON object: %s' % str(e)]}
                        raise ValidationError(error_dict, error_summary=package_error_summary(error_dict))

                    try:
                        save_package_extent(package.id,geometry)

                    except ValueError,e:
                        error_dict = {'spatial':[u'Error creating geometry: %s' % str(e)]}
                        raise ValidationError(error_dict, error_summary=package_error_summary(error_dict))
                    except Exception, e:
                        error_dict = {'spatial':[u'Error: %s' % str(e)]}
                        raise ValidationError(error_dict, error_summary=package_error_summary(error_dict))
Пример #27
0
    def _parse_recline_state(self, params):
        state_version = int(request.params.get('state_version', '1'))
        if state_version != 1:
            return None

        recline_state = {}
        for k, v in request.params.items():
            try:
                v = json.loads(v)
            except ValueError:
                pass
            recline_state[k] = v

        recline_state.pop('width', None)
        recline_state.pop('height', None)
        recline_state['readOnly'] = True

        # Ensure only the currentView is available
        # default to grid view if none specified
        if not recline_state.get('currentView', None):
            recline_state['currentView'] = 'grid'
        for k in recline_state.keys():
            if k.startswith('view-') and \
                    not k.endswith(recline_state['currentView']):
                recline_state.pop(k)
        return recline_state
Пример #28
0
    def _parse_recline_state(self, params):
        state_version = int(request.params.get("state_version", "1"))
        if state_version != 1:
            return None

        recline_state = {}
        for k, v in request.params.items():
            try:
                v = json.loads(v)
            except ValueError:
                pass
            recline_state[k] = v

        recline_state.pop("width", None)
        recline_state.pop("height", None)
        recline_state["readOnly"] = True

        # previous versions of recline setup used elasticsearch_url attribute
        # for data api url - see http://trac.ckan.org/ticket/2639
        # fix by relocating this to url attribute which is the default location
        if "dataset" in recline_state and "elasticsearch_url" in recline_state["dataset"]:
            recline_state["dataset"]["url"] = recline_state["dataset"]["elasticsearch_url"]

        # Ensure only the currentView is available
        # default to grid view if none specified
        if not recline_state.get("currentView", None):
            recline_state["currentView"] = "grid"
        for k in recline_state.keys():
            if k.startswith("view-") and not k.endswith(recline_state["currentView"]):
                recline_state.pop(k)
        return recline_state
Пример #29
0
    def test_create_extent(self):

        package = factories.Dataset()

        geojson = json.loads(self.geojson_examples['point'])

        shape = asShape(geojson)
        package_extent = PackageExtent(package_id=package['id'],
                                       the_geom=WKTElement(shape.wkt,
                                                           self.db_srid))
        package_extent.save()

        assert_equals(package_extent.package_id, package['id'])
        if legacy_geoalchemy:
            assert_equals(Session.scalar(package_extent.the_geom.x),
                          geojson['coordinates'][0])
            assert_equals(Session.scalar(package_extent.the_geom.y),
                          geojson['coordinates'][1])
            assert_equals(Session.scalar(package_extent.the_geom.srid),
                          self.db_srid)
        else:
            from sqlalchemy import func
            assert_equals(
                Session.query(func.ST_X(package_extent.the_geom)).first()[0],
                geojson['coordinates'][0])
            assert_equals(
                Session.query(func.ST_Y(package_extent.the_geom)).first()[0],
                geojson['coordinates'][1])
            assert_equals(package_extent.the_geom.srid, self.db_srid)
Пример #30
0
    def before_index(self, pkg_dict):
        '''Adds the fulltext of a package to the dict what 
        will be given to the solr for indexing.
        
        @param pkg_dict: flattened dict (except for multli-valued fields such as tags) 
                         containing all the terms which will be sent to the indexer
        @return: modified package dict
        '''
        
        if pkg_dict and pkg_dict.has_key('extras_full_text_search'):
            del pkg_dict['extras_full_text_search']
        
        data_dict = json.loads(pkg_dict['data_dict'])
        fulltext = [x for x in data_dict['extras'] if 'full_text_search' in x['key']]
        
        if len(fulltext) > 0:
            extras = [x for x in data_dict['extras'] if not 'full_text_search' in x['key']]
            data_dict['extras'] = extras
            pkg_dict['fulltext'] = fulltext[0]['value']

        else:
            fulltext_dict = _get_fulltext(pkg_dict['id'])
            if fulltext_dict:
                pkg_dict['fulltext'] = fulltext_dict.text
        
        pkg_dict['data_dict'] = json.dumps(data_dict)

        return pkg_dict
Пример #31
0
    def import_stage(self, harvest_object):
        log.debug('In DSPCKANHarvester import_stage')

        context = {
            'model': model,
            'session': Session,
            'user': self._get_user_name()
        }
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)

            if package_dict.get('type') == 'harvest':
                log.warn('Remote dataset is a harvest source, ignoring...')
                return True

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend(
                    [t for t in default_tags if t not in package_dict['tags']])

            remote_groups = self.config.get('remote_groups', None)
            if not remote_groups in ('only_local', 'create'):
                # Ignore remote groups
                package_dict.pop('groups', None)
            else:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []

                for group_name in package_dict['groups']:
                    try:
                        data_dict = {'id': group_name}
                        group = get_action('group_show')(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group['name'])
                        else:
                            validated_groups.append(group['id'])
                    except NotFound, e:
                        log.info('Group %s is not available' % group_name)
                        if remote_groups == 'create':
                            try:
                                group = self._get_group(
                                    harvest_object.source.url, group_name)
                            except RemoteResourceError:
                                log.error('Could not get remote group %s' %
                                          group_name)
                                continue

                            for key in [
                                    'packages', 'created', 'users', 'groups',
                                    'tags', 'extras', 'display_name'
                            ]:
                                group.pop(key, None)

                            get_action('group_create')(context, group)
                            log.info('Group %s has been newly created' %
                                     group_name)
                            if self.api_version == 1:
                                validated_groups.append(group['name'])
                            else:
                                validated_groups.append(group['id'])

                package_dict['groups'] = validated_groups

            # Local harvest source organization
            source_dataset = get_action('package_show')(
                context, {
                    'id': harvest_object.source.id
                })
            local_org = source_dataset.get('owner_org')

            remote_orgs = self.config.get('remote_orgs', None)

            if not remote_orgs in ('only_local', 'create'):
                # Assign dataset to the source organization
                package_dict['owner_org'] = local_org
            else:
                if not 'owner_org' in package_dict:
                    package_dict['owner_org'] = None

                # check if remote org exist locally, otherwise remove
                validated_org = None
                remote_org = package_dict['owner_org']

                if remote_org:
                    try:
                        data_dict = {'id': remote_org}
                        org = get_action('organization_show')(context,
                                                              data_dict)
                        validated_org = org['id']
                    except NotFound, e:
                        log.info('Organization %s is not available' %
                                 remote_org)
                        if remote_orgs == 'create':
                            try:
                                try:
                                    org = self._get_organization(
                                        harvest_object.source.url, remote_org)
                                except RemoteResourceError:
                                    # fallback if remote CKAN exposes organizations as groups
                                    # this especially targets older versions of CKAN
                                    org = self._get_group(
                                        harvest_object.source.url, remote_org)

                                for key in [
                                        'packages', 'created', 'users',
                                        'groups', 'tags', 'extras',
                                        'display_name', 'type'
                                ]:
                                    org.pop(key, None)
                                get_action('organization_create')(context, org)
                                log.info(
                                    'Organization %s has been newly created' %
                                    remote_org)
                                validated_org = org['id']
                            except (RemoteResourceError, ValidationError):
                                log.error('Could not get remote org %s' %
                                          remote_org)

                package_dict['owner_org'] = validated_org or local_org
Пример #32
0
    def gather_stage(self, harvest_job):
        log.debug('In DSP\'s CKANHarvester gather_stage (%s)' %
                  harvest_job.source.url)
        get_all_packages = True
        package_ids = []

        self._set_config(harvest_job.source.config)

        # Check if this source has been harvested before
        previous_job = Session.query(HarvestJob) \
                        .filter(HarvestJob.source==harvest_job.source) \
                        .filter(HarvestJob.gather_finished!=None) \
                        .filter(HarvestJob.id!=harvest_job.id) \
                        .order_by(HarvestJob.gather_finished.desc()) \
                        .limit(1).first()

        # Get source URL
        base_url = harvest_job.source.url.rstrip('/')
        base_rest_url = base_url + self._get_rest_api_offset()
        base_search_url = base_url + self._get_search_api_offset()

        if (previous_job and not previous_job.gather_errors
                and not len(previous_job.objects) == 0):
            if not self.config.get('force_all', False):
                get_all_packages = False

                # Request only the packages modified since last harvest job
                last_time = previous_job.gather_finished.isoformat()
                url = base_search_url + '/revision?since_time=%s' % last_time

                try:
                    content = self._get_content(url)

                    revision_ids = json.loads(content)
                    if len(revision_ids):
                        for revision_id in revision_ids:
                            url = base_rest_url + '/revision/%s' % revision_id
                            try:
                                content = self._get_content(url)
                            except ContentFetchError, e:
                                self._save_gather_error(
                                    'Unable to get content for URL: %s: %s' %
                                    (url, str(e)), harvest_job)
                                continue

                            revision = json.loads(content)
                            for package_id in revision['packages']:
                                if not package_id in package_ids:
                                    package_ids.append(package_id)
                    else:
                        log.info(
                            'No packages have been updated on the remote CKAN instance since the last harvest job'
                        )
                        return None

                except urllib2.HTTPError, e:
                    if e.getcode() == 400:
                        log.info(
                            'CKAN instance %s does not suport revision filtering'
                            % base_url)
                        get_all_packages = True
                    else:
                        self._save_gather_error(
                            'Unable to get content for URL: %s: %s' %
                            (url, str(e)), harvest_job)
                        return None
Пример #33
0
                            'Unable to get content for URL: %s: %s' %
                            (url, str(e)), harvest_job)
                        return None

        if get_all_packages:
            # Request all remote packages
            url = base_rest_url + '/package'
            try:
                content = self._get_content(url)
            except ContentFetchError, e:
                self._save_gather_error(
                    'Unable to get content for URL: %s: %s' % (url, str(e)),
                    harvest_job)
                return None

            package_ids = json.loads(content)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=package_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error(
                    'No packages received for URL: %s' % url, harvest_job)
Пример #34
0
    def import_stage(self, harvest_object):
        log.debug('In HRIHarvester import_stage')

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name()
        }
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)

            if package_dict.get('type') == 'harvest':
                log.warn('Remote dataset is a harvest source, ignoring...')
                return True

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if 'tags' not in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend(
                    [t for t in default_tags if t not in package_dict['tags']])

            remote_groups = self.config.get('remote_groups', None)
            if remote_groups not in ('only_local', 'create'):
                # Ignore remote groups
                package_dict.pop('groups', None)
            else:
                if 'groups' not in package_dict:
                    package_dict['groups'] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []

                for group_name in package_dict['groups']:
                    try:
                        data_dict = {'id': group_name}
                        group = get_action('group_show')(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group['name'])
                        else:
                            validated_groups.append(group['id'])
                    except NotFound, e:
                        log.info('Group %s is not available', group_name)
                        if remote_groups == 'create':
                            try:
                                group = self._get_group(
                                    harvest_object.source.url, group_name)
                            except RemoteResourceError:
                                log.error('Could not get remote group %s',
                                          group_name)
                                continue

                            for key in [
                                    'packages', 'created', 'users', 'groups',
                                    'tags', 'extras', 'display_name'
                            ]:
                                group.pop(key, None)

                            get_action('group_create')(context, group)
                            log.info('Group %s has been newly created',
                                     group_name)
                            if self.api_version == 1:
                                validated_groups.append(group['name'])
                            else:
                                validated_groups.append(group['id'])

                package_dict['groups'] = validated_groups

            # Find if remote org exists locally, otherwise don't import dataset
            if 'owner_org' not in package_dict:
                package_dict['owner_org'] = None

            remote_org = None
            if package_dict.get('organization'):
                remote_org = package_dict['organization']['name']

            if remote_org:
                try:
                    data_dict = {'id': remote_org}
                    org = get_action('organization_show')(context, data_dict)
                    package_dict['owner_org'] = org['id']
                except NotFound:
                    log.info('No organization exist, not importing dataset')
                    return "unchanged"
            else:
                log.info('No organization in harvested dataset')
                return "unchanged"

                # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                if 'groups' not in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([
                    g for g in default_groups
                    if g not in package_dict['groups']
                ])

            # Set default extras if needed
            default_extras = self.config.get('default_extras', {})

            def get_extra(key, package_dict):
                for extra in package_dict.get('extras', []):
                    if extra['key'] == key:
                        return extra

            if default_extras:
                override_extras = self.config.get('override_extras', False)
                if 'extras' not in package_dict:
                    package_dict['extras'] = {}
                for key, value in default_extras.iteritems():
                    existing_extra = get_extra(key, package_dict)
                    if existing_extra and not override_extras:
                        continue  # no need for the default
                    if existing_extra:
                        package_dict['extras'].remove(existing_extra)
                    # Look for replacement strings
                    if isinstance(value, basestring):
                        value = value.format(
                            harvest_source_id=harvest_object.job.source.id,
                            harvest_source_url=harvest_object.job.source.url.
                            strip('/'),
                            harvest_source_title=harvest_object.job.source.
                            title,
                            harvest_job_id=harvest_object.job.id,
                            harvest_object_id=harvest_object.id,
                            dataset_id=package_dict['id'])

                    package_dict['extras'].append({'key': key, 'value': value})

            for resource in package_dict.get('resources', []):
                # Clear remote url_type for resources (eg datastore, upload) as
                # we are only creating normal resources with links to the
                # remote ones
                resource.pop('url_type', None)

                # Clear revision_id as the revision won't exist on this CKAN
                # and saving it will cause an IntegrityError with the foreign
                # key.
                resource.pop('revision_id', None)

            result = self._create_or_update_package(
                package_dict, harvest_object, package_dict_form='package_show')

            return result
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g.
              create, update or delete a DataNorge package).
              Note: if this stage creates or updates a package, a reference
              to the package should be added to the HarvestObject.
            - setting the HarvestObject.package (if there is one)
            - setting the HarvestObject.current for this harvest:
               - True if successfully created/updated
               - False if successfully deleted
            - setting HarvestObject.current to False for previous harvest
              objects of this harvest source if the action was successful.
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - creating the HarvestObject - Package relation (if necessary)
            - returning True if the action was done, "unchanged" if the object
              didn't need harvesting after all or False if there were errors.

        NB You can run this stage repeatedly using 'paster harvest import'.

        :param harvest_object: HarvestObject object
        :returns: True if the action was done, "unchanged" if the object didn't
                  need harvesting after all or False if there were errors.
        '''
        log.debug('In DataNorgeHarvester import_stage')

        base_context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name()
        }
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)
            if package_dict.get('type', '') == 'harvest':
                log.warn('Remote dataset is a harvest source, ignoring...')
                return True

            organization_name = package_dict['publisher'].get('name')
            package_dict['owner_org'] = self._gen_new_name(organization_name)

            if not 'tags' in package_dict:
                package_dict['tags'] = []

            # TODO: CKAN tags don't accept commas, while keywords from datanorge
            # do contain them. A solution for this may be to create groups from
            # the keywords, since they're not really seen as 'tags' in
            # datanorge. The tags in datanorge are not accessable via their API.

            default_tags = self.config.get('default_tags', False)
            if default_tags:
                package_dict['tags'].extend(
                    [t for t in default_tags if t not in package_dict['tags']])

            # Sets a description to the dataset.
            descriptions = package_dict.pop('description')
            notes = None
            for item in descriptions:
                if item.get('language') == 'nb':
                    notes = item.get('value')
            if notes:
                package_dict['notes'] = notes

            if not 'resources' in package_dict:
                package_dict['resources'] = []

            distribution = package_dict.get('distribution')
            if distribution:
                for resource in distribution:
                    items = resource.get('description')
                    name = 'Name'
                    if items:
                        for item in items:
                            if item.get('language') == 'nb':
                                name = item.get('value')
                    package_dict['resources'].append({
                        'url':
                        resource.get('accessURL'),
                        'name':
                        name,
                        'format':
                        resource.get('format')
                    })

            source_dataset = \
                get_action('package_show')(base_context.copy(),
                                           {'id': harvest_object.source.id})

            # Local harvest source organization
            source_dataset = \
                get_action('package_show')(base_context.copy(),
                                           {'id': harvest_object.source.id})
            local_org = source_dataset.get('owner_org')

            create_orgs = self.config.get('create_orgs', True)

            if not create_orgs:
                # Assign dataset to the source
                package_dict['owner_org'] = local_org
            else:
                # check if remote org exist locally, otherwise remove
                validated_org = None
                remote_org = package_dict.get('owner_org', None)

                if remote_org:
                    try:
                        data_dict = {'id': remote_org}
                        org = get_action('organization_show')(
                            base_context.copy(), data_dict)
                        if org.get('state') == 'deleted':
                            patch_org = {
                                'id': org.get('id'),
                                'state': 'active'
                            }
                            get_action('organization_patch')(
                                base_context.copy(), patch_org)
                        validated_org = org['id']
                    except NotFound, e:
                        log.info('Organization %s is not available',
                                 remote_org)
                        if create_orgs:
                            try:
                                new_org = {
                                    'name': package_dict.get('owner_org'),
                                    'title': organization_name
                                }

                                try:
                                    html_source = \
                                    BeautifulSoup(
                                        urllib.urlopen(
                                            package_dict.get('url')
                                        ).read()
                                    )
                                    img_source = \
                                        html_source.body.find(
                                            'div',
                                            attrs={'class': 'logo'}
                                        ).img.get('src')
                                except AttributeError, e:
                                    img_source = None
                                    log.debug('No logo was found for remote '
                                              'org %s.' % remote_org)

                                if img_source:
                                    new_org['image_url'] = img_source

                                org = get_action('organization_create')(
                                    base_context.copy(), new_org)

                                log.info(
                                    'Organization %s has been newly '
                                    'created', remote_org)
                                validated_org = org['id']
                            except (RemoteResourceError, ValidationError):
                                log.error('Could not get remote org %s' %
                                          remote_org)
Пример #36
0
    def _import_package(self, harvest_object):
        package_dict = json.loads(harvest_object.content)
        package_dict['id'] = harvest_object.guid
        package_dict['name'] = munge_title_to_name(package_dict[u'datasetID'])
        context = self._create_new_context()

        # check if package already exists and
        existing_package = self._get_existing_package(package_dict)

        # get metadata for resources
        resource_metadata = package_dict.pop('resource_metadata', {})
        new_resources = self._generate_resources_from_folder(
            package_dict['datasetFolder']
        )
        for resource in new_resources:
            if resource['name'] in resource_metadata:
                resource.update(resource_metadata[resource['name']])

        # update existing resources, delete old ones, create new ones
        actions, resources_changed = self._resources_actions(
            existing_package,
            new_resources
        )

        if existing_package and 'resources' in existing_package:
            package_dict['resources'] = existing_package['resources']

        self._find_or_create_organization(package_dict, context.copy())

        # import the package if it does not yet exists => it's a new package
        # or if this harvester is allowed to update packages
        if not existing_package:
            dataset_id = self._create_package(package_dict, harvest_object)
            self._create_notification_for_new_dataset(package_dict)
            log.debug('Dataset `%s` has been added' % package_dict['id'])
        else:
            # Don't change the dataset name even if the title has
            package_dict['name'] = existing_package['name']
            package_dict['id'] = existing_package['id']
            dataset_id = self._update_package(package_dict, harvest_object)
            log.debug('Dataset `%s` has been updated' % package_dict['id'])

        # create diffs if there is a previous package
        if existing_package:
            self._create_diffs(package_dict)

        # set the date_last_modified if any resource changed
        if self.config['update_date_last_modified'] and resources_changed:
            theme_plugin = StadtzhThemePlugin()
            package_schema = theme_plugin.update_package_schema()
            schema_context = self._create_new_context()
            schema_context['ignore_auth'] = True
            schema_context['schema'] = package_schema
            today = datetime.datetime.now().strftime('%d.%m.%Y')
            try:
                get_action('package_patch')(
                    schema_context,
                    {'id': dataset_id, 'dateLastUpdated': today}
                )
            except p.toolkit.ValidationError, e:
                self._save_object_error(
                    'Update validation Error: %s' % str(e.error_summary),
                    harvest_object,
                    'Import'
                )
                return False
            log.info('Updated dateLastUpdated to %s', today)
Пример #37
0
        c.editors = c.group.members_of_type(model.User, 'editor')
        if c.user:
            c.is_sysadmin = Authorizer().is_sysadmin(unicode(c.user))
            c.can_admin = c.is_sysadmin or c.userobj in c.administrators
            c.can_edit = c.can_admin or c.userobj in c.editors

        c.restricted_to_publisher = 'publisher' in request.params
        parent_groups = c.group.get_groups('publisher')
        c.parent_publisher = parent_groups[0] if len(parent_groups) > 0 else None

        c.group_extras = []
        for extra in sorted(c.group_dict.get('extras',[]), key=lambda x:x['key']):
            if extra.get('state') == 'deleted':
                continue
            k, v = extra['key'], extra['value']
            v = json.loads(v)
            c.group_extras.append((k, v))
        c.group_extras = dict(c.group_extras)

        return render('publisher/read.html')


    def report_users_not_assigned_to_groups(self):
        context = {'model': model, 'session': model.Session,
                   'user': c.user or c.author}
        try:
            check_access('group_create', context)
        except NotAuthorized:
            abort(401, _('Not authorized to see this page'))

        query = """SELECT * FROM public.user WHERE id NOT IN
Пример #38
0
    def import_stage(self, harvest_object):
        log.debug('In CKANHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)

            if package_dict.get('type') == 'harvest':
                log.warn('Remote dataset is a harvest source, ignoring...')
                return False

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend(
                    [t for t in default_tags if t not in package_dict['tags']])

            remote_groups = self.config.get('remote_groups', None)
            if not remote_groups in ('only_local', 'create'):
                # Ignore remote groups
                package_dict.pop('groups', None)
            else:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []
                context = {
                    'model': model,
                    'session': Session,
                    'user': '******'
                }

                for group_name in package_dict['groups']:
                    try:
                        data_dict = {'id': group_name}
                        group = get_action('group_show')(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group['name'])
                        else:
                            validated_groups.append(group['id'])
                    except NotFound, e:
                        log.info('Group %s is not available' % group_name)
                        if remote_groups == 'create':
                            try:
                                group = self._get_group(
                                    harvest_object.source.url, group_name)
                            except:
                                log.error('Could not get remote group %s' %
                                          group_name)
                                continue

                            for key in [
                                    'packages', 'created', 'users', 'groups',
                                    'tags', 'extras', 'display_name'
                            ]:
                                group.pop(key, None)
                            get_action('group_create')(context, group)
                            log.info('Group %s has been newly created' %
                                     group_name)
                            if self.api_version == 1:
                                validated_groups.append(group['name'])
                            else:
                                validated_groups.append(group['id'])

                package_dict['groups'] = validated_groups

            # Ignore remote orgs for the time being
            package_dict.pop('owner_org', None)

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                package_dict['groups'].extend([
                    g for g in default_groups
                    if g not in package_dict['groups']
                ])

            # Find any extras whose values are not strings and try to convert
            # them to strings, as non-string extras are not allowed anymore in
            # CKAN 2.0.
            for key in package_dict['extras'].keys():
                if not isinstance(package_dict['extras'][key], basestring):
                    try:
                        package_dict['extras'][key] = json.dumps(
                            package_dict['extras'][key])
                    except TypeError:
                        # If converting to a string fails, just delete it.
                        del package_dict['extras'][key]

            # Set default extras if needed
            default_extras = self.config.get('default_extras', {})
            if default_extras:
                override_extras = self.config.get('override_extras', False)
                if not 'extras' in package_dict:
                    package_dict['extras'] = {}
                for key, value in default_extras.iteritems():
                    if not key in package_dict['extras'] or override_extras:
                        # Look for replacement strings
                        if isinstance(value, basestring):
                            value = value.format(
                                harvest_source_id=harvest_object.job.source.id,
                                harvest_source_url=harvest_object.job.source.
                                url.strip('/'),
                                harvest_source_title=harvest_object.job.source.
                                title,
                                harvest_job_id=harvest_object.job.id,
                                harvest_object_id=harvest_object.id,
                                dataset_id=package_dict['id'])

                        package_dict['extras'][key] = value

            result = self._create_or_update_package(package_dict,
                                                    harvest_object)

            if result and self.config.get('read_only', False) == True:

                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user', u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package,
                                             user=user,
                                             role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor', u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package,
                                                 user=user,
                                                 role=model.Role.READER)

            return True
Пример #39
0
                            'Unable to get content for URL: %s: %s' %
                            (url, str(e)), harvest_job)
                        return None

        if get_all_packages:
            # Request all remote packages
            url = base_rest_url + '/package'
            try:
                content = self._get_content(url)
            except Exception, e:
                self._save_gather_error(
                    'Unable to get content for URL: %s: %s' % (url, str(e)),
                    harvest_job)
                return None

            package_ids = json.loads(content)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=package_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error(
                    'No packages received for URL: %s' % url, harvest_job)
Пример #40
0
    def gather_stage(self, harvest_job):
        log.error('In SpodHarvester gather_stage (%s)' %
                  harvest_job.source.url)
        get_all_packages = True
        package_ids = []

        self._set_config(harvest_job.source.config)

        # Check if this source has been harvested before
        previous_job = Session.query(HarvestJob) \
                        .filter(HarvestJob.source==harvest_job.source) \
                        .filter(HarvestJob.gather_finished!=None) \
                        .filter(HarvestJob.id!=harvest_job.id) \
                        .order_by(HarvestJob.gather_finished.desc()) \
                        .limit(1).first()

        # Get source URL
        base_url = harvest_job.source.url.rstrip('/')
        base_rest_url = base_url + self._get_rest_api_offset()
        base_search_url = base_url + self._get_search_api_offset()

        # Filter in/out datasets from particular organizations
        org_filter_include = self.config.get('organizations_filter_include',
                                             [])
        org_filter_exclude = self.config.get('organizations_filter_exclude',
                                             [])

        def get_pkg_ids_for_organizations(orgs):
            pkg_ids = set()
            for organization in orgs:
                url = base_search_url + '/dataset?organization=%s' % organization
                content = self._get_content(url)
                content_json = json.loads(content)
                result_count = int(content_json['count'])
                pkg_ids |= set(content_json['results'])
                while len(
                        pkg_ids) < result_count or not content_json['results']:
                    url = base_search_url + '/dataset?organization=%s&offset=%s' % (
                        organization, len(pkg_ids))
                    content = self._get_content(url)
                    content_json = json.loads(content)
                    pkg_ids |= set(content_json['results'])
            return pkg_ids

        include_pkg_ids = get_pkg_ids_for_organizations(org_filter_include)
        exclude_pkg_ids = get_pkg_ids_for_organizations(org_filter_exclude)

        if (previous_job and not previous_job.gather_errors
                and not len(previous_job.objects) == 0):
            if not self.config.get('force_all', False):
                get_all_packages = True

        if get_all_packages:
            # Request all remote packages
            log.error("Request all remote packages")
            url = base_rest_url + '/package'
            log.error(url)
            try:
                content = self._get_content(url)
                package_ids = json.loads(content)
            except ContentFetchError, e:
                log.error("Unable to get content for URL")
                self._save_gather_error(
                    'Unable to get content for URL: %s: %s' % (url, str(e)),
                    harvest_job)
                return None
            except JSONDecodeError, e:
                log.error("Unable to decode content for URL")
                self._save_gather_error(
                    'Unable to decode content for URL: %s: %s' % (url, str(e)),
                    harvest_job)
                return None
Пример #41
0
    def before_index(self, pkg_dict):
        import shapely
        import shapely.geometry

        if pkg_dict.get('extras_spatial', None) and self.search_backend in (
                'solr', 'solr-spatial-field'):
            try:
                geometry = json.loads(pkg_dict['extras_spatial'])
            except ValueError as e:
                log.error('Geometry not valid GeoJSON, not indexing')
                return pkg_dict

            if self.search_backend == 'solr':
                # Only bbox supported for this backend
                if not (geometry['type'] == 'Polygon'
                        and len(geometry['coordinates']) == 1
                        and len(geometry['coordinates'][0]) == 5):
                    log.error(
                        'Solr backend only supports bboxes (Polygons with 5 points), ignoring geometry {0}'
                        .format(pkg_dict['extras_spatial']))
                    return pkg_dict

                coords = geometry['coordinates']
                pkg_dict['maxy'] = max(coords[0][2][1], coords[0][0][1])
                pkg_dict['miny'] = min(coords[0][2][1], coords[0][0][1])
                pkg_dict['maxx'] = max(coords[0][2][0], coords[0][0][0])
                pkg_dict['minx'] = min(coords[0][2][0], coords[0][0][0])
                pkg_dict['bbox_area'] = (pkg_dict['maxx'] - pkg_dict['minx']) * \
                                        (pkg_dict['maxy'] - pkg_dict['miny'])

            elif self.search_backend == 'solr-spatial-field':
                wkt = None

                # Check potential problems with bboxes
                if geometry['type'] == 'Polygon' \
                   and len(geometry['coordinates']) == 1 \
                   and len(geometry['coordinates'][0]) == 5:

                    # Check wrong bboxes (4 same points)
                    xs = [p[0] for p in geometry['coordinates'][0]]
                    ys = [p[1] for p in geometry['coordinates'][0]]

                    if xs.count(xs[0]) == 5 and ys.count(ys[0]) == 5:
                        wkt = 'POINT({x} {y})'.format(x=xs[0], y=ys[0])
                    else:
                        # Check if coordinates are defined counter-clockwise,
                        # otherwise we'll get wrong results from Solr
                        lr = shapely.geometry.polygon.LinearRing(
                            geometry['coordinates'][0])
                        if not lr.is_ccw:
                            lr.coords = list(lr.coords)[::-1]
                        polygon = shapely.geometry.polygon.Polygon(lr)
                        wkt = polygon.wkt

                if not wkt:
                    shape = shapely.geometry.asShape(geometry)
                    if not shape.is_valid:
                        log.error(
                            'Wrong geometry, not indexing package {0}'.format(
                                pkg_dict.get('name')))

                        return pkg_dict
                    wkt = shape.wkt

                pkg_dict['spatial_geom'] = wkt

        return pkg_dict
Пример #42
0
    def import_stage(self, harvest_object):
        package_dict = json.loads(harvest_object.content)

        if not self._should_import_local(package_dict):
            package_dict['state'] = 'deleted'
        else:
            package_dict = self._apply_package_extras_white_list(package_dict)
            package_dict = self._apply_package_resource_extras_black_list(
                package_dict)
            package_dict = self._fix_date_in_fields(package_dict)
            package_dict = self._set_license(package_dict)

        package_dict = self._pop_black_list_resources_by_type(package_dict)
        harvest_object.content = json.dumps(package_dict)
        upload_resources = self._pop_upload_resources(package_dict)

        import_stage_result = super(GuiaHarvesterPlugin,
                                    self).import_stage(harvest_object)

        if import_stage_result:
            package_dict = json.loads(harvest_object.content)
            harvested_rels = package_dict.get('relationships', [])
            try:
                this_package = model.Package.get(package_dict['name'])
                if not this_package: raise logic.NotFound()
            except logic.NotFound as nf:
                log.info(
                    'import_stage(): could not find package "{0}"; relationships not updated: {1}'
                    .format(package_dict['name'], nf))
                return import_stage_result

            existing_rels = this_package.get_relationships()
            self._update_relationships(existing_rels, harvested_rels)

            for resource_dict in upload_resources:
                resource_url = resource_dict['url']
                resource_filename = resource_url.split('/')[-1]

                try:
                    response = requests.get(resource_url)
                    resource_file = StringIO(response.content)
                except Exception, e:
                    self._save_object_error(
                        'Resource not harvested for package "{0}". Unable to fetch resource from "{1}": {2}'
                        .format(package_dict['name'], resource_url,
                                e), harvest_object, 'Import')
                    continue

                cfs = FieldStorage()
                cfs.file = resource_file
                cfs.filename = resource_filename
                resource_dict['upload'] = cfs
                if 'created' in resource_dict: del resource_dict['created']
                if 'last_modified' in resource_dict:
                    del resource_dict['last_modified']
                if 'api' in resource_dict: del resource_dict['api']

                try:
                    the_resource = toolkit.get_action('resource_create')(
                        data_dict=resource_dict)
                except Exception, e:
                    self._save_object_error(
                        'Resource not harvested for package "{0}". Unable to import the resource originally from "{1}": {2}'
                        .format(package_dict['name'], resource_url,
                                e), harvest_object, 'Import')
                    continue
Пример #43
0
 def loads(self, chars):
     try:
         return json.loads(chars)
     except ValueError, inst:
         raise Exception, "Couldn't loads string '%s': %s" % (chars, inst)
Пример #44
0
    def import_stage(self, harvest_object):
        log.debug('In HRIHarvester import_stage')

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name()
        }
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)

            if package_dict.get('type') == 'harvest':
                log.warn('Remote dataset is a harvest source, ignoring...')
                return True

            # Set default translations
            lang = ckan_config['ckan.locale_default']

            def translated_field(name):
                translated = package_dict.get('%s_translated' % name, {})
                translated[lang] = translated.get(lang, package_dict[name])
                # Process translations added as extras
                translated.update((e['key'].split('_', 2)[1], e['value'])
                                  for e in package_dict.get('extras', [])
                                  if e['key'].startswith('%s_' % name))
                return translated

            def translated_extra_list(name):
                translated = {lang: []}
                for x in package_dict.get('extras', []):
                    if x['key'] == name and len(x['value']) > 2:
                        translated[lang] = [x['value']]

                package_dict['extras'] = [
                    x for x in package_dict.get('extras', [])
                    if x['key'] != name
                ]
                return translated

            package_dict['title_translated'] = translated_field('title')
            package_dict['notes_translated'] = translated_field('notes')
            package_dict['update_frequency'] = translated_extra_list(
                'update_frequency')

            # Set default values for required fields
            default_values = {
                'maintainer':
                package_dict.get('author') or '(not set)',
                'maintainer_email':
                package_dict.get('author_email') or '(not set)',
            }
            missing_values = ((k, v) for k, v in default_values.iteritems()
                              if not package_dict.get(k))
            package_dict.update(missing_values)

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if 'tags' not in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend(
                    [t for t in default_tags if t not in package_dict['tags']])

            keywords = package_dict.get('keywords', {})
            keywords[lang] = keywords.get(
                lang, [x['name'] for x in package_dict['tags']])
            package_dict['keywords'] = keywords

            remote_groups = self.config.get('remote_groups', None)
            if remote_groups not in ('only_local', 'create'):
                # Ignore remote groups
                package_dict.pop('groups', None)
            else:
                if 'groups' not in package_dict:
                    package_dict['groups'] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []

                for group_name in package_dict['groups']:
                    try:
                        data_dict = {'id': group_name}
                        group = get_action('group_show')(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group['name'])
                        else:
                            validated_groups.append(group['id'])
                    except NotFound, e:
                        log.info('Group %s is not available', group_name)
                        if remote_groups == 'create':
                            try:
                                group = self._get_group(
                                    harvest_object.source.url, group_name)
                            except RemoteResourceError:
                                log.error('Could not get remote group %s',
                                          group_name)
                                continue

                            for key in [
                                    'packages', 'created', 'users', 'groups',
                                    'tags', 'extras', 'display_name'
                            ]:
                                group.pop(key, None)

                            get_action('group_create')(context, group)
                            log.info('Group %s has been newly created',
                                     group_name)
                            if self.api_version == 1:
                                validated_groups.append(group['name'])
                            else:
                                validated_groups.append(group['id'])

                package_dict['groups'] = validated_groups

            # Find if remote org exists locally, otherwise don't import dataset
            if 'owner_org' not in package_dict:
                package_dict['owner_org'] = None

            remote_org = None
            if package_dict.get('organization'):
                remote_org = package_dict['organization']['name']

            if remote_org:
                try:
                    data_dict = {'id': remote_org}
                    org = get_action('organization_show')(context, data_dict)
                    package_dict['owner_org'] = org['id']
                except NotFound:
                    log.info('No organization exist, not importing dataset')
                    return "unchanged"
            else:
                log.info('No organization in harvested dataset')
                return "unchanged"

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                if 'groups' not in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([
                    g for g in default_groups
                    if g not in package_dict['groups']
                ])

            # Map fields
            fields_to_map = [('url', 'maintainer_website')]
            for key_from, key_to in fields_to_map:
                if key_to not in package_dict and key_from in package_dict:
                    package_dict[key_to] = package_dict[key_from]

            # Rename extras
            extras_to_rename_keys = {
                'geographic_coverage': 'geographical_coverage',
                'temporal_coverage-from': 'valid_from',
                'temporal_coverage-to': 'valid_till',
                'source': 'owner'
            }

            def map_extra(e):
                result = {}
                result.update(e)
                result['key'] = extras_to_rename_keys.get(e['key'], e['key'])
                return result

            package_dict['extras'] = [
                map_extra(extra) for extra in package_dict.get('extras', [])
            ]

            # Set default extras if needed
            default_extras = self.config.get('default_extras', {})

            def get_extra(key, package_dict):
                for extra in package_dict.get('extras', []):
                    if extra['key'] == key:
                        return extra

            if default_extras:
                override_extras = self.config.get('override_extras', False)
                if 'extras' not in package_dict:
                    package_dict['extras'] = {}
                for key, value in default_extras.iteritems():
                    existing_extra = get_extra(key, package_dict)
                    if existing_extra and not override_extras:
                        continue  # no need for the default
                    if existing_extra:
                        package_dict['extras'].remove(existing_extra)
                    # Look for replacement strings
                    if isinstance(value, basestring):
                        value = value.format(
                            harvest_source_id=harvest_object.job.source.id,
                            harvest_source_url=harvest_object.job.source.url.
                            strip('/'),
                            harvest_source_title=harvest_object.job.source.
                            title,
                            harvest_job_id=harvest_object.job.id,
                            harvest_object_id=harvest_object.id,
                            dataset_id=package_dict['id'])

                    package_dict['extras'].append({'key': key, 'value': value})

            # Convert extras from strings to datetimes
            extras_to_datetimes = ['valid_from', 'valid_till']

            def map_extra_to_date(e):
                if e['key'] not in extras_to_datetimes:
                    return e
                result = {}
                result.update(e)
                result['value'] = self._parse_datetime(e['value'])
                return result

            package_dict['extras'] = [
                map_extra_to_date(extra)
                for extra in package_dict.get('extras', [])
            ]

            # Move extras to fields
            extras_to_fields_keys = [
                'collection_type', 'geographical_coverage', 'valid_from',
                'valid_till', 'owner'
            ]
            extras_to_fields = [
                x for x in package_dict.get('extras', [])
                if x['key'] in extras_to_fields_keys
                and x['key'] not in package_dict
            ]

            for x in extras_to_fields:
                package_dict[x['key']] = x['value']

            package_dict['extras'] = [
                x for x in package_dict.get('extras', [])
                if x['key'] not in extras_to_fields_keys
            ]

            for resource in package_dict.get('resources', []):
                # Clear remote url_type for resources (eg datastore, upload) as
                # we are only creating normal resources with links to the
                # remote ones
                resource.pop('url_type', None)

                # Clear revision_id as the revision won't exist on this CKAN
                # and saving it will cause an IntegrityError with the foreign
                # key.
                resource.pop('revision_id', None)

            # Ensure imported tags are valid
            tag_string_fields = ['geographical_coverage']
            for field in tag_string_fields:
                package_dict[field] = [
                    t for t in self._parse_tag_string(
                        package_dict.get(field, '')) if t
                ]

            # Create or update package
            result = self._create_or_update_package(
                package_dict, harvest_object, package_dict_form='package_show')

            return result
Пример #45
0
    def check_spatial_extra(self, package):
        '''
        For a given package, looks at the spatial extent (as given in the
        extra "spatial" in GeoJSON format) and records it in PostGIS.
        '''
        from ckanext.spatial.lib import save_package_extent

        if not package.id:
            log.warning(
                'Couldn\'t store spatial extent because no id was provided for the package'
            )
            return

        # TODO: deleted extra
        for extra in package.extras_list:
            if extra.key == 'spatial':
                if extra.state == 'active' and extra.value:
                    try:
                        log.debug('Received: %r' % extra.value)
                        geometry = json.loads(extra.value)
                    except ValueError as e:
                        error_dict = {
                            'spatial': [
                                u'Error decoding JSON object: %s' %
                                six.text_type(e)
                            ]
                        }
                        raise p.toolkit.ValidationError(
                            error_dict,
                            error_summary=package_error_summary(error_dict))
                    except TypeError as e:
                        error_dict = {
                            'spatial': [
                                u'Error decoding JSON object: %s' %
                                six.text_type(e)
                            ]
                        }
                        raise p.toolkit.ValidationError(
                            error_dict,
                            error_summary=package_error_summary(error_dict))

                    try:
                        save_package_extent(package.id, geometry)

                    except ValueError as e:
                        error_dict = {
                            'spatial': [
                                u'Error creating geometry: %s' %
                                six.text_type(e)
                            ]
                        }
                        raise p.toolkit.ValidationError(
                            error_dict,
                            error_summary=package_error_summary(error_dict))
                    except Exception as e:
                        if bool(os.getenv('DEBUG')):
                            raise
                        error_dict = {
                            'spatial': [u'Error: %s' % six.text_type(e)]
                        }
                        raise p.toolkit.ValidationError(
                            error_dict,
                            error_summary=package_error_summary(error_dict))

                elif (extra.state == 'active'
                      and not extra.value) or extra.state == 'deleted':
                    # Delete extent from table
                    save_package_extent(package.id, None)

                break
Пример #46
0
 def _get_extent_object(self, geometry):
     if isinstance(geometry, six.string_types):
         geometry = json.loads(geometry)
     shape = asShape(geometry)
     return PackageExtent(package_id="xxx",
                          the_geom=WKTElement(shape.wkt, 4326))
Пример #47
0
    def import_stage(self, harvest_object):
        log.debug('In CKANHarvester import_stage')

        base_context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name()
        }
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)

            if package_dict.get('type') == 'harvest':
                log.warn('Remote dataset is a harvest source, ignoring...')
                return True

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if 'tags' not in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend(
                    [t for t in default_tags if t not in package_dict['tags']])

            remote_groups = self.config.get('remote_groups', None)
            if remote_groups not in ('only_local', 'create'):
                # Ignore remote groups
                package_dict.pop('groups', None)
            else:
                if 'groups' not in package_dict:
                    package_dict['groups'] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []

                for group_ in package_dict['groups']:
                    try:
                        try:
                            if 'id' in group_:
                                data_dict = {'id': group_['id']}
                                group = get_action('group_show')(
                                    base_context.copy(), data_dict)
                            else:
                                raise NotFound

                        except NotFound as e:
                            if 'name' in group_:
                                data_dict = {'id': group_['name']}
                                group = get_action('group_show')(
                                    base_context.copy(), data_dict)
                            else:
                                raise NotFound
                        # Found local group
                        validated_groups.append({
                            'id': group['id'],
                            'name': group['name']
                        })

                    except NotFound as e:
                        log.info('Group %s is not available', group_)
                        if remote_groups == 'create':
                            try:
                                group = self._get_group(
                                    harvest_object.source.url, group_)
                            except RemoteResourceError:
                                log.error('Could not get remote group %s',
                                          group_)
                                continue

                            for key in [
                                    'packages', 'created', 'users', 'groups',
                                    'tags', 'extras', 'display_name'
                            ]:
                                group.pop(key, None)

                            get_action('group_create')(base_context.copy(),
                                                       group)
                            log.info('Group %s has been newly created', group_)
                            validated_groups.append({
                                'id': group['id'],
                                'name': group['name']
                            })

                package_dict['groups'] = validated_groups

            # Local harvest source organization
            source_dataset = get_action('package_show')(
                base_context.copy(), {
                    'id': harvest_object.source.id
                })
            local_org = source_dataset.get('owner_org')

            remote_orgs = self.config.get('remote_orgs', None)

            if remote_orgs not in ('only_local', 'create'):
                # Assign dataset to the source organization
                package_dict['owner_org'] = local_org
            else:
                if 'owner_org' not in package_dict:
                    package_dict['owner_org'] = None

                # check if remote org exist locally, otherwise remove
                validated_org = None
                remote_org = package_dict['owner_org']

                if remote_org:
                    try:
                        data_dict = {'id': remote_org}
                        org = get_action('organization_show')(
                            base_context.copy(), data_dict)
                        validated_org = org['id']
                    except NotFound as e:
                        log.info('Organization %s is not available',
                                 remote_org)
                        if remote_orgs == 'create':
                            try:
                                try:
                                    org = self._get_organization(
                                        harvest_object.source.url, remote_org)
                                except RemoteResourceError:
                                    # fallback if remote CKAN exposes organizations as groups
                                    # this especially targets older versions of CKAN
                                    org = self._get_group(
                                        harvest_object.source.url, remote_org)

                                for key in [
                                        'packages', 'created', 'users',
                                        'groups', 'tags', 'extras',
                                        'display_name', 'type'
                                ]:
                                    org.pop(key, None)
                                get_action('organization_create')(
                                    base_context.copy(), org)
                                log.info(
                                    'Organization %s has been newly created',
                                    remote_org)
                                validated_org = org['id']
                            except (RemoteResourceError, ValidationError):
                                log.error('Could not get remote org %s',
                                          remote_org)

                package_dict['owner_org'] = validated_org or local_org

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                if 'groups' not in package_dict:
                    package_dict['groups'] = []
                existing_group_ids = [g['id'] for g in package_dict['groups']]
                package_dict['groups'].extend([
                    g for g in self.config['default_group_dicts']
                    if g['id'] not in existing_group_ids
                ])

            # Set default extras if needed
            default_extras = self.config.get('default_extras', {})

            def get_extra(key, package_dict):
                for extra in package_dict.get('extras', []):
                    if extra['key'] == key:
                        return extra

            if default_extras:
                override_extras = self.config.get('override_extras', False)
                if 'extras' not in package_dict:
                    package_dict['extras'] = []
                for key, value in default_extras.items():
                    existing_extra = get_extra(key, package_dict)
                    if existing_extra and not override_extras:
                        continue  # no need for the default
                    if existing_extra:
                        package_dict['extras'].remove(existing_extra)
                    # Look for replacement strings
                    if isinstance(value, six.string_types):
                        value = value.format(
                            harvest_source_id=harvest_object.job.source.id,
                            harvest_source_url=harvest_object.job.source.url.
                            strip('/'),
                            harvest_source_title=harvest_object.job.source.
                            title,
                            harvest_job_id=harvest_object.job.id,
                            harvest_object_id=harvest_object.id,
                            dataset_id=package_dict['id'])

                    package_dict['extras'].append({'key': key, 'value': value})

            for resource in package_dict.get('resources', []):
                # Clear remote url_type for resources (eg datastore, upload) as
                # we are only creating normal resources with links to the
                # remote ones
                resource.pop('url_type', None)

                # Clear revision_id as the revision won't exist on this CKAN
                # and saving it will cause an IntegrityError with the foreign
                # key.
                resource.pop('revision_id', None)

            package_dict = self.modify_package_dict(package_dict,
                                                    harvest_object)

            result = self._create_or_update_package(
                package_dict, harvest_object, package_dict_form='package_show')

            return result
        except ValidationError as e:
            self._save_object_error(
                'Invalid package with GUID %s: %r' %
                (harvest_object.guid, e.error_dict), harvest_object, 'Import')
        except Exception as e:
            self._save_object_error('%s' % e, harvest_object, 'Import')
Пример #48
0
    def fetch_stage(self, harvest_object):

        # Check harvest object status
        status = self._get_object_extra(harvest_object, 'status')

        if status == 'delete':
            # No need to fetch anything, just pass to the import stage
            return True

        log = logging.getLogger(__name__ + '.CSW.fetch')
        log.debug('CswHarvester fetch_stage for object: %s', harvest_object.id)

        url = harvest_object.source.url
        try:
            self._setup_csw_client(url)
        except Exception as e:
            self._save_object_error('Error contacting the CSW server: %s' % e,
                                    harvest_object)
            return False

        identifier = harvest_object.guid
        esn = self.source_config.get('esn', 'full')
        try:
            record = self.csw.getrecordbyid([identifier],
                                            outputschema=self.output_schema())
        except Exception as e:
            self._save_object_error(
                'Error getting the CSW record with GUID %s' % identifier,
                harvest_object)
            return False

        if record is None:
            self._save_object_error('Empty record for GUID %s' % identifier,
                                    harvest_object)
            return False

        source_config = json.loads(harvest_object.source.config
                                   ) if harvest_object.source.config else {}
        require_keywords = source_config.get('require_keywords', None)
        if require_keywords:
            record_keywords = set()
            for keyword_container in record.get('identification',
                                                {}).get('keywords', []):
                keywords = keyword_container.get('keywords', None)
                if keywords and isinstance(keywords, list):
                    record_keywords.update(keywords)

            if not set(require_keywords).issubset(record_keywords):
                status_extra = self._get_extra(harvest_object, 'status')
                if status_extra is None:
                    self._save_object_error(
                        'No status set for object with GUID %s' % identifier,
                        harvest_object)
                    return False
                status_extra.value = 'delete'
                status_extra.save()

                # Should not be processed further
                return 'unchanged'
            else:
                log.info("Found tagged record with guid %s" % identifier)

        require_in_abstract = source_config.get('require_in_abstract', None)
        if require_in_abstract:
            if not record.get('identification', {}).get('abstract', '') or\
                    require_in_abstract not in record.get('identification', {}).get('abstract', ""):
                status_extra = self._get_extra(harvest_object, 'status')
                if status_extra is None:
                    self._save_object_error(
                        'No status set for object with GUID %s' % identifier,
                        harvest_object)
                    return False
                status_extra.value = 'delete'
                status_extra.save()

                # Should not be processed further
                return 'unchanged'
            else:
                log.info("Found tagged record with guid %s" % identifier)

        try:
            # Save the fetch contents in the HarvestObject
            # Contents come from csw_client already declared and encoded as utf-8
            # Remove original XML declaration
            content = re.sub(r'<\?xml(.*)\?>', '', record['xml'])

            harvest_object.content = content.strip()
            harvest_object.save()
        except Exception as e:
            self._save_object_error('Error saving the harvest object for GUID %s [%r]' % \
                                    (identifier, e), harvest_object)
            return False

        log.debug('XML content saved (len %s)', len(record['xml']))
        return True
Пример #49
0
    def import_stage(self, harvest_object):
        log.debug('In CKANHarvester import_stage')

        base_context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name()
        }
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)

            if package_dict.get('type') == 'harvest':
                log.warn('Remote dataset is a harvest source, ignoring...')
                return True

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend(
                    [t for t in default_tags if t not in package_dict['tags']])

            remote_groups = self.config.get('remote_groups', None)
            if not remote_groups in ('only_local', 'create'):
                # Ignore remote groups
                package_dict.pop('groups', None)
            else:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []

                for group_ in package_dict['groups']:
                    try:
                        try:
                            if 'id' in group_:
                                data_dict = {'id': group_['id']}
                                group = get_action('group_show')(
                                    base_context.copy(), data_dict)
                            else:
                                raise NotFound

                        except NotFound, e:
                            if 'name' in group_:
                                data_dict = {'id': group_['name']}
                                group = get_action('group_show')(
                                    base_context.copy(), data_dict)
                            else:
                                raise NotFound
                        # Found local group
                        validated_groups.append({
                            'id': group['id'],
                            'name': group['name']
                        })

                    except NotFound, e:
                        log.info('Group %s is not available', group_)
                        if remote_groups == 'create':
                            try:
                                group = self._get_group(
                                    harvest_object.source.url, group_)
                            except RemoteResourceError:
                                log.error('Could not get remote group %s',
                                          group_)
                                continue

                            for key in [
                                    'packages', 'created', 'users', 'groups',
                                    'tags', 'extras', 'display_name'
                            ]:
                                group.pop(key, None)

                            get_action('group_create')(base_context.copy(),
                                                       group)
                            log.info('Group %s has been newly created', group_)
                            validated_groups.append({
                                'id': group['id'],
                                'name': group['name']
                            })
Пример #50
0
    def validate_config(self, config):
        if not config:
            return config

        try:
            config_obj = json.loads(config)

            if 'api_version' in config_obj:
                try:
                    int(config_obj['api_version'])
                except ValueError:
                    raise ValueError('api_version must be an integer')

            if 'default_tags' in config_obj:
                if not isinstance(config_obj['default_tags'], list):
                    raise ValueError('default_tags must be a list')
                if config_obj['default_tags'] and \
                        not isinstance(config_obj['default_tags'][0], dict):
                    raise ValueError('default_tags must be a list of '
                                     'dictionaries')

            if 'default_groups' in config_obj:
                if not isinstance(config_obj['default_groups'], list):
                    raise ValueError('default_groups must be a *list* of group'
                                     ' names/ids')
                if config_obj['default_groups'] and \
                        not isinstance(config_obj['default_groups'][0],
                                       six.string_types):
                    raise ValueError('default_groups must be a list of group '
                                     'names/ids (i.e. strings)')

                # Check if default groups exist
                context = {'model': model, 'user': toolkit.c.user}
                config_obj['default_group_dicts'] = []
                for group_name_or_id in config_obj['default_groups']:
                    try:
                        group = get_action('group_show')(context, {
                            'id': group_name_or_id
                        })
                        # save the dict to the config object, as we'll need it
                        # in the import_stage of every dataset
                        config_obj['default_group_dicts'].append(group)
                    except NotFound as e:
                        raise ValueError('Default group not found')
                config = json.dumps(config_obj)

            if 'default_extras' in config_obj:
                if not isinstance(config_obj['default_extras'], dict):
                    raise ValueError('default_extras must be a dictionary')

            if 'organizations_filter_include' in config_obj \
                    and 'organizations_filter_exclude' in config_obj:
                raise ValueError(
                    'Harvest configuration cannot contain both '
                    'organizations_filter_include and organizations_filter_exclude'
                )

            if 'groups_filter_include' in config_obj \
                    and 'groups_filter_exclude' in config_obj:
                raise ValueError(
                    'Harvest configuration cannot contain both '
                    'groups_filter_include and groups_filter_exclude')

            if 'user' in config_obj:
                # Check if user exists
                context = {'model': model, 'user': toolkit.c.user}
                try:
                    get_action('user_show')(context, {
                        'id': config_obj.get('user')
                    })
                except NotFound:
                    raise ValueError('User not found')

            for key in ('read_only', 'force_all'):
                if key in config_obj:
                    if not isinstance(config_obj[key], bool):
                        raise ValueError('%s must be boolean' % key)

        except ValueError as e:
            raise e

        return config
    def import_stage(self, harvest_object):
        log.debug("In CKANHarvester import_stage")

        context = {"model": model, "session": Session, "user": self._get_user_name()}
        if not harvest_object:
            log.error("No harvest object received")
            return False

        if harvest_object.content is None:
            self._save_object_error(
                "Empty content for object %s" % harvest_object.id,
                harvest_object,
                "Import",
            )
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)

            if package_dict.get("type") == "harvest":
                log.warn("Remote dataset is a harvest source, ignoring...")
                return True

            # Set default tags if needed
            default_tags = self.config.get("default_tags", [])
            if default_tags:
                if not "tags" in package_dict:
                    package_dict["tags"] = []
                package_dict["tags"].extend(
                    [t for t in default_tags if t not in package_dict["tags"]]
                )

            remote_groups = self.config.get("remote_groups", None)
            if not remote_groups in ("only_local", "create"):
                # Ignore remote groups
                package_dict.pop("groups", None)
            else:
                if not "groups" in package_dict:
                    package_dict["groups"] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []

                for group_name in package_dict["groups"]:
                    try:
                        data_dict = {"id": group_name}
                        group = get_action("group_show")(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group["name"])
                        else:
                            validated_groups.append(group["id"])
                    except NotFound as e:
                        log.info("Group %s is not available" % group_name)
                        if remote_groups == "create":
                            try:
                                group = self._get_group(
                                    harvest_object.source.url, group_name
                                )
                            except RemoteResourceError:
                                log.error("Could not get remote group %s" % group_name)
                                continue

                            for key in [
                                "packages",
                                "created",
                                "users",
                                "groups",
                                "tags",
                                "extras",
                                "display_name",
                            ]:
                                group.pop(key, None)

                            get_action("group_create")(context, group)
                            log.info("Group %s has been newly created" % group_name)
                            if self.api_version == 1:
                                validated_groups.append(group["name"])
                            else:
                                validated_groups.append(group["id"])

                package_dict["groups"] = validated_groups

            # Local harvest source organization
            source_dataset = get_action("package_show")(
                context, {"id": harvest_object.source.id}
            )
            local_org = source_dataset.get("owner_org")

            remote_orgs = self.config.get("remote_orgs", None)

            if not remote_orgs in ("only_local", "create"):
                # Assign dataset to the source organization
                package_dict["owner_org"] = local_org
            else:
                if not "owner_org" in package_dict:
                    package_dict["owner_org"] = None

                # check if remote org exist locally, otherwise remove
                validated_org = None
                remote_org = package_dict["owner_org"]

                if remote_org:
                    try:
                        data_dict = {"id": remote_org}
                        org = get_action("organization_show")(context, data_dict)
                        validated_org = org["id"]
                    except NotFound as e:
                        log.info("Organization %s is not available" % remote_org)
                        if remote_orgs == "create":
                            try:
                                try:
                                    org = self._get_organization(
                                        harvest_object.source.url, remote_org
                                    )
                                except RemoteResourceError:
                                    # fallback if remote CKAN exposes organizations as groups
                                    # this especially targets older versions of CKAN
                                    org = self._get_group(
                                        harvest_object.source.url, remote_org
                                    )

                                for key in [
                                    "packages",
                                    "created",
                                    "users",
                                    "groups",
                                    "tags",
                                    "extras",
                                    "display_name",
                                    "type",
                                ]:
                                    org.pop(key, None)
                                get_action("organization_create")(context, org)
                                log.info(
                                    "Organization %s has been newly created"
                                    % remote_org
                                )
                                validated_org = org["id"]
                            except (RemoteResourceError, ValidationError):
                                log.error("Could not get remote org %s" % remote_org)

                package_dict["owner_org"] = validated_org or local_org

            # Set default groups if needed
            default_groups = self.config.get("default_groups", [])
            if default_groups:
                if not "groups" in package_dict:
                    package_dict["groups"] = []
                package_dict["groups"].extend(
                    [g for g in default_groups if g not in package_dict["groups"]]
                )

            # FIXME: enable only if not using ckanext-scheming dataset schemas
            # handle extras in harvested schema
            #
            """
            # Find any extras whose values are not strings and try to convert
            # them to strings, as non-string extras are not allowed anymore in
            # CKAN 2.0.
	    for key in package_dict['extras'].keys():
                if not isinstance(package_dict['extras'][key], basestring):
                    try:
                        package_dict['extras'][key] = json.dumps(
                                package_dict['extras'][key])
                    except TypeError:
                        # If converting to a string fails, just delete it.
                        del package_dict['extras'][key]

            # Set default extras if needed
            default_extras = self.config.get('default_extras',{})
            if default_extras:
                override_extras = self.config.get('override_extras',False)
                if not 'extras' in package_dict:
                    package_dict['extras'] = {}
                for key,value in default_extras.iteritems():
                    if not key in package_dict['extras'] or override_extras:
                        # Look for replacement strings
                        if isinstance(value,basestring):
                            value = value.format(harvest_source_id=harvest_object.job.source.id,
                                     harvest_source_url=harvest_object.job.source.url.strip('/'),
                                     harvest_source_title=harvest_object.job.source.title,
                                     harvest_job_id=harvest_object.job.id,
                                     harvest_object_id=harvest_object.id,
                                     dataset_id=package_dict['id'])

                        package_dict['extras'][key] = value
	    """

            # Clear remote url_type for resources (eg datastore, upload) as we
            # are only creating normal resources with links to the remote ones
            for resource in package_dict.get("resources", []):
                resource.pop("url_type", None)

            result = self._create_or_update_package(package_dict, harvest_object)

            if result and self.config.get("read_only", False) == True:

                package = model.Package.get(package_dict["id"])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get("user", "harvest")
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(
                    package=package, user=user, role=model.Role.ADMIN
                )

                # Other users can only read
                for user_name in ("visitor", "logged_in"):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(
                        package=package, user=user, role=model.Role.READER
                    )

            return True
        except ValidationError as e:
            self._save_object_error(
                "Invalid package with GUID %s: %r"
                % (harvest_object.guid, e.error_dict),
                harvest_object,
                "Import",
            )
        except Exception as e:
            self._save_object_error("%r" % e, harvest_object, "Import")
Пример #52
0
 def _get_search_params(cls, request_params):
     if request_params.has_key('qjson'):
         try:
             params = json.loads(request_params['qjson'], encoding='utf8')
         except ValueError, e:
             raise ValueError, gettext('Malformed qjson value') + ': %r' % e
Пример #53
0
    def import_stage(self, harvest_object):
        '''The import_stage contains lots of boiler plate, updating the
        harvest_objects correctly etc, so inherit this method and customize the
        get_package_dict method.

        * HOExtra.status should have been set to 'new_or_changed' or 'deleted'
          in the gather or fetch stages.
        * It follows that checking that the metadata date has changed should
          have been done in the gather or fetch stages
        * harvest_object.source.config can control default additions to the
          package, for extras etc
        '''

        log.debug('Import stage for harvest object: %s', harvest_object.id)

        if not harvest_object:
            # something has gone wrong with the code
            log.error('No harvest object received')
            self._save_object_error('System error')
            return False
        if harvest_object.content is None:
            # fetched object is blank - error with the harvested server
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        source_config = json.loads(harvest_object.source.config or '{}')

        def get_extra(extras, key):
            for extra in extras:
                if extra.key == key:
                    return extra.value
            return 'new'

        status = get_extra(harvest_object.extras, 'status')
        if not status in ['new', 'changed', 'new_or_changed', 'deleted']:
            log.error('Status is not set correctly: %r', status)
            self._save_object_error('System error', harvest_object, 'Import')
            return False

        # Get the last harvested object (if any)
        previous_object = \
            model.Session.query(HarvestObject) \
                 .filter(HarvestObject.guid == harvest_object.guid) \
                 .filter(HarvestObject.current == True) \
                 .first()

        # Fix the obscure cases where the last harvested object is disconnected
        # from its package
        # i.e. harvest_object where current = true and package_id is null
        if previous_object and not previous_object.package_id:
            pkg = model.Session.query(model.Package) \
                .filter_by(state='active') \
                .join(model.PackageExtra) \
                .filter_by(state='active') \
                .filter_by(key='guid') \
                .filter_by(value=harvest_object.guid) \
                .first()
            if pkg:
                previous_object.package_id = pkg.id
                log.info(
                    'Previous harvest object %s had no package_id - '
                    'have fixed with package: %s', previous_object.id,
                    pkg.name)
            else:
                log.warning(
                    'Previous harvest object %s has no package_id - '
                    'could not fix by finding GUID %r', previous_object.id,
                    harvest_object.guid)

        user = self._get_user_name()

        context = {
            'model': model,
            'session': model.Session,
            'user': user,
            'api_version': 3,
            'extras_as_string': True
        }

        if status == 'delete':
            # Delete package
            tk.get_action('package_delete')(context.copy(), {
                'id': harvest_object.package_id
            })
            log.info('Deleted package {0} with guid {1}'.format(
                harvest_object.package_id, harvest_object.guid))
            previous_object.save()
            self._transfer_current(previous_object, harvest_object)
            return True

        # Set defaults for the package_dict, mainly from the source_config
        package_dict_defaults = PackageDictDefaults()
        package_id = previous_object.package_id if previous_object else None
        package_dict_defaults['id'] = package_id or unicode(uuid.uuid4())
        existing_dataset = model.Package.get(package_id)

        if existing_dataset:
            package_dict_defaults['name'] = existing_dataset.name
        if existing_dataset and existing_dataset.owner_org:
            package_dict_defaults['owner_org'] = existing_dataset.owner_org
        else:
            source_dataset = tk.get_action('package_show')(
                context.copy(), {
                    'id': harvest_object.source.id
                })
            package_dict_defaults['owner_org'] = source_dataset.get(
                'owner_org')

        package_dict_defaults['tags'] = source_config.get('default_tags', [])
        package_dict_defaults['groups'] = source_config.get(
            'default_groups', [])
        package_dict_defaults['extras'] = {
            'import_source':
            'harvest',  # to identify all harvested datasets
            'harvest_object_id':
            harvest_object.id,
            'guid':
            harvest_object.guid,
            'metadata-date':
            harvest_object.metadata_modified_date.strftime('%Y-%m-%d')
            if harvest_object.metadata_modified_date else None,
            # Add provenance for this harvest, so at least that info is saved
            # even if the harvester doesn't fill it in properly with get_provenance().
            'metadata_provenance':
            self.get_metadata_provenance(harvest_object,
                                         harvested_provenance=None),
        }
        default_extras = source_config.get('default_extras', {})
        if default_extras:
            env = dict(
                harvest_source_id=harvest_object.job.source.id,
                harvest_source_url=harvest_object.job.source.url.strip('/'),
                harvest_source_title=harvest_object.job.source.title,
                harvest_job_id=harvest_object.job.id,
                harvest_object_id=harvest_object.id,
                dataset_id=package_dict_defaults['id'])
            for key, value in default_extras.iteritems():
                # Look for replacement strings
                if isinstance(value, basestring):
                    value = value.format(env)
                package_dict_defaults['extras'][key] = value
        if existing_dataset:
            extras_kept = set(
                pylons.config.get('ckan.harvest.extras_not_overwritten',
                                  '').split(' '))
            for extra_key in extras_kept:
                if extra_key in existing_dataset.extras:
                    package_dict_defaults['extras'][extra_key] = \
                        existing_dataset.extras.get(extra_key)

        if status in ('new', 'changed', 'new_or_changed'):
            # There are 2 circumstances that the status is wrong:
            # 1. we are using 'paster import' to reimport this object, yet
            # status is still 'new' from the previous harvest, yet it needs to
            # be 'changed' so that it does a package_update().
            # 2. the first harvest excepted, so status is 'new' because the
            # harvest_object is there, but no package was created.
            # Simplest solution is to set it according to whether there is an
            # existing dataset.
            status = 'changed' if existing_dataset else 'new'
            # FIXME URGENTLY
            # harvest_object.extras
            # harvest_object.set_extra('status', status)
            harvest_object.save()

        try:
            package_dict = self.get_package_dict(harvest_object,
                                                 package_dict_defaults,
                                                 source_config,
                                                 existing_dataset)
        except PackageDictError, e:
            log.error('Harvest PackageDictError in get_package_dict %s %r', e,
                      harvest_object)
            self._save_object_error('Error converting to dataset: %s' % e,
                                    harvest_object, 'Import')
            return False
Пример #54
0
            query['tie'] = '0.1'
            # this minimum match is explained
            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
            query['mm'] = '2<-1 5<80%'
            query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection()
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e.reason))
        try:
            data = json.loads(solr_response)
            response = data['response']
            self.count = response.get('numFound', 0)
            self.results = response.get('docs', [])

            # #1683 Filter out the last row that is sometimes out of order
            self.results = self.results[:rows_to_return]

            # get any extras and add to 'extras' dict
            for result in self.results:
                extra_keys = filter(lambda x: x.startswith('extras_'),
                                    result.keys())
                extras = {}
                for extra_key in extra_keys:
                    value = result.pop(extra_key)
                    extras[extra_key[len('extras_'):]] = value
Пример #55
0
     try:
         if request.method in ['POST', 'PUT']:
             request_data = request.body
         else:
             request_data = None
     except Exception, inst:
         msg = "Could not extract request body data: %s" % \
               (inst)
         raise ValueError(msg)
     cls.log.debug('Retrieved request body: %r' % request.body)
     if not request_data:
         msg = "No request body data"
         raise ValueError(msg)
 if request_data:
     try:
         request_data = json.loads(request_data, encoding='utf8')
     except ValueError, e:
         raise ValueError('Error decoding JSON data. '
                          'Error: %r '
                          'JSON data extracted from the request: %r' %
                          (e, request_data))
     if not isinstance(request_data, dict):
         raise ValueError('Request data JSON decoded to %r but '
                          'it needs to be a dictionary.' % request_data)
     # ensure unicode values
     for key, val in request_data.items():
         # if val is str then assume it is ascii, since json converts
         # utf8 encoded JSON to unicode
         request_data[key] = cls._make_unicode(val)
 cls.log.debug('Request data extracted: %r' % request_data)
 return request_data
Пример #56
0
        else:
            return self._finish_not_found(
                gettext('Unknown register: %s') % register)

    @classmethod
    def _get_search_params(cls, request_params):
        if request_params.has_key('qjson'):
            try:
                params = json.loads(request_params['qjson'], encoding='utf8')
            except ValueError, e:
                raise ValueError, gettext('Malformed qjson value') + ': %r' % e
        elif len(request_params) == 1 and \
                 len(request_params.values()[0]) < 2 and \
                 request_params.keys()[0].startswith('{'):
            # e.g. {some-json}='1' or {some-json}=''
            params = json.loads(request_params.keys()[0], encoding='utf8')
        else:
            params = request_params
        if not isinstance(params, (UnicodeMultiDict, dict)):
            raise ValueError, _(
                'Request params must be in form of a json encoded dictionary.')
        return params

    def markdown(self, ver=None):
        raw_markdown = request.params.get('q', '')
        results = ckan.misc.MarkdownFormat().to_html(raw_markdown)

        return self._finish_ok(results)

    def tag_counts(self, ver=None):
        c.q = request.params.get('q', '')
Пример #57
0
    def import_stage(self, harvest_object):
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        try:
            row = json.loads(harvest_object.content)

            def csplit(txt):
                return [t.strip() for t in txt.split(",")]

            package_dict = {
                'title': row['TITLE'],
                'url': row['URL'],
                'notes': row['LONGDESC'],
                'author': row['AUTHOR_NAME'],
                'maintainer': row['MAINTAINER'],
                'maintainer_email': row['MAINTAINER_EMAIL'],
                'tags': csplit(row['TAGS']),
                'license_id': 'ukcrown',
                'extras': {
                    'date_released': row['RELEASE_DATE'],
                    'categories': csplit(row['CATEGORIES']),
                    'geographical_granularity': row['GEOGRAPHY'],
                    'geographical_coverage': row['EXTENT'],
                    'temporal_granularity': row['UPDATE_FREQUENCY'],
                    'temporal_coverage': row['DATE_RANGE'],
                    'license_summary': row['LICENSE_SUMMARY'],
                    'license_details': row['license_details'],
                    'spatial_reference_system': row['spatial_ref'],
                    'harvest_dataset_url': row['DATASTORE_URL'],
                    # Common extras
                    'harvest_catalogue_name': 'London Datastore',
                    'harvest_catalogue_url': 'http://data.london.gov.uk',
                    'eu_country': 'UK',
                    'eu_nuts1': 'UKI'
                },
                'resources': []
            }

            def pkg_format(prefix, mime_type):
                if row.get(prefix + "_URL"):
                    package_dict['resources'].append({
                        'url':
                        row.get(prefix + "_URL"),
                        'format':
                        mime_type,
                        'description':
                        "%s version" % prefix.lower()
                    })

            pkg_format('EXCEL', 'application/vnd.ms-excel')
            pkg_format('CSV', 'text/csv')
            pkg_format('TAB', 'text/tsv')
            pkg_format('XML', 'text/xml')
            pkg_format('GOOGLEDOCS', 'api/vnd.google-spreadsheet')
            pkg_format('JSON', 'application/json')
            pkg_format('SHP', 'application/octet-stream+esri')
            pkg_format('KML', 'application/vnd.google-earth.kml+xml')
        except Exception, e:
            log.exception(e)
            self._save_object_error('%r' % e, harvest_object, 'Import')
Пример #58
0
    def _search_for_datasets(self, remote_ckan_base_url, fq_terms=None):
        '''Does a dataset search on a remote CKAN and returns the results.
        Deals with paging to return all the results, not just the first page.
        '''
        base_search_url = remote_ckan_base_url + self._get_search_api_offset()
        params = {'rows': '100', 'start': '0'}
        # There is the worry that datasets will be changed whilst we are paging
        # through them.
        # * In SOLR 4.7 there is a cursor, but not using that yet
        #   because few CKANs are running that version yet.
        # * However we sort, then new names added or removed before the current
        #   page would cause existing names on the next page to be missed or
        #   double counted.
        # * Another approach might be to sort by metadata_modified and always
        #   ask for changes since (and including) the date of the last item of
        #   the day before. However if the entire page is of the exact same
        #   time, then you end up in an infinite loop asking for the same page.
        # * We choose a balanced approach of sorting by ID, which means
        #   datasets are only missed if some are removed, which is far less
        #   likely than any being added. If some are missed then it is assumed
        #   they will harvested the next time anyway. When datasets are added,
        #   we are at risk of seeing datasets twice in the paging, so we detect
        #   and remove any duplicates.
        params['sort'] = 'id asc'
        if fq_terms:
            params['fq'] = ' '.join(fq_terms)

        pkg_dicts = []
        pkg_ids = set()
        previous_content = None
        while True:
            url = base_search_url + '?' + urllib.urlencode(params)
            log.debug('Searching for CKAN datasets: %s', url)
            try:
                content = self._get_content(url)
            except ContentFetchError, e:
                raise SearchError('Error sending request to search remote '
                                  'CKAN instance %s using URL %r. Error: %s' %
                                  (remote_ckan_base_url, url, e))

            if previous_content and content == previous_content:
                raise SearchError('The paging doesn\'t seem to work. URL: %s' %
                                  url)
            try:
                response_dict = json.loads(content)
            except ValueError:
                raise SearchError(
                    'Response from remote CKAN was not JSON: %r' % content)
            try:
                pkg_dicts_page = response_dict.get('result',
                                                   {}).get('results', [])
            except ValueError:
                raise SearchError('Response JSON did not contain '
                                  'result/results: %r' % response_dict)

            # Weed out any datasets found on previous pages (should datasets be
            # changing while we page)
            ids_in_page = set(p['id'] for p in pkg_dicts_page)
            duplicate_ids = ids_in_page & pkg_ids
            if duplicate_ids:
                pkg_dicts_page = [
                    p for p in pkg_dicts_page if p['id'] not in duplicate_ids
                ]
            pkg_ids |= ids_in_page

            pkg_dicts.extend(pkg_dicts_page)

            if len(pkg_dicts_page) == 0:
                break

            params['start'] = str(int(params['start']) + int(params['rows']))
Пример #59
0
class PackageSearchQuery(SearchQuery):
    def get_all_entity_ids(self, max_results=1000):
        """
        Return a list of the IDs of all indexed packages.
        """
        query = "*:*"
        fq = "+site_id:\"%s\" " % config.get('ckan.site_id')
        fq += "+state:active "

        conn = make_connection()
        try:
            data = conn.query(query, fq=fq, rows=max_results, fields='id')
        finally:
            conn.close()

        return [r.get('id') for r in data.results]

    def run(self, query):
        '''
        Performs a dataset search using the given query.

        @param query - dictionary with keys like: q, fq, sort, rows, facet
        @return - dictionary with keys results and count
        
        May raise SearchQueryError or SearchError.
        '''
        from solr import SolrException
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS]
            raise SearchQueryError("Invalid search parameters: %s" % invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        query['rows'] = min(1000, int(query.get('rows', 10)))

        # order by score if no 'sort' term given
        order_by = query.get('sort')
        if order_by == 'rank' or order_by is None: 
            query['sort'] = 'score desc, name asc'

        # show only results from this CKAN instance
        fq = query.get('fq', '')
        if not '+site_id:' in fq:
            fq += ' +site_id:"%s"' % config.get('ckan.site_id')

        # filter for package status       
        if not '+state:' in fq:
            fq += " +state:active"
        query['fq'] = fq

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')
        
        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # query field weighting: disabled for now as solr 3.* is required for 
        # the 'edismax' query parser, our current Ubuntu version only has
        # packages for 1.4
        #
        # query['defType'] = 'edismax'
        # query['tie'] = '0.5'
        # query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection()
        log.debug('Package query: %r' % query)
        
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError('SOLR returned an error running query: %r Error: %r' %
                              (query, e.reason))
        try:
            data = json.loads(solr_response)
            response = data['response']
            self.count = response.get('numFound', 0)
            self.results = response.get('docs', [])

            # get any extras and add to 'extras' dict
            for result in self.results:
                extra_keys = filter(lambda x: x.startswith('extras_'), result.keys())
                extras = {}
                for extra_key in extra_keys:
                    value = result.pop(extra_key)
                    extras[extra_key[len('extras_'):]] = value
                if extra_keys:
                    result['extras'] = extras

            # if just fetching the id or name, return a list instead of a dict
            if query.get('fl') in ['id', 'name']:
                self.results = [r.get(query.get('fl')) for r in self.results]

            # get facets and convert facets list to a dict
            self.facets = data.get('facet_counts', {}).get('facet_fields', {})
            for field, values in self.facets.iteritems():
                self.facets[field] = dict(zip(values[0::2], values[1::2]))
        except Exception, e:
            log.exception(e)
            raise SearchError(e)
Пример #60
0
def _getjson(self):
    return json.loads(self.body)