def get_user_info(self, user_id): """ GET to CKAN API to get list of admins https://docs.ckan.org/en/2.8/api/#ckan.logic.action.get.user_show """ url = '{}{}?id={}'.format(self.base_url, self.user_show_url, user_id) headers = self.get_request_headers(include_api_key=True) logger.info(f'GET {url} headers:{headers}') try: req = requests.get(url, headers=headers) except Exception as e: error = 'ERROR getting users information: {} [{}]'.format(url, e) raise content = req.content if req.status_code >= 400: error = 'ERROR getting users information: {} \n\t Status code: {} \n\t content:{}'.format( url, req.status_code, content) logger.error(error) raise Exception(error) try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data from users information {} [{}]'.format( content, e) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def show_package(self, ckan_package_id_or_name): """ GET to CKAN API to show a package/dataset """ url = '{}{}'.format(self.base_url, self.package_show_url) headers = self.get_request_headers(include_api_key=True) data = {'id': ckan_package_id_or_name} logger.info(f'GET {url} headers:{headers} data:{data}') try: req = requests.get(url, params=data, headers=headers) except Exception as e: error = 'ERROR showing CKAN package: {} [{}]'.format(url, e) raise content = req.content if req.status_code >= 400: error = 'ERROR showing CKAN package: {} \n\t Status code: {} \n\t content:{}'.format( url, req.status_code, content) logger.error(error) raise Exception(error) content = req.content try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data from show_package: {} [{}]'.format( content, e) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def validate(self, validator_schema): """ Validate the data.json suorce We need to know which validator to use and two jsonschema definition file at ./validation/schemas/{validator_schema} /catalog.json: definition for full data.json /dataset.json: definition for each dataset """ if validator_schema not in VALID_DATAJSON_SCHEMAS: raise Exception(f'Unknown validator_schema {validator_schema}') self.schema_version = VALID_DATAJSON_SCHEMAS[validator_schema] # check to see if the original json is from a dictionary which will indicate it is a test - we only need to check for encoding errors on real harvests if self.raw_data_json is not None: try: self.data_json = json.loads(self.raw_data_json) except Exception as e: error = 'ERROR parsing JSON: {}. Data: {}'.format( e, self.raw_data_json) self.errors.append(error) logger.error(error) return False error = None if self.data_json is None: error = 'No data json available' elif type(self.data_json) == list: error = 'Data.json is a simple list. We expect a dict' if error is not None: self.errors.append(error) logger.error(error) return False # validate with json schema schemas_folder = os.path.join(os.path.dirname(__file__), 'validation', 'schemas', validator_schema) catalog_schema = os.path.join(schemas_folder, 'catalog.json') if os.path.isfile(catalog_schema): f = open(catalog_schema, 'r') schema = json.load(f) try: jss.validate(instance=self.data_json, schema=schema) except Exception as e: error = "Error validating catalog: {} with schema {}".format( e, schema) self.errors.append(error) return False return True
def fetch(self, clean_url=True, timeout=120): # connect to csw source url = self.get_cleaned_url() if clean_url else self.url try: self.csw = CatalogueServiceWeb(url, timeout=timeout) except Exception as e: error = f'Error connection CSW: {e}' self.errors.append(error) logger.error(error) raise self.read_csw_info()
def clean_tags(tags): ret = [] pattern = re.compile(r'[^A-Za-z0-9\s_\-!?]+') for tag in tags: tag = pattern.sub('', tag).strip() if len(tag) > settings.MAX_TAG_NAME_LENGTH: logger.error('tag is long, cutting: {}'.format(tag)) tag = tag[:settings.MAX_TAG_NAME_LENGTH] elif len(tag) < settings.MIN_TAG_NAME_LENGTH: logger.error('tag is short: {}'.format(tag)) tag += '_' * (settings.MIN_TAG_NAME_LENGTH - len(tag)) if tag != '': ret.append(tag.lower().replace(' ', '-')) # copyin CKAN behaviour return ret
def _is_wms(self, url): ''' Checks if the provided URL actually points to a Web Map Service. Uses owslib WMS reader to parse the response. ''' try: capabilities_url = wms.WMSCapabilitiesReader().capabilities_url(url) res = requests.get(capabilities_url, timeout=10) xml = res.text s = wms.WebMapService(url, xml=xml) raise Exception('is_wms: {}'.format(s.contents)) return isinstance(s.contents, dict) and s.contents != {} except Exception as e: logger.error('WMS check for %s failed with exception: %s' % (url, str(e))) return False
def show_organization( self, organization_id_or_name, method='POST'): # troubles using 2.3 and 2.8 CKAN versions): """ GET to CKAN API to show a organization """ url = '{}{}'.format(self.base_url, self.organization_show_url) headers = self.get_request_headers() data = {'id': organization_id_or_name} logger.info(f'POST {url} headers:{headers} data:{data}') try: if method == 'POST': req = requests.post(url, data=data, headers=headers) else: req = requests.get(url, params=data, headers=headers) except Exception as e: error = 'ERROR showing organization: {} [{}]'.format(url, e) raise content = req.content if req.status_code >= 400 and req.status_code != 404: error = 'ERROR showing organization: {} \n\t Status code: {} \n\t content:{}'.format( url, req.status_code, content) logger.error(error) raise Exception(error) try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data from show_organization: {} [{}]'.format( content, e) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def fetch(self, timeout=30): """ download de data.json file """ logger.info(f'Fetching data from {self.url}') if self.url is None: error = "No URL defined" self.errors.append(error) logger.error(error) raise Exception(error) try: req = requests.get(self.url, timeout=timeout) except Exception as e: error = 'ERROR Donwloading data: {} [{}]'.format(self.url, e) self.errors.append(error) logger.error(error) raise logger.info(f'Data fetched status {req.status_code}') if req.status_code >= 400: error = '{} HTTP error: {}'.format(self.url, req.status_code) self.errors.append(error) logger.error(error) raise Exception(error) logger.info(f'Data fetched OK') self.raw_data_json = req.content
def delete_package(self, ckan_package_id_or_name): """ POST to CKAN API to delete a new package/dataset https://docs.ckan.org/en/2.8/api/#ckan.logic.action.delete.package_delete """ url = '{}{}'.format(self.base_url, self.package_delete_url) headers = self.get_request_headers(include_api_key=True) data = {'id': ckan_package_id_or_name} logger.error(f'POST {url} headers:{headers} data:{data}') try: req = requests.post(url, data=data, headers=headers) except Exception as e: error = 'ERROR deleting CKAN package: {} [{}]'.format(url, e) raise content = req.content if req.status_code >= 400: error = 'ERROR deleting CKAN package: {} \n\t Status code: {} \n\t content:{}'.format( url, req.status_code, content) logger.error(error) raise Exception(error) try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data from delete_package: {} [{}]'.format( content, e) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def validate(self, validator_schema): schemas_folder = os.path.join(os.path.dirname(__file__), 'validation', 'schemas', validator_schema) dataset_schema = os.path.join(schemas_folder, 'dataset.json') if os.path.isfile(dataset_schema): f = open(dataset_schema, 'r') schema = json.load(f) try: jss.validate(self.data, schema=schema) except Exception as e: error = "Error validating dataset: {}".format(e) self.errors.append(error) logger.error(error) return False if validator_schema in ['federal-v1.1', 'federal']: if not self.validate_bureau_code(): return False return True
def update_package(self, ckan_package): """ POST to CKAN API to update a package/dataset ckan_package is just a python dict https://docs.ckan.org/en/2.8/api/#ckan.logic.action.update.package_update """ url = '{}{}'.format(self.base_url, self.package_update_url) headers = self.get_request_headers(include_api_key=True) headers['Content-Type'] = 'application/json' ckan_package = json.dumps(ckan_package) logger.info(f'POST {url} headers:{headers} data:{ckan_package}') try: req = requests.post(url, data=ckan_package, headers=headers) except Exception as e: error = 'ERROR creating CKAN package: {} [{}]'.format(url, e) raise content = req.content if req.status_code >= 400: error = 'ERROR updateing CKAN package: {} \n\t Status code: {} \n\t content:{}'.format( url, req.status_code, content) logger.error(error) raise Exception(error) try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def create_organization(self, organization, check_if_exists=True): """ POST to CKAN API to create a new organization organization is just a python dict https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.organization_create """ logger.info(f'**** Creating Organization {organization}') if check_if_exists: logger.info(f'Exists Organization? {organization}') res = self.show_organization( organization_id_or_name=organization['name']) if res['success']: # do not create logger.info(f'Avoid create Organization {organization}') return res url = '{}{}'.format(self.base_url, self.organization_create_url) headers = self.get_request_headers(include_api_key=True) headers['Content-Type'] = 'application/json' organization = json.dumps(organization) logger.info(f'POST {url} headers:{headers} data:{organization}') try: req = requests.post(url, data=organization, headers=headers) except Exception as e: error = 'ERROR creating [POST] organization: {} [{}]'.format( url, e) raise content = req.content if req.status_code >= 400: error = ('ERROR creating [STATUS] organization: {}' '\n\t Status code: {}' '\n\t content:{}' '\n\t Dataset {}'.format(url, req.status_code, content, organization)) logger.error(error) raise Exception(error) try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) logger.error(error) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def test_create_harvest_source(self): logger.info(f'Creating harvest source from {CKAN_BASE_URL}') cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY) try: cpa.delete_all_harvest_sources(harvest_type='harvest', source_type='datajson') except Exception as e: logger.error(f'Error cleaning previous harvest soures {e}') pass title = 'Energy JSON test {}'.format(random.randint(1, 999999)) url = 'http://www.energy.gov/data-{}.json'.format( random.randint(1, 999999)) res = cpa.create_harvest_source( title=title, url=url, owner_org_id=CKAN_ORG_ID, source_type='datajson', notes='Some tests about local harvesting sources creation', frequency='WEEKLY') self.assertTrue(res['success']) harvest_source = res['result'] logger.info('Created: {}'.format(res['success'])) # read it res = cpa.show_package(ckan_package_id_or_name=harvest_source['id']) self.assertTrue(res['success']) self.assertEqual(harvest_source['url'], url) self.assertEqual(harvest_source['title'], title) self.assertEqual(harvest_source['type'], 'harvest') self.assertEqual(harvest_source['source_type'], 'datajson') # search for it results = cpa.search_harvest_packages(rows=1000, harvest_type='harvest', source_type='datajson') created_ok = False for datasets in results: for dataset in datasets: # print('FOUND: {}'.format(dataset['name'])) if dataset['name'] == harvest_source['name']: created_ok = True logger.info('Found!') else: logger.info('Other harvest source: {}'.format( dataset['name'])) assert created_ok == True # create a dataset with this harvest_soure_id dataset_title = 'Dataset number {}'.format(random.randint(1, 999999)) dataset_name = slugify(dataset_title) tags = [{'name': 'tag81'}, {'name': 'tag82'}] randval = random.randint(1, 999) extras = [ { 'key': 'harvest_source_id', 'value': harvest_source['id'] }, { 'key': 'harvest_source_title', 'value': harvest_source['title'] }, # {'key': 'harvest_object_id', 'value': harvest_source['id']}, # ? not sure { 'key': 'harvest_ng_source_id', 'value': harvest_source['id'] }, { 'key': 'harvest_ng_source_title', 'value': harvest_source['title'] }, { 'key': 'try_a_extra', 'value': randval } ] package = { 'name': dataset_name, 'title': dataset_title, 'owner_org': CKAN_ORG_ID, 'tags': tags, 'extras': extras } res2 = cpa.create_package(ckan_package=package) self.assertTrue(res2['success']) logger.info('Package with harvest source: {}'.format(res2['success'])) # read full dataset res3 = cpa.show_package(ckan_package_id_or_name=dataset_name) self.assertTrue(res3['success']) ckan_dataset = res3['result'] logger.info( 'Package with harvest source readed: {}'.format(ckan_dataset)) assert 'extras' in ckan_dataset assert [str(randval)] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'try_a_extra' ] # my custom ID (not connected to a real harvest ID) assert [harvest_source['id']] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_ng_source_id' ] # check if this package is related to harvest source total_datasets_in_source = 0 datasets_from_source = cpa.search_harvest_packages( harvest_source_id=harvest_source['id']) connected_ok = False for datasets in datasets_from_source: for dataset in datasets: total_datasets_in_source += 1 if dataset['name'] == dataset_name: connected_ok = True logger.info('Found!') else: # we just expect one dataset error = '{} != {} ------ {}'.format( dataset['name'], dataset_name, dataset) logger.error(error) assert error == False assert connected_ok == True assert total_datasets_in_source == 1 logger.info( f' +++++++++++++ total_datasets_in_source={total_datasets_in_source}' ) # this fails, harvest process is more complex that just add an extra # assert [harvest_source['id']] == [extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_source_id'] # delete both logger.info('Delete CKAN package: {}'.format(ckan_dataset['id'])) res4 = cpa.delete_package(ckan_package_id_or_name=ckan_dataset['id']) self.assertTrue(res4['success']) logger.info('Delete Harvest source: {}'.format(harvest_source['id'])) res5 = cpa.delete_package(ckan_package_id_or_name=harvest_source['id']) self.assertTrue(res5['success'])
def create_package( self, ckan_package, on_duplicated='RAISE', # if name already exists 'RAISE' 'SKIP' | 'DELETE' ): """ POST to CKAN API to create a new package/dataset ckan_package is just a python dict https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.package_create Params: - ckan_package: a dict with with a ready-to-save package - on_duplicated (str): action to take where the package already exists: + RAISE: raise an error + SKIP: returns show_package results + DELETE: remove the package and try to create again """ url = '{}{}'.format(self.base_url, self.package_create_url) headers = self.get_request_headers(include_api_key=True) headers['Content-Type'] = 'application/json' ckan_package_str = json.dumps(ckan_package) logger.info(f'POST {url} headers:{headers} data:{ckan_package}') try: req = requests.post(url, data=ckan_package_str, headers=headers) except Exception as e: error = 'ERROR creating [POST] CKAN package: {} [{}]'.format( url, e) raise content = req.content try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) logger.error(error) raise if req.status_code == 409: logger.info(f'409 json_content: {json_content}') # another posible [error] = {'owner_org': ['Organization does not exist']} # Check for duplicates name_errors = json_content['error'][ 'url'] if 'name' in json_content['error'] else [] dataset_exists = len([ ne for ne in name_errors if "That URL is already in use" in ne ]) > 0 url_errors = json_content['error']['url'] if 'url' in json_content[ 'error'] else [] harvest_exists = len([ ue for ue in url_errors if "There already is a Harvest Source for this URL" in ue ]) > 0 is_duplicated = dataset_exists or harvest_exists if is_duplicated: logger.error(f'Already exists! ACTION: {on_duplicated}') if on_duplicated == 'SKIP': # returns {'success': True, 'result': {the package}} res = self.show_package( ckan_package_id_or_name=ckan_package['name']) logger.info(f'Skipped: {res}') return res elif on_duplicated == 'DELETE': delr = self.delete_package( ckan_package_id_or_name=ckan_package['name']) if not delr['success']: raise Exception('Failed to delete {}'.format( ckan_package['name'])) return self.create_package(ckan_package=ckan_package, on_duplicated='RAISE') elif on_duplicated == 'RAISE': error = ('DUPLICATED CKAN package: {}' '\n\t Status code: {}' '\n\t content:{}' '\n\t Dataset {}'.format(url, req.status_code, content, ckan_package)) logger.error(error) raise Exception(error) if req.status_code >= 400: error = ('ERROR creating CKAN package: {}' '\n\t Status code: {}' '\n\t content:{}' '\n\t Dataset {}'.format(url, req.status_code, content, ckan_package)) logger.error(error) raise Exception(error) if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) logger.info(f'Harvest source created: {json_content}') return json_content
def search_packages( self, rows=1000, method='POST', # POST work in CKAN 2.8, fails in 2.3 search_params={}): # datajson for """ search packages. "rows" is the page size. """ start = 0 url = '{}{}'.format(self.base_url, self.package_search_url) page = 0 # TODO check for a real paginated version while url: page += 1 params = {'start': start, 'rows': rows} params.update(search_params) logger.info( f'Searching packages {url} PAGE:{page} start:{start}, rows:{rows} with params: {params}' ) headers = self.get_request_headers() try: if method == 'POST': # depend on CKAN version req = requests.post(url, data=params, headers=headers) else: req = requests.get(url, params=params, headers=headers) except Exception as e: error = 'ERROR Donwloading package list: {} [{}]'.format( url, e) raise ValueError( 'Failed to get package list at {}'.format(url)) content = req.content if req.status_code >= 400: error = ('ERROR searching CKAN package: {}' '\n\t Status code: {}' '\n\t Params: {}' '\n\t content:{}'.format(url, req.status_code, params, content)) logger.error(error) raise Exception(error) try: json_content = json.loads(content) # check for encoding errors except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) raise ValueError(error) if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) raise ValueError(error) result = json_content['result'] results = result['results'] real_results_count = len(results) self.total_packages += real_results_count logger.info(f'{real_results_count} results') if real_results_count == 0: url = None else: start += rows self.package_list += results logger.debug(f'datasets found: {results}') yield (results)
def search_harvest_packages( self, rows=1000, method='POST', # POST work in CKAN 2.8, fails in 2.3 harvest_source_id=None, # just one harvest source harvest_type=None, # harvest for harvest sources source_type=None): """ search harvested packages or harvest sources "rows" is the page size. You could search for an specific harvest_source_id """ start = 0 sort = "metadata_modified desc" url = '{}{}'.format(self.base_url, self.package_search_url) page = 0 # TODO check for a real paginated version while url: page += 1 params = {'start': start, 'rows': rows} # , 'sort': sort} if harvest_source_id is not None: # our new extra is working params['fq'] = f'+harvest_ng_source_id:"{harvest_source_id}"' elif harvest_type is not None: # at my local instance I need this. # I not sure why, in another public instances is not needed params['fq'] = f'+dataset_type:{harvest_type}' if source_type is not None: params[ 'q'] = f'(type:{harvest_type} source_type:{source_type})' else: params['q'] = f'(type:{harvest_type})' logger.info( f'Searching {url} PAGE:{page} start:{start}, rows:{rows} with params: {params}' ) headers = self.get_request_headers() try: logger.info(f'Search harvest packages via {method}') if method == 'POST': # depend on CKAN version req = requests.post(url, data=params, headers=headers) else: req = requests.get(url, params=params, headers=headers) except Exception as e: error = 'ERROR Donwloading package list: {} [{}]'.format( url, e) raise ValueError( 'Failed to get package list at {}'.format(url)) content = req.content if req.status_code >= 400: error = ('ERROR searching CKAN package: {}' '\n\t Status code: {}' '\n\t Params: {}' '\n\t content:{}'.format(url, req.status_code, params, content)) logger.error(error) raise Exception(error) try: json_content = json.loads(content) # check for encoding errors except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) raise ValueError(error) if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) raise ValueError(error) result = json_content['result'] count_results = result['count'] sort_results = result['sort'] facet_results = result['facets'] results = result['results'] real_results_count = len(results) self.total_packages += real_results_count logger.info(f'{real_results_count} results') if real_results_count == 0: url = None else: start += rows self.package_list += results yield (results)
def transform_to_ckan_resource(self): valid, error = self.validate_origin_distribution() if not valid: raise Exception(f'Error validating origin resource/record: {error}') original_resource = self.original_resource ckan_resource = self.get_base_ckan_resource() resource = None if original_resource['type'] == 'resource_locator': """ example_data = { 'resource-locator': [{ 'url': 'http://geonode.state.gov/geoserver/wms?layers=geonode%3ASyria_IDPSites_2015Jun11_HIU_USDoS&width=373&bbox=35.748%2C32.583%2C38.674%2C36.894&service=WMS&format=image%2Fjpeg&srs=EPSG%3A4326&request=GetMap&height=550', 'function': '', 'name': 'Syria_IDPSites_2015Jun11_HIU_USDoS', 'description': 'Syria_IDPSites_2015Jun11_HIU_USDoS (JPEG Format)', 'protocol': 'WWW:DOWNLOAD-1.0-http--download' }] } """ resource_locator = original_resource['data'] url = resource_locator.get('url', '').strip() if url: resource = {} format_from_url = self.guess_resource_format(url) resource['format'] = format_from_url cfg = True # TODO config.get('ckanext.spatial.harvest.validate_wms', False) if resource['format'] == 'wms' and cfg: # Check if the service is a view service test_url = url.split('?')[0] if '?' in url else url if self._is_wms(test_url): resource['verified'] = True resource['verified_date'] = datetime.now().isoformat() resource.update( { 'url': url, 'name': resource_locator.get('name') or 'Unnamed resource', 'description': resource_locator.get('description') or '', 'resource_locator_protocol': resource_locator.get('protocol') or '', 'resource_locator_function': resource_locator.get('function') or '', }) elif original_resource['type'] == 'resource_locator_group_data_format': resource_locator_group_data_format = original_resource['data'] """ sample data ({ 'resource-locator': [{ 'url': 'http://geonode.state.gov/geoserver/wms?layers=geonode%3ASyria_IDPSites_2015Jun11_HIU_USDoS&width=373&bbox=35.748%2C32.583%2C38.674%2C36.894&service=WMS&format=image%2Fjpeg&srs=EPSG%3A4326&request=GetMap&height=550', 'function': '', 'name': 'Syria_IDPSites_2015Jun11_HIU_USDoS', 'description': 'Syria_IDPSites_2015Jun11_HIU_USDoS (JPEG Format)', 'protocol': 'WWW:DOWNLOAD-1.0-http--download' }] }, None) """ resource_locator_group = resource_locator_group_data_format[0] data_format = resource_locator_group_data_format[1] for resource_locator in resource_locator_group['resource-locator']: url = resource_locator.get('url', None) if url is not None: resource = {} format_from_url = self.guess_resource_format(url) resource['format'] = format_from_url if format_from_url else data_format cfg = True # TODO config.get('ckanext.spatial.harvest.validate_wms', False) if resource['format'] == 'wms' and cfg: # Check if the service is a view service test_url = url.split('?')[0] if '?' in url else url if self._is_wms(test_url): resource['verified'] = True resource['verified_date'] = datetime.now().isoformat() resource.update( { 'url': url, 'name': resource_locator.get('name') or 'Unnamed resource', 'description': resource_locator.get('description') or '', 'resource_locator_protocol': resource_locator.get('protocol') or '', 'resource_locator_function': resource_locator.get('function') or '', }) if resource is None: logger.error(f'Unable to parse resource: {original_resource}') return None ckan_resource.update(**resource) valid, error = self.validate_final_resource(ckan_resource) if not valid: raise Exception(f'Error validating final resource/distribution: {error}') return ckan_resource