def get_user_info(self, user_id): """ GET to CKAN API to get list of admins https://docs.ckan.org/en/2.8/api/#ckan.logic.action.get.user_show """ url = '{}{}?id={}'.format(self.base_url, self.user_show_url, user_id) headers = self.get_request_headers(include_api_key=True) logger.info(f'GET {url} headers:{headers}') try: req = requests.get(url, headers=headers) except Exception as e: error = 'ERROR getting users information: {} [{}]'.format(url, e) raise content = req.content if req.status_code >= 400: error = 'ERROR getting users information: {} \n\t Status code: {} \n\t content:{}'.format( url, req.status_code, content) logger.error(error) raise Exception(error) try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data from users information {} [{}]'.format( content, e) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def validate_origin_dataset(self): # check required https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.package_create if self.ckan_owner_org_id is None: error = 'Owner organization ID is required' self.errors.append(error) return False requireds = [] if self.schema == 'usmetadata': requireds += ['accessLevel', 'identifier', 'contactPoint__fn', 'programCode', 'bureauCode', 'contactPoint__hasEmail', 'publisher', 'modified', 'keyword'] ok = True for req in requireds: # read fields considering the __ separator identified = self.identify_origin_element(raw_field=req) if identified in [None, '']: error = f'"{req}" field could not be empty at origin dataset' self.errors.append(error) ok = False if not ok: logger.info(f'requires failed on {self.original_dataset}: {self.errors}') return ok
def show_package(self, ckan_package_id_or_name): """ GET to CKAN API to show a package/dataset """ url = '{}{}'.format(self.base_url, self.package_show_url) headers = self.get_request_headers(include_api_key=True) data = {'id': ckan_package_id_or_name} logger.info(f'GET {url} headers:{headers} data:{data}') try: req = requests.get(url, params=data, headers=headers) except Exception as e: error = 'ERROR showing CKAN package: {} [{}]'.format(url, e) raise content = req.content if req.status_code >= 400: error = 'ERROR showing CKAN package: {} \n\t Status code: {} \n\t content:{}'.format( url, req.status_code, content) logger.error(error) raise Exception(error) content = req.content try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data from show_package: {} [{}]'.format( content, e) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def transform_to_ckan_dataset(self, existing_resources=None): valid, error = self.validate_origin_dataset() if not valid: raise Exception(f'Error validating origin dataset: {error}') dataset = self.original_dataset.get('iso_values', {}) tags = dataset.get('tags', []) cleaned_tags = clean_tags(tags) self.ckan_dataset['tag_string'] = ','.join(cleaned_tags) # previous transformations at origin for old_field, field_ckan in self.mapped_fields.items(): logger.debug(f'Connecting fields "{old_field}", "{field_ckan}"') # identify origin and set value to destination origin = self.identify_origin_element(raw_field=old_field) if origin is None: logger.debug(f'No data in origin for "{old_field}"') else: self.set_destination_element(raw_field=field_ckan, new_value=origin) logger.debug(f'Connected OK fields "{old_field}"="{origin}"') self.infer_resources() self.ckan_dataset['resources'] = self.transform_resources() # custom changes self.fix_licence_url() self.set_browse_graphic() self.set_temporal_extent() self.set_responsible_party() self.set_bbox() # define name (are uniques in CKAN instance) if 'name' not in self.ckan_dataset or self.ckan_dataset['name'] == '': self.ckan_dataset['name'] = self.generate_name(title=self.ckan_dataset['title']) # mandatory self.ckan_dataset['owner_org'] = self.ckan_owner_org_id # clean all empty unused values (can't pop keys while iterating) ckan_dataset_copy = self.ckan_dataset.copy() for k, v in self.ckan_dataset.items(): if v is None: ckan_dataset_copy.pop(k) self.ckan_dataset = ckan_dataset_copy valid = self.validate_final_dataset() if not valid: raise Exception(f'Error validating final dataset: {self.errors} from {self.original_dataset}') logger.info('Dataset transformed {} OK'.format(self.original_dataset.get('identifier', ''))) return self.ckan_dataset
def get_xml_tree(self): if self.xml_tree is None: parser = letree.XMLParser(remove_blank_text=True) if type(self.xml_str) != str: logger.info('XML_STR is not str, is {}: {}'.format( type(self.xml_str), self.xml_str)) xml_str = str(self.xml_str) else: xml_str = self.xml_str # logger.debug(f'Parsing ISO XML {xml_str}') self.xml_tree = letree.fromstring(xml_str, parser=parser) return self.xml_tree
def delete_all_harvest_sources(self, harvest_type='harvest', source_type='datajson'): logger.info(f'Deleting local harvest sources from {self.base_url}') deleted = [] for harvest_sources in self.search_harvest_packages( harvest_type=harvest_type, source_type=source_type): for harvest_source in harvest_sources: harvest_source_name = harvest_source['name'] if harvest_source_name in deleted: #TODO fix duplicated continue logger.info(f'Deleting local harvest {harvest_source_name}') res = self.delete_package( ckan_package_id_or_name=harvest_source_name) if not res['success']: raise Exception(f'Failed to delete {harvest_source_name}') else: logger.info(f'Deleted {harvest_source_name}') deleted.append(harvest_source_name) deleted += 1 logger.info(f'{deleted} harvest sources deleted') return deleted
def create_organization(self, organization, check_if_exists=True): """ POST to CKAN API to create a new organization organization is just a python dict https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.organization_create """ logger.info(f'**** Creating Organization {organization}') if check_if_exists: logger.info(f'Exists Organization? {organization}') res = self.show_organization( organization_id_or_name=organization['name']) if res['success']: # do not create logger.info(f'Avoid create Organization {organization}') return res url = '{}{}'.format(self.base_url, self.organization_create_url) headers = self.get_request_headers(include_api_key=True) headers['Content-Type'] = 'application/json' organization = json.dumps(organization) logger.info(f'POST {url} headers:{headers} data:{organization}') try: req = requests.post(url, data=organization, headers=headers) except Exception as e: error = 'ERROR creating [POST] organization: {} [{}]'.format( url, e) raise content = req.content if req.status_code >= 400: error = ('ERROR creating [STATUS] organization: {}' '\n\t Status code: {}' '\n\t content:{}' '\n\t Dataset {}'.format(url, req.status_code, content, organization)) logger.error(error) raise Exception(error) try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) logger.error(error) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def import_harvest_sources( self, catalog_url, method='GET', # depend on CKAN version, GET for older versions on_duplicated='DELETE', harvest_type='harvest', source_type='datajson', delete_local_harvest_sources=True): """ import harvest sources from another CKAN open data portal """ if delete_local_harvest_sources: deleted = self.delete_all_harvest_sources(source_type=source_type) logger.info(f'Getting external harvest sources for {catalog_url}') external_portal = CKANPortalAPI(base_url=catalog_url) total_sources = 0 search_external = external_portal.search_harvest_packages( method=method, harvest_type=harvest_type, source_type=source_type) for external_harvest_sources in search_external: for external_harvest_source in external_harvest_sources: name = external_harvest_source['name'] organization = external_harvest_source['organization'] logger.info(f'**** Importing Organization {organization}') # copy organization locally del organization['id'] # drop original ID del organization['created'] del organization['revision_id'] res = self.create_organization(organization=organization) owner_org_id = organization['name'] config = external_harvest_source.get('config', {}) # res = self.delete_package(name) logger.info(external_harvest_source) res = self.create_harvest_source( title=external_harvest_source['title'], url=external_harvest_source['url'], owner_org_id=owner_org_id, name=name, config=config, notes=external_harvest_source['notes'], source_type=source_type, frequency=external_harvest_source['frequency'], on_duplicated=on_duplicated) if not res['success']: raise Exception(f'Failed to import harvest source {name}') else: logger.info(f'Created {name}') total_sources += 1 return total_sources
def fetch(self, timeout=30): """ download de data.json file """ logger.info(f'Fetching data from {self.url}') if self.url is None: error = "No URL defined" self.errors.append(error) logger.error(error) raise Exception(error) try: req = requests.get(self.url, timeout=timeout) except Exception as e: error = 'ERROR Donwloading data: {} [{}]'.format(self.url, e) self.errors.append(error) logger.error(error) raise logger.info(f'Data fetched status {req.status_code}') if req.status_code >= 400: error = '{} HTTP error: {}'.format(self.url, req.status_code) self.errors.append(error) logger.error(error) raise Exception(error) logger.info(f'Data fetched OK') self.raw_data_json = req.content
def show_organization( self, organization_id_or_name, method='POST'): # troubles using 2.3 and 2.8 CKAN versions): """ GET to CKAN API to show a organization """ url = '{}{}'.format(self.base_url, self.organization_show_url) headers = self.get_request_headers() data = {'id': organization_id_or_name} logger.info(f'POST {url} headers:{headers} data:{data}') try: if method == 'POST': req = requests.post(url, data=data, headers=headers) else: req = requests.get(url, params=data, headers=headers) except Exception as e: error = 'ERROR showing organization: {} [{}]'.format(url, e) raise content = req.content if req.status_code >= 400 and req.status_code != 404: error = 'ERROR showing organization: {} \n\t Status code: {} \n\t content:{}'.format( url, req.status_code, content) logger.error(error) raise Exception(error) try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data from show_organization: {} [{}]'.format( content, e) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def update_package(self, ckan_package): """ POST to CKAN API to update a package/dataset ckan_package is just a python dict https://docs.ckan.org/en/2.8/api/#ckan.logic.action.update.package_update """ url = '{}{}'.format(self.base_url, self.package_update_url) headers = self.get_request_headers(include_api_key=True) headers['Content-Type'] = 'application/json' ckan_package = json.dumps(ckan_package) logger.info(f'POST {url} headers:{headers} data:{ckan_package}') try: req = requests.post(url, data=ckan_package, headers=headers) except Exception as e: error = 'ERROR creating CKAN package: {} [{}]'.format(url, e) raise content = req.content if req.status_code >= 400: error = 'ERROR updateing CKAN package: {} \n\t Status code: {} \n\t content:{}'.format( url, req.status_code, content) logger.error(error) raise Exception(error) try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def transform_to_ckan_dataset(self, existing_resources=None): # check how to parse # https://github.com/GSA/ckanext-datajson/blob/07ca20e0b6dc1898f4ca034c1e073e0c27de2015/ckanext/datajson/parse_datajson.py#L5 # if we are updating existing dataset we need to merge resources logger.info('Transforming data.json dataset {}'.format(self.original_dataset.get('identifier', ''))) valid = self.validate_origin_dataset() if not valid: # raise Exception(f'Error validating origin dataset: {error}') return None datajson_dataset = self.original_dataset tags = datajson_dataset.get('keyword', []) cleaned_tags = clean_tags(tags) self.ckan_dataset['tag_string'] = ','.join(cleaned_tags) # previous transformations at origin for old_field, field_ckan in self.mapped_fields.items(): logger.debug(f'Connecting fields "{old_field}", "{field_ckan}"') # identify origin and set value to destination origin = self.identify_origin_element(raw_field=old_field) if origin is None: logger.debug(f'No data in origin for "{old_field}"') else: self.set_destination_element(raw_field=field_ckan, new_value=origin) logger.debug(f'Connected OK fields "{old_field}"="{origin}"') # transform distribution into resources distribution = datajson_dataset['distribution'] if 'distribution' in datajson_dataset else [] # if _distribution_ is empty then we try to create them from "accessURL" or "webService" URLs if distribution is None or distribution == []: distribution = self.infer_resources() self.ckan_dataset['resources'] = self.transform_resources(distribution) # move out the resources with validation errores # and log the error as a dataset error final_resources = [] for resource in self.ckan_dataset['resources']: if 'error' in resource: self.errors.append(resource) else: final_resources.append(resource) self.ckan_dataset['resources'] = final_resources if existing_resources is not None: res = self.merge_resources(existing_resources=existing_resources, new_resources=self.ckan_dataset['resources']) self.ckan_dataset['resources'] = res # add custom extras # add source_datajson_identifier = {"key": "source_datajson_identifier", "value": True} self.set_destination_element(raw_field='extras__source_datajson_identifier', new_value=True) # define name (are uniques in CKAN instance) if 'name' not in self.ckan_dataset or self.ckan_dataset['name'] == '': name = self.generate_name(title=self.ckan_dataset['title']) self.ckan_dataset['name'] = name # mandatory self.ckan_dataset['owner_org'] = self.ckan_owner_org_id # check for license if datajson_dataset.get('license', None) not in [None, '']: original_license = datajson_dataset['license'] original_license = original_license.replace('http://', '') original_license = original_license.replace('https://', '') original_license = original_license.rstrip('/') license_id = ckan_settings.LICENCES.get(original_license, "other-license-specified") self.ckan_dataset['license_id'] = license_id # define publisher as extras as we expect publisher = datajson_dataset.get('publisher', None) if publisher is not None: publisher_name = publisher.get('name', '') # TODO check which place we are going to use self.set_extra('publisher', publisher_name) # self.ckan_dataset['publisher'] = publisher_name parent_publisher = publisher.get('subOrganizationOf', None) if parent_publisher is not None: publisher_hierarchy = [publisher_name] while parent_publisher: parent_name = parent_publisher.get('name', '') parent_publisher = parent_publisher.get('subOrganizationOf', None) publisher_hierarchy.append(parent_name) publisher_hierarchy.reverse() publisher_hierarchy = " > ".join(publisher_hierarchy) self.set_extra('publisher_hierarchy', publisher_hierarchy) # clean all empty unused values (can't pop keys while iterating) ckan_dataset_copy = self.ckan_dataset.copy() for k, v in self.ckan_dataset.items(): if v is None: ckan_dataset_copy.pop(k) self.ckan_dataset = ckan_dataset_copy valid = self.validate_final_dataset() if valid is None: return None logger.info('Dataset transformed {} OK'.format(self.original_dataset.get('identifier', ''))) return ckan_dataset_copy
def test_create_harvest_source(self): logger.info(f'Creating harvest source from {CKAN_BASE_URL}') cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY) try: cpa.delete_all_harvest_sources(harvest_type='harvest', source_type='datajson') except Exception as e: logger.error(f'Error cleaning previous harvest soures {e}') pass title = 'Energy JSON test {}'.format(random.randint(1, 999999)) url = 'http://www.energy.gov/data-{}.json'.format( random.randint(1, 999999)) res = cpa.create_harvest_source( title=title, url=url, owner_org_id=CKAN_ORG_ID, source_type='datajson', notes='Some tests about local harvesting sources creation', frequency='WEEKLY') self.assertTrue(res['success']) harvest_source = res['result'] logger.info('Created: {}'.format(res['success'])) # read it res = cpa.show_package(ckan_package_id_or_name=harvest_source['id']) self.assertTrue(res['success']) self.assertEqual(harvest_source['url'], url) self.assertEqual(harvest_source['title'], title) self.assertEqual(harvest_source['type'], 'harvest') self.assertEqual(harvest_source['source_type'], 'datajson') # search for it results = cpa.search_harvest_packages(rows=1000, harvest_type='harvest', source_type='datajson') created_ok = False for datasets in results: for dataset in datasets: # print('FOUND: {}'.format(dataset['name'])) if dataset['name'] == harvest_source['name']: created_ok = True logger.info('Found!') else: logger.info('Other harvest source: {}'.format( dataset['name'])) assert created_ok == True # create a dataset with this harvest_soure_id dataset_title = 'Dataset number {}'.format(random.randint(1, 999999)) dataset_name = slugify(dataset_title) tags = [{'name': 'tag81'}, {'name': 'tag82'}] randval = random.randint(1, 999) extras = [ { 'key': 'harvest_source_id', 'value': harvest_source['id'] }, { 'key': 'harvest_source_title', 'value': harvest_source['title'] }, # {'key': 'harvest_object_id', 'value': harvest_source['id']}, # ? not sure { 'key': 'harvest_ng_source_id', 'value': harvest_source['id'] }, { 'key': 'harvest_ng_source_title', 'value': harvest_source['title'] }, { 'key': 'try_a_extra', 'value': randval } ] package = { 'name': dataset_name, 'title': dataset_title, 'owner_org': CKAN_ORG_ID, 'tags': tags, 'extras': extras } res2 = cpa.create_package(ckan_package=package) self.assertTrue(res2['success']) logger.info('Package with harvest source: {}'.format(res2['success'])) # read full dataset res3 = cpa.show_package(ckan_package_id_or_name=dataset_name) self.assertTrue(res3['success']) ckan_dataset = res3['result'] logger.info( 'Package with harvest source readed: {}'.format(ckan_dataset)) assert 'extras' in ckan_dataset assert [str(randval)] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'try_a_extra' ] # my custom ID (not connected to a real harvest ID) assert [harvest_source['id']] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_ng_source_id' ] # check if this package is related to harvest source total_datasets_in_source = 0 datasets_from_source = cpa.search_harvest_packages( harvest_source_id=harvest_source['id']) connected_ok = False for datasets in datasets_from_source: for dataset in datasets: total_datasets_in_source += 1 if dataset['name'] == dataset_name: connected_ok = True logger.info('Found!') else: # we just expect one dataset error = '{} != {} ------ {}'.format( dataset['name'], dataset_name, dataset) logger.error(error) assert error == False assert connected_ok == True assert total_datasets_in_source == 1 logger.info( f' +++++++++++++ total_datasets_in_source={total_datasets_in_source}' ) # this fails, harvest process is more complex that just add an extra # assert [harvest_source['id']] == [extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_source_id'] # delete both logger.info('Delete CKAN package: {}'.format(ckan_dataset['id'])) res4 = cpa.delete_package(ckan_package_id_or_name=ckan_dataset['id']) self.assertTrue(res4['success']) logger.info('Delete Harvest source: {}'.format(harvest_source['id'])) res5 = cpa.delete_package(ckan_package_id_or_name=harvest_source['id']) self.assertTrue(res5['success'])
def search_harvest_packages( self, rows=1000, method='POST', # POST work in CKAN 2.8, fails in 2.3 harvest_source_id=None, # just one harvest source harvest_type=None, # harvest for harvest sources source_type=None): """ search harvested packages or harvest sources "rows" is the page size. You could search for an specific harvest_source_id """ start = 0 sort = "metadata_modified desc" url = '{}{}'.format(self.base_url, self.package_search_url) page = 0 # TODO check for a real paginated version while url: page += 1 params = {'start': start, 'rows': rows} # , 'sort': sort} if harvest_source_id is not None: # our new extra is working params['fq'] = f'+harvest_ng_source_id:"{harvest_source_id}"' elif harvest_type is not None: # at my local instance I need this. # I not sure why, in another public instances is not needed params['fq'] = f'+dataset_type:{harvest_type}' if source_type is not None: params[ 'q'] = f'(type:{harvest_type} source_type:{source_type})' else: params['q'] = f'(type:{harvest_type})' logger.info( f'Searching {url} PAGE:{page} start:{start}, rows:{rows} with params: {params}' ) headers = self.get_request_headers() try: logger.info(f'Search harvest packages via {method}') if method == 'POST': # depend on CKAN version req = requests.post(url, data=params, headers=headers) else: req = requests.get(url, params=params, headers=headers) except Exception as e: error = 'ERROR Donwloading package list: {} [{}]'.format( url, e) raise ValueError( 'Failed to get package list at {}'.format(url)) content = req.content if req.status_code >= 400: error = ('ERROR searching CKAN package: {}' '\n\t Status code: {}' '\n\t Params: {}' '\n\t content:{}'.format(url, req.status_code, params, content)) logger.error(error) raise Exception(error) try: json_content = json.loads(content) # check for encoding errors except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) raise ValueError(error) if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) raise ValueError(error) result = json_content['result'] count_results = result['count'] sort_results = result['sort'] facet_results = result['facets'] results = result['results'] real_results_count = len(results) self.total_packages += real_results_count logger.info(f'{real_results_count} results') if real_results_count == 0: url = None else: start += rows self.package_list += results yield (results)
def create_package( self, ckan_package, on_duplicated='RAISE', # if name already exists 'RAISE' 'SKIP' | 'DELETE' ): """ POST to CKAN API to create a new package/dataset ckan_package is just a python dict https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.package_create Params: - ckan_package: a dict with with a ready-to-save package - on_duplicated (str): action to take where the package already exists: + RAISE: raise an error + SKIP: returns show_package results + DELETE: remove the package and try to create again """ url = '{}{}'.format(self.base_url, self.package_create_url) headers = self.get_request_headers(include_api_key=True) headers['Content-Type'] = 'application/json' ckan_package_str = json.dumps(ckan_package) logger.info(f'POST {url} headers:{headers} data:{ckan_package}') try: req = requests.post(url, data=ckan_package_str, headers=headers) except Exception as e: error = 'ERROR creating [POST] CKAN package: {} [{}]'.format( url, e) raise content = req.content try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) logger.error(error) raise if req.status_code == 409: logger.info(f'409 json_content: {json_content}') # another posible [error] = {'owner_org': ['Organization does not exist']} # Check for duplicates name_errors = json_content['error'][ 'url'] if 'name' in json_content['error'] else [] dataset_exists = len([ ne for ne in name_errors if "That URL is already in use" in ne ]) > 0 url_errors = json_content['error']['url'] if 'url' in json_content[ 'error'] else [] harvest_exists = len([ ue for ue in url_errors if "There already is a Harvest Source for this URL" in ue ]) > 0 is_duplicated = dataset_exists or harvest_exists if is_duplicated: logger.error(f'Already exists! ACTION: {on_duplicated}') if on_duplicated == 'SKIP': # returns {'success': True, 'result': {the package}} res = self.show_package( ckan_package_id_or_name=ckan_package['name']) logger.info(f'Skipped: {res}') return res elif on_duplicated == 'DELETE': delr = self.delete_package( ckan_package_id_or_name=ckan_package['name']) if not delr['success']: raise Exception('Failed to delete {}'.format( ckan_package['name'])) return self.create_package(ckan_package=ckan_package, on_duplicated='RAISE') elif on_duplicated == 'RAISE': error = ('DUPLICATED CKAN package: {}' '\n\t Status code: {}' '\n\t content:{}' '\n\t Dataset {}'.format(url, req.status_code, content, ckan_package)) logger.error(error) raise Exception(error) if req.status_code >= 400: error = ('ERROR creating CKAN package: {}' '\n\t Status code: {}' '\n\t content:{}' '\n\t Dataset {}'.format(url, req.status_code, content, ckan_package)) logger.error(error) raise Exception(error) if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) logger.info(f'Harvest source created: {json_content}') return json_content
def search_packages( self, rows=1000, method='POST', # POST work in CKAN 2.8, fails in 2.3 search_params={}): # datajson for """ search packages. "rows" is the page size. """ start = 0 url = '{}{}'.format(self.base_url, self.package_search_url) page = 0 # TODO check for a real paginated version while url: page += 1 params = {'start': start, 'rows': rows} params.update(search_params) logger.info( f'Searching packages {url} PAGE:{page} start:{start}, rows:{rows} with params: {params}' ) headers = self.get_request_headers() try: if method == 'POST': # depend on CKAN version req = requests.post(url, data=params, headers=headers) else: req = requests.get(url, params=params, headers=headers) except Exception as e: error = 'ERROR Donwloading package list: {} [{}]'.format( url, e) raise ValueError( 'Failed to get package list at {}'.format(url)) content = req.content if req.status_code >= 400: error = ('ERROR searching CKAN package: {}' '\n\t Status code: {}' '\n\t Params: {}' '\n\t content:{}'.format(url, req.status_code, params, content)) logger.error(error) raise Exception(error) try: json_content = json.loads(content) # check for encoding errors except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) raise ValueError(error) if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) raise ValueError(error) result = json_content['result'] results = result['results'] real_results_count = len(results) self.total_packages += real_results_count logger.info(f'{real_results_count} results') if real_results_count == 0: url = None else: start += rows self.package_list += results logger.debug(f'datasets found: {results}') yield (results)