def get_user_info(self, user_id): """ GET to CKAN API to get list of admins https://docs.ckan.org/en/2.8/api/#ckan.logic.action.get.user_show """ url = '{}{}?id={}'.format(self.base_url, self.user_show_url, user_id) headers = self.get_request_headers(include_api_key=True) logger.info(f'GET {url} headers:{headers}') try: req = requests.get(url, headers=headers) except Exception as e: error = 'ERROR getting users information: {} [{}]'.format(url, e) raise content = req.content if req.status_code >= 400: error = 'ERROR getting users information: {} \n\t Status code: {} \n\t content:{}'.format( url, req.status_code, content) logger.error(error) raise Exception(error) try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data from users information {} [{}]'.format( content, e) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def clean_duplicated_identifiers(rows): """ clean duplicated datasets identifiers on data.json source """ logger.info('Cleaning duplicates') unique_identifiers = [] duplicates = [] processed = 0 # resource = rows.res # logger.error('Rows from resource {}'.format(resource.name)) for row in rows: if row['identifier'] not in unique_identifiers: unique_identifiers.append(row['identifier']) yield (row) processed += 1 else: duplicates.append(row['identifier']) row['is_duplicate'] = 'True' yield (row) # do not log all duplicates. Sometimes they are too many. if len(duplicates) < 10: logger.error('Duplicated {}'.format(row['identifier'])) elif len(duplicates) == 10: logger.error('... more duplicates not shown') logger.info('{} duplicates deleted. {} OK'.format(len(duplicates), processed))
def show_package(self, ckan_package_id_or_name): """ GET to CKAN API to show a package/dataset """ url = '{}{}'.format(self.base_url, self.package_show_url) headers = self.get_request_headers() data = {'id': ckan_package_id_or_name} logger.info(f'POST {url} headers:{headers} data:{data}') try: req = requests.get(url, params=data, headers=headers) except Exception as e: error = 'ERROR showing CKAN package: {} [{}]'.format(url, e) raise content = req.content if req.status_code >= 400: error = 'ERROR showing CKAN package: {} \n\t Status code: {} \n\t content:{}'.format( url, req.status_code, content) logger.error(error) raise Exception(error) content = req.content try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data from show_package: {} [{}]'.format( content, e) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def clean_duplicated_identifiers(rows): unique_identifiers = [] duplicates = [] processed = 0 for row in rows: if row['identifier'] not in unique_identifiers: unique_identifiers.append(row['identifier']) yield(row) processed += 1 else: duplicates.append(row['identifier']) logger.error('Duplicated {}'.format(row['identifier'])) logger.info('{} duplicates deleted. {} OK'.format(len(duplicates), processed))
def process_results(self): # analyze results actions = {} # create | delete | update validation_errors = [] action_errors = [] action_warnings = [] # print(f'Result: {self.results}') if type(self.results) != list: logger.error(f'Unexpected results: {self.results}') return False for result in self.results: # print(f'Result: {result}') comparison_results = result.get('comparison_results', None) if comparison_results is None: # this is bad. This source is broken return False action = comparison_results['action'] if action not in actions.keys(): actions[action] = {'total': 0, 'success': 0, 'fails': 0} actions[action]['total'] += 1 if action in ['create', 'update']: # delete has no new_data if len(comparison_results['new_data'].get('validation_errors', [])) > 0: validation_errors += comparison_results['new_data']['validation_errors'] action_results = comparison_results.get('action_results', {}) success = action_results.get('success', False) if success: actions[action]['success'] += 1 else: actions[action]['fails'] += 1 action_warnings += action_results.get('warnings', []) action_errors += action_results.get('errors', []) self.final_results['actions'] = actions self.final_results['validation_errors'] = validation_errors self.final_results['action_warnings'] = action_warnings self.final_results['action_errors'] = action_errors return True
def get_data_json_from_url(url): logger.info(f'Geting data.json from {url}') datajson = DataJSON() datajson.url = url ret, info = datajson.download_data_json(timeout=90) if not ret: error = 'Error getting data: {}'.format(info) logger.error(error) raise Exception(error) logger.info('Downloaded OK') ret, info = datajson.load_data_json() if not ret: datajson.save_validation_errors( path=config.get_datajson_validation_errors_path()) logger.error(datajson.validation_errors) try: build_validation_error_email() except Exception as e: logger.error('Error sending validation email: {}'.format(e)) raise Exception(datajson.validation_errors) logger.info('JSON OK') ret, info = datajson.validate_json() if not ret: logger.error( 'Error validating data: {}\n----------------\n'.format(info)) # continue # USE invalid too logger.info('Validate FAILED: {} datasets'.format( len(datajson.datasets))) else: logger.info('Validate OK: {} datasets'.format(len(datajson.datasets))) # TODO move this as a DataJson function and add it to a validate function validate_data_json(data_json['dataset']) logger.info('VALID JSON, {} datasets found'.format(len(datajson.datasets))) # save data.json datajson.save_data_json(path=config.get_datajson_cache_path()) # save headers errors datajson.save_validation_errors( path=config.get_datajson_validation_errors_path()) # the real dataset list if config.LIMIT_DATASETS > 0: datajson.datasets = datajson.datasets[:config.LIMIT_DATASETS] for dataset in datajson.datasets: # add headers (previously called catalog_values) dataset['headers'] = datajson.headers yield (dataset)
def delete_package(self, ckan_package_id_or_name): """ POST to CKAN API to delete a new package/dataset https://docs.ckan.org/en/2.8/api/#ckan.logic.action.delete.package_delete """ url = '{}{}'.format(self.base_url, self.package_delete_url) headers = self.get_request_headers(include_api_key=True) data = {'id': ckan_package_id_or_name} logger.error(f'POST {url} headers:{headers} data:{data}') try: req = requests.post(url, data=data, headers=headers) except Exception as e: error = 'ERROR deleting CKAN package: {} [{}]'.format(url, e) raise content = req.content if req.status_code >= 400: error = 'ERROR deleting CKAN package: {} \n\t Status code: {} \n\t content:{}'.format( url, req.status_code, content) logger.error(error) raise Exception(error) try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data from delete_package: {} [{}]'.format( content, e) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def show_organization( self, organization_id_or_name, method='POST'): # troubles using 2.3 and 2.8 CKAN versions): """ GET to CKAN API to show a organization """ url = '{}{}'.format(self.base_url, self.organization_show_url) headers = self.get_request_headers() data = {'id': organization_id_or_name} logger.info(f'POST {url} headers:{headers} data:{data}') try: if method == 'POST': req = requests.post(url, data=data, headers=headers) else: req = requests.get(url, params=data, headers=headers) except Exception as e: error = 'ERROR showing organization: {} [{}]'.format(url, e) raise content = req.content if req.status_code >= 400 and req.status_code != 404: error = 'ERROR showing organization: {} \n\t Status code: {} \n\t content:{}'.format( url, req.status_code, content) logger.error(error) raise Exception(error) try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data from show_organization: {} [{}]'.format( content, e) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def get_data_json_from_url(url): datajson = DataJSON() datajson.url = url ret, info = datajson.download_data_json(timeout=90) if not ret: error = 'Error getting data: {}'.format(info) logger.error(error) raise Exception(error) logger.info('Downloaded OK') ret, info = datajson.load_data_json() if not ret: error = 'Error loading JSON data: {}'.format(info) logger.error(error) raise Exception(error) logger.info('JSON OK') ret, info = datajson.validate_json() if not ret: logger.error('Error validating data: {}\n----------------\n'.format(info)) # continue # USE invalid too logger.info('Validate FAILED: {} datasets'.format(len(datajson.datasets))) else: logger.info('Validate OK: {} datasets'.format(len(datajson.datasets))) # logger.debug('JSONSchema: {}'.format(json.dumps(datajson.schema.json_content, indent=4))) return datajson
def get_data_json_from_file(data_json_path): datajson = DataJSON() ret, info = datajson.read_local_data_json(data_json_path=data_json_path) ret, info = datajson.load_data_json() if not ret: error = 'Error loading JSON data: {}'.format(info) logger.error(error) raise Exception(error) logger.info('JSON OK') ret, errors = datajson.validate_json() if not ret: total_errors = len(errors) logger.error('{} Errors validating data'.format(total_errors)) error = errors[0] if len(error) > 70: # too long and vervose errors error = error[:70] logger.error('Error 1/{} validating data:\n\t{}'.format(total_errors, error)) # continue # USE invalid too logger.info('Validate FAILED: {} datasets'.format(len(datajson.datasets))) else: logger.info('Validate OK: {} datasets'.format(len(datajson.datasets))) # logger.debug('JSONSchema: {}'.format(json.dumps(datajson.schema.json_content, indent=4))) return datajson
def process_all(self): logger.info(f'Inspecting {self.base_folder} folder') for subdir, dirs, files in os.walk(self.base_folder): for name in dirs: if name == 'harvest_sources': continue logger.info(f'Processing {name} folder') self.summary_data['harvest_sources_readed'] += 1 hs = HarvestedSource(name=name) ret = hs.process_results() if not ret: self.summary_data['harvest_sources_failed'] += 1 continue hs.render_template(save=True) data = hs.get_json_data() self.all_data.append(data) if type(data['data_json']) == list: datasets = [] logger.error(f'{name}: Data JSON Source is a list. Must be a dict') if type(data['data_json']) == dict: datasets = data['data_json'].get('dataset', []) if len(datasets) == 0: logger.error(f'Source with 0 datasets {name}') self.summary_data['total_data_json_datasets'] += len(datasets) logger.info(' - Total datasets: {}'.format(self.summary_data['total_data_json_datasets'])) harvest_sources_readed = self.summary_data['harvest_sources_readed'] harvest_sources_failed = self.summary_data['harvest_sources_failed'] total_data_json_datasets = self.summary_data['total_data_json_datasets'] logger.info('''************** Harvest sources readed: {} Harvest sources failed: {} Total datasets: {}'''.format(harvest_sources_readed, harvest_sources_failed, total_data_json_datasets))
def create_organization(self, organization, check_if_exists=True): """ POST to CKAN API to create a new organization organization is just a python dict https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.organization_create """ logger.info(f'**** Creating Organization {organization}') if check_if_exists: logger.info(f'Exists Organization? {organization}') res = self.show_organization( organization_id_or_name=organization['name']) if res['success']: # do not create logger.info(f'Avoid create Organization {organization}') return res url = '{}{}'.format(self.base_url, self.organization_create_url) headers = self.get_request_headers(include_api_key=True) headers['Content-Type'] = 'application/json' organization = json.dumps(organization) logger.info(f'POST {url} headers:{headers} data:{organization}') try: req = requests.post(url, data=organization, headers=headers) except Exception as e: error = 'ERROR creating [POST] organization: {} [{}]'.format( url, e) raise content = req.content if req.status_code >= 400: error = ('ERROR creating [STATUS] organization: {}' '\n\t Status code: {}' '\n\t content:{}' '\n\t Dataset {}'.format(url, req.status_code, content, organization)) logger.error(error) raise Exception(error) try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) logger.error(error) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def update_package(self, ckan_package): """ POST to CKAN API to update a package/dataset ckan_package is just a python dict https://docs.ckan.org/en/2.8/api/#ckan.logic.action.update.package_update """ url = '{}{}'.format(self.base_url, self.package_update_url) headers = self.get_request_headers(include_api_key=True) headers['Content-Type'] = 'application/json' ckan_package = json.dumps(ckan_package) logger.error(f'POST {url} headers:{headers} data:{ckan_package}') try: req = requests.post(url, data=ckan_package, headers=headers) except Exception as e: error = 'ERROR creating CKAN package: {} [{}]'.format(url, e) raise content = req.content if req.status_code >= 400: error = 'ERROR updateing CKAN package: {} \n\t Status code: {} \n\t content:{}'.format( url, req.status_code, content) logger.error(error) raise Exception(error) try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) raise if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def compare_resources(rows): """ read the previous resource (CKAN API results) Yield any comparison result """ res_name = rows.res.name if hasattr(rows, 'res') else 'Fake res testing' logger.info(f'Rows from resource {res_name}') data_packages_path = config.get_data_packages_folder_path() default_tzinfo_for_naives_dates = pytz.UTC # Calculate minimum statistics total = 0 no_extras = 0 no_identifier_key_found = 0 deleted = 0 found_update = 0 found_not_update = 0 sample_row = None for row in rows: total += 1 # logger.info(f'Row: {total}') # check for identifier ckan_id = row['id'] extras = row.get('extras', False) if not extras: # TODO learn why. logger.error(f'No extras! dataset: {ckan_id}') result = { 'action': 'error', 'ckan_id': ckan_id, 'new_data': None, 'reason': 'The CKAN dataset does not ' 'have the "extras" property' } row.update({'comparison_results': result}) yield row no_extras += 1 continue identifier = None for extra in extras: if extra['key'] == 'identifier': identifier = extra['value'] if identifier is None: logger.error('No identifier ' '(extras[].key.identifier not exists). ' 'Dataset.id: {}'.format(ckan_id)) no_identifier_key_found += 1 result = { 'action': 'error', 'ckan_id': ckan_id, 'new_data': None, 'reason': 'The CKAN dataset does not have an "identifier"' } row.update({'comparison_results': result}) yield row continue # was parent in the previous harvest # if extras.get('collection_metadata', None) is not None: encoded_identifier = encode_identifier(identifier) expected_filename = f'data-json-{encoded_identifier}.json' expected_path = os.path.join(data_packages_path, expected_filename) if not os.path.isfile(expected_path): logger.info((f'Dataset: {ckan_id} not in DATA.JSON.' f'It was deleted?: {expected_path}')) deleted += 1 result = { 'action': 'delete', 'ckan_id': ckan_id, 'new_data': None, 'reason': 'It no longer exists in the data.json source' } row.update({'comparison_results': result}) yield row continue datajson_package = Package(expected_path) # logger.info(f'Dataset: {ckan_id} # found as data package at {expected_path}') # TODO analyze this: https://github.com/ckan/ckanext-harvest/blob/master/ckanext/harvest/harvesters/base.py#L229 # compare dates # at data.json: "modified": "2019-06-27 12:41:27", # at ckan results: "metadata_modified": "2019-07-02T17:20:58.334748", data_json = datajson_package.get_resource('inline') data_json_data = data_json.source data_json_modified = parse( data_json_data['modified']) # It's a naive date ckan_json = row ckan_json_modified = parse(ckan_json['metadata_modified']) # un-naive datetimes if data_json_modified.tzinfo is None: data_json_modified = data_json_modified.replace( tzinfo=default_tzinfo_for_naives_dates) # logger.warning('Modified date in data.json is naive: {}'.format(data_json_data['modified'])) if ckan_json_modified.tzinfo is None: ckan_json_modified = ckan_json_modified.replace( tzinfo=default_tzinfo_for_naives_dates) # logger.warning('Modified date in CKAN results is naive: {}'.format(ckan_json['metadata_modified'])) diff_times = data_json_modified - ckan_json_modified seconds = diff_times.total_seconds() # logger.info(f'Seconds: {seconds} data.json:{data_json_modified} ckan:{ckan_json_modified})') # TODO analyze this since we have a Naive date we are not sure if abs(seconds) > 86400: # more than a day warning = '' if seconds > 0 else 'Data.json is older than CKAN' result = { 'action': 'update', 'ckan_id': ckan_id, 'new_data': data_json_data, 'reason': f'Changed: ~{seconds} seconds difference. {warning}' } found_update += 1 else: result = { 'action': 'ignore', 'ckan_id': ckan_id, 'new_data': None, # do not need this data_json_data 'reason': 'Changed: ~{seconds} seconds difference' } found_not_update += 1 row.update({'comparison_results': result}) yield row # if sample_row is None: # sample_row = row # Delete the data.json file os.remove(expected_path) news = 0 for name in glob.glob(f'{data_packages_path}/data-json-*.json'): news += 1 package = Package(name) data_json = package.get_resource('inline') data_json_data = data_json.source result = { 'action': 'create', 'ckan_id': None, 'new_data': data_json_data, 'reason': 'Not found in the CKAN results' } # there is no real row here # row = sample_row.update({'comparison_results': result}) row = {'comparison_results': result} yield row # Delete the data.json file os.remove(name) found = found_not_update + found_update stats = f"""Total processed: {total}. {no_extras} fail extras. {no_identifier_key_found} fail identifier key. {deleted} deleted. {found} datasets found ({found_update} needs update, {found_not_update} are the same), {news} new datasets.""" logger.info(stats)
def compare_resources(rows): """ read the previous resource (CKAN API results) Yield any comparison result """ res_name = rows.res.name if hasattr(rows, 'res') else 'Fake res testing' logger.info(f'Rows from resource {res_name}') data_packages_path = config.get_data_packages_folder_path() default_tzinfo_for_naives_dates = pytz.UTC # Calculate minimum statistics total = 0 no_extras = 0 no_identifier_key_found = 0 deleted = 0 found_update = 0 found_not_update = 0 sample_row = None for row in rows: total += 1 # logger.info(f'Row: {total}') # check for identifier ckan_id = row['id'] extras = row.get('extras', False) if not extras: # TODO learn why. logger.error(f'No extras! dataset: {ckan_id}') result = { 'action': 'error', 'ckan_id': ckan_id, 'new_data': None, 'reason': 'The CKAN dataset does not ' 'have the "extras" property' } row.update({'comparison_results': result}) yield row no_extras += 1 continue identifier = None for extra in extras: if extra['key'] == 'identifier': identifier = extra['value'] if identifier is None: logger.error('No identifier ' '(extras[].key.identifier not exists). ' 'Dataset.id: {}'.format(ckan_id)) no_identifier_key_found += 1 result = { 'action': 'error', 'ckan_id': ckan_id, 'new_data': None, 'reason': 'The CKAN dataset does not have an "identifier"' } row.update({'comparison_results': result}) yield row continue # was parent in the previous harvest # if extras.get('collection_metadata', None) is not None: encoded_identifier = encode_identifier(identifier) expected_filename = f'csw-{encoded_identifier}.json' expected_path = os.path.join(data_packages_path, expected_filename) if not os.path.isfile(expected_path): logger.info((f'Dataset: {ckan_id} not in CSW Source.' f'It was deleted?: {expected_path}')) deleted += 1 result = { 'action': 'delete', 'ckan_id': ckan_id, 'new_data': None, 'reason': 'It no longer exists in the CSW source' } row.update({'comparison_results': result}) yield row continue # the file (and the identifier) exists csw_package = Package(expected_path) csw_json = csw_package.get_resource('inline') csw_json_data = csw_json.source result = { 'action': 'update', 'ckan_id': ckan_id, 'new_data': csw_json_data, 'reason': f'Changed: ~{seconds} seconds difference. {warning}' } found_update += 1 yield row # remove so next step not detect it as new os.remove(expected_path) news = 0 for name in glob.glob(f'{data_packages_path}/csw-*.json'): news += 1 package = Package(name) csw_json = package.get_resource('inline') csw_json_data = csw_json.source result = { 'action': 'create', 'ckan_id': None, 'new_data': csw_json_data, 'reason': 'Not found in the CKAN results' } # there is no real row here # row = sample_row.update({'comparison_results': result}) row = {'comparison_results': result} yield row # Delete the csw.json file os.remove(name) found = found_not_update + found_update stats = f"""Total processed: {total}. {no_extras} fail extras. {no_identifier_key_found} fail identifier key. {deleted} deleted. {found} datasets found ({found_update} needs update, {found_not_update} are the same), {news} new datasets.""" logger.info(stats)
def create_package( self, ckan_package, on_duplicated='RAISE', # if name already exists 'RAISE' 'SKIP' | 'DELETE' ): """ POST to CKAN API to create a new package/dataset ckan_package is just a python dict https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.package_create """ url = '{}{}'.format(self.base_url, self.package_create_url) headers = self.get_request_headers(include_api_key=True) headers['Content-Type'] = 'application/json' ckan_package_str = json.dumps(ckan_package) logger.info(f'POST {url} headers:{headers} data:{ckan_package}') try: req = requests.post(url, data=ckan_package_str, headers=headers) except Exception as e: error = 'ERROR creating [POST] CKAN package: {} [{}]'.format( url, e) raise content = req.content try: json_content = json.loads(content) except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) logger.error(error) raise if req.status_code == 409: logger.info(f'409 json_content: {json_content}') # another posible [error] = {'owner_org': ['Organization does not exist']} # Check for duplicates if json_content['error'].get( 'name', None) == ["That URL is already in use."]: logger.error( f'Package Already exists! ACTION: {on_duplicated}') if on_duplicated == 'SKIP': return {'success': True} elif on_duplicated == 'DELETE': delr = self.delete_package( ckan_package_id_or_name=ckan_package['name']) if not delr['success']: raise Exception('Failed to delete {}'.format( ckan_package['name'])) return self.create_package(ckan_package=ckan_package, on_duplicated='RAISE') elif on_duplicated == 'RAISE': error = ('DUPLICATED CKAN package: {}' '\n\t Status code: {}' '\n\t content:{}' '\n\t Dataset {}'.format(url, req.status_code, content, ckan_package)) logger.error(error) raise Exception(error) if req.status_code >= 400: error = ('ERROR creating CKAN package: {}' '\n\t Status code: {}' '\n\t content:{}' '\n\t Dataset {}'.format(url, req.status_code, content, ckan_package)) logger.error(error) raise Exception(error) if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) logger.error(error) return json_content
def search_packages( self, rows=1000, method='POST', # POST work in CKAN 2.8, fails in 2.3 search_params={}): # datajson for """ search packages. "rows" is the page size. """ start = 0 url = '{}{}'.format(self.base_url, self.package_search_url) page = 0 # TODO check for a real paginated version while url: page += 1 params = {'start': start, 'rows': rows} params.update(search_params) logger.info( f'Searching packages {url} PAGE:{page} start:{start}, rows:{rows} with params: {params}' ) headers = self.get_request_headers() try: if method == 'POST': # depend on CKAN version req = requests.post(url, data=params, headers=headers) else: req = requests.get(url, params=params, headers=headers) except Exception as e: error = 'ERROR Donwloading package list: {} [{}]'.format( url, e) raise ValueError( 'Failed to get package list at {}'.format(url)) content = req.content if req.status_code >= 400: error = ('ERROR searching CKAN package: {}' '\n\t Status code: {}' '\n\t Params: {}' '\n\t content:{}'.format(url, req.status_code, params, content)) logger.error(error) raise Exception(error) try: json_content = json.loads(content) # check for encoding errors except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) raise ValueError(error) if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) raise ValueError(error) result = json_content['result'] results = result['results'] real_results_count = len(results) self.total_packages += real_results_count logger.info(f'{real_results_count} results') if real_results_count == 0: url = None else: start += rows self.package_list += results logger.debug(f'datasets found: {results}') yield (results)
colections_ids = set() c = 0 urls = [] with_configs = 0 with_config_filters = 0 with_config_defaults = 0 for results in cpa.search_harvest_packages(harvest_type='harvest', method='GET' #,source_type='datajson' ): for local_harvest_source in results: url = local_harvest_source['url'] if url in urls: logger.error( '------------------\n ALREADY READED\n------------------') continue else: urls.append(url) c += 1 name = local_harvest_source.get('name', 'UNNAMED') hspath = config.get_harvest_sources_path(hs_name=name) f = open(hspath, 'w') f.write(json.dumps(local_harvest_source, indent=2)) f.close() logger.info(f'{hspath} saved') # check for config.filters and config.defaults config_str = local_harvest_source.get('config', '{}') configs = json.loads(config_str)
def test_create_harvest_source(self): logger.info('Creating harvest source') cpa = CKANPortalAPI(base_url=CKAN_BASE_URL, api_key=CKAN_API_KEY) cpa.delete_all_harvest_sources(harvest_type='harvest', source_type='datajson') title = 'Energy JSON test {}'.format(random.randint(1, 999999)) url = 'http://www.energy.gov/data-{}.json'.format( random.randint(1, 999999)) res = cpa.create_harvest_source( title=title, url=url, owner_org_id=CKAN_ORG_ID, source_type='datajson', notes='Some tests about local harvesting sources creation', frequency='WEEKLY') self.assertTrue(res['success']) harvest_source = res['result'] logger.info('Created: {}'.format(res['success'])) # read it res = cpa.show_package(ckan_package_id_or_name=harvest_source['id']) self.assertTrue(res['success']) self.assertEqual(harvest_source['url'], url) self.assertEqual(harvest_source['title'], title) self.assertEqual(harvest_source['type'], 'harvest') self.assertEqual(harvest_source['source_type'], 'datajson') # search for it results = cpa.search_harvest_packages(rows=1000, harvest_type='harvest', source_type='datajson') created_ok = False for datasets in results: for dataset in datasets: # print('FOUND: {}'.format(dataset['name'])) if dataset['name'] == harvest_source['name']: created_ok = True logger.info('Found!') else: logger.info('Other harvest source: {}'.format( dataset['name'])) assert created_ok == True # create a dataset with this harvest_soure_id dataset_title = 'Dataset number {}'.format(random.randint(1, 999999)) dataset_name = slugify(dataset_title) tags = [{'name': 'tag81'}, {'name': 'tag82'}] randval = random.randint(1, 999) extras = [ { 'key': 'harvest_source_id', 'value': harvest_source['id'] }, { 'key': 'harvest_source_title', 'value': harvest_source['title'] }, # {'key': 'harvest_object_id', 'value': harvest_source['id']}, # ? not sure { 'key': 'harvest_ng_source_id', 'value': harvest_source['id'] }, { 'key': 'harvest_ng_source_title', 'value': harvest_source['title'] }, { 'key': 'try_a_extra', 'value': randval } ] package = { 'name': dataset_name, 'title': dataset_title, 'owner_org': CKAN_ORG_ID, 'tags': tags, 'extras': extras } res2 = cpa.create_package(ckan_package=package) self.assertTrue(res2['success']) logger.info('Package with harvest source: {}'.format(res2['success'])) # read full dataset res3 = cpa.show_package(ckan_package_id_or_name=dataset_name) self.assertTrue(res3['success']) ckan_dataset = res3['result'] logger.info( 'Package with harvest source readed: {}'.format(ckan_dataset)) assert 'extras' in ckan_dataset assert [str(randval)] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'try_a_extra' ] # my custom ID (not connected to a real harvest ID) assert [harvest_source['id']] == [ extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_ng_source_id' ] # check if this package is related to harvest source total_datasets_in_source = 0 datasets_from_source = cpa.search_harvest_packages( harvest_source_id=harvest_source['id']) connected_ok = False for datasets in datasets_from_source: for dataset in datasets: total_datasets_in_source += 1 if dataset['name'] == dataset_name: connected_ok = True logger.info('Found!') else: # we just expect one dataset error = '{} != {} ------ {}'.format( dataset['name'], dataset_name, dataset) logger.error(error) assert error == False assert connected_ok == True assert total_datasets_in_source == 1 logger.info( f' +++++++++++++ total_datasets_in_source={total_datasets_in_source}' ) # this fails, harvest process is more complex that just add an extra # assert [harvest_source['id']] == [extra['value'] for extra in ckan_dataset['extras'] if extra['key'] == 'harvest_source_id'] # delete both logger.info('Delete CKAN package: {}'.format(ckan_dataset['id'])) res4 = cpa.delete_package(ckan_package_id_or_name=ckan_dataset['id']) self.assertTrue(res4['success']) logger.info('Delete Harvest source: {}'.format(harvest_source['id'])) res5 = cpa.delete_package(ckan_package_id_or_name=harvest_source['id']) self.assertTrue(res5['success'])
def search_harvest_packages( self, rows=1000, method='POST', # POST work in CKAN 2.8, fails in 2.3 harvest_source_id=None, # just one harvest source harvest_type=None, # harvest for harvest sources source_type=None): # datajson for """ search harvested packages or harvest sources "rows" is the page size. You could search for an specific harvest_source_id """ start = 0 sort = "metadata_modified desc" url = '{}{}'.format(self.base_url, self.package_search_url) page = 0 # TODO check for a real paginated version while url: page += 1 params = {'start': start, 'rows': rows} # , 'sort': sort} if harvest_source_id is not None: # you can't search by any extras # https://github.com/ckan/ckan/blob/30ca7aae2f2aca6a19a2e6ed29148f8428e25c86/ckan/logic/action/get.py#L1852 # params['ext_harvest_source_id'] = harvest_source_id # params['ext_harvest_ng_source_id'] = harvest_source_id # params['extras'] = {'ext_harvest_ng_source_id': harvest_source_id} # params['q'] = f'harvest_source_id:{harvest_source_id}' # --------------- # this must work # --------------- # https://github.com/ckan/ckanext-harvest/blob/3a72337f1e619bf9ea3221037ca86615ec22ae2f/ckanext/harvest/helpers.py#L38 # params['fq'] = f'+harvest_source_id:"{harvest_source_id}"' # but is not working. For some reason exta harvest_source_id doesn't exists # our new extra is working params['fq'] = f'+harvest_ng_source_id:"{harvest_source_id}"' elif harvest_type is not None: # at my local instance I need this. # I not sure why, in another public instances is not needed params['fq'] = f'+dataset_type:{harvest_type}' if source_type is not None: params[ 'q'] = f'(type:{harvest_type} source_type:{source_type})' else: params['q'] = f'(type:{harvest_type})' logger.info( f'Searching {url} PAGE:{page} start:{start}, rows:{rows} with params: {params}' ) headers = self.get_request_headers() try: if method == 'POST': # depend on CKAN version req = requests.post(url, data=params, headers=headers) else: req = requests.get(url, params=params, headers=headers) except Exception as e: error = 'ERROR Donwloading package list: {} [{}]'.format( url, e) raise ValueError( 'Failed to get package list at {}'.format(url)) content = req.content if req.status_code >= 400: error = ('ERROR searching CKAN package: {}' '\n\t Status code: {}' '\n\t Params: {}' '\n\t content:{}'.format(url, req.status_code, params, content)) logger.error(error) raise Exception(error) try: json_content = json.loads(content) # check for encoding errors except Exception as e: error = 'ERROR parsing JSON data: {} [{}]'.format(content, e) raise ValueError(error) if not json_content['success']: error = 'API response failed: {}'.format( json_content.get('error', None)) raise ValueError(error) result = json_content['result'] count_results = result['count'] sort_results = result['sort'] facet_results = result['facets'] results = result['results'] real_results_count = len(results) self.total_packages += real_results_count logger.info(f'{real_results_count} results') if real_results_count == 0: url = None else: start += rows self.package_list += results logger.debug(f'datasets found: {results}') yield (results)