def test_author_check(self): context = {'user': self.user['name']} dataset1 = helpers.call_action('package_create', context=context, name='syndicated_dataset1', extras=[{ 'key': 'syndicate', 'value': 'true' }]) dataset2 = helpers.call_action('package_create', context=context, name='syndicated_dataset2', extras=[{ 'key': 'syndicate', 'value': 'true' }]) with patch('ckanext.syndicate.tasks.get_target') as mock_target: # Mock API mock_target.return_value = ckanapi.TestAppCKAN( self._get_test_app(), apikey=self.user['apikey']) # Syndicate to our Test CKAN instance ckan = mock_target() mock_user_show = mock.Mock() mock_user_show.return_value = self.user ckan.action.user_show = mock_user_show sync_package(dataset1['id'], 'dataset/create') helpers.call_action('package_patch', id=dataset1['id'], extras=[{ 'key': 'syndicate', 'value': 'true' }]) sync_package(dataset1['id'], 'dataset/update') mock_user_show.assert_called_once_with(id='test_author') updated1 = helpers.call_action('package_show', id=dataset1['id']) assert_is_not_none( get_pkg_dict_extra(updated1, get_syndicated_id())) mock_user_show = mock.Mock() mock_user_show.return_value = {'name': 'random-name', 'id': ''} ckan.action.user_show = mock_user_show sync_package(dataset2['id'], 'dataset/create') helpers.call_action('package_patch', id=dataset2['id'], extras=[{ 'key': 'syndicate', 'value': 'true' }]) sync_package(dataset2['id'], 'dataset/update') updated2 = helpers.call_action('package_show', id=dataset2['id']) assert_false(get_pkg_dict_extra(updated2, get_syndicated_id())) del Session.revision
def retrieve_ueb_run_output_packages(): source = 'uebpackage.tasks.retrieve_ueb_run_output_packages():' global service_host_address #service_request_api_url = '/api/UEBModelRunOutput' service_request_api_url = uebhelper.StringSettings.app_server_api_get_ueb_run_output connection = httplib.HTTPConnection(service_host_address) # get all datasets of type model-package model_pkg_datasets = uebhelper.get_packages_by_dataset_type('model-package') for dataset in model_pkg_datasets: pkg_run_job_id = h.get_pkg_dict_extra(dataset, 'package_run_job_id') if pkg_run_job_id is None: continue # to get the package_type value which is a tag, use the get_package() of my the helper module pkg_dict = uebhelper.get_package(dataset['id']) # TODO: Before using pkg_dict check that it is not None pkg_type = pkg_dict['package_type'][0] if len(pkg_run_job_id) == 0: continue if pkg_type == u'Complete': continue pkg_run_status = h.get_pkg_dict_extra(dataset, 'package_run_status') if pkg_run_status != uebhelper.StringSettings.app_server_job_status_success: continue dataset_id = dataset['id'] service_request_url = service_request_api_url + '?uebRunJobID=' + pkg_run_job_id connection.request('GET', service_request_url) service_call_results = connection.getresponse() if service_call_results.status == httplib.OK: log.info(source + 'UEB model output package was received from App ' 'server for model pkg dataset ID:%s and Run Job ID:%s' % (dataset_id, pkg_run_job_id)) _merge_ueb_output_pkg_with_input_pkg(service_call_results, dataset_id) else: log.error(source + 'HTTP status %d returned from App server when ' 'retrieving UEB model output package for ' 'model pkg dataset ID:%s and Run Job ID:%s' % (service_call_results.status, dataset_id, pkg_run_job_id)) ueb_run_status = 'Failed to retrieve output package' # update the dataset data_dict = {'package_run_status': ueb_run_status} try: uebhelper.update_package(dataset_id, data_dict, backgroundTask=True) log.info(source + 'UEB model package dataset run status was updated to %s for ' 'dataset ID:%s' % (dataset_id, ueb_run_status)) except Exception as e: log.error(source + 'Failed to update run status for UEB model package dataset ' 'with dataset ID:%s\nException:%s' % (dataset_id, e)) connection.close() return
def get_harvest_source_link(package_dict): harvest_source_id = h.get_pkg_dict_extra(package_dict, 'harvest_source_id', None) harvest_source_title = h.get_pkg_dict_extra(package_dict, 'harvest_source_title', None) if harvest_source_id and harvest_source_title: msg = p.toolkit._('Harvested from') url = h.url_for('harvest_read', id=harvest_source_id) link = '{msg} <a href="{url}">{title}</a>'.format(url=url, msg=msg, title=harvest_source_title) return p.toolkit.literal(link) return ''
def _set_context_to_user_input_model_packages(self): # get all datasets of type model-package model_pkg_datasets = uebhelper.get_packages_by_dataset_type( 'model-package') # for each resource we need only the id (id be used as the selection value) and the name for display file_resources = [] for dataset in model_pkg_datasets: pkg_run_job_id = h.get_pkg_dict_extra(dataset, 'package_run_job_id') if pkg_run_job_id is None: continue # skip dataset if that does not have pkg_model-name = 'UEB' pkg_model_name = h.get_pkg_dict_extra(dataset, 'pkg_model_name') if pkg_model_name.upper() != 'UEB': continue # to get the package_type value which is a tag, use the get_package() of my the helper module pkg_dict = uebhelper.get_package(dataset['id']) pkg_type = pkg_dict['package_type'][0] if len(pkg_run_job_id.strip()) != 0: continue if pkg_type == u'Complete': continue # check if the dataset is owned by the current user dataset_id = dataset['id'] if not uebhelper.is_user_owns_package(dataset_id, tk.c.user) and \ not uebhelper.is_user_owns_package(dataset_id, 'default'): continue # get model package zip file resource from the dataset and we assume the dataset has only one resource model_pkg_resource = dataset['resources'][0] dataset_title = dataset['title'] max_len = 50 if len(dataset_title) > max_len: dataset_title = dataset_title[:max_len] + '...' dataset_title = ' (' + dataset_title + ')' resource = {} resource['id'] = model_pkg_resource['id'] resource['url'] = model_pkg_resource['url'] resource['name'] = model_pkg_resource['name'] + dataset_title resource['description'] = model_pkg_resource['description'] file_resources.append(resource) tk.c.ueb_input_model_packages = file_resources
def test_get_pkg_dict_extra(self): from ckan.lib.create_test_data import CreateTestData from ckan import model from ckan.logic import get_action CreateTestData.create() pkg_dict = get_action("package_show")({"model": model, "user": u"tester"}, {"id": "annakarenina"}) assert_equal(h.get_pkg_dict_extra(pkg_dict, "genre"), "romantic novel") assert_equal(h.get_pkg_dict_extra(pkg_dict, "extra_not_found"), None) assert_equal(h.get_pkg_dict_extra(pkg_dict, "extra_not_found", "default_value"), "default_value")
def test_get_pkg_dict_extra(self): from ckan.lib.create_test_data import CreateTestData from ckan import model from ckan.logic import get_action CreateTestData.create() pkg_dict = get_action('package_show')({'model': model, 'user': u'tester'}, {'id': 'annakarenina'}) assert_equal(h.get_pkg_dict_extra(pkg_dict, 'genre'), '"romantic novel"') assert_equal(h.get_pkg_dict_extra(pkg_dict, 'extra_not_found'), None) assert_equal(h.get_pkg_dict_extra(pkg_dict, 'extra_not_found', 'default_value'), 'default_value')
def _update_package(package): syndicated_id = get_pkg_dict_extra(package, get_syndicated_id()) if syndicated_id is None: _create_package(package) return ckan = get_target() try: updated_package = dict(package) # Keep the existing remote ID and Name del updated_package['id'] del updated_package['name'] updated_package['extras'] = filter_extras(package['extras']) updated_package['resources'] = filter_resources(package['resources']) updated_package['owner_org'] = get_syndicated_organization() try: # TODO: No automated test updated_package = toolkit.get_action( 'update_dataset_for_syndication')( {}, {'dataset_dict': updated_package}) except KeyError: pass ckan.action.package_update( id=syndicated_id, **updated_package ) except ckanapi.NotFound: _create_package(package)
def _update_package(package): syndicated_id = get_pkg_dict_extra(package, get_syndicated_id()) if syndicated_id is None: _create_package(package) return ckan = get_target() try: updated_package = dict(package) # Keep the existing remote ID and Name del updated_package['id'] del updated_package['name'] updated_package['extras'] = filter_extras(package['extras']) updated_package['resources'] = filter_resources(package['resources']) updated_package['owner_org'] = get_syndicated_organization() try: # TODO: No automated test updated_package = toolkit.get_action( 'update_dataset_for_syndication')( {}, { 'dataset_dict': updated_package }) except KeyError: pass ckan.action.package_update(id=syndicated_id, **updated_package) except ckanapi.NotFound: _create_package(package)
def _get_group_ids(dataset_dict): group_ids = [] countries = get_pkg_dict_extra(dataset_dict, 'countries') if countries is not None: for country_name in countries.split(','): cleaned_name = country_name.strip().title() country = None try: country = pycountry.countries.get( name=cleaned_name) except KeyError: try: country = pycountry.countries.get( common_name=cleaned_name) except KeyError: pass if country is not None: group_ids.append( {'id': country.alpha3.lower()}) if group_ids == []: group_ids.append({'id': 'world'}) return group_ids
def _set_context_to_user_input_model_packages(self): # get all datasets of type model-package model_pkg_datasets = uebhelper.get_packages_by_dataset_type('model-package') # for each resource we need only the id (id be used as the selection value) and the name for display file_resources = [] for dataset in model_pkg_datasets: pkg_run_job_id = h.get_pkg_dict_extra(dataset, 'package_run_job_id') if pkg_run_job_id is None: continue # skip dataset if that does not have pkg_model-name = 'UEB' pkg_model_name = h.get_pkg_dict_extra(dataset, 'pkg_model_name') if pkg_model_name.upper() != 'UEB': continue # to get the package_type value which is a tag, use the get_package() of my the helper module pkg_dict = uebhelper.get_package(dataset['id']) pkg_type = pkg_dict['package_type'][0] if len(pkg_run_job_id.strip()) != 0: continue if pkg_type == u'Complete': continue # check if the dataset is owned by the current user dataset_id = dataset['id'] if not uebhelper.is_user_owns_package(dataset_id, tk.c.user) and \ not uebhelper.is_user_owns_package(dataset_id, 'default'): continue # get model package zip file resource from the dataset and we assume the dataset has only one resource model_pkg_resource = dataset['resources'][0] dataset_title = dataset['title'] max_len = 50 if len(dataset_title) > max_len: dataset_title = dataset_title[:max_len] + '...' dataset_title = ' (' + dataset_title + ')' resource = {} resource['id'] = model_pkg_resource['id'] resource['url'] = model_pkg_resource['url'] resource['name'] = model_pkg_resource['name'] + dataset_title resource['description'] = model_pkg_resource['description'] file_resources.append(resource) tk.c.ueb_input_model_packages = file_resources
def test_get_pkg_dict_extra(): from ckan.lib.create_test_data import CreateTestData from ckan import model CreateTestData.create() pkg_dict = helpers.call_action("package_show", id="annakarenina") assert h.get_pkg_dict_extra(pkg_dict, "genre") == "romantic novel" assert h.get_pkg_dict_extra(pkg_dict, "extra_not_found") is None assert (h.get_pkg_dict_extra(pkg_dict, "extra_not_found", "default_value") == "default_value") model.repo.rebuild_db()
def test_get_pkg_dict_extra(self): from ckan.lib.create_test_data import CreateTestData from ckan import model from ckan.logic import get_action CreateTestData.create() pkg_dict = get_action('package_show')({'model': model, 'user': u'tester'}, {'id': 'annakarenina'}) assert_equal(h.get_pkg_dict_extra(pkg_dict, 'genre'), 'romantic novel') assert_equal(h.get_pkg_dict_extra(pkg_dict, 'extra_not_found'), None) assert_equal(h.get_pkg_dict_extra(pkg_dict, 'extra_not_found', 'default_value'), 'default_value') model.repo.rebuild_db()
def check_ueb_run_status(): source = 'uebpackage.tasks.check_ueb_run_status():' global service_host_address service_request_api_url = uebhelper.StringSettings.app_server_api_check_ueb_run_status_url connection = httplib.HTTPConnection(service_host_address) job_status_processing = uebhelper.StringSettings.app_server_job_status_processing job_status_in_queue = uebhelper.StringSettings.app_server_job_status_in_queue model_pkg_datasets_with_run_status_processing = _get_model_pkg_datasets_by_run_status(job_status_processing) model_pkg_datasets_with_run_status_in_queue = _get_model_pkg_datasets_by_run_status(job_status_in_queue) model_pkg_datasets_need_run_status_update = model_pkg_datasets_with_run_status_processing + \ model_pkg_datasets_with_run_status_in_queue if len(model_pkg_datasets_need_run_status_update) == 0: log.info(source + "No UEB model package dataset has a run status of %s at this time" % job_status_processing) else: log.info(source + "Number of UEB model package datatsets with run status of %s or %s at this time is:%s" % (job_status_processing, job_status_in_queue, len(model_pkg_datasets_need_run_status_update))) for dataset in model_pkg_datasets_need_run_status_update: pkg_run_job_id = h.get_pkg_dict_extra(dataset, 'package_run_job_id') if pkg_run_job_id is None: continue dataset_id = dataset['id'] service_request_url = service_request_api_url + '?uebRunJobID=' + pkg_run_job_id connection.request('GET', service_request_url) service_call_results = connection.getresponse() if service_call_results.status == httplib.OK: request_processing_status = service_call_results.read() log.info(source + 'UEB model package run status as returned from App ' 'server for dataset ID: %s and Run Job ID:%s is %s' % (dataset_id, pkg_run_job_id, request_processing_status)) else: request_processing_status = uebhelper.StringSettings.app_server_job_status_error log.error(source + 'HTTP status %d returned from App server when checking ' 'run status for Run Job ID:%s and model pkg dataset ID:%s' % (service_call_results.status, pkg_run_job_id, dataset_id)) connection.close() # update the dataset data_dict = {'package_run_status': request_processing_status} try: uebhelper.update_package(dataset_id, data_dict, backgroundTask=True) log.info(source + 'UEB model package dataset run status was updated to %s for ' 'dataset ID:%s' % (dataset_id, request_processing_status)) except Exception as e: log.error(source + 'Failed to update run status for UEB model package dataset ' 'with dataset ID:%s\nException:%s' % (dataset_id, e))
def test_get_pkg_dict_extra(self): from ckan.lib.create_test_data import CreateTestData from ckan import model from ckan.logic import get_action CreateTestData.create() pkg_dict = get_action("package_show")({ "model": model, "user": u"tester" }, { "id": "annakarenina" }) assert h.get_pkg_dict_extra(pkg_dict, "genre") == "romantic novel" assert h.get_pkg_dict_extra(pkg_dict, "extra_not_found") is None assert (h.get_pkg_dict_extra(pkg_dict, "extra_not_found", "default_value") == "default_value") model.repo.rebuild_db()
def update_dataset_for_hdx_syndication(context, data_dict): dataset_dict = data_dict['dataset_dict'] dataset_dict['dataset_date'] = _get_dataset_date(dataset_dict) dataset_dict['methodology'] = 'Other' methodology = get_pkg_dict_extra(dataset_dict, 'methodology') if methodology is None: dataset_dict['methodology_other'] = 'Not specified' else: dataset_dict['methodology_other'] = methodology dataset_dict['dataset_source'] = get_pkg_dict_extra( dataset_dict, 'datasource') dataset_dict['groups'] = _get_group_ids(dataset_dict) dataset_dict['data_update_frequency'] = '0' # Never dataset_dict.pop('tags', None) dataset_dict.pop('extras', None) return dataset_dict
def format_frequency(package): freq = h.get_pkg_dict_extra(package, 'frequency-of-collection') unit = h.get_pkg_dict_extra(package, 'frequency-of-collection-units') # Remove the surrounding curly braces from both the strings freq_num = run_format_regex(freq) freq_float = None try: freq_int = int(freq_num) except ValueError: freq_float = float(freq_num) # Most values are ints, but some are floats and some of these floats are # just the same number as the int. This complicated and ugly logic makes # sure floats are used *only* when needed. if freq_float is not None: if freq_float == int(freq_float): freq_num = int(freq_float) else: freq_num = freq_float else: freq_num = freq_int unit_str = run_format_regex(unit) if freq_num > 0: unit_str = '{0}s'.format(unit_str) return '{0} {1}'.format(freq_num, unit_str)
def test_syndicate_existing_package(self): context = { 'user': self.user['name'], } existing = helpers.call_action( 'package_create', context=_get_context(context), name='existing-dataset', notes= 'The MapAction PowerPoint Map Pack contains a set of country level reference maps' ) existing['extras'] = [ { 'key': 'syndicate', 'value': 'true' }, ] helpers.call_action('package_update', context=_get_context(context), **existing) with patch('ckanext.syndicate.tasks.get_target') as mock_target: mock_target.return_value = ckanapi.TestAppCKAN( self._get_test_app(), apikey=self.user['apikey']) sync_package(existing['id'], 'dataset/update') updated = helpers.call_action( 'package_show', context=_get_context(context), id=existing['id'], ) syndicated_id = get_pkg_dict_extra(updated, 'syndicated_id') syndicated = helpers.call_action( 'package_show', context=_get_context(context), id=syndicated_id, ) # Expect the id of the syndicated package to match the metadata # syndicated_id in the source package. assert_equal(syndicated['notes'], updated['notes'])
def _get_dataset_date(dataset_dict): created = get_pkg_dict_extra(dataset_dict, 'createdate') created_date = datetime(2003, 1, 1) if created is not None: try: created_date = datetime.strptime(created, '%Y-%m-%d %H:%M:%S') except ValueError: try: created_date = datetime.strptime(created, '%d/%m/%Y %H:%M') except ValueError: pass return created_date.strftime('%m/%d/%Y')
def test_syndicate_existing_package(self): context = { 'user': self.user['name'], } existing = helpers.call_action( 'package_create', context=_get_context(context), name='existing-dataset', notes='The MapAction PowerPoint Map Pack contains a set of country level reference maps' ) existing['extras'] = [ {'key': 'syndicate', 'value': 'true'}, ] helpers.call_action( 'package_update', context=_get_context(context), **existing) with patch('ckanext.syndicate.tasks.get_target') as mock_target: mock_target.return_value = ckanapi.TestAppCKAN( self._get_test_app(), apikey=self.user['apikey']) sync_package(existing['id'], 'dataset/update') updated = helpers.call_action( 'package_show', context=_get_context(context), id=existing['id'], ) syndicated_id = get_pkg_dict_extra(updated, 'syndicated_id') syndicated = helpers.call_action( 'package_show', context=_get_context(context), id=syndicated_id, ) # Expect the id of the syndicated package to match the metadata # syndicated_id in the source package. assert_equal(syndicated['notes'], updated['notes'])
def test_syndicate_existing_package_with_stale_syndicated_id(self): context = { 'user': self.user['name'], } existing = helpers.call_action( 'package_create', context=_get_context(context), name='existing-dataset', notes= 'The MapAction PowerPoint Map Pack contains a set of country level reference maps', extras=[{ 'key': 'syndicate', 'value': 'true' }, { 'key': 'syndicated_id', 'value': '87f7a229-46d0-4171-bfb6-048c622adcdc' }]) with patch('ckanext.syndicate.tasks.get_target') as mock_target: mock_target.return_value = ckanapi.TestAppCKAN( self._get_test_app(), apikey=self.user['apikey']) sync_package(existing['id'], 'dataset/update') updated = helpers.call_action( 'package_show', context=_get_context(context), id=existing['id'], ) syndicated_id = get_pkg_dict_extra(updated, 'syndicated_id') syndicated = helpers.call_action( 'package_show', context=_get_context(context), id=syndicated_id, ) assert_equal(syndicated['notes'], updated['notes'])
def before_view(context, pkg_dict): ''' Adds any additional data fields to the package dictionary for custom display ''' # Attach URL "download bucket" endpoint to package pkg_dict['preview_url'] = helpers.get_pkg_dict_extra( pkg_dict, 'download-url', '') # Turn the stored string of credits back into a list credits = helpers.get_pkg_dict_extra(pkg_dict, 'credits', '') if credits: pkg_dict['credits'] = ast.literal_eval(credits) # If temporal extent, format the dates. temporal_start = helpers.get_pkg_dict_extra(pkg_dict, 'temporal-extent-begin') temporal_end = helpers.get_pkg_dict_extra(pkg_dict, 'temporal-extent-end') if temporal_start and temporal_end: # Unfortunately, the datetime library won't handle years before 1900, so a bit # of manual parsing is needed here. start_year = temporal_start[:4] start_month = temporal_start[5:7] end_year = temporal_end[:4] end_month = temporal_end[5:7] try: pkg_dict['temporal_start'] = "{0} {1}".format( datetime.strptime(start_month, '%m').strftime('%B'), start_year) pkg_dict['temporal_end'] = "{0} {1}".format( datetime.strptime(end_month, '%m').strftime('%B'), end_year) except ValueError: # Swallow and ignore if the date parsing failed. pass spatial_resolution = helpers.get_pkg_dict_extra( pkg_dict, 'spatial-resolution') spatial_resolution_units = helpers.get_pkg_dict_extra( pkg_dict, 'spatial-resolution-units') if spatial_resolution and spatial_resolution_units: pkg_dict['spatial_resolution'] = "{0} {1}".format( spatial_resolution, spatial_resolution_units) return pkg_dict
def test_syndicate_existing_package_with_stale_syndicated_id(self): context = { 'user': self.user['name'], } existing = helpers.call_action( 'package_create', context=_get_context(context), name='existing-dataset', notes='The MapAction PowerPoint Map Pack contains a set of country level reference maps', extras=[ {'key': 'syndicate', 'value': 'true'}, {'key': 'syndicated_id', 'value': '87f7a229-46d0-4171-bfb6-048c622adcdc'} ] ) with patch('ckanext.syndicate.tasks.get_target') as mock_target: mock_target.return_value = ckanapi.TestAppCKAN( self._get_test_app(), apikey=self.user['apikey']) sync_package(existing['id'], 'dataset/update') updated = helpers.call_action( 'package_show', context=_get_context(context), id=existing['id'], ) syndicated_id = get_pkg_dict_extra(updated, 'syndicated_id') syndicated = helpers.call_action( 'package_show', context=_get_context(context), id=syndicated_id, ) assert_equal(syndicated['notes'], updated['notes'])
class SearchfedPlugin(plugins.SingletonPlugin): plugins.implements(plugins.IConfigurer) plugins.implements(plugins.IPackageController, inherit=True) search_fed_dict = dict(zip(*[iter(toolkit.aslist( config.get('ckan.search_federation', [])))] * 2)) search_fed_this_label = config.get('ckan.search_federation.label', '') search_fed_keys = toolkit.aslist( config.get('ckan.search_federation.extra_keys', 'harvest_portal')) search_fed_labels = search_fed_dict.keys() + [search_fed_this_label] use_remote_facets = toolkit.asbool(config.get( 'ckan.search_federation.use_remote_facet_results', False)) search_fed_label_blacklist = toolkit.aslist(config.get( 'ckan.search_federation.label_blacklist', 'owner_org harvest_source_id user_id')) search_fed_dataset_whitelist = toolkit.aslist(config.get( 'ckan.search_federation.dataset_whitelist', 'dataset')) # IConfigurer def update_config(self, config_): toolkit.add_template_directory(config_, 'templates') toolkit.add_public_directory(config_, 'public') toolkit.add_resource('fanstatic', 'searchfed') # IPackageController def before_search(self, search_params): limit = int(config.get( 'ckan.search_federation.min_search_results', 20)) rows = search_params.get('rows', None) search_params['rows'] = rows if rows is not None else limit return search_params def after_search(self, search_results, search_params): limit = int(config.get( 'ckan.search_federation.min_search_results', 20)) def _append_remote_search(search_keys, remote_org_label, remote_org_url, fed_labels, type_whitelist): local_results_num = len(search_results['results']) # query.run increase by 1, so we need to reduce by 1 limit = search_params.get('rows') - 1 current_page = request.params.get('page', 1) try: current_page = int(current_page) if current_page < 1: raise ValueError("Negative number not allowed") except ValueError, e: abort(400, ('"page" parameter must be a positive integer')) fq = " ".join(g for g in map(lambda sk: " ".join(e for e in map( lambda x: "-" + sk + ":" + str(x), fed_labels)), search_keys)) for fq_entry in toolkit.aslist(search_params['fq'][0]): fq_entry = fq_entry.replace('/"', '"').replace("//", "") fq_split = fq_entry.split(':', 1) if len(fq_split) == 2: fq_key = fq_split[0] fq_value = fq_split[1] fq_monop = "" if fq_key[0] in ['+', '-']: fq_monop = fq_entry[:1] fq_key = fq_key[1:] # Dataset whitelist check if (fq_key == 'dataset_type' and fq_monop != "-" and fq_value not in type_whitelist): return fq += " " + fq_monop + fq_key + ":" + fq_value else: fq += fq_entry count_only = False start = search_params.get('start', 0) if local_results_num > start: remote_limit = limit - local_results_num if remote_limit <= 0: count_only = True remote_start = 0 else: remote_limit = limit if not used_controller: remote_start = start - toolkit.c.local_item_count else: if current_page == 1: remote_start = 0 elif current_page == 2: remote_start = limit - toolkit.c.local_item_count else: remote_start = limit - toolkit.c.local_item_count + limit * (current_page - 2) @beaker_cache(expire=3600, query_args=True) def _fetch_data(fetch_start, fetch_num): data = urllib.quote(json.dumps({ 'q': search_params['q'], 'fq': fq, 'facet.field': search_params.get('facet.field', []), 'rows': fetch_num, 'start': fetch_start, 'sort': search_params['sort'], 'extras': search_params['extras'] })) try: req = urllib2.Request( remote_org_url + '/api/3/action/package_search', data) rsp = urllib2.urlopen(req) except urllib2.URLError, err: log.warn('Unable to connect to %r: %r' % ( remote_org_url + '/api/3/action/package_search', err)) return None content = rsp.read() return json.loads(content) remote_results = _fetch_data(remote_start, remote_limit) # Only continue if the remote fetch was successful if remote_results is None: return search_results if count_only: remote_results['result']['results'] = [] else: remote_results_num = len(remote_results['result']['results']) if remote_results_num <= remote_limit + remote_start: if remote_results['result']['count'] > remote_results_num: # While the result count reports all remote matches, the number of results may be limited # by the CKAN install. Here our query has extended beyond the actual returned results, so # we re-issue a more refined query starting and ending at precisely where we want (since # we have already acquired the total count) temp_results = _fetch_data(remote_start, min( remote_results['result']['count'] - remote_start, remote_limit)) if temp_results: remote_results['result']['results'] = temp_results[ 'result']['results'] for dataset in remote_results['result']['results']: extras = dataset.get('extras', []) if not h.get_pkg_dict_extra(dataset, 'harvest_url'): extras += [ { 'key': 'harvest_url', 'value': remote_org_url + '/dataset/' + dataset[ 'id'] } ] for k in search_keys: if not h.get_pkg_dict_extra(dataset, k): extras += [{'key': k, 'value': remote_org_label}] if not h.get_pkg_dict_extra(dataset, 'federation_source'): extras += [{'key': 'federation_source', 'value': remote_org_url}] dataset.update( extras=extras, harvest_source_title=remote_org_label) search_results['count'] += remote_results['result']['count'] if not count_only: if not limit or start > search_results['count']: search_results['results'] = [] elif toolkit.c.local_item_count < limit + start: search_results['results'] += remote_results['result'][ 'results'] if ('search_facets' in remote_results['result'] and self.use_remote_facets): search_results['search_facets'] = remote_results['result'][ 'search_facets']
def retrieve_ueb_run_output_packages(): source = 'uebpackage.tasks.retrieve_ueb_run_output_packages():' global service_host_address #service_request_api_url = '/api/UEBModelRunOutput' service_request_api_url = uebhelper.StringSettings.app_server_api_get_ueb_run_output connection = httplib.HTTPConnection(service_host_address) # get all datasets of type model-package model_pkg_datasets = uebhelper.get_packages_by_dataset_type( 'model-package') for dataset in model_pkg_datasets: pkg_run_job_id = h.get_pkg_dict_extra(dataset, 'package_run_job_id') if pkg_run_job_id is None: continue # to get the package_type value which is a tag, use the get_package() of my the helper module pkg_dict = uebhelper.get_package(dataset['id']) # TODO: Before using pkg_dict check that it is not None pkg_type = pkg_dict['package_type'][0] if len(pkg_run_job_id) == 0: continue if pkg_type == u'Complete': continue pkg_run_status = h.get_pkg_dict_extra(dataset, 'package_run_status') if pkg_run_status != uebhelper.StringSettings.app_server_job_status_success: continue dataset_id = dataset['id'] service_request_url = service_request_api_url + '?uebRunJobID=' + pkg_run_job_id connection.request('GET', service_request_url) service_call_results = connection.getresponse() if service_call_results.status == httplib.OK: log.info(source + 'UEB model output package was received from App ' 'server for model pkg dataset ID:%s and Run Job ID:%s' % (dataset_id, pkg_run_job_id)) _merge_ueb_output_pkg_with_input_pkg(service_call_results, dataset_id) else: log.error( source + 'HTTP status %d returned from App server when ' 'retrieving UEB model output package for ' 'model pkg dataset ID:%s and Run Job ID:%s' % (service_call_results.status, dataset_id, pkg_run_job_id)) ueb_run_status = 'Failed to retrieve output package' # update the dataset data_dict = {'package_run_status': ueb_run_status} try: uebhelper.update_package(dataset_id, data_dict, backgroundTask=True) log.info( source + 'UEB model package dataset run status was updated to %s for ' 'dataset ID:%s' % (dataset_id, ueb_run_status)) except Exception as e: log.error( source + 'Failed to update run status for UEB model package dataset ' 'with dataset ID:%s\nException:%s' % (dataset_id, e)) connection.close() return
def retrieve_ueb_packages(): source = 'uebpackage.tasks.retrieve_ueb_packages():' global service_host_address service_request_api_url = uebhelper.StringSettings.app_server_api_get_ueb_package_url connection = httplib.HTTPConnection(service_host_address) job_status_complete = uebhelper.StringSettings.app_server_job_status_success model_config_datasets_with_status_complete = _get_model_configuration_datasets_by_processing_status( job_status_complete) if len(model_config_datasets_with_status_complete) == 0: log.info( source + "No UEB model configuration dataset has a status of %s at this time" % job_status_complete) else: log.info( source + "Number of UEB model configuration datasets with build status of %s at this time is:%s" % (job_status_complete, len(model_config_datasets_with_status_complete))) for dataset in model_config_datasets_with_status_complete: pkg_availability_status = h.get_pkg_dict_extra(dataset, 'package_availability') if pkg_availability_status == uebhelper.StringSettings.app_server_job_status_package_available: continue pkg_process_job_id = h.get_pkg_dict_extra( dataset, 'package_build_request_job_id') dataset_id = dataset['id'] package_availability_status = h.get_pkg_dict_extra( dataset, 'package_availability') # if package is already available or error has been logged for package retrieval then skip this dataset if package_availability_status == uebhelper.StringSettings.app_server_job_status_package_available or \ package_availability_status == uebhelper.StringSettings.app_server_job_status_error: continue service_request_url = service_request_api_url + '?packageID=' + pkg_process_job_id connection.request('GET', service_request_url) service_call_results = connection.getresponse() if service_call_results.status == httplib.OK: log.info( source + 'UEB model package was received from App server for PackageJobID:%s' % pkg_process_job_id) try: _save_ueb_package_as_dataset(service_call_results, dataset_id) pkg_availability_status = uebhelper.StringSettings.app_server_job_status_package_available except Exception as e: log.error( source + 'Failed to save ueb model package as a new dataset ' 'for model configuration dataset ID:%s\nException:%s' % (dataset_id, e)) pkg_availability_status = uebhelper.StringSettings.app_server_job_status_error else: log.error( source + 'HTTP status %d returned from App server when retrieving ' 'UEB model package for PackageJobID:' '%s' % (service_call_results.status, pkg_process_job_id)) pkg_availability_status = uebhelper.StringSettings.app_server_job_status_error connection.close() # update the resource processing status # update the related dataset data_dict = {'package_availability': pkg_availability_status} update_msg = 'system auto updated ueb package dataset' background_task = True try: updated_package = uebhelper.update_package(dataset_id, data_dict, update_msg, background_task) log.info( source + 'UEB model configuration dataset was updated as a result of ' 'receiving model input package for dataset:%s' % updated_package['name']) except Exception as e: log.error(source + 'Failed to update UEB model configuration dataset after ' 'receiving model input package for dataset ID:%s \n' 'Exception: %s' % (dataset_id, e)) pass return
def check_ueb_run_status(): source = 'uebpackage.tasks.check_ueb_run_status():' global service_host_address service_request_api_url = uebhelper.StringSettings.app_server_api_check_ueb_run_status_url connection = httplib.HTTPConnection(service_host_address) job_status_processing = uebhelper.StringSettings.app_server_job_status_processing job_status_in_queue = uebhelper.StringSettings.app_server_job_status_in_queue model_pkg_datasets_with_run_status_processing = _get_model_pkg_datasets_by_run_status( job_status_processing) model_pkg_datasets_with_run_status_in_queue = _get_model_pkg_datasets_by_run_status( job_status_in_queue) model_pkg_datasets_need_run_status_update = model_pkg_datasets_with_run_status_processing + \ model_pkg_datasets_with_run_status_in_queue if len(model_pkg_datasets_need_run_status_update) == 0: log.info( source + "No UEB model package dataset has a run status of %s at this time" % job_status_processing) else: log.info( source + "Number of UEB model package datatsets with run status of %s or %s at this time is:%s" % (job_status_processing, job_status_in_queue, len(model_pkg_datasets_need_run_status_update))) for dataset in model_pkg_datasets_need_run_status_update: pkg_run_job_id = h.get_pkg_dict_extra(dataset, 'package_run_job_id') if pkg_run_job_id is None: continue dataset_id = dataset['id'] service_request_url = service_request_api_url + '?uebRunJobID=' + pkg_run_job_id connection.request('GET', service_request_url) service_call_results = connection.getresponse() if service_call_results.status == httplib.OK: request_processing_status = service_call_results.read() log.info(source + 'UEB model package run status as returned from App ' 'server for dataset ID: %s and Run Job ID:%s is %s' % (dataset_id, pkg_run_job_id, request_processing_status)) else: request_processing_status = uebhelper.StringSettings.app_server_job_status_error log.error( source + 'HTTP status %d returned from App server when checking ' 'run status for Run Job ID:%s and model pkg dataset ID:%s' % (service_call_results.status, pkg_run_job_id, dataset_id)) connection.close() # update the dataset data_dict = {'package_run_status': request_processing_status} try: uebhelper.update_package(dataset_id, data_dict, backgroundTask=True) log.info( source + 'UEB model package dataset run status was updated to %s for ' 'dataset ID:%s' % (dataset_id, request_processing_status)) except Exception as e: log.error( source + 'Failed to update run status for UEB model package dataset ' 'with dataset ID:%s\nException:%s' % (dataset_id, e))
def _crawl_results(self, harvest_url, limit=100, timeout=5, username=None, password=None, provider=None): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 update_counter = 0 while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: r = requests.get(harvest_url, auth=HTTPBasicAuth(username, password), verify=False, timeout=timeout) except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, status_code, timeout)) # noqa: E128 return ids if r.status_code != 200: self._save_gather_error('{} error: {}'.format(r.status_code, r.text), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, r.status_code, elapsed)) # noqa: E128 return ids if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, r.status_code, r.elapsed.total_seconds())) # noqa: E128, E501 soup = Soup(r.content, 'lxml') # Get the URL for the next loop, or None to break the loop harvest_url = self._get_next_url(soup) # Get the entries from the results entries = self._get_entries_from_results(soup) # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = entry['identifier'] entry_restart_date = entry['restart_date'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. # We need package_show to ensure that all the conversions # are carried out. context = {"user": "******", "ignore_auth": True, "model": model, "session": Session} pkg_dict = logic.get_action('package_show')(context, {"id": package.name}) # noqa: E501 previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format(entry_name)) # noqa: E501 status = 'change' update_counter += 1 # E.g., a Sentinel dataset exists, # but doesn't have a NOA resource yet. elif self.flagged_extra and not get_pkg_dict_extra(pkg_dict, self.flagged_extra): # noqa: E501 log.debug('{} already exists and will be extended.'.format(entry_name)) # noqa: E501 status = 'change' update_counter += 1 else: log.debug('{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date)]) obj.content = entry['content'] obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date)]) new_counter += 1 obj.content = entry['content'] obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info(harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, update_counter)) # noqa: E128, E501 return ids
def retrieve_ueb_packages(): source = 'uebpackage.tasks.retrieve_ueb_packages():' global service_host_address service_request_api_url = uebhelper.StringSettings.app_server_api_get_ueb_package_url connection = httplib.HTTPConnection(service_host_address) job_status_complete = uebhelper.StringSettings.app_server_job_status_success model_config_datasets_with_status_complete = _get_model_configuration_datasets_by_processing_status( job_status_complete) if len(model_config_datasets_with_status_complete) == 0: log.info(source + "No UEB model configuration dataset has a status of %s at this time" % job_status_complete) else: log.info(source + "Number of UEB model configuration datasets with build status of %s at this time is:%s" % (job_status_complete, len(model_config_datasets_with_status_complete))) for dataset in model_config_datasets_with_status_complete: pkg_availability_status = h.get_pkg_dict_extra(dataset, 'package_availability') if pkg_availability_status == uebhelper.StringSettings.app_server_job_status_package_available: continue pkg_process_job_id = h.get_pkg_dict_extra(dataset, 'package_build_request_job_id') dataset_id = dataset['id'] package_availability_status = h.get_pkg_dict_extra(dataset, 'package_availability') # if package is already available or error has been logged for package retrieval then skip this dataset if package_availability_status == uebhelper.StringSettings.app_server_job_status_package_available or \ package_availability_status == uebhelper.StringSettings.app_server_job_status_error: continue service_request_url = service_request_api_url + '?packageID=' + pkg_process_job_id connection.request('GET', service_request_url) service_call_results = connection.getresponse() if service_call_results.status == httplib.OK: log.info(source + 'UEB model package was received from App server for PackageJobID:%s' % pkg_process_job_id) try: _save_ueb_package_as_dataset(service_call_results, dataset_id) pkg_availability_status = uebhelper.StringSettings.app_server_job_status_package_available except Exception as e: log.error(source + 'Failed to save ueb model package as a new dataset ' 'for model configuration dataset ID:%s\nException:%s' % (dataset_id, e)) pkg_availability_status = uebhelper.StringSettings.app_server_job_status_error else: log.error(source + 'HTTP status %d returned from App server when retrieving ' 'UEB model package for PackageJobID:' '%s' % (service_call_results.status, pkg_process_job_id)) pkg_availability_status = uebhelper.StringSettings.app_server_job_status_error connection.close() # update the resource processing status # update the related dataset data_dict = {'package_availability': pkg_availability_status} update_msg = 'system auto updated ueb package dataset' background_task = True try: updated_package = uebhelper.update_package(dataset_id, data_dict, update_msg, background_task) log.info(source + 'UEB model configuration dataset was updated as a result of ' 'receiving model input package for dataset:%s' % updated_package['name']) except Exception as e: log.error(source + 'Failed to update UEB model configuration dataset after ' 'receiving model input package for dataset ID:%s \n' 'Exception: %s' % (dataset_id, e)) pass return
def format_data_costs(package): data = h.get_pkg_dict_extra(package, 'access_constraints') data_list = json.loads(data) return ', '.join(data_list)
def test_create_package(self): local_org = factories.Organization(user=self.user, name='local-org') remote_org = factories.Organization(user=self.user, name='remote-org') helpers.call_action( 'member_create', id=local_org['id'], object=self.user['id'], object_type='user', capacity='editor') helpers.call_action( 'member_create', id=remote_org['id'], object=self.user['id'], object_type='user', capacity='editor') context = { 'user': self.user['name'], } dataset = helpers.call_action( 'package_create', context=context, name='syndicated_dataset', owner_org=local_org['id'], extras=[ {'key': 'syndicate', 'value': 'true'}, ], resources=[{ 'upload': test_upload_file, 'url': 'test_file.txt', 'url_type': 'upload', 'format': 'txt', 'name': 'test_file.txt', }, { 'upload': test_upload_file, 'url': 'test_file1.txt', 'url_type': 'upload', 'format': 'txt', 'name': 'test_file1.txt', }], ) assert_equal(dataset['name'], 'syndicated_dataset') with patch('ckanext.syndicate.tasks.get_target') as mock_target: # Mock API mock_target.return_value = ckanapi.TestAppCKAN( self._get_test_app(), apikey=self.user['apikey']) # Syndicate to our Test CKAN instance sync_package(dataset['id'], 'dataset/create') # Reload our local package, to read the syndicated ID source = helpers.call_action( 'package_show', context=context, id=dataset['id'], ) # The source package should have a syndicated_id set pointing to the # new syndicated package. syndicated_id = get_pkg_dict_extra(source, 'syndicated_id') assert_is_not_none(syndicated_id) # Expect a new package to be created syndicated = helpers.call_action( 'package_show', context=context, id=syndicated_id, ) # Expect the id of the syndicated package to match the metadata # syndicated_id in the source package. assert_equal(syndicated['id'], syndicated_id) assert_equal(syndicated['name'], 'test-syndicated_dataset') assert_equal(syndicated['owner_org'], remote_org['id']) # Test links to resources on the source CKAN instace have been added resources = syndicated['resources'] assert_equal(len(resources), 2) remote_resource_url = resources[0]['url'] local_resource_url = source['resources'][0]['url'] assert_equal(local_resource_url, remote_resource_url) remote_resource_url = resources[1]['url'] local_resource_url = source['resources'][1]['url'] assert_equal(local_resource_url, remote_resource_url)
def after_search(self, search_results, search_params): limit = int(config.get('ckan.search_federation.min_search_results', 20)) def _append_remote_search(search_keys, remote_org_label, remote_org_url, fed_labels, type_whitelist): local_results_num = len(search_results['results']) facet_fields = search_params.get('facet.field', []) fq = " ".join(g for g in map( lambda sk: " ".join(e for e in map( lambda x: "-" + sk + ":" + str(x), fed_labels)), search_keys)) for fq_entry in toolkit.aslist(search_params['fq'][0]): fq_entry = fq_entry.replace('/"', '"').replace("//", "") fq_split = fq_entry.split(':', 1) if len(fq_split) == 2: fq_key = fq_split[0] fq_value = fq_split[1] fq_monop = "" if fq_key[0] in ['+', '-']: fq_monop = fq_entry[:1] fq_key = fq_key[1:] # Dataset whitelist check if (fq_key == 'dataset_type' and fq_monop != "-" and fq_value not in type_whitelist): return if fq_key.lower() in facet_fields: fq += " " + fq_monop + fq_key + ":" + fq_value else: fq += fq_entry count_only = False start = search_params.get('start', 0) if local_results_num >= start: remote_limit = limit - local_results_num + start if remote_limit <= 0: count_only = True remote_start = 0 else: remote_limit = limit remote_start = start - local_results_num @beaker_cache(expire=3600) def _fetch_data(fetch_start, fetch_num): data = urllib.quote( json.dumps({ 'q': search_params['q'], 'fq': fq, 'facet.field': search_params.get('facet.field', []), 'rows': fetch_num, 'start': fetch_start, 'sort': search_params['sort'], 'extras': search_params['extras'] })) try: req = urllib2.Request( remote_org_url + '/api/3/action/package_search', data) rsp = urllib2.urlopen(req) except urllib2.URLError, err: log.warn( 'Unable to connect to %r: %r' % (remote_org_url + '/api/3/action/package_search', err)) return None content = rsp.read() return json.loads(content) remote_results = _fetch_data(0, 99999) # Only continue if the remote fetch was successful if remote_results is None: return search_results if count_only: remote_results['result']['results'] = [] else: use_temp = False remote_results_num = len(remote_results['result']['results']) if remote_results_num <= remote_limit + remote_start: if remote_results['result']['count'] > remote_results_num: # While the result count reports all remote matches, the number of results may be limited # by the CKAN install. Here our query has extended beyond the actual returned results, so # we re-issue a more refined query starting and ending at precisely where we want (since # we have already acquired the total count) temp_results = _fetch_data( remote_start, min( remote_results['result']['count'] - remote_start, remote_limit)) if temp_results: use_temp = True remote_results['result']['results'] = temp_results[ 'result']['results'] if not use_temp: remote_results['result']['results'] = remote_results[ 'result']['results'][remote_start:remote_limit + remote_start - 1] for dataset in remote_results['result']['results']: extras = dataset.get('extras', []) if not h.get_pkg_dict_extra(dataset, 'harvest_url'): extras += [{ 'key': 'harvest_url', 'value': remote_org_url + '/dataset/' + dataset['id'] }] for k in search_keys: if not h.get_pkg_dict_extra(dataset, k): extras += [{'key': k, 'value': remote_org_label}] dataset.update(extras=extras, harvest_source_title=remote_org_label) search_results['count'] += remote_results['result']['count'] if not count_only: search_results['results'] += remote_results['result'][ 'results'] if ('search_facets' in remote_results['result'] and self.use_remote_facets): search_results['search_facets'] = remote_results['result'][ 'search_facets']
def test_author_check(self): context = { 'user': self.user['name'] } dataset1 = helpers.call_action( 'package_create', context=context, name='syndicated_dataset1', extras=[{'key': 'syndicate', 'value': 'true'}] ) dataset2 = helpers.call_action( 'package_create', context=context, name='syndicated_dataset2', extras=[{'key': 'syndicate', 'value': 'true'}] ) with patch('ckanext.syndicate.tasks.get_target') as mock_target: # Mock API mock_target.return_value = ckanapi.TestAppCKAN( self._get_test_app(), apikey=self.user['apikey']) # Syndicate to our Test CKAN instance ckan = mock_target() mock_user_show = mock.Mock() mock_user_show.return_value = self.user ckan.action.user_show = mock_user_show sync_package(dataset1['id'], 'dataset/create') helpers.call_action( 'package_patch', id=dataset1['id'], extras=[{'key': 'syndicate', 'value': 'true'}] ) sync_package(dataset1['id'], 'dataset/update') mock_user_show.assert_called_once_with(id='test_author') updated1 = helpers.call_action('package_show', id=dataset1['id']) assert_is_not_none( get_pkg_dict_extra(updated1, get_syndicated_id()) ) mock_user_show = mock.Mock() mock_user_show.return_value = {'name': 'random-name', 'id': ''} ckan.action.user_show = mock_user_show sync_package(dataset2['id'], 'dataset/create') helpers.call_action( 'package_patch', id=dataset2['id'], extras=[{'key': 'syndicate', 'value': 'true'}] ) sync_package(dataset2['id'], 'dataset/update') updated2 = helpers.call_action('package_show', id=dataset2['id']) assert_false( get_pkg_dict_extra(updated2, get_syndicated_id()) ) del Session.revision