def edit(self, id, data=None, errors=None, error_summary=None): package_type = self._get_package_type(id) context = {'model': model, 'session': model.Session, 'user': c.user, 'auth_user_obj': c.userobj, 'save': 'save' in request.params} if context['save'] and not data: return self._save_edit(id, context, package_type=package_type) try: c.pkg_dict = get_action('package_show')(dict(context, for_view=True), {'id': id}) context['for_edit'] = True old_data = get_action('package_show')(context, {'id': id}) # old data is from the database and data is passed from the # user if there is a validation error. Use users data if there. if data: old_data.update(data) data = old_data except (NotFound, NotAuthorized): abort(404, _('Dataset not found')) # are we doing a multiphase add? if data.get('state', '').startswith('draft'): c.form_action = h.url_for(controller='package', action='new') c.form_style = 'new' return self.new(data=data, errors=errors, error_summary=error_summary) c.pkg = context.get("package") c.resources_json = h.json.dumps(data.get('resources', [])) try: check_access('package_update', context) except NotAuthorized: abort(403, _('User %r not authorized to edit %s') % (c.user, id)) # convert tags if not supplied in data if data and not data.get('tag_string'): data['tag_string'] = ', '.join(h.dict_list_reduce( c.pkg_dict.get('tags', {}), 'name')) errors = errors or {} form_snippet = self._package_form(package_type=package_type) form_vars = {'data': data, 'errors': errors, 'error_summary': error_summary, 'action': 'edit', 'dataset_type': package_type, } c.errors_json = h.json.dumps(errors) self._setup_template_variables(context, {'id': id}, package_type=package_type) # we have already completed stage 1 form_vars['stage'] = ['active'] if data.get('state', '').startswith('draft'): form_vars['stage'] = ['active', 'complete'] edit_template = self._edit_template(package_type) return render(edit_template, extra_vars={'form_vars': form_vars, 'form_snippet': form_snippet, 'dataset_type': package_type})
def _save_new(self, context, package_type=None): # The staged add dataset used the new functionality when the dataset is # partially created so we need to know if we actually are updating or # this is a real new. is_an_update = False ckan_phase = request.params.get('_ckan_phase') from ckan.lib.search import SearchIndexError try: data_dict = clean_dict(dict_fns.unflatten( tuplize_dict(parse_params(request.POST)))) if ckan_phase: # prevent clearing of groups etc context['allow_partial_update'] = True # sort the tags if 'tag_string' in data_dict: data_dict['tags'] = self._tag_string_to_list(data_dict['tag_string']) self._validate_dataset(data_dict) if data_dict.get('pkg_name'): is_an_update = True # This is actually an update not a save data_dict['id'] = data_dict['pkg_name'] del data_dict['pkg_name'] # don't change the dataset state data_dict['state'] = 'draft' # this is actually an edit not a save pkg_dict = get_action('package_update')(context, data_dict) if request.params['save'] == 'go-metadata': # redirect to add metadata url = h.url_for(controller='package', action='new_metadata', id=pkg_dict['name']) elif request.params['save'] == 'save-draft': url = h.url_for(controller='package', action='read', id=pkg_dict['name']) else: # redirect to add dataset resources url = h.url_for(controller='package', action='new_resource', id=pkg_dict['name']) redirect(url) # Make sure we don't index this dataset if request.params['save'] not in ['go-resource', 'go-metadata']: data_dict['state'] = 'draft' # allow the state to be changed context['allow_state_change'] = True data_dict['type'] = package_type context['message'] = data_dict.get('log_message', '') pkg_dict = get_action('package_create')(context, data_dict) if ckan_phase and request.params['save'] != 'save-draft': url = h.url_for(controller='package', action='new_resource', id=pkg_dict['name']) redirect(url) elif request.params['save'] == 'save-draft': url = h.url_for(controller='package', action='read', id=pkg_dict['name']) redirect(url) self._form_save_redirect(pkg_dict['name'], 'new', package_type=package_type) except NotAuthorized: abort(401, _('Unauthorized to read package %s') % '') except NotFound, e: abort(404, _('Dataset not found'))
def _save_new(self, context, package_type=None): # The staged add dataset used the new functionality when the dataset is # partially created so we need to know if we actually are updating or # this is a real new. is_an_update = False ckan_phase = request.params.get('_ckan_phase') from ckan.lib.search import SearchIndexError try: data_dict = clean_dict(dict_fns.unflatten( tuplize_dict(parse_params(request.POST)))) if ckan_phase: # prevent clearing of groups etc context['allow_partial_update'] = True # sort the tags if 'tag_string' in data_dict: data_dict['tags'] = self._tag_string_to_list(data_dict['tag_string']) if data_dict.get('pkg_name'): is_an_update = True # This is actually an update not a save data_dict['id'] = data_dict['pkg_name'] del data_dict['pkg_name'] # don't change the dataset state data_dict['state'] = 'draft' # this is actually an edit not a save pkg_dict = get_action('package_update')(context, data_dict) if request.params['save'] == 'go-metadata': # redirect to add metadata url = h.url_for(controller='package', action='new_metadata', id=pkg_dict['name']) elif request.params['save'] == 'save-draft': url = h.url_for(controller='package', action='read', id=pkg_dict['name']) else: # redirect to add dataset resources url = h.url_for(controller='package', action='new_resource', id=pkg_dict['name']) redirect(url) # Make sure we don't index this dataset if request.params['save'] not in ['go-resource', 'go-metadata']: data_dict['state'] = 'draft' # allow the state to be changed context['allow_state_change'] = True data_dict['type'] = package_type context['message'] = data_dict.get('log_message', '') pkg_dict = get_action('package_create')(context, data_dict) if ckan_phase and request.params['save'] != 'save-draft': url = h.url_for(controller='package', action='new_resource', id=pkg_dict['name']) redirect(url) elif request.params['save'] == 'save-draft': url = h.url_for(controller='package', action='read', id=pkg_dict['name']) redirect(url) self._form_save_redirect(pkg_dict['name'], 'new', package_type=package_type) except NotAuthorized: abort(401, _('Unauthorized to read package %s') % '') except NotFound, e: abort(404, _('Dataset not found'))
def _save_edit(self, name_or_id, context, package_type=None): from ckan.lib.search import SearchIndexError log.debug('Package save request name: %s POST: %r', name_or_id, request.POST) try: data_dict = clean_dict( dict_fns.unflatten(tuplize_dict(parse_params(request.POST)))) self._validate_dataset(data_dict) if '_ckan_phase' in data_dict: # we allow partial updates to not destroy existing resources context['allow_partial_update'] = True if 'tag_string' in data_dict: data_dict['tags'] = self._tag_string_to_list( data_dict['tag_string']) del data_dict['_ckan_phase'] del data_dict['save'] context['message'] = data_dict.get('log_message', '') data_dict['id'] = name_or_id # Obtengo la lista de extras del dataset y agrego en el data_dict los extras que falten # (no estaban en el request.POST), y reemplazo valores desactualizados extra_fields = get_action('package_show')(dict(context, for_view=True), { 'id': name_or_id })['extras'] if 'extras' not in data_dict.keys(): data_dict['extras'] = [] for extra_field in extra_fields: found_extra_field = filter( lambda x: x['key'] == extra_field['key'], data_dict['extras']) if len(found_extra_field) == 0: data_dict['extras'].append(extra_field) time_now = moment.now().isoformat() self._add_or_replace_extra(key='modified', value=time_now, extras=data_dict['extras']) self.__generate_spatial_extra_field(data_dict) pkg = get_action('package_update')(context, data_dict) c.pkg = context['package'] c.pkg_dict = pkg self._form_save_redirect(pkg['name'], 'edit', package_type=package_type) except NotAuthorized: abort(403, _('Unauthorized to read package %s') % id) except NotFound, e: abort(404, _('Dataset not found'))
def resource_edit(self, id, resource_id, data=None, errors=None, error_summary=None): context = { 'model': model, 'session': model.Session, 'api_version': 3, 'for_edit': True, 'user': c.user, 'auth_user_obj': c.userobj } data_dict = {'id': id} try: check_access('package_update', context, data_dict) except NotAuthorized: abort(403, _('User %r not authorized to edit %s') % (c.user, id)) if request.method == 'POST' and not data: data = data or \ clean_dict(dict_fns.unflatten(tuplize_dict(parse_params( request.POST)))) # we don't want to include save as it is part of the form del data['save'] # Guardo los campos issued y modified package_data = get_action('resource_show')(context, { 'id': resource_id }) # Los packages creados sin el campo extra "issued" deben defaultear al campo "created" issued = package_data.get('issued', None) or package_data.get('created') data['issued'] = issued data['modified'] = moment.now().isoformat() data['package_id'] = id try: if resource_id: data['id'] = resource_id get_action('resource_update')(context, data) else: get_action('resource_create')(context, data) except ValidationError, e: errors = e.error_dict error_summary = e.error_summary return self.resource_edit(id, resource_id, data, errors, error_summary) except NotAuthorized: abort(401, _('Unauthorized to edit this resource'))
def _save_edit(self, name_or_id, context, package_type=None): from ckan.lib.search import SearchIndexError log.debug('Package save request name: %s POST: %r', name_or_id, request.POST) try: data_dict = clean_dict( dict_fns.unflatten(tuplize_dict(parse_params(request.POST)))) self._validate_dataset(data_dict) if '_ckan_phase' in data_dict: # we allow partial updates to not destroy existing resources context['allow_partial_update'] = True if 'tag_string' in data_dict: data_dict['tags'] = self._tag_string_to_list( data_dict['tag_string']) del data_dict['_ckan_phase'] del data_dict['save'] context['message'] = data_dict.get('log_message', '') data_dict['id'] = name_or_id self.__generate_spatial_extra_field(data_dict) pkg = get_action('package_update')(context, data_dict) c.pkg = context['package'] c.pkg_dict = pkg self._form_save_redirect(pkg['name'], 'edit', package_type=package_type) except NotAuthorized: abort(403, _('Unauthorized to read package %s') % id) except NotFound, e: abort(404, _('Dataset not found'))
def search(self): package_type = self._guess_package_type() try: context = {'model': model, 'user': c.user or c.author, 'auth_user_obj': c.userobj} check_access('site_read', context) except NotAuthorized: abort(401, _('Not authorized to see this page')) q = c.q = request.params.get('q', u'') c.query_error = False page = self._get_page_number(request.params) limit = g.datasets_per_page params_nopage = [(k, v) for k, v in request.params.items() if k != 'page'] def drill_down_url(alternative_url=None, **by): return h.add_url_param( alternative_url=alternative_url, controller='package', action='search', new_params=by ) c.drill_down_url = drill_down_url def remove_field(key, value=None, replace=None): return h.remove_url_param(key, value=value, replace=replace, controller='package', action='search') c.remove_field = remove_field sort_by = request.params.get('sort', None) params_nosort = [(k, v) for k, v in params_nopage if k != 'sort'] def _sort_by(fields): params = params_nosort[:] if fields: sort_string = ', '.join('%s %s' % f for f in fields) params.append(('sort', sort_string)) return search_url(params, package_type) c.sort_by = _sort_by if not sort_by: c.sort_by_fields = [] else: c.sort_by_fields = [field.split()[0] for field in sort_by.split(',')] def pager_url(q=None, page=None): params = list(params_nopage) params.append(('page', page)) return search_url(params, package_type) c.search_url_params = urlencode(_encode_params(params_nopage)) try: c.fields = [] c.fields_grouped = {} search_extras = {} fq = '' for (param, value) in request.params.items(): if param not in ['q', 'page', 'sort'] \ and len(value) and not param.startswith('_'): if not param.startswith('ext_'): c.fields.append((param, value)) if param != 'organization': fq += ' %s:"%s"' % (param, value) else: fq += custom_organization_filter(value) if param not in c.fields_grouped: c.fields_grouped[param] = [value] else: c.fields_grouped[param].append(value) else: search_extras[param] = value context = {'model': model, 'session': model.Session, 'user': c.user or c.author, 'for_view': True, 'auth_user_obj': c.userobj} if package_type and package_type != 'dataset': fq += ' +dataset_type:{type}'.format(type=package_type) elif not asbool(config.get('ckan.search.show_all_types', 'False')): fq += ' +dataset_type:dataset' facets = OrderedDict() default_facet_titles = { 'organization': _('Organizations'), 'groups': _('Groups'), 'tags': _('Tags'), 'res_format': _('Formats'), 'license_id': _('Licenses'), } for facet in g.facets: if facet in default_facet_titles: facets[facet] = default_facet_titles[facet] else: facets[facet] = facet for plugin in p.PluginImplementations(p.IFacets): facets = plugin.dataset_facets(facets, package_type) c.facet_titles = facets data_dict = { 'q': q, 'fq': fq.strip(), 'facet.field': facets.keys(), 'rows': limit, 'start': (page - 1) * limit, 'sort': sort_by, 'extras': search_extras } query = get_action('package_search')(context, data_dict) c.sort_by_selected = query['sort'] c.page = h.Page( collection=query['results'], page=page, url=pager_url, item_count=query['count'], items_per_page=limit ) c.facets = query['facets'] c.search_facets = query['search_facets'] c.page.items = query['results'] except SearchError, se: log.error('Dataset search error: %r', se.args) c.query_error = True c.facets = {} c.search_facets = {} c.page = h.Page(collection=[])
url = h.url_for(controller='package', action='new_resource', id=pkg_dict['name']) redirect(url) elif request.params['save'] == 'save-draft': url = h.url_for(controller='package', action='read', id=pkg_dict['name']) redirect(url) self._form_save_redirect(pkg_dict['name'], 'new', package_type=package_type) except NotAuthorized: abort(401, _('Unauthorized to read package %s') % '') except NotFound, e: abort(404, _('Dataset not found')) except dict_fns.DataError: abort(400, _(u'Integrity Error')) except SearchIndexError, e: try: exc_str = unicode(repr(e.args)) except Exception: # We don't like bare excepts exc_str = unicode(str(e)) abort(500, _(u'Unable to add package to search index.') + exc_str) except ValidationError, e: errors = e.error_dict error_summary = e.error_summary if is_an_update: # we need to get the state of the dataset to show the stage we # are on. pkg_dict = get_action('package_show')(context, data_dict) data_dict['state'] = pkg_dict['state'] return self.edit(data_dict['id'], data_dict, errors, error_summary) data_dict['state'] = 'none' return self.new(data_dict, errors, error_summary)
def search(self): package_type = self._guess_package_type() try: context = { 'model': model, 'user': c.user or c.author, 'auth_user_obj': c.userobj } check_access('site_read', context) except NotAuthorized: abort(401, _('Not authorized to see this page')) q = c.q = request.params.get('q', u'') c.query_error = False page = self._get_page_number(request.params) limit = g.datasets_per_page params_nopage = [(k, v) for k, v in request.params.items() if k != 'page'] def drill_down_url(alternative_url=None, **by): return h.add_url_param(alternative_url=alternative_url, controller='package', action='search', new_params=by) c.drill_down_url = drill_down_url def remove_field(key, value=None, replace=None): return h.remove_url_param(key, value=value, replace=replace, controller='package', action='search') c.remove_field = remove_field sort_by = request.params.get('sort', None) params_nosort = [(k, v) for k, v in params_nopage if k != 'sort'] def _sort_by(fields): params = params_nosort[:] if fields: sort_string = ', '.join('%s %s' % f for f in fields) params.append(('sort', sort_string)) return search_url(params, package_type) c.sort_by = _sort_by if not sort_by: c.sort_by_fields = [] else: c.sort_by_fields = [ field.split()[0] for field in sort_by.split(',') ] def pager_url(q=None, page=None): params = list(params_nopage) params.append(('page', page)) return search_url(params, package_type) c.search_url_params = urlencode(_encode_params(params_nopage)) try: c.fields = [] c.fields_grouped = {} search_extras = {} fq = '' for (param, value) in request.params.items(): if param not in ['q', 'page', 'sort'] \ and len(value) and not param.startswith('_'): if not param.startswith('ext_'): c.fields.append((param, value)) if param != 'organization': fq += ' %s:"%s"' % (param, value) else: fq += custom_organization_filter(value) if param not in c.fields_grouped: c.fields_grouped[param] = [value] else: c.fields_grouped[param].append(value) else: search_extras[param] = value context = { 'model': model, 'session': model.Session, 'user': c.user or c.author, 'for_view': True, 'auth_user_obj': c.userobj } if package_type and package_type != 'dataset': fq += ' +dataset_type:{type}'.format(type=package_type) elif not asbool(config.get('ckan.search.show_all_types', 'False')): fq += ' +dataset_type:dataset' facets = OrderedDict() default_facet_titles = { 'organization': _('Organizations'), 'groups': _('Groups'), 'tags': _('Tags'), 'res_format': _('Formats'), 'license_id': _('Licenses'), } for facet in g.facets: if facet in default_facet_titles: facets[facet] = default_facet_titles[facet] else: facets[facet] = facet for plugin in p.PluginImplementations(p.IFacets): facets = plugin.dataset_facets(facets, package_type) c.facet_titles = facets data_dict = { 'q': q, 'fq': fq.strip(), 'facet.field': facets.keys(), 'rows': limit, 'start': (page - 1) * limit, 'sort': sort_by, 'extras': search_extras } query = get_action('package_search')(context, data_dict) c.sort_by_selected = query['sort'] c.page = h.Page(collection=query['results'], page=page, url=pager_url, item_count=query['count'], items_per_page=limit) c.facets = query['facets'] c.search_facets = query['search_facets'] c.page.items = query['results'] except SearchError, se: log.error('Dataset search error: %r', se.args) c.query_error = True c.facets = {} c.search_facets = {} c.page = h.Page(collection=[])
def import_stage(self, harvest_object): # The import stage actually creates the dataset. log.debug('In %s import_stage' % repr(self)) if (harvest_object.content == None): return True dataset = json.loads(harvest_object.content) schema_version = '1.2' # default to '1.0' is_collection = False parent_pkg_id = '' catalog_extras = {} for extra in harvest_object.extras: if extra.key == 'schema_version': schema_version = extra.value if extra.key == 'is_collection' and extra.value: is_collection = True if extra.key == 'collection_pkg_id' and extra.value: parent_pkg_id = extra.value if extra.key.startswith('catalog_'): catalog_extras[extra.key] = extra.value # if this dataset is part of collection, we need to check if # parent dataset exist or not. we dont support any hierarchy # in this, so the check does not apply to those of is_collection if parent_pkg_id and not is_collection: parent_pkg = None try: parent_pkg = get_action('package_show')(self.context(), { "id": parent_pkg_id }) except: pass if not parent_pkg: parent_check_message = "isPartOf identifer '%s' not found." \ % dataset.get('isPartOf') self._save_object_error(parent_check_message, harvest_object, 'Import') return None # Get default values. dataset_defaults = self.load_config(harvest_object.source)["defaults"] source_config = json.loads(harvest_object.source.config or '{}') validator_schema = source_config.get('validator_schema') if schema_version == '1.2' and validator_schema != 'non-federal': lowercase_conversion = True else: lowercase_conversion = False MAPPING = { "title": "title", "identifier": "extras__identifier", "description": "notes", "keyword": "tags", "modified": "modified", "author": "author", "author_email": "author_email", "maintainer": "maintainer", "maintainer_email": "maintainer_email", "dataQuality": "extras__dataQuality", "license": "license_title", "spatial": "extras__spatial", "temporal": "extras__temporal", "superTheme": "groups", "primaryITInvestmentUII": "extras__primaryITInvestmentUII", "accrualPeriodicity": "extras__updateFrequency", "landingPage": "url", "language": "extras__language", "references": "extras__references", "issued": "extras__issued", "distribution": None, } MAPPING_V1_1 = { "title": "title", "identifier": "extras__identifier", "description": "notes", "keyword": "tags", "modified": "modified", "author": "author", "author_email": "author_email", "maintainer": "maintainer", "maintainer_email": "maintainer_email", "dataQuality": "extras__dataQuality", "license": "license_title", "spatial": "extras__spatial", "temporal": "extras__temporal", "superTheme": "groups", "primaryITInvestmentUII": "extras__primaryITInvestmentUII", "accrualPeriodicity": "extras__updateFrequency", "landingPage": "url", "language": "extras__language", "references": "extras__references", "issued": "extras__issued", "distribution": None, } MAPPING_V1_2 = { "title": "title", "identifier": "extras__identifier", "description": "notes", "keyword": "tags", "modified": "modified", "author": "author", "author_email": "author_email", "maintainer": "maintainer", "maintainer_email": "maintainer_email", "dataQuality": "extras__dataQuality", "license": "license_title", "spatial": "extras__spatial", "temporal": "extras__temporal", "superTheme": "groups", "primaryITInvestmentUII": "extras__primaryITInvestmentUII", "accrualPeriodicity": "extras__updateFrequency", "landingPage": "url", "language": "extras__language", "references": "extras__references", "issued": "extras__issued", "distribution": None, } SKIP = [ "accessURL", "webService", "format", "distribution", "processed_how" ] SKIP_V1_1 = [ "@type", "isPartOf", "license", "distribution", "processed_how" ] if lowercase_conversion: mapping_processed = {} for k, v in MAPPING.items(): mapping_processed[k.lower()] = v skip_processed = [k.lower() for k in SKIP] dataset_processed = {'processed_how': ['lowercase']} for k, v in dataset.items(): if k.lower() in mapping_processed.keys(): dataset_processed[k.lower()] = v else: dataset_processed[k] = v if 'distribution' in dataset and dataset[ 'distribution'] is not None: dataset_processed['distribution'] = [] for d in dataset['distribution']: d_lower = {} for k, v in d.items(): if k.lower() in mapping_processed.keys(): d_lower[k.lower()] = v else: d_lower[k] = v dataset_processed['distribution'].append(d_lower) else: dataset_processed = dataset mapping_processed = MAPPING skip_processed = SKIP if schema_version == '1.1': mapping_processed = MAPPING_V1_1 skip_processed = SKIP_V1_1 if schema_version == '1.2': mapping_processed = MAPPING_V1_2 skip_processed = SKIP_V1_1 validate_message = self._validate_dataset(validator_schema, schema_version, dataset_processed) if validate_message: self._save_object_error(validate_message, harvest_object, 'Import') return None # We need to get the owner organization (if any) from the harvest # source dataset owner_org = None source_dataset = model.Package.get(harvest_object.source.id) if source_dataset.owner_org: owner_org = source_dataset.owner_org source_config = json.loads(harvest_object.source.config or '{}') # group_name = source_config.get('default_groups', '') group_name = [{ 'name': theme.lower() } for theme in dataset['superTheme']] # Assemble basic information about the dataset. pkg = { "state": "active", # in case was previously deleted "owner_org": owner_org, "groups": group_name, "resources": [], "extras": [ { "key": "resource-type", "value": "Dataset", }, { "key": "source_hash", "value": self.make_upstream_content_hash(dataset, harvest_object.source, catalog_extras, schema_version), }, { "key": "source_datajson_identifier", "value": True, }, { "key": "harvest_source_id", "value": harvest_object.harvest_source_id, }, { "key": "harvest_object_id", "value": harvest_object.id, }, { "key": "harvest_source_title", "value": harvest_object.source.title, }, { "key": "source_schema_version", "value": schema_version, }, ] } extras = pkg["extras"] unmapped = [] for key, value in dataset_processed.iteritems(): if key in skip_processed: continue new_key = mapping_processed.get(key) if not new_key: unmapped.append(key) continue # after schema 1.0+, we need to deal with multiple new_keys new_keys = [] values = [] if isinstance(new_key, dict): # when schema is not 1.0 _new_key_keys = new_key.keys() new_keys = new_key.values() values = [] for _key in _new_key_keys: values.append(value.get(_key)) else: new_keys.append(new_key) values.append(value) if not any(item for item in values): continue mini_dataset = dict(zip(new_keys, values)) for mini_key, mini_value in mini_dataset.iteritems(): if not mini_value: continue if mini_key.startswith('extras__'): extras.append({"key": mini_key[8:], "value": mini_value}) else: pkg[mini_key] = mini_value # pick a fix number of unmapped entries and put into extra if unmapped: unmapped.sort() del unmapped[100:] for key in unmapped: value = dataset_processed.get(key, "") if value is not None: extras.append({"key": key, "value": value}) # if theme is geospatial/Geospatial, we tag it in metadata_type. themes = self.find_extra(pkg, "theme") if themes and ('geospatial' in [x.lower() for x in themes]): extras.append({'key': 'metadata_type', 'value': 'geospatial'}) if is_collection: extras.append({'key': 'collection_metadata', 'value': 'true'}) elif parent_pkg_id: extras.append({ 'key': 'collection_package_id', 'value': parent_pkg_id }) for k, v in catalog_extras.iteritems(): extras.append({'key': k, 'value': v}) # Set specific information about the dataset. self.set_dataset_info(pkg, dataset_processed, dataset_defaults, schema_version) # Try to update an existing package with the ID set in harvest_object.guid. If that GUID # corresponds with an existing package, get its current metadata. try: existing_pkg = get_action('package_show')(self.context(), { "id": harvest_object.guid }) except NotFound: existing_pkg = None if existing_pkg: # Update the existing metadata with the new information. # But before doing that, try to avoid replacing existing resources with new resources # my assigning resource IDs where they match up. for res in pkg.get("resources", []): for existing_res in existing_pkg.get("resources", []): if res["url"] == existing_res["url"]: res["id"] = existing_res["id"] pkg['groups'] = existing_pkg['groups'] existing_pkg.update( pkg ) # preserve other fields that we're not setting, but clobber extras pkg = existing_pkg log.warn('updating package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) pkg = get_action('package_update')(self.context(), pkg) else: # It doesn't exist yet. Create a new one. pkg['name'] = self.make_package_name(dataset_processed["title"], harvest_object.guid) try: pkg = get_action('package_create')(self.context(), pkg) log.warn('created package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) except IntegrityError: # sometimes one fetch worker does not see new pkg added # by other workers. it gives db error for pkg with same title. model.Session.rollback() pkg['name'] = self.make_package_name( dataset_processed["title"], harvest_object.guid) pkg = get_action('package_create')(self.context(), pkg) log.warn('created package %s (%s) from %s' % (pkg["name"], pkg["id"], harvest_object.source.url)) except: log.error('failed to create package %s from %s' % (pkg["name"], harvest_object.source.url)) raise # Flag the other HarvestObjects linking to this package as not current anymore for ob in model.Session.query(HarvestObject).filter_by( package_id=pkg["id"]): ob.current = False ob.save() # Flag this HarvestObject as the current harvest object harvest_object.package_id = pkg['id'] harvest_object.current = True harvest_object.save() model.Session.commit() # Now that the package and the harvest source are associated, re-index the # package so it knows it is part of the harvest source. The CKAN harvester # does this by creating the association before the package is saved by # overriding the GUID creation on a new package. That's too difficult. # So here we end up indexing twice. PackageSearchIndex().index_package(pkg) return True
class DatasetHarvesterBase(HarvesterBase): """ A Harvester for datasets. """ _user_name = None def validate_config(self, config): if not config: return config config_obj = yaml.load(config) return config def load_config(self, harvest_source): # Load the harvest source's configuration data. We expect it to be a YAML # string. Unfortunately I went ahead of CKAN on this. The stock CKAN harvester # only allows JSON in the configuration box. My fork is necessary for this # to work: https://github.com/joshdata/ckanext-harvest ret = { "filters": {}, # map data.json field name to list of values one of which must be present "defaults": {}, } source_config = yaml.load(harvest_source.config) try: ret["filters"].update(source_config["filters"]) except TypeError: pass except KeyError: pass try: ret["defaults"].update(source_config["defaults"]) except TypeError: pass except KeyError: pass return ret def _get_user_name(self): if not self._user_name: user = p.toolkit.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {}) self._user_name = user['name'] return self._user_name def context(self): return {"user": self._get_user_name(), "ignore_auth": True} # SUBCLASSES MUST IMPLEMENT def load_remote_catalog(self, harvest_job): raise Exception("Not implemented") def extra_schema(self): return { 'validator_schema': [ignore_empty, unicode, validate_schema], } def gather_stage(self, harvest_job): log.debug('In %s gather_stage (%s)' % (repr(self), harvest_job.source.url)) try: source_datasets, catalog_values = self.load_remote_catalog( harvest_job) except ValueError as e: self._save_gather_error("Error loading json content: %s." % (e), harvest_job) return [] tmp_superThemes = [ "agri", "educ", "econ", "ener", "envi", "gove", "heal", "intr", "just", "regi", "soci", "tech", "tran" ] ckan_host = '' # Call to config.ini to load superTheme list if 'CKAN_CONFIG' in environ: if path.exists(environ['CKAN_CONFIG']): try: tmp_ckan_config = ConfigParser() tmp_ckan_config.read(environ['CKAN_CONFIG']) except IOError: log.warn( 'Error loading CKAN config.ini file [%s]. ' 'Loading default SuperThemes', environ['CKAN_CONFIG']) except Exception: log.warn( 'Unknow error loading CKAN config.ini file [%s]. ' 'Loading default SuperThemes', environ['CKAN_CONFIG']) try: ckan_host = tmp_ckan_config.get('app:main', 'ckan.site_url') except Exception: log.warn( 'Error loading \"ckan.site_url\" from CKAN config.ini file [%s]. ' 'Loading default SuperThemes', environ['CKAN_CONFIG']) # Get superThemeTaxonomy try: if len(ckan_host) > 0: stt_url = '{site_url}/superThemeTaxonomy.json'.format( site_url=ckan_host) superThemeTaxonomy = requests.get(stt_url) superThemeTaxonomy = superThemeTaxonomy.json() if len(superThemeTaxonomy) < 0: raise Exception('SuperThemeTaxonomy JSON in empty') if 'id' not in [theme for theme in superThemeTaxonomy]: raise Exception( 'SuperThemeTaxonomy JSON don\'t contains \"id\" field' ) tmp_superThemes = [ theme['id'] for theme in superThemeTaxonomy ] log.info("superThemeTaxonomy loaded!") else: raise Exception( 'The field of config.ini \"site_url\" is empty.') except Exception, e: log.warn("Error getting \"ThemeTaxonomy.json\", err: %s.", e) superThemes = tmp_superThemes for dataset in source_datasets: # Delete if exists @type key try: del dataset['@type'] except Exception: pass try: foo = dataset['theme'] log.info('Theme exists and it value is:{0}.'.format(foo)) except IndexError: log.warn('The field \"theme\" not exists!... Fill it MF!') dataset.update({'theme': []}) try: tags = dataset['keyword'] themes = dataset['theme'] if len(themes) > 0: if type(tags) is list: dataset['keyword'] = tags + themes else: dataset['keyword'] = [tags] + themes except IndexError: pass try: dataset.update({'author_email': dataset['publisher']['mbox']}) except IndexError: log.warn( 'El campo \"publisher\" para \"{0}\" no contine campo \"mbox\".' .format(dataset['title'])) dataset.update({'author_mail': "unknow"}) except Exception: log.warn( 'El fallo el campo \"publisher\" para \"{0}\". Este error es critico, ' 'se completara el campo \"mbox\". para evitar errores futuros.' .format(dataset['title'])) dataset.update({'author_email': "unknow"}) try: dataset.update({'author': dataset['publisher']['name']}) except IndexError: log.warn( 'El campo \"publisher\" para \"{0}\" no contine campo \"name\".' .format(dataset['title'])) dataset.update({'author': "unknow"}) except Exception: log.warn( 'El fallo el campo \"publisher\" para \"{0}\". Este error es critico, ' 'se completara el campo \"name\". para evitar errores futuros.' .format(dataset['title'])) dataset.update({'author': "unknow"}) try: del dataset['publisher'] except Exception: pass try: dataset.update( {'maintainer_email': dataset['contactPoint']['hasEmail']}) dataset.update({'maintainer': dataset['contactPoint']['fn']}) del dataset['contactPoint'] except Exception: dataset.update({'maintainer_email': ""}) dataset.update({'maintainer': ""}) del dataset['contactPoint'] DATAJSON_SCHEMA = source_datasets schema_version = '1.2' parent_identifiers = set() child_identifiers = set() catalog_extras = {} if isinstance(catalog_values, dict): schema_version = '1.2' for dataset in source_datasets: parent_identifier = dataset.get('isPartOf') if parent_identifier: parent_identifiers.add(parent_identifier) child_identifiers.add(dataset.get('identifier')) # get a list of needed catalog values and put into hobj catalog_fields = ['title', 'description'] catalog_extras = dict(('catalog_' + k, v) for (k, v) in catalog_values.iteritems() if k in catalog_fields) # Loop through the packages we've already imported from this source # and go into their extra fields to get their source_identifier, # which corresponds to the remote catalog's 'identifier' field. # Make a mapping so we know how to update existing records. # Added: mark all existing parent datasets. existing_datasets = {} existing_parents = {} for hobj in model.Session.query(HarvestObject).filter_by( source=harvest_job.source, current=True): try: pkg = get_action('package_show')(self.context(), { "id": hobj.package_id }) except: # reference is broken continue sid = self.find_extra(pkg, "identifier") is_parent = self.find_extra(pkg, "collection_metadata") if sid: existing_datasets[sid] = pkg if is_parent and pkg.get("state") == "active": existing_parents[sid] = pkg # which parent has been demoted to child level? existing_parents_demoted = set( identifier for identifier in existing_parents.keys() \ if identifier not in parent_identifiers) # which dataset has been promoted to parent level? existing_datasets_promoted = set( identifier for identifier in existing_datasets.keys() \ if identifier in parent_identifiers \ and identifier not in existing_parents.keys()) # if there is any new parents, we will have to harvest parents # first, mark the status in harvest_source config, which # triggers a children harvest_job after parents job is finished. source = harvest_job.source source_config = json.loads(source.config or '{}') # run status: None, or parents_run, or children_run? run_status = source_config.get('datajson_collection') if parent_identifiers: for parent in parent_identifiers & child_identifiers: self._save_gather_error("Collection identifier '%s' \ cannot be isPartOf another collection." \ % parent, harvest_job) new_parents = set(identifier for identifier in parent_identifiers \ if identifier not in existing_parents.keys()) if new_parents: if not run_status: # fresh start run_status = 'parents_run' source_config['datajson_collection'] = run_status source.config = json.dumps(source_config) source.save() elif run_status == 'children_run': # it means new parents are tried and failed. # but skip some which have previously reported with # parent_identifiers & child_identifiers for parent in new_parents - \ (parent_identifiers & child_identifiers): self._save_gather_error("Collection identifier '%s' \ not found. Records which are part of this \ collection will not be harvested." \ % parent, harvest_job) else: # run_status was parents_run, and did not finish. # something wrong but not sure what happened. # let's leave it as it is, let it run one more time. pass else: # all parents are already in place. run it as usual. run_status = None elif run_status: # need to clear run_status run_status = None source_config['datajson_collection'] = run_status source.config = json.dumps(source_config) source.save() # Create HarvestObjects for any records in the remote catalog. object_ids = [] seen_datasets = set() unique_datasets = set() filters = self.load_config(harvest_job.source)["filters"] for dataset in source_datasets: # Create a new HarvestObject for this dataset and save the # dataset metdata inside it for later. # Check the config's filters to see if we should import this dataset. # For each filter, check that the value specified in the data.json file # is among the permitted values in the filter specification. matched_filters = True for k, v in filters.items(): if dataset.get(k) not in v: matched_filters = False if not matched_filters: continue if parent_identifiers and new_parents \ and dataset['identifier'] not in parent_identifiers \ and dataset.get('isPartOf') in new_parents: if run_status == 'parents_run': # skip those whose parents still need to run. continue else: # which is 'children_run'. # error out since parents got issues. self._save_gather_error( "Record with identifier '%s': isPartOf '%s' points to \ an erroneous record." % (dataset['identifier'], dataset.get('isPartOf')), harvest_job) continue # Some source contains duplicate identifiers. skip all except the first one if dataset['identifier'] in unique_datasets: self._save_gather_error( "Duplicate entry ignored for identifier: '%s'." % (dataset['identifier']), harvest_job) continue unique_datasets.add(dataset['identifier']) # Get the package_id of this resource if we've already imported # it into our system. Otherwise, assign a brand new GUID to the # HarvestObject. I'm not sure what the point is of that. if dataset['identifier'] in existing_datasets: pkg = existing_datasets[dataset["identifier"]] pkg_id = pkg["id"] seen_datasets.add(dataset['identifier']) # We store a hash of the dict associated with this dataset # in the package so we can avoid updating datasets that # don't look like they've changed. if pkg.get("state") == "active" \ and dataset['identifier'] not in existing_parents_demoted \ and dataset['identifier'] not in existing_datasets_promoted \ and self.find_extra(pkg, "source_hash") == self.make_upstream_content_hash(dataset, harvest_job.source, catalog_extras, schema_version): continue else: pkg_id = uuid.uuid4().hex # Create a new HarvestObject and store in it the GUID of the # existing dataset (if it exists here already) and the dataset's # metadata from the remote catalog file. extras = [ HarvestObjectExtra(key='schema_version', value=schema_version) ] if dataset['identifier'] in parent_identifiers: extras.append( HarvestObjectExtra(key='is_collection', value=True)) elif dataset.get('isPartOf'): parent_pkg_id = existing_parents[dataset.get('isPartOf')]['id'] extras.append( HarvestObjectExtra(key='collection_pkg_id', value=parent_pkg_id)) # FIX EXTRAS # for k,v in for k, v in catalog_extras.iteritems(): extras.append(HarvestObjectExtra(key=k, value=v)) # ---- obj = HarvestObject( guid=pkg_id, job=harvest_job, extras=extras, content=json.dumps(dataset, sort_keys=True) ) # use sort_keys to preserve field order so hashes of this string are constant from run to run obj.save() object_ids.append(obj.id) # Remove packages no longer in the remote catalog. for upstreamid, pkg in existing_datasets.items(): if upstreamid in seen_datasets: continue # was just updated if pkg.get("state") == "deleted": continue # already deleted pkg["state"] = "deleted" log.warn('deleting package %s (%s) because it is no longer in %s' % (pkg["name"], pkg["id"], harvest_job.source.url)) get_action('package_update')(self.context(), pkg) obj = HarvestObject( guid=pkg_id, package_id=pkg["id"], job=harvest_job, ) obj.save() object_ids.append(obj.id) return object_ids
def new_resource(self, id, data=None, errors=None, error_summary=None): ''' FIXME: This is a temporary action to allow styling of the forms. ''' if request.method == 'POST' and not data: save_action = request.params.get('save') data = data or clean_dict( dict_fns.unflatten(tuplize_dict(parse_params(request.POST)))) # we don't want to include save as it is part of the form del data['save'] resource_id = data['id'] del data['id'] self._validate_resource(data) context = { 'model': model, 'session': model.Session, 'user': c.user, 'auth_user_obj': c.userobj } if save_action == 'go-dataset': # go to first stage of add dataset h.redirect_to(controller='package', action='edit', id=id) # see if we have any data that we are trying to save data_provided = False for key, value in data.iteritems(): if ((value or isinstance(value, cgi.FieldStorage)) and key not in [ 'resource_type', 'license_id', 'attributesDescription' ]): data_provided = True break if not data_provided and save_action != "go-dataset-complete": if save_action == 'go-dataset': # go to first stage of add dataset h.redirect_to(controller='package', action='edit', id=id) try: data_dict = get_action('package_show')(context, {'id': id}) except NotAuthorized: abort(403, _('Unauthorized to update dataset')) except NotFound: abort( 404, _('The dataset {id} could not be found.').format( id=id)) if not len(data_dict['resources']): # no data so keep on page msg = _('You must add at least one data resource') # On new templates do not use flash message if asbool(config.get('ckan.legacy_templates')): h.flash_error(msg) h.redirect_to(controller='package', action='new_resource', id=id) else: errors = {} error_summary = {_('Error'): msg} return self.new_resource(id, data, errors, error_summary) # XXX race condition if another user edits/deletes data_dict = get_action('package_show')(context, {'id': id}) get_action('package_update')(dict(context, allow_state_change=True), dict(data_dict, state='active')) h.redirect_to(controller='package', action='read', id=id) data['package_id'] = id try: if resource_id: data['id'] = resource_id get_action('resource_update')(context, data) else: get_action('resource_create')(context, data) except ValidationError, e: errors = e.error_dict error_summary = e.error_summary return self.new_resource(id, data, errors, error_summary) except NotAuthorized: abort(403, _('Unauthorized to create a resource'))
def _save_new(self, context, package_type=None): # The staged add dataset used the new functionality when the dataset is # partially created so we need to know if we actually are updating or # this is a real new. is_an_update = False ckan_phase = request.params.get('_ckan_phase') from ckan.lib.search import SearchIndexError def pop_groups_from_data_dict_and_get_package_name_and_group_name( a_data_dict): # sacamos los grupos para que no fallen más adelante las validaciones de ckan some_group_names = [ group['name'] for group in ( a_data_dict['groups'] if 'groups' in a_data_dict else []) ] a_data_dict['groups'] = [] a_package_name = a_data_dict[ 'name'] # El campo Name identifica unívocamente a un Dataset return a_package_name, some_group_names def update_package_group_relation(a_package_name, group_names_to_add): # obtener id del package usando el a_package_name package = model.Package.get(a_package_name) # Es necesario eliminar *todos* los objetos `Member` que relacionan `Group`s con `Package`s # ya que vamos a reescribir esas relaciones según el parámetro `group_names_to_add` for group in model.Session.query(model.Group): # con el ID del package queriear los Member con table_id = package_id eliminar members_to_delete = model.Session.query(model.Member).filter( model.Member.group_id == group.id, model.Member.table_name == 'package', model.Member.table_id == package.id) for member in members_to_delete: model.Session.delete(member) model.Session.commit() # Hace falta el commit? # relaciono los datasets con los grupos correspondientes (que fueron ingresados) for group_name in group_names_to_add: group = model.Group.get(group_name) group.add_package_by_name(a_package_name) group.save() try: data_dict = clean_dict( dict_fns.unflatten(tuplize_dict(parse_params(request.POST)))) if ckan_phase: # prevent clearing of groups etc context['allow_partial_update'] = True # sort the tags if 'tag_string' in data_dict: data_dict['tags'] = self._tag_string_to_list( data_dict['tag_string']) self._validate_dataset(data_dict) # Limpiamos el data_dict para poder guardar el DS aun siendo colaborador no miembro del grupo package_name, group_names = pop_groups_from_data_dict_and_get_package_name_and_group_name( data_dict) if data_dict.get('pkg_name'): is_an_update = True # This is actually an update not a save data_dict['id'] = data_dict['pkg_name'] del data_dict['pkg_name'] # don't change the dataset state data_dict['state'] = 'draft' # this is actually an edit not a save pkg_dict = get_action('package_update')(context, data_dict) # Restauramos los grupos asignados al dataset (cuando es un update) update_package_group_relation(package_name, group_names) if request.params['save'] == 'go-metadata': # redirect to add metadata url = h.url_for(controller='package', action='new_metadata', id=pkg_dict['name']) elif request.params['save'] == 'save-draft': url = h.url_for(controller='package', action='read', id=pkg_dict['name']) else: # redirect to add dataset resources url = h.url_for(controller='package', action='new_resource', id=pkg_dict['name']) redirect(url) # Make sure we don't index this dataset if request.params['save'] not in [ 'go-resource', 'go-metadata' ]: data_dict['state'] = 'draft' # allow the state to be changed context['allow_state_change'] = True data_dict['type'] = package_type context['message'] = data_dict.get('log_message', '') self.__generate_spatial_extra_field(data_dict) pkg_dict = get_action('package_create')(context, data_dict) # Restauramos los grupos asignados al dataset (cuando es un insert) update_package_group_relation(package_name, group_names) if ckan_phase and request.params['save'] != 'save-draft': url = h.url_for(controller='package', action='new_resource', id=pkg_dict['name']) redirect(url) elif request.params['save'] == 'save-draft': url = h.url_for(controller='package', action='read', id=pkg_dict['name']) redirect(url) self._form_save_redirect(pkg_dict['name'], 'new', package_type=package_type) except NotAuthorized: abort(401, _('Unauthorized to read package %s') % '') except NotFound, e: abort(404, _('Dataset not found'))
abort(404, _('Dataset not found')) except dict_fns.DataError: abort(400, _(u'Integrity Error')) except SearchIndexError, e: try: exc_str = unicode(repr(e.args)) except Exception: # We don't like bare excepts exc_str = unicode(str(e)) abort(500, _(u'Unable to add package to search index.') + exc_str) except ValidationError, e: errors = e.error_dict error_summary = e.error_summary if is_an_update: # we need to get the state of the dataset to show the stage we # are on. pkg_dict = get_action('package_show')(context, data_dict) data_dict['state'] = pkg_dict['state'] return self.edit(data_dict['id'], data_dict, errors, error_summary) data_dict['state'] = 'none' return self.new(data_dict, errors, error_summary) def new_resource(self, id, data=None, errors=None, error_summary=None): ''' FIXME: This is a temporary action to allow styling of the forms. ''' if request.method == 'POST' and not data: save_action = request.params.get('save') data = data or clean_dict( dict_fns.unflatten(tuplize_dict(parse_params(request.POST)))) # we don't want to include save as it is part of the form del data['save']
action='read', id=pkg_dict['name']) redirect(url) self._form_save_redirect(pkg_dict['name'], 'new', package_type=package_type) except NotAuthorized: abort(401, _('Unauthorized to read package %s') % '') except NotFound, e: abort(404, _('Dataset not found')) except dict_fns.DataError: abort(400, _(u'Integrity Error')) except SearchIndexError, e: try: exc_str = unicode(repr(e.args)) except Exception: # We don't like bare excepts exc_str = unicode(str(e)) abort(500, _(u'Unable to add package to search index.') + exc_str) except ValidationError, e: errors = e.error_dict error_summary = e.error_summary if is_an_update: # we need to get the state of the dataset to show the stage we # are on. pkg_dict = get_action('package_show')(context, data_dict) data_dict['state'] = pkg_dict['state'] return self.edit(data_dict['id'], data_dict, errors, error_summary) data_dict['state'] = 'none' return self.new(data_dict, errors, error_summary)