def before_index(self, data_dict): dataset = sh.scheming_get_schema('dataset', 'dataset') if ('res_format' in data_dict): #Get format field formats = sh.scheming_field_by_name(dataset.get('resource_fields'), 'format') #Create SOLR field data_dict['res_format_label'] = [] for res_format in data_dict['res_format']: #Get format label res_format_label = sh.scheming_choices_label( formats['choices'], res_format) if res_format_label: #Add label to new SOLR field data_dict['res_format_label'].append(res_format_label) if ('frequency' in data_dict): #Get frequency field frequency = data_dict['frequency'] if frequency: freq = json.loads(frequency) ftype = freq['type'] fvalue = freq['value'] data_dict['frequency_id'] = '{value}-{type}'.format( type=ftype, value=fvalue) data_dict[ 'frequency_label'] = helpers.csc_dataset_display_frequency( fvalue, ftype) #log.info('Frecuency = {f1}, frequency_id={f2}, frequency_label={f3}'.format(f1=frequency, f2=data_dict['frequency_id'], f3=data_dict['frequency_label'])) if ('theme' in data_dict): #Get theme field categoria = sh.scheming_field_by_name( dataset.get('dataset_fields'), 'theme') #Get theme value valor_categoria = data_dict['theme'] #Empty theme values data_dict['theme'] = [] data_dict['theme_id'] = [] data_dict['theme_es'] = [] data_dict['theme_gl'] = [] #Get key values valores = valor_categoria.replace('[', '').replace(']', '') categorias = valores.split('", "') #Get translated label for each key for term_categoria in list(categorias): clean_term = term_categoria.replace('"', '') data_dict['theme'].append(clean_term) data_dict['theme_id'].append(helpers.csc_theme_id(clean_term)) #Look for label in the scheme for option in categoria.get('choices'): if option['value'] == clean_term: #Add label for each language data_dict['theme_es'].append(option['label']['es']) data_dict['theme_gl'].append(option['label']['gl']) return data_dict
def spc_thematic_area_list(context, data_dict): tk.check_access('spc_thematic_area_list', context, data_dict) schema = scheming_helpers.scheming_get_dataset_schema('dataset') field = scheming_helpers.scheming_field_by_name(schema['dataset_fields'], 'thematic_area_string') choices = scheming_helpers.scheming_field_choices(field) return choices
def _dge_harvest_list_dataset_field_labels(name_field=None, value_field=None): ''' Returns the available values that the given dataset name_field may have to the given value_field ''' result = {} if name_field is not None: dataset = sh.scheming_get_schema('dataset', 'dataset') values = sh.scheming_field_by_name(dataset.get('dataset_fields'), name_field) or [] if values and values['choices']: for option in values['choices']: if option and option['value']: if value_field: if option['value'] == value_field: return { option.get('value'): { 'label': option.get('label'), 'description': option.get('description'), 'dcat_ap': option.get('dcat_ap'), 'notation': option.get('notation') } } else: result[option.get('value')] = { 'label': option.get('label'), 'description': option.get('description'), 'dcat_ap': option.get('dcat_ap'), 'notation': option.get('notation') } return result
def has_published_date_field_in_schema(dataset_type): if not dataset_type: return False dataset_scheme = h.scheming_get_schema('dataset', dataset_type) fields = dataset_scheme['dataset_fields'] pd = h.scheming_field_by_name(fields, "published_date") if not pd: return False return True
def datawa_scheming_select_options(field_name): schema = sh.scheming_get_dataset_schema("dataset") try: access_level_options = sh.scheming_field_by_name( schema["dataset_fields"], field_name)["choices"] options = {i["value"]: i["label"] for i in access_level_options} except Exception as e: raise e return options
def get_choice_label(name, value, is_resource=False): schema = scheming_get_dataset_schema('deposited-dataset') fields = schema['resource_fields'] if is_resource else schema[ 'dataset_fields'] field = scheming_field_by_name(fields, name) for choice in field.get('choices', []): if choice.get('value') == value: return choice.get('label') return value
def datawa_scheming_select_options(field_name): schema = sh.scheming_get_dataset_schema('dataset') try: access_level_options = sh.scheming_field_by_name( schema['dataset_fields'], field_name)['choices'] options = {i['value']: i['label'] for i in access_level_options} except Exception as e: raise e return options
def get_field_label(name, is_resource=False): schema = scheming_get_dataset_schema('deposited-dataset') fields = schema['resource_fields'] if is_resource else schema[ 'dataset_fields'] field = scheming_field_by_name(fields, name) if field: return field.get('label', name) else: log.warning( 'Could not get field {} from deposited-dataset schema'.format( name))
def get_data_container_choice_label(name, value): schema = scheming_get_organization_schema('data-container') fields = schema['fields'] field = scheming_field_by_name(fields, name) if field: for choice in field.get('choices', []): if choice.get('value') == value: return choice.get('label') return value else: log.warning( 'Could not get field {} from data-container schema'.format(name))
def validator(key, data, errors, context): value = data.get(key) try: other_value = data[(field.get('required_if_value_in'),)] except: other_value = "" if (not value or value is missing) : if (not other_value or other_value is missing): return value raise Invalid(_('Required since "%s" is defined.') % sh.scheming_field_by_name(schema['dataset_fields'], field.get('required_if_value_in'))['label']) return value
def _get_classfication_field(dataset_type): scheme = h.scheming_get_schema('dataset', dataset_type) if not scheme: return None fields = scheme.get('resource_fields') if not fields: return None f = h.scheming_field_by_name(fields, "classification") if f: return f else: return None
def _map_gdl_to_publication(data_dict, obj): dataset = { "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, str(data_dict['id']))), "type": "publications", "title": data_dict['title'], "creator": [a['name'] for a in data_dict['authors']], # "subject": data_dict, "notes": data_dict['description'], "publisher": data_dict.get('relatedOrganisation'), # "contributor": [a['name'] for a in data_dict['authors']], "date": data_dict.get('created'), "metadata_modified": data_dict.get('created'), # "publication_type": data_dict, # "format": data_dict, "identifier": data_dict['identifier'], "source": data_dict.get('source'), # "language": data_dict, # "relation": data_dict, # "spatial": data_dict, # "rights": data_dict, "license_id": 'notspecified', "member_countries": 'other', # relatedCountry, optional "harvest_source": 'GDL' } thematic_area = data_dict.get('thematicArea', {}).get('area') if thematic_area: dataset["thematic_area_string"] = thematic_area_mapping.get( thematic_area) related_country = data_dict.get('relatedCountry') if related_country: schema = sh.scheming_get_dataset_schema('publications') choices = sh.scheming_field_by_name(schema['dataset_fields'], 'member_countries')['choices'] member_country = F.first( F.filter( F.compose(F.rpartial(contains, related_country), itemgetter('label')), choices)) if member_country: dataset['member_countries'] = member_country['value'] spatial = get_extent_for_country(member_country['label']) if spatial: dataset['spatial'] = spatial['value'] if data_dict['file']: res_url = _gl_url(obj.source.url, 'download') + '?id=' + str( data_dict['id']) res = {'name': data_dict['file'], 'url': res_url} res['format'] = splitext(res['name'])[1].lstrip('.') dataset['resources'] = [res] return dataset
def get_choice_label(name, value, is_resource=False): schema = scheming_get_dataset_schema('deposited-dataset') fields = schema['resource_fields'] if is_resource else schema[ 'dataset_fields'] field = scheming_field_by_name(fields, name) if field: for choice in field.get('choices', []): if choice.get('value') == value: return choice.get('label') return value else: log.warning( 'Could not get field {} from deposited-dataset schema'.format( name))
def _csc_dcat_list_resource_field_values(name_field=None): ''' Returns the available values that the given resource name_field may have ''' result = [] if name_field is not None: dataset = sh.scheming_get_schema('dataset', 'dataset') values = sh.scheming_field_by_name(dataset.get('resource_fields'), name_field) or [] if values and values['choices']: for option in values['choices']: if option and option['value']: result.append(option['value']) return result
def dge_list_themes(themes=None): ''' Given an theme list values, get theirs translated labels :param themes: value theme list :type string list :rtype (string, string) list ''' dataset = sh.scheming_get_schema('dataset', 'dataset') formats = sh.scheming_field_by_name(dataset.get('dataset_fields'), 'theme') label_list = [] for theme in themes: label = sh.scheming_choices_label(formats['choices'], theme) if label: label_list.append((dge_theme_id(theme), label)) return label_list
def dge_resource_format_label(res_format=None): ''' Given an format, get its label :param res_format: format :type string :rtype string ''' if format: dataset = sh.scheming_get_schema('dataset', 'dataset') formats = sh.scheming_field_by_name(dataset.get('resource_fields'), 'format') res_format_label = sh.scheming_choices_label(formats['choices'], res_format) if res_format_label: return res_format_label return res_format
def after_search(self, search_results, search_params): facets = search_results.get('search_facets') results = search_results.get('results') if not facets or not results: return search_results schema = scheming_helpers.scheming_get_dataset_schema(results[0]['type']) for facet in facets.values(): for item in facet['items']: field_name = facet['title'].replace('_facet', '') field = scheming_helpers.scheming_field_by_name( \ schema['dataset_fields'], field_name) if field and (field.get('choices') or \ field.get('choices_helper')): choices = scheming_helpers.scheming_field_choices(field) item['display_name'] = scheming_helpers. \ scheming_choices_label(choices, item['name']) return search_results
def _extract_additional_fields(self, content, package_dict): package_dict['thematic_area_string'] = self.topic if not package_dict.get('license_id'): package_dict['license_id'] = 'notspecified' skip_keys = {'set_spec', 'description'} for key, value in content.items(): if key in package_dict or key in skip_keys: continue if key == 'type': key = 'publication_type' package_dict[key] = value package_dict.pop('extras', None) package_dict['type'] = 'publications' package_dict.pop('maintainer_email', None) coverage = package_dict.pop('coverage', None) if coverage: schema = scheming_get_dataset_schema('publications') field = scheming_field_by_name(schema['dataset_fields'], 'member_countries') choices = scheming_field_choices(field) package_dict['member_countries'] = [ choice['value'] for choice in choices if choice['label'] in coverage ] or ['other'] polygons = [ t['geometry'] for t in eez.collection if any(country in t['properties']['GeoName'] for country in coverage) ] # TODO: for now we are taking first polygon from possible # list because of SOLR restriction of spatial field # size. In future we may add additional logic here if polygons: package_dict['coverage'] = json.dumps(polygons[0]) return package_dict
def after_search(self, search_results, search_params): if not is_frontend(): return search_results # Translate the unselected search facets. facets = search_results.get('search_facets') if not facets: return search_results desired_lang_code = pylons.request.environ['CKAN_LANG'] fallback_lang_code = pylons.config.get('ckan.locale_default', 'es') # Look up translations for all of the facets in one db query. dataset = sh.scheming_get_schema('dataset', 'dataset') categoria = sh.scheming_field_by_name(dataset.get('dataset_fields'), 'theme') dict_categoria = {} for option in categoria.get('choices'): label_option = (option.get('label')).get(desired_lang_code, None) if not label_option: label_option = (option.get('label')).get( fallback_lang_code, None) dict_categoria[helpers.dge_theme_id( option.get('value'))] = label_option facet = facets.get('theme_id', None) if facet: for item in facet.get('items', None): item['display_name'] = dict_categoria.get( item.get('name'), item.get('display_name')) item['class'] = item.get('name') facet = facets.get('administration_level', None) if facet: for item in facet.get('items', None): item[ 'display_name'] = helpers.dge_get_translated_administration_level( item.get('name')) return search_results
def after_search(self, search_results, search_params): # Translate the unselected search facets. facets = search_results.get('search_facets') if not facets: return search_results desired_lang_code = request.environ.get('CKAN_LANG') fallback_lang_code = config.get('ckan.locale_default', 'es') # Look up translations for all of the facets in one db query. dataset = sh.scheming_get_schema('dataset', 'dataset') categoria = sh.scheming_field_by_name(dataset.get('dataset_fields'), 'theme') dict_categoria = {} for option in categoria.get('choices'): label_option = (option.get('label')).get(desired_lang_code, None) if not label_option: label_option = (option.get('label')).get( fallback_lang_code, None) dict_categoria[helpers.csc_theme_id( option.get('value'))] = label_option facet = facets.get('theme_id', None) if facet: for item in facet.get('items', None): item['display_name'] = dict_categoria.get( item.get('name'), item.get('display_name')) item['class'] = item.get('name') facet = facets.get('frequency_id', None) if facet: for item in facet.get('items', None): #log.info("facet {facet}".format(facet=facet)) value = item.get('name', '').split('-') item['display_name'] = helpers.csc_dataset_display_frequency( value[0], value[1]) return search_results
def dge_list_reduce_resource_format_label(resources=None, field_name='format'): ''' Given an resource list, get label of resource_format :param resources: resource dict :type dict list :param field_name: field_name of resource :type string :rtype string list ''' format_list = h.dict_list_reduce(resources, field_name) dataset = sh.scheming_get_schema('dataset', 'dataset') formats = sh.scheming_field_by_name(dataset.get('resource_fields'), 'format') label_list = [] for res_format in format_list: res_format_label = sh.scheming_choices_label(formats['choices'], res_format) if res_format_label: label_list.append(res_format_label) return label_list
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package must be added to the HarvestObject. Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' logger.debug("in import stage: %s" % harvest_object.guid) if not harvest_object: logger.error('No harvest object received') self._save_object_error('No harvest object received') return False try: self._set_config(harvest_object.job.source.config) package_dict = json.loads(harvest_object.content) data_dict = {} data_dict['id'] = package_dict['id'] data_dict['title'] = package_dict['title'] data_dict['name'] = munge_title_to_name(package_dict['name']) data_dict['notes'] = markdown_extract( package_dict.get('description')) tags = package_dict.get('keyword', []) data_dict['tag_string'] = ', '.join( [munge_tag(tag) for tag in tags]) data_dict['private'] = False license_id = package_dict.get('license', 'cc-by').strip('/').split('/')[-1] if license_id == 'de2a56f5-a565-481a-8589-406dc40b5588': license_id = 'sprep-public-license' data_dict['license_id'] = license_id or 'notspecified' data_dict['created'] = _parse_drupal_date(package_dict['issued']) data_dict['modified'] = _parse_drupal_date( package_dict['modified']) c_point, c_email = package_dict['contactPoint'][ 'fn'], package_dict['contactPoint']['hasEmail'].split(':')[-1] if c_email != '*****@*****.**': data_dict['contact_uri'] = c_point data_dict['contact_email'] = c_email data_dict['resources'] = [] for res in package_dict.get('distribution', []): # res['issued'] = _parse_drupal_date(res.pop('created')) # res['modified'] = _parse_drupal_date( # res.pop('last_modified').replace('Date changed ', '') # ) res['url'] = res.get('downloadURL') or res.get('accessURL') res['name'] = res['title'] res['description'] = markdown_extract(res.get('description')) data_dict['resources'].append(res) if 'spatial' in package_dict: data_dict['spatial'] = package_dict.pop('spatial') try: geometry = { "type": "Polygon", "coordinates": [[[float(c) for c in pair.split()] for pair in RE_SPATIAL.match( data_dict['spatial']).group(1).split(', ')]] } shape = shapely.geometry.asShape(geometry) if shape.is_valid and shape.is_closed: data_dict['spatial'] = json.dumps(geometry) else: del data_dict['spatial'] except KeyError: pass except (AttributeError, ValueError): del data_dict['spatial'] # logger.warn('-' * 80) # # logger.warn('Failed parsing of spatial field: %s', data_dict['spatial']) # package_dict.pop('type') # add owner_org source_dataset = get_action('package_show')( { 'ignore_auth': True }, { 'id': harvest_object.source.id }) owner_org = source_dataset.get('owner_org') data_dict['owner_org'] = owner_org data_dict['member_countries'] = country_mapping[None] if 'isPartOf' in package_dict: country = package_dict['isPartOf'].split('.')[0] data_dict['member_countries'] = country_mapping.get( country, country_mapping[None]) org = model.Session.query( model.Group).filter_by(name=country + '-data').first() if org: data_dict['owner_org'] = org.id if 'spatial' in package_dict: data_dict['spatial'] = package_dict['spatial'] try: data_dict['spatial'] = json.dumps({ "type": "Polygon", "coordinates": [[[float(c) for c in pair.split()] for pair in RE_SPATIAL.match( data_dict['spatial']).group(1).split(', ')]] }) except KeyError: pass # package_dict.pop('type') else: schema = sh.scheming_get_dataset_schema('dataset') choices = sh.scheming_field_by_name( schema['dataset_fields'], 'member_countries')['choices'] member_country = sh.scheming_choices_label( choices, data_dict['member_countries']) if member_country: spatial = get_extent_for_country(member_country) if spatial: data_dict['spatial'] = spatial['value'] data_dict['source'] = package_dict.get('landingPage') data_dict['theme'] = package_dict.get('theme', []) data_dict['theme'] = package_dict.get('theme', []) data_dict['thematic_area_string'] = _map_theme_to_topic( data_dict['theme']) data_dict['harvest_source'] = 'SPREP' self._create_or_update_package(data_dict, harvest_object, 'package_show') Session.commit() stored_package = get_action('package_show')({ 'ignore_auth': True }, { 'id': data_dict['id'] }) for res in stored_package.get('resources', []): get_action('resource_create_default_resource_views')( { 'ignore_auth': True }, { 'package': stored_package, 'resource': res }) logger.debug("Finished record") except: logger.exception('Something went wrong!') self._save_object_error('Exception in import stage', harvest_object) return False return True
def dge_harvest_catalog_show(context, data_dict): method_log_prefix = '[%s][dge_harvest_catalog_show]' % __name__ output = None try: log.debug('%s Init method. Inputs context=%s, data_dict=%s' % (method_log_prefix, context, data_dict)) ini = datetime.datetime.now() toolkit.check_access('dge_harvest_catalog_show', context, data_dict) page = 1 data_dict['page'] = page limit = data_dict.get('limit', -1) _format = data_dict.get('format') if _format == RDF_FORMAT: filepath = config.get('ckanext.dge_harvest.rdf.filepath', '/tmp/catalog.rdf') elif _format == CSV_FORMAT: filepath = config.get('ckanext.dge_harvest.csv.filepath', '/tmp/catalog.csv') columnsfilepath = config.get( 'ckanext.dge_harvest.csv.columns.filepath', '/usr/lib/ckan/default/src/ckanext-dge-harvest/ckanext/dge_harvest/commands/columns.json' ) else: filepath = '/tmp/catalog.' + _format query = _dge_harvest_search_ckan_datasets(context, data_dict) dataset_dicts = query['results'] total_datasets = query['count'] log.debug('%s Total_datasets obtenidos en la query: %s' % (method_log_prefix, total_datasets)) if limit > -1 and limit < total_datasets: total_datasets = limit num = len(dataset_dicts) log.debug('%s Total_datasets a exportar: %s' % (method_log_prefix, total_datasets)) while (total_datasets > num): page = page + 1 data_dict['page'] = page query = _dge_harvest_search_ckan_datasets(context, data_dict) dataset_dicts.extend(query['results']) total_datasets = query['count'] num = len(dataset_dicts) log.debug('%s Total_datasets obtenidos en la query: %s' % (method_log_prefix, total_datasets)) log.debug('%s Total_datasets a exportar: %s' % (method_log_prefix, num)) if _format == RDF_FORMAT: serializer = DGERDFSerializer() #log.debug("%s DATASET_DICTS = %s" % (method_log_prefix,dataset_dicts)) output = serializer.serialize_catalog( {}, dataset_dicts, _format=data_dict.get('format'), pagination_info=None) elif _format == CSV_FORMAT and columnsfilepath: #log.info('%s Dataset_dicts de partida =%s' % (method_log_prefix, dataset_dicts)) organizations = {} themes = dhh.dge_harvest_dict_theme_option_label() spatial_coverages = dhh.dge_harvest_dict_spatial_coverage_option_label( ) _dataset = sh.scheming_get_schema('dataset', 'dataset') res_format = sh.scheming_field_by_name( _dataset.get('resource_fields'), 'format') format_values = res_format['choices'] formats = {} datasets = [] num = 0 for dataset in dataset_dicts: ds = {} #Id #ds['id'] = _encode_value(dataset.get('id', None)) #ulr ds['url'] = dataset_uri(dataset) #Description descriptions = _from_dict_to_string( dataset.get(dhc.DS_DESCRIPTION, None)) ds['description'] = _encode_value(descriptions, True) #Title titles = _from_dict_to_string( dataset.get(dhc.DS_TITLE_TRANSLATED, None)) ds['title'] = _encode_value(titles, True) #Theme theme_values = dataset.get(dhc.DS_THEME, None) theme_labels = [] if theme_values: for value in theme_values: theme = themes.get(value) if theme and theme.get('label'): theme_labels.append(theme.get('label').get('es')) theme_value = _from_list_to_string(theme_labels) ds['theme'] = _encode_value(theme_value, True) #Keywords tags = dataset.get(dhc.DS_TAGS) value = None if tags and len(tags) > 0: for tag in tags: stag = tag.get('name', None) if stag: if value: value = '%s%s%s' % (value, MAIN_SEPARATOR, stag) else: value = stag ds['tags'] = _encode_value(value, True) #Identifier ds['identifier'] = _encode_value( dataset.get('identifier', None), True) #Created ds['issued_date'] = _encode_value( _from_iso8601_date_to_string( dataset.get(dhc.DS_ISSUED_DATE, None))) #Modified ds['modified_date'] = _encode_value( _from_iso8601_date_to_string( dataset.get(dhc.DS_MODIFIED_DATE, None))) #Accrual Periodicity frequency = dataset.get(dhc.DS_FREQUENCY) if (frequency): stype = frequency.get('type', '') if stype and len(stype) > 0: stype = 'http://www.w3.org/2006/time#' + stype svalue = frequency.get('value', '') sfrequency = '[TYPE]%s[VALUE]%s' % (stype, svalue) ds['frequency'] = _encode_value(sfrequency, True) #Language languages = _from_list_to_string(dataset.get(dhc.DS_LANGUAGE)) ds['language'] = _encode_value(languages, True) #Publisher publisher = dataset.get(dhc.DS_PUBLISHER, None) if publisher: if publisher in organizations: ds['publisher'] = _encode_value( organizations.get(publisher, None), True) else: organization = h.get_organization(publisher, False) if organization: organizations[publisher] = organization.get( 'title', organization.get('display_name', None)) ds['publisher'] = _encode_value( organizations.get(publisher), True) #License ds['license_id'] = _encode_value(dataset.get(dhc.DS_LICENSE), True) #Spatial spatial_values = dataset.get(dhc.DS_SPATIAL, None) spatial_labels = [] if spatial_values: for value in spatial_values: spatial = spatial_coverages.get(value) if spatial and spatial.get('label') and spatial.get( 'label').get('es'): spatial_labels.append( spatial.get('label').get('es')) spatials = _from_list_to_string(spatial_labels) ds['spatial'] = _encode_value(spatials, True) #Temporal temporal_coverage = dataset.get(dhc.DS_TEMPORAL_COVERAGE) if temporal_coverage: value = None for tc in temporal_coverage.itervalues(): if tc: tc_from = _from_iso8601_date_to_string( tc.get('from', None)) tc_to = _from_iso8601_date_to_string( tc.get('to', None)) if tc_from or tc_to: if value: value = '%s%s%s-%s' % (value, MAIN_SEPARATOR, (tc_from or ''), (tc_to or '')) else: value = '%s-%s' % ((tc_from or ''), (tc_to or '')) ds['coverage_new'] = _encode_value(value, True) #Valid ds['valid'] = _encode_value( _from_iso8601_date_to_string( dataset.get(dhc.DS_VALID, None)), True) #References references = _from_list_to_string( dataset.get(dhc.DS_REFERENCE, None)) ds['references'] = _encode_value(references, True) #Normative conforms_to = _from_list_to_string( dataset.get(dhc.DS_NORMATIVE, None)) ds['conforms_to'] = _encode_value(conforms_to, True) #Resources resources = dataset.get(dhc.DS_RESOURCES) sresources = [] if resources: for resource in resources: sresource = None if resource: name = _from_dict_to_string( resource.get(dhc.DS_RESOURCE_NAME_TRANSLATED, None), 'TITLE_') if not name: name = '' url = resource.get(dhc.DS_RESOURCE_ACCESS_URL, '') if url: url = '[ACCESS_URL]%s' % (url) format_value = resource.get( dhc.DS_RESOURCE_FORMAT, None) format = None if format_value: if format_value in formats: format = formats.get(format_value, None) else: formats[ format_value] = sh.scheming_choices_label( format_values, format_value) format = formats.get(format_value, None) if format: format = '[MEDIA_TYPE]%s' % (format) size = resource.get(dhc.DS_RESOURCE_BYTE_SIZE, '') if size: size = '[BYTE_SIZE]%s' % (size) relation = _from_list_to_string( resource.get(dhc.DS_RESOURCE_RELATION, None), SECONDARY_SEPARATOR) relations = '' if relation: relations = '[RELATION]%s' % (relation) sresource = '%s%s%s%s%s' % (name, url, format, size, relations) if sresource and len(sresource) > 0: sresources.append(sresource) if len(sresources) > 0: value = None for item in sresources: if value: value = '%s%s%s' % (value, MAIN_SEPARATOR, item) else: value = item ds['resources'] = _encode_value(value, True) num = num + 1 datasets.append(ds) #log.debug('%s Datasets con datos a exportar=%s' % (method_log_prefix, datasets)) log.debug('%s Numero de datasets con datos a exportar...%s' % (method_log_prefix, num)) output = losser.losser.table(datasets, columnsfilepath, csv=True, pretty=False) if filepath: file = None try: file = open(filepath, "w") file.write(output) file.close() except: if file and not file.closed: file.close() end = datetime.datetime.now() log.debug( "%s Time in serialize %s catalog [%s] with %s datasets ... %s milliseconds" % (method_log_prefix, _format, filepath, total_datasets, int((end - ini).total_seconds() * 1000))) except Exception, e: log.error("%s Exception %s: %s" % (method_log_prefix, type(e).__name__, e)) output = None
def get_field_label(name, is_resource=False): schema = scheming_get_dataset_schema('deposited-dataset') fields = schema['resource_fields'] if is_resource else schema[ 'dataset_fields'] field = scheming_field_by_name(fields, name) return field.get('label', name)
def before_index(self, data_dict): dataset = sh.scheming_get_schema('dataset', 'dataset') if ('res_format' in data_dict): #Get format field formats = sh.scheming_field_by_name(dataset.get('resource_fields'), 'format') #Create SOLR field data_dict['res_format_label'] = [] for res_format in data_dict['res_format']: #Get format label res_format_label = sh.scheming_choices_label( formats['choices'], res_format) if res_format_label: #Add label to new SOLR field data_dict['res_format_label'].append(res_format_label) if ('publisher' in data_dict): organismo = data_dict['publisher'] if is_frontend(): publisher = toolkit.get_action('dge_organization_publisher')( { 'model': model }, { 'id': organismo }) else: publisher = h.get_organization(organismo) data_dict['publisher'] = publisher.get('id') data_dict['publisher_display_name'] = publisher.get('display_name') administration_level_code = helpers.dge_get_organization_administration_level_code( publisher) if not administration_level_code or administration_level_code not in TRANSLATED_UNITS: administration_level_code = DEFAULT_UNIT data_dict['administration_level'] = administration_level_code data_dict['administration_level_es'] = TRANSLATED_UNITS[ administration_level_code]['es'] or '' data_dict['administration_level_en'] = TRANSLATED_UNITS[ administration_level_code]['en'] or '' data_dict['administration_level_ca'] = TRANSLATED_UNITS[ administration_level_code]['ca'] or '' data_dict['administration_level_eu'] = TRANSLATED_UNITS[ administration_level_code]['eu'] or '' data_dict['administration_level_gl'] = TRANSLATED_UNITS[ administration_level_code]['gl'] or '' if ('theme' in data_dict): #Get theme field categoria = sh.scheming_field_by_name( dataset.get('dataset_fields'), 'theme') #Get theme value valor_categoria = data_dict['theme'] #Empty theme values data_dict['theme'] = [] data_dict['theme_id'] = [] data_dict['theme_es'] = [] data_dict['theme_en'] = [] data_dict['theme_ca'] = [] data_dict['theme_eu'] = [] data_dict['theme_gl'] = [] #Get key values valores = valor_categoria.replace('[', '').replace(']', '') categorias = valores.split('", "') #Get translated label for each key for term_categoria in list(categorias): clean_term = term_categoria.replace('"', '') data_dict['theme'].append(clean_term) data_dict['theme_id'].append(helpers.dge_theme_id(clean_term)) #Look for label in the scheme for option in categoria.get('choices'): if option['value'] == clean_term: #Add label for each language data_dict['theme_es'].append(option['label']['es']) data_dict['theme_en'].append(option['label']['en']) data_dict['theme_ca'].append(option['label']['ca']) data_dict['theme_eu'].append(option['label']['eu']) data_dict['theme_gl'].append(option['label']['gl']) return data_dict
def _get_process_state_field(dataset_type): dataset_scheme = h.scheming_get_schema('dataset', dataset_type) if not dataset_scheme: return None fields = dataset_scheme['dataset_fields'] return h.scheming_field_by_name(fields, "process_state")
def import_stage(self, harvest_object): log.debug('In PRDREngergyResourcesHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False self._set_config(harvest_object.job.source.config) if self.force_import: status = 'change' else: status = self._get_object_extra(harvest_object, 'status') if status == 'delete': context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } p.toolkit.get_action('package_delete')( context, { 'id': harvest_object.package_id }) log.info('Deleted package {0} with guid {1}'.format( harvest_object.package_id, harvest_object.guid)) return True if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False # Get the last harvested object (if any) previous_object = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == harvest_object.guid) \ .filter(HarvestObject.current == True) \ .first() # Flag previous object as not current anymore if previous_object and not self.force_import: previous_object.current = False previous_object.add() package_dict = self._get_package_dict(harvest_object) if not package_dict: return False if not package_dict.get('name'): package_dict['name'] = \ self._get_package_name(harvest_object, package_dict['title']) # copy across resource ids from the existing dataset, otherwise they'll # be recreated with new ids if status == 'change': existing_dataset = self._get_existing_dataset(harvest_object.guid) if existing_dataset: copy_across_resource_ids(existing_dataset, package_dict) # Allow custom harvesters to modify the package dict before creating # or updating the package package_dict = self.modify_package_dict(package_dict, harvest_object) # Unless already set by an extension, get the owner organization (if # any) from the harvest source dataset if not package_dict.get('owner_org'): source_dataset = model.Package.get(harvest_object.source.id) if source_dataset.owner_org: package_dict['owner_org'] = source_dataset.owner_org if not package_dict.get('license_id'): package_dict['license_id'] = 'notspecified' # Flag this object as the current one harvest_object.current = True harvest_object.add() context = { 'user': self._get_user_name(), 'return_id_only': True, 'ignore_auth': True, } package_schema = scheming_get_dataset_schema('dataset') field = scheming_field_by_name(package_schema['dataset_fields'], 'member_countries') choices = scheming_field_choices(field) mem_temp_list = [ x for x in package_dict['member_countries'] if x is not None ] package_dict['member_countries'] = [ choice['value'] for choice in choices if choice['label'] in mem_temp_list ] or ['other'] polygons = [ t['geometry'] for t in eez.collection if any(country in t['properties']['GeoName'] for country in mem_temp_list) ] # TODO: for now we are taking first polygon from possible # list because of SOLR restriction of spatial field # size. In future we may add additional logic here if polygons: package_dict['coverage'] = json.dumps(polygons[0]) if status == 'new': # context['schema'] = package_schema # We need to explicitly provide a package ID package_dict['id'] = unicode(uuid.uuid4()) # package_schema['id'] = [unicode] # Save reference to the package on the object harvest_object.package_id = package_dict['id'] harvest_object.add() # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) model.Session.execute( 'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() package_id = \ p.toolkit.get_action('package_create')(context, package_dict) log.info('Created dataset with id %s', package_id) elif status == 'change': package_dict['id'] = harvest_object.package_id try: package_id = \ p.toolkit.get_action('package_update')(context, package_dict) log.info('Updated dataset with id %s', package_id) except NotFound: log.info( 'Update returned NotFound, trying to create new Dataset.') if not harvest_object.package_id: package_dict['id'] = unicode(uuid.uuid4()) harvest_object.package_id = package_dict['id'] harvest_object.add() else: package_dict['id'] = harvest_object.package_id package_id = \ p.toolkit.get_action('package_create')(context, package_dict) log.info('Created dataset with id %s', package_id) model.Session.commit() stored_package = p.toolkit.get_action('package_show')(context.copy(), { 'id': package_id }) for res in stored_package.get('resources', []): p.toolkit.get_action('resource_create_default_resource_views')( context.copy(), { 'package': stored_package, 'resource': res }) return True
def graph_from_dataset(self, dataset_dict, dataset_ref): ''' Given a CKAN dataset dict, creates an RDF graph The class RDFLib graph (accessible via `self.g`) should be updated on this method `dataset_dict` is a dict with the dataset metadata like the one returned by `package_show`. `dataset_ref` is an rdflib URIRef object that must be used to reference the dataset when working with the graph. ''' method_log_prefix = '[%s][graph_from_dataset]' % type( self).__name__ #log.debug('%s Init method. Inputs dataset_dict=%r, dataset_ref=%r' % (method_log_prefix, dataset_dict, dataset_ref)) #log.debug('%s Init method. Inputs, dataset_ref=%r' % (method_log_prefix, dataset_ref)) try: g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Title self._add_translated_triple_field_from_dict( dataset_dict, dataset_ref, DCT.title, DS_TITLE_TRANSLATED, None) # Description self._add_translated_triple_field_from_dict( dataset_dict, dataset_ref, DCT.description, DS_DESCRIPTION, None) # Theme value = self._get_dict_value(dataset_dict, DS_THEME) if value: themes = dataset_dict.get(EXPORT_AVAILABLE_THEMES, {}) for theme in value: #self._add_resource_list_triple(dataset_ref, DCAT.theme, value) theme_values = themes.get(theme, {}) labels = theme_values.get('label') descriptions = theme_values.get('description') dcat_ap = theme_values.get('dcat_ap') notation = theme_values.get('notation') self._add_resource_list_triple( dataset_ref, DCAT.theme, theme, labels, descriptions, dcat_ap, notation) # Tags for tag in dataset_dict.get('tags', []): self.g.add( (dataset_ref, DCAT.keyword, Literal(tag['name']))) # Identifier self._add_triple_from_dict( dataset_dict, dataset_ref, DCT.identifier, DS_IDENTIFIER, None, False, False) # Issued, Modified dates self._add_date_triple(dataset_ref, DCT.issued, self._get_value_from_dict( dataset_dict, DS_ISSUED_DATE, ['metadata_created'])) self._add_date_triple(dataset_ref, DCT.modified, self._get_value_from_dict( dataset_dict, DS_MODIFIED_DATE, ['metadata_modified'])) self._add_date_triple(dataset_ref, DCT.valid, self._get_value_from_dict( dataset_dict, DS_VALID, None)) # Accrual periodicity frequency = dataset_dict.get(DS_FREQUENCY) if frequency: ftypes = {'seconds': TIME.seconds, 'minutes': TIME.minutes, 'hours': TIME.hours, 'days': TIME.days, 'weeks': TIME.weeks, 'months': TIME.months, 'years': TIME.years} ftype = frequency.get('type') fvalue = frequency.get('value') if ftype and ftype in ftypes.keys() and fvalue: duration = BNode() frequency = BNode() g.add((frequency, RDF.type, DCT.Frequency)) g.add((duration, RDF.type, TIME.DurationDescription)) g.add((dataset_ref, DCT.accrualPeriodicity, frequency)) g.add((frequency, RDF.value, duration)) g.add((duration, ftypes.get(ftype), Literal( fvalue, datatype=XSD.decimal))) # Languages self._add_triple_from_dict( dataset_dict, dataset_ref, DCT.language, DS_LANGUAGE, None, True, False) # Publisher pub_dir3 = False publishers = dataset_dict.get( EXPORT_AVAILABLE_PUBLISHERS, {}) organization_id = dataset_dict.get('owner_org') if organization_id in publishers: publisher = publishers.get(organization_id) else: org = h.get_organization(organization_id, False) publisher = [None, None, None] if org: publisher = [org.get('title'), None, None] if org['extras']: for extra in org.get('extras'): if extra and 'key' in extra and extra['key'] == ORG_PROP_ID_UD_ORGANICA: notation = extra.get('value') if notation and notation != '': pub_dir3 = True publisher[1] = PUBLISHER_PREFIX + notation publisher[2] = notation if pub_dir3: publishers[organization_id] = publisher dataset_dict[EXPORT_AVAILABLE_PUBLISHERS] = publishers else: #publisher organizations = cdh.csc_dcat_organizations_available() publisher_ref = config.get('ckanext.csc_dcat.catalog.publisher', None) if publisher_ref and len(publisher_ref.strip()) > 0: publisher_ref = publisher_ref.strip() publisher = [publisher_ref, None, None] s_publisher = publisher_ref.upper().split('/') if s_publisher and len(s_publisher) > 0: organization_minhap = s_publisher[-1] org = organizations.get(organization_minhap, None) if org: publisher = [org[1], PUBLISHER_PREFIX + organization_minhap, organization_minhap] if publisher[1]: self._add_resource_list_triple( dataset_ref, DCT.publisher, publisher[1], publisher[0], None, None, publisher[2]) else: g.add((dataset_ref, DCT.publisher, URIRef(publisher[0]))) # Spatial Coverage value = self._get_dict_value(dataset_dict, DS_SPATIAL) if value: self._add_resource_list_triple( dataset_ref, DCT.spatial, value) # Temporal temporal_coverage = self._get_dataset_value( dataset_dict, DS_TEMPORAL_COVERAGE) i = 1 if temporal_coverage: for key, value in temporal_coverage.items(): if (value): start = end = None if 'from' in value: start = value.get('from') if 'to' in value: end = value.get('to') if start or end: temporal_extent = URIRef( "%s/%s-%s" % (dataset_ref, 'PeriodOfTime', i)) g.add( (temporal_extent, RDF.type, DCT.PeriodOfTime)) if start: self._add_date_triple( temporal_extent, SCHEMA.startDate, start) if end: self._add_date_triple( temporal_extent, SCHEMA.endDate, end) g.add((dataset_ref, DCT.temporal, temporal_extent)) i = i+1 # References value = self._get_dict_value(dataset_dict, DS_REFERENCE) if value: self._add_resource_list_triple( dataset_ref, DCT.references, value) # Conforms To value = self._get_dict_value(dataset_dict, DS_NORMATIVE) if value: self._add_resource_list_triple( dataset_ref, DCT.conformsTo, value) # License (dataset license) if dataset_dict.get(DS_LICENSE): g.add((dataset_ref, DCT.license, URIRef( dataset_dict.get(DS_LICENSE)))) # Distributions/Resources for resource_dict in dataset_dict.get('resources', []): uri_resource = '%s/resource/%s' % ( dataset_ref, resource_dict['id']) distribution = URIRef(uri_resource) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) # Identifier self._add_triple_from_dict( resource_dict, distribution, DCT.identifier, DS_RESOURCE_IDENTIFIER, None, False, False) # Title self._add_translated_triple_field_from_dict( resource_dict, distribution, DCT.title, DS_RESOURCE_NAME_TRANSLATED, None) # License (dataset license) if dataset_dict.get(DS_LICENSE): g.add((distribution, DCT.license, URIRef( dataset_dict.get(DS_LICENSE)))) # Access URL if resource_dict.get(DS_RESOURCE_ACCESS_URL): g.add((distribution, DCAT.accessURL, Literal( resource_dict.get(DS_RESOURCE_ACCESS_URL), datatype=XSD.anyURI))) # Format if resource_dict.get(DS_RESOURCE_FORMAT, None): imt = URIRef("%s/format" % uri_resource) g.add((imt, RDF.type, DCT.IMT)) g.add((distribution, DCT['format'], imt)) format = resource_dict.get( DS_RESOURCE_FORMAT, None) formats = dataset_dict.get( EXPORT_AVAILABLE_RESOURCE_FORMATS, {}) label = None if format and format in formats: label = formats.get(format, None) else: _dataset = sh.scheming_get_schema( 'dataset', 'dataset') res_format = sh.scheming_field_by_name(_dataset.get('resource_fields'), 'format') formats[format] = sh.scheming_choices_label( res_format['choices'], format) label = formats.get(format, None) dataset_dict[EXPORT_AVAILABLE_RESOURCE_FORMATS] = formats if label: g.add((imt, RDFS.label, Literal(label))) g.add((imt, RDF.value, Literal( resource_dict[DS_RESOURCE_FORMAT]))) # Size if resource_dict.get(DS_RESOURCE_BYTE_SIZE): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict[DS_RESOURCE_BYTE_SIZE]), datatype=XSD.decimal))) except (ValueError, TypeError): g.add((distribution, DCAT.byteSize, Literal(resource_dict[DS_RESOURCE_BYTE_SIZE]))) # Relation value = self._get_dict_value( dataset_dict, DS_NORMATIVE) if value: self._add_resource_list_triple( distribution, DCT.relation, value) except Exception, e: log.error("%s [dataset_ref: %s]. Unexpected Error %s: %s" % ( method_log_prefix, dataset_ref, type(e).__name__, e))