def get_package_dict(self, iso_values, harvest_object): package_dict = super(DCATAPITCSWHarvester, self).get_package_dict(iso_values, harvest_object) mapping_frequencies_to_mdr_vocabulary = self.source_config.get('mapping_frequencies_to_mdr_vocabulary', utils._mapping_frequencies_to_mdr_vocabulary) mapping_languages_to_mdr_vocabulary = self.source_config.get('mapping_languages_to_mdr_vocabulary', utils._mapping_languages_to_mdr_vocabulary) self._ckan_locales_mapping = self.source_config.get('ckan_locales_mapping') or utils._ckan_locales_mapping default_values = self.source_config.get('default_values') or {} dcatapit_config = self.source_config.get('dcatapit_config', self._dcatapit_config) # if dcatapit_config and not all(name in dcatapit_config for name in self._dcatapit_config): # dcatapit_config = self._dcatapit_config # log.warning('Some keys are missing in dcatapit_config configuration property, \ # keyes to use are: dataset_theme, dataset_language, agent_code, frequency, \ # agent_code_regex, org_name_regex and dcatapit_skos_theme_id. Using defaults') # elif not dcatapit_config: # dcatapit_config = self._dcatapit_config controlled_vocabularies = dcatapit_config.get('controlled_vocabularies', self._dcatapit_config.get('controlled_vocabularies')) agents = dcatapit_config.get('agents', self._dcatapit_config.get('agents')) # ------------------------------# # MANDATORY FOR DCAT-AP_IT # # ------------------------------# # -- identifier -- # identifier = iso_values['guid'] package_dict['extras'].append({'key': 'identifier', 'value': identifier}) default_agent_code = identifier.split(':')[0] if ':' in identifier else None # -- theme -- # dataset_themes = [] if iso_values['keywords']: default_vocab_id = self._dcatapit_config.get('controlled_vocabularies').get('dcatapit_skos_theme_id') dataset_themes = utils.get_controlled_vocabulary_values('eu_themes', controlled_vocabularies.get('dcatapit_skos_theme_id', default_vocab_id), iso_values['keywords']) if dataset_themes: dataset_themes = list(set(dataset_themes)) dataset_themes = [{'theme': str(l), 'subthemes': []} for l in dataset_themes] else: dataset_themes = default_values.get('dataset_theme') if isinstance(dataset_themes, str): dataset_themes = [{'theme': dt} for dt in dataset_themes.strip('{}').split(',')] log.info('Medatata harvested dataset themes: %r', dataset_themes) package_dict['extras'].append({'key': FIELD_THEMES_AGGREGATE, 'value': json.dumps(dataset_themes)}) # -- publisher -- # citedResponsiblePartys = iso_values['cited-responsible-party'] agent_name, agent_code = utils.get_responsible_party(citedResponsiblePartys, agents.get('publisher', self._dcatapit_config.get('agents').get('publisher'))) package_dict['extras'].append({'key': 'publisher_name', 'value': agent_name}) package_dict['extras'].append({'key': 'publisher_identifier', 'value': agent_code or default_agent_code}) # -- modified -- # revision_date = iso_values['date-updated'] or iso_values['date-released'] package_dict['extras'].append({'key': 'modified', 'value': revision_date}) # -- frequency -- # updateFrequency = iso_values['frequency-of-update'] package_dict['extras'].append({'key': 'frequency', 'value': mapping_frequencies_to_mdr_vocabulary.get(updateFrequency, dcatapit_config.get('frequency', self._dcatapit_config.get('frequency')))}) # -- rights_holder -- # citedResponsiblePartys = iso_values['cited-responsible-party'] agent_name, agent_code = utils.get_responsible_party(citedResponsiblePartys, agents.get('owner', self._dcatapit_config.get('agents').get('owner'))) package_dict['extras'].append({'key': 'holder_name', 'value': agent_name}) package_dict['extras'].append({'key': 'holder_identifier', 'value': agent_code or default_agent_code}) # -----------------------------------------------# # OTHER FIELDS NOT MANDATORY FOR DCAT_AP-IT # # -----------------------------------------------# # -- alternate_identifier nothing to do -- # # -- issued -- # publication_date = iso_values['date-released'] package_dict['extras'].append({'key': 'issued', 'value': publication_date}) # -- geographical_name -- # dataset_places = [] if iso_values['keywords']: default_vocab_id = self._dcatapit_config.get('controlled_vocabularies').get('dcatapit_skos_theme_id') dataset_places = utils.get_controlled_vocabulary_values('places', controlled_vocabularies.get('dcatapit_skos_places_id', default_vocab_id), iso_values['keywords']) if dataset_places and len(dataset_places) > 1: dataset_places = list(set(dataset_places)) dataset_places = '{' + ','.join(str(l) for l in dataset_places) + '}' else: dataset_places = dataset_places[0] if dataset_places and len(dataset_places) > 0 else dcatapit_config.get('dataset_places', self._dcatapit_config.get('dataset_places')) if dataset_places: log.info('Medatata harvested dataset places: %r', dataset_places) package_dict['extras'].append({'key': 'geographical_name', 'value': dataset_places}) # -- geographical_geonames_url nothing to do -- # # -- language -- # dataset_languages = iso_values['dataset-language'] language = None if dataset_languages and len(dataset_languages) > 0: languages = [] for language in dataset_languages: lang = mapping_languages_to_mdr_vocabulary.get(language, None) if lang: languages.append(lang) if len(languages) > 1: language = '{' + ','.join(str(l) for l in languages) + '}' else: language = languages[0] if len(languages) > 0 else dcatapit_config.get('dataset_languages', self._dcatapit_config.get('dataset_languages')) log.info('Medatata harvested dataset languages: %r', language) else: language = dcatapit_config.get('dataset_language') package_dict['extras'].append({'key': 'language', 'value': language}) # temporal_coverage # ################## temporal_coverage = [] temporal_start = None temporal_end = None for key in ['temporal-extent-begin', 'temporal-extent-end']: if len(iso_values[key]) > 0: temporal_extent_value = iso_values[key][0] if key == 'temporal-extent-begin': temporal_start = temporal_extent_value if key == 'temporal-extent-end': temporal_end = temporal_extent_value if temporal_start: temporal_coverage.append({'temporal_start': temporal_start, 'temporal_end': temporal_end}) if temporal_coverage: package_dict['extras'].append({'key': 'temporal_coverage', 'value': json.dumps(temporal_coverage)}) # conforms_to # ################## conforms_to_identifier = iso_values['conformity-specification-title'] conforms_to_locale = self._ckan_locales_mapping.get(iso_values['metadata-language'], 'it').lower() conforms_to = {'identifier': conforms_to_identifier, 'title': {conforms_to_locale: conforms_to_identifier}} if conforms_to: package_dict['extras'].append({'key': 'conforms_to', 'value': json.dumps([conforms_to])}) # creator # ############### # -- creator -- # citedResponsiblePartys = iso_values['cited-responsible-party'] agent_name, agent_code = utils.get_responsible_party(citedResponsiblePartys, agents.get('author', self._dcatapit_config.get('agents').get('author'))) agent_code = agent_code or default_agent_code if (agent_name and agent_code): creator = {} creator_lang = self._ckan_locales_mapping.get(iso_values['metadata-language'], 'it').lower() creator['creator_name'] = {creator_lang: agent_name} creator['creator_identifier'] = agent_code package_dict['extras'].append({'key': 'creator', 'value': json.dumps([creator])}) # ckan_license # ################## ckan_license = None use_constraints = iso_values.get('use-constraints') if use_constraints: use_constraints = use_constraints[0] import ckan.logic.action.get as _license license_list = _license.license_list({'model': model, 'session': Session, 'user': '******'}, {}) for license in license_list: if use_constraints == str(license.get('id')) or use_constraints == str(license.get('url')) or (str(license.get('id')) in use_constraints.lower()): ckan_license = license break if ckan_license: package_dict['license_id'] = ckan_license.get('id') else: default_license = self.source_config.get('default_license') if default_license: package_dict['license_id'] = default_license # -- license handling -- # interfaces.populate_resource_license(package_dict) # End of processing, return the modified package return package_dict
def do_migrate_data(limit=None, offset=None, skip_orgs=False): user = toolkit.get_action('get_site_user')({'ignore_auth': True}, {}) context = {'user': user['name'], 'ignore_auth': True, 'use_cache': False} pshow = toolkit.get_action('package_show') pupdate = toolkit.get_action('package_update') pcreate = toolkit.get_action('package_create') oshow = toolkit.get_action('organization_show') oupdate = toolkit.get_action('organization_patch') pupdate_schema = DCATAPITPackagePlugin().update_package_schema() pupdate_schema['tags']['name'].remove(tag_name_validator) org_list = get_organization_list() ocount = org_list.count() oidx = 0 if not skip_orgs: print(u'processing {} organizations'.format(ocount)).encode('utf-8') for oidx, oname in enumerate(org_list): odata = oshow( context, { 'id': oname, 'include_extras': True, 'include_tags': False, 'include_users': False, }) oidentifier = odata.get('identifier') print(u'processing {}/{} organization: {}'.format( oidx + 1, ocount, odata['name'])) # we require identifier for org now. if not oidentifier: odata.pop('identifier', None) tmp_identifier = get_temp_org_identifier() print(u"org: [{}] {} : setting temporal identifier: {}".format( odata['name'], odata['title'], tmp_identifier)).encode('utf-8') ocontext = context.copy() ocontext['allow_partial_update'] = True #oupdate(ocontext, {'id': odata['id'], # 'identifier': tmp_identifier}) update_organization_identifier(odata['id'], tmp_identifier) else: print(u'Skipping organizations processing').encode('utf-8') pcontext = context.copy() pkg_list = get_package_list() pcount = pkg_list.count() print(u'processing {} packages'.format(pcount)).encode('utf-8') errored = [] if offset: pkg_list = pkg_list.offset(offset) if limit: pkg_list = pkg_list.limit(limit) # pidx may be not initialized for empty slice, need separate counter # to count actually processed datasets pidx_count = 0 for pidx, pname in enumerate(pkg_list): pcontext['schema'] = pupdate_schema pname = pname[0] print(u'processing {}/{} package: {}'.format(pidx + 1, pcount, pname)).encode('utf-8') pdata = pshow(context, {'name_or_id': pname}) #, 'use_default_schema': True}) # remove empty conforms_to to avoid silly validation errors if not pdata.get('conforms_to'): pdata.pop('conforms_to', None) # ... the same for alternate_identifier if not pdata.get('alternate_identifier'): pdata.pop('alternate_identifier', None) update_creator(pdata) update_temporal_coverage(pdata) update_theme(pdata) update_identifier(pdata) update_modified(pdata) update_frequency(pdata) update_conforms_to(pdata) update_holder_info(pdata) interfaces.populate_resource_license(pdata) pdata['metadata_modified'] = None print 'updating', pdata['id'], pdata['name'] try: out = pupdate(pcontext, pdata) pidx_count += 1 except ValidationError, err: print(u'Cannot update due to validation error {}'.format( pdata['name'])).encode('utf-8') print err print(pdata) print errored.append(( pidx, pdata['name'], err, )) continue except Exception, err: print(u'Cannot update due to general error {}'.format( pdata['name'])).encode('utf-8') print err print(pdata) print errored.append(( pidx, pdata['name'], err, )) continue
def do_migrate_data(limit=None, offset=None, skip_orgs=False, pkg_uuid: list = None): # Data migrations from 1.0.0 to 1.1.0 # ref: https://github.com/geosolutions-it/ckanext-dcatapit/issues/188 from ckanext.dcatapit.plugin import DCATAPITPackagePlugin user = toolkit.get_action('get_site_user')({'ignore_auth': True}, {}) context = {'user': user['name'], 'ignore_auth': True, 'use_cache': False} pshow = toolkit.get_action('package_show') pupdate = toolkit.get_action('package_update') pcreate = toolkit.get_action('package_create') oshow = toolkit.get_action('organization_show') oupdate = toolkit.get_action('organization_patch') pupdate_schema = DCATAPITPackagePlugin().update_package_schema() pupdate_schema['tags']['name'].remove(tag_name_validator) org_list = get_organization_list() ocount = org_list.count() oidx = 0 if not skip_orgs: log.info(f'processing {ocount} organizations') for oidx, oname in enumerate(org_list): odata = oshow(context, {'id': oname, 'include_extras': True, 'include_tags': False, 'include_users': False, }) oidentifier = odata.get('identifier') log.info('processing {}/{} organization: {}'.format(oidx + 1, ocount, odata['name'])) # we require identifier for org now. if not oidentifier: odata.pop('identifier', None) tmp_identifier = get_temp_org_identifier() log.info( f"org: [{odata['name']}] {odata['title']}: " f'setting temporal identifier: {tmp_identifier}' ) ocontext = context.copy() ocontext['allow_partial_update'] = True # oupdate(ocontext, {'id': odata['id'], # 'identifier': tmp_identifier}) update_organization_identifier(odata['id'], tmp_identifier) else: log.info(u'Skipping organizations processing') pcontext = context.copy() pkg_list = get_package_list(pkg_uuid) pcount = pkg_list.count() log.info(f'processing {pcount} packages') errored = [] if offset: pkg_list = pkg_list.offset(offset) if limit: pkg_list = pkg_list.limit(limit) # pidx may be not initialized for empty slice, need separate counter # to count actually processed datasets pidx_count = 0 for pidx, pname in enumerate(pkg_list): pcontext['schema'] = pupdate_schema pname = pname[0] log.info(f'processing {pidx + 1}/{pcount} package: {pname}') pdata = pshow(context, {'name_or_id': pname}) # , 'use_default_schema': True}) # remove empty conforms_to to avoid silly validation errors if not pdata.get('conforms_to'): pdata.pop('conforms_to', None) # ... the same for alternate_identifier if not pdata.get('alternate_identifier'): pdata.pop('alternate_identifier', None) update_creator(pdata) update_temporal_coverage(pdata) update_theme(pdata) update_identifier(pdata) update_modified(pdata) update_frequency(pdata) update_conforms_to(pdata) update_holder_info(pdata) interfaces.populate_resource_license(pdata) pdata['metadata_modified'] = None log.info(f"updating {pdata['id']} {pdata['name']}") try: out = pupdate(pcontext, pdata) pidx_count += 1 except ValidationError as err: log.error( f"Cannot update due to validation error {pdata['name']}", exc_info=True ) errored.append((pidx, pdata['name'], err,)) continue except Exception as err: log.error( f"Cannot update due to general error {pdata['name']}", exc_info=True ) errored.append((pidx, pdata['name'], err,)) continue log.debug('-' * 9) if not skip_orgs: log.info(f'processed {oidx} out of {ocount} organizations') log.info(f'processed {pidx_count} out of {pcount} packages in total') if errored: log.info(f'Following {len(errored)} datasets failed:') for position, ptitile, err in errored: err_summary = getattr(err, 'error', None) or err # this is a hack on dumb override in __str__() in some exception subclasses # stringified exception raises itself otherwise. try: log.info( f' {ptitile} at position {position}: {err.__class__}{err_summary}' ) except Exception as err: err_summary = err log.error( f' {ptitile} at position {position}: {err.__class__}{err_summary}' ) return pidx_count