示例#1
0
    def get_package_dict(self, iso_values, harvest_object):
        package_dict = super(DCATAPITCSWHarvester, self).get_package_dict(iso_values, harvest_object)

        mapping_frequencies_to_mdr_vocabulary = self.source_config.get('mapping_frequencies_to_mdr_vocabulary',
                                                                       utils._mapping_frequencies_to_mdr_vocabulary)
        mapping_languages_to_mdr_vocabulary = self.source_config.get('mapping_languages_to_mdr_vocabulary',
                                                                     utils._mapping_languages_to_mdr_vocabulary)

        self._ckan_locales_mapping = self.source_config.get('ckan_locales_mapping') or utils._ckan_locales_mapping

        default_values = self.source_config.get('default_values') or {}

        dcatapit_config = self.source_config.get('dcatapit_config', self._dcatapit_config)

        # if dcatapit_config and not all(name in dcatapit_config for name in self._dcatapit_config):
        #    dcatapit_config = self._dcatapit_config
        #    log.warning('Some keys are missing in dcatapit_config configuration property, \
        #        keyes to use are: dataset_theme, dataset_language, agent_code, frequency, \
        #        agent_code_regex, org_name_regex and dcatapit_skos_theme_id. Using defaults')
        # elif not dcatapit_config:
        #    dcatapit_config = self._dcatapit_config

        controlled_vocabularies = dcatapit_config.get('controlled_vocabularies',
                                                      self._dcatapit_config.get('controlled_vocabularies'))
        agents = dcatapit_config.get('agents', self._dcatapit_config.get('agents'))

        # ------------------------------#
        #    MANDATORY FOR DCAT-AP_IT   #
        # ------------------------------#

        #  -- identifier -- #
        identifier = iso_values['guid']
        package_dict['extras'].append({'key': 'identifier', 'value': identifier})

        default_agent_code = identifier.split(':')[0] if ':' in identifier else None

        #  -- theme -- #
        dataset_themes = []
        if iso_values['keywords']:
            default_vocab_id = self._dcatapit_config.get('controlled_vocabularies').get('dcatapit_skos_theme_id')
            dataset_themes = utils.get_controlled_vocabulary_values('eu_themes',
                                                                    controlled_vocabularies.get('dcatapit_skos_theme_id', default_vocab_id), iso_values['keywords'])

        if dataset_themes:
            dataset_themes = list(set(dataset_themes))
            dataset_themes = [{'theme': str(l), 'subthemes': []} for l in dataset_themes]

        else:
            dataset_themes = default_values.get('dataset_theme')

        if isinstance(dataset_themes, str):
            dataset_themes = [{'theme': dt} for dt in dataset_themes.strip('{}').split(',')]

        log.info('Medatata harvested dataset themes: %r', dataset_themes)
        package_dict['extras'].append({'key': FIELD_THEMES_AGGREGATE, 'value': json.dumps(dataset_themes)})

        #  -- publisher -- #
        citedResponsiblePartys = iso_values['cited-responsible-party']
        agent_name, agent_code = utils.get_responsible_party(citedResponsiblePartys, agents.get('publisher',
                                                                                                self._dcatapit_config.get('agents').get('publisher')))
        package_dict['extras'].append({'key': 'publisher_name', 'value': agent_name})
        package_dict['extras'].append({'key': 'publisher_identifier', 'value': agent_code or default_agent_code})

        #  -- modified -- #
        revision_date = iso_values['date-updated'] or iso_values['date-released']
        package_dict['extras'].append({'key': 'modified', 'value': revision_date})

        #  -- frequency -- #
        updateFrequency = iso_values['frequency-of-update']
        package_dict['extras'].append({'key': 'frequency', 'value':
                                       mapping_frequencies_to_mdr_vocabulary.get(updateFrequency,
                                                                                 dcatapit_config.get('frequency', self._dcatapit_config.get('frequency')))})

        #  -- rights_holder -- #
        citedResponsiblePartys = iso_values['cited-responsible-party']
        agent_name, agent_code = utils.get_responsible_party(citedResponsiblePartys,
                                                             agents.get('owner', self._dcatapit_config.get('agents').get('owner')))
        package_dict['extras'].append({'key': 'holder_name', 'value': agent_name})
        package_dict['extras'].append({'key': 'holder_identifier', 'value': agent_code or default_agent_code})

        # -----------------------------------------------#
        #    OTHER FIELDS NOT MANDATORY FOR DCAT_AP-IT   #
        # -----------------------------------------------#

        #  -- alternate_identifier nothing to do  -- #

        #  -- issued -- #
        publication_date = iso_values['date-released']
        package_dict['extras'].append({'key': 'issued', 'value': publication_date})

        #  -- geographical_name  -- #
        dataset_places = []
        if iso_values['keywords']:
            default_vocab_id = self._dcatapit_config.get('controlled_vocabularies').get('dcatapit_skos_theme_id')
            dataset_places = utils.get_controlled_vocabulary_values('places',
                                                                    controlled_vocabularies.get('dcatapit_skos_places_id', default_vocab_id), iso_values['keywords'])

        if dataset_places and len(dataset_places) > 1:
            dataset_places = list(set(dataset_places))
            dataset_places = '{' + ','.join(str(l) for l in dataset_places) + '}'
        else:
            dataset_places = dataset_places[0] if dataset_places and len(dataset_places) > 0 else dcatapit_config.get('dataset_places',
                                                                                                                      self._dcatapit_config.get('dataset_places'))

        if dataset_places:
            log.info('Medatata harvested dataset places: %r', dataset_places)
            package_dict['extras'].append({'key': 'geographical_name', 'value': dataset_places})

        #  -- geographical_geonames_url nothing to do  -- #

        #  -- language -- #
        dataset_languages = iso_values['dataset-language']
        language = None
        if dataset_languages and len(dataset_languages) > 0:
            languages = []
            for language in dataset_languages:
                lang = mapping_languages_to_mdr_vocabulary.get(language, None)
                if lang:
                    languages.append(lang)

            if len(languages) > 1:
                language = '{' + ','.join(str(l) for l in languages) + '}'
            else:
                language = languages[0] if len(languages) > 0 else dcatapit_config.get('dataset_languages',
                                                                                       self._dcatapit_config.get('dataset_languages'))

            log.info('Medatata harvested dataset languages: %r', language)
        else:
            language = dcatapit_config.get('dataset_language')

        package_dict['extras'].append({'key': 'language', 'value': language})

        # temporal_coverage
        # ##################
        temporal_coverage = []
        temporal_start = None
        temporal_end = None

        for key in ['temporal-extent-begin', 'temporal-extent-end']:
            if len(iso_values[key]) > 0:
                temporal_extent_value = iso_values[key][0]
                if key == 'temporal-extent-begin':
                    temporal_start = temporal_extent_value
                if key == 'temporal-extent-end':
                    temporal_end = temporal_extent_value
        if temporal_start:
            temporal_coverage.append({'temporal_start': temporal_start,
                                      'temporal_end': temporal_end})
        if temporal_coverage:
            package_dict['extras'].append({'key': 'temporal_coverage', 'value': json.dumps(temporal_coverage)})

        # conforms_to
        # ##################
        conforms_to_identifier = iso_values['conformity-specification-title']
        conforms_to_locale = self._ckan_locales_mapping.get(iso_values['metadata-language'], 'it').lower()

        conforms_to = {'identifier': conforms_to_identifier,
                       'title': {conforms_to_locale: conforms_to_identifier}}

        if conforms_to:
            package_dict['extras'].append({'key': 'conforms_to', 'value': json.dumps([conforms_to])})

        # creator
        # ###############
        #  -- creator -- #
        citedResponsiblePartys = iso_values['cited-responsible-party']
        agent_name, agent_code = utils.get_responsible_party(citedResponsiblePartys,
                                                             agents.get('author', self._dcatapit_config.get('agents').get('author')))

        agent_code = agent_code or default_agent_code
        if (agent_name and agent_code):

            creator = {}
            creator_lang = self._ckan_locales_mapping.get(iso_values['metadata-language'], 'it').lower()
            creator['creator_name'] = {creator_lang: agent_name}
            creator['creator_identifier'] = agent_code
            package_dict['extras'].append({'key': 'creator', 'value': json.dumps([creator])})

        # ckan_license
        # ##################
        ckan_license = None
        use_constraints = iso_values.get('use-constraints')
        if use_constraints:
            use_constraints = use_constraints[0]
            import ckan.logic.action.get as _license
            license_list = _license.license_list({'model': model, 'session': Session, 'user': '******'}, {})
            for license in license_list:
                if use_constraints == str(license.get('id')) or use_constraints == str(license.get('url')) or (str(license.get('id')) in use_constraints.lower()):
                    ckan_license = license
                    break

        if ckan_license:
            package_dict['license_id'] = ckan_license.get('id')
        else:
            default_license = self.source_config.get('default_license')
            if default_license:
                package_dict['license_id'] = default_license

        #  -- license handling -- #
        interfaces.populate_resource_license(package_dict)

        # End of processing, return the modified package
        return package_dict
示例#2
0
def do_migrate_data(limit=None, offset=None, skip_orgs=False):

    user = toolkit.get_action('get_site_user')({'ignore_auth': True}, {})
    context = {'user': user['name'], 'ignore_auth': True, 'use_cache': False}
    pshow = toolkit.get_action('package_show')
    pupdate = toolkit.get_action('package_update')
    pcreate = toolkit.get_action('package_create')
    oshow = toolkit.get_action('organization_show')
    oupdate = toolkit.get_action('organization_patch')
    pupdate_schema = DCATAPITPackagePlugin().update_package_schema()
    pupdate_schema['tags']['name'].remove(tag_name_validator)
    org_list = get_organization_list()
    ocount = org_list.count()
    oidx = 0
    if not skip_orgs:
        print(u'processing {} organizations'.format(ocount)).encode('utf-8')
        for oidx, oname in enumerate(org_list):
            odata = oshow(
                context, {
                    'id': oname,
                    'include_extras': True,
                    'include_tags': False,
                    'include_users': False,
                })

            oidentifier = odata.get('identifier')
            print(u'processing {}/{} organization: {}'.format(
                oidx + 1, ocount, odata['name']))
            # we require identifier for org now.
            if not oidentifier:
                odata.pop('identifier', None)
                tmp_identifier = get_temp_org_identifier()
                print(u"org: [{}] {} : setting temporal identifier: {}".format(
                    odata['name'], odata['title'],
                    tmp_identifier)).encode('utf-8')
                ocontext = context.copy()

                ocontext['allow_partial_update'] = True
                #oupdate(ocontext, {'id': odata['id'],
                #                  'identifier': tmp_identifier})
                update_organization_identifier(odata['id'], tmp_identifier)
    else:
        print(u'Skipping organizations processing').encode('utf-8')
    pcontext = context.copy()
    pkg_list = get_package_list()
    pcount = pkg_list.count()
    print(u'processing {} packages'.format(pcount)).encode('utf-8')
    errored = []

    if offset:
        pkg_list = pkg_list.offset(offset)
    if limit:
        pkg_list = pkg_list.limit(limit)

    # pidx may be not initialized for empty slice, need separate counter
    # to count actually processed datasets
    pidx_count = 0
    for pidx, pname in enumerate(pkg_list):
        pcontext['schema'] = pupdate_schema
        pname = pname[0]
        print(u'processing {}/{} package: {}'.format(pidx + 1, pcount,
                                                     pname)).encode('utf-8')
        pdata = pshow(context,
                      {'name_or_id': pname})  #, 'use_default_schema': True})

        # remove empty conforms_to to avoid silly validation errors
        if not pdata.get('conforms_to'):
            pdata.pop('conforms_to', None)
        # ... the same for alternate_identifier
        if not pdata.get('alternate_identifier'):
            pdata.pop('alternate_identifier', None)

        update_creator(pdata)
        update_temporal_coverage(pdata)
        update_theme(pdata)
        update_identifier(pdata)
        update_modified(pdata)
        update_frequency(pdata)
        update_conforms_to(pdata)
        update_holder_info(pdata)
        interfaces.populate_resource_license(pdata)
        pdata['metadata_modified'] = None
        print 'updating', pdata['id'], pdata['name']
        try:
            out = pupdate(pcontext, pdata)
            pidx_count += 1
        except ValidationError, err:
            print(u'Cannot update due to validation error {}'.format(
                pdata['name'])).encode('utf-8')
            print err
            print(pdata)
            print
            errored.append((
                pidx,
                pdata['name'],
                err,
            ))
            continue

        except Exception, err:
            print(u'Cannot update due to general error {}'.format(
                pdata['name'])).encode('utf-8')
            print err
            print(pdata)
            print
            errored.append((
                pidx,
                pdata['name'],
                err,
            ))
            continue
示例#3
0
def do_migrate_data(limit=None, offset=None, skip_orgs=False, pkg_uuid: list = None):
    # Data migrations from 1.0.0 to 1.1.0
    # ref: https://github.com/geosolutions-it/ckanext-dcatapit/issues/188

    from ckanext.dcatapit.plugin import DCATAPITPackagePlugin

    user = toolkit.get_action('get_site_user')({'ignore_auth': True}, {})
    context = {'user': user['name'],
               'ignore_auth': True,
               'use_cache': False}
    pshow = toolkit.get_action('package_show')
    pupdate = toolkit.get_action('package_update')
    pcreate = toolkit.get_action('package_create')
    oshow = toolkit.get_action('organization_show')
    oupdate = toolkit.get_action('organization_patch')
    pupdate_schema = DCATAPITPackagePlugin().update_package_schema()
    pupdate_schema['tags']['name'].remove(tag_name_validator)
    org_list = get_organization_list()
    ocount = org_list.count()
    oidx = 0
    if not skip_orgs:
        log.info(f'processing {ocount} organizations')
        for oidx, oname in enumerate(org_list):
            odata = oshow(context, {'id': oname, 'include_extras': True,
                                    'include_tags': False,
                                    'include_users': False,
                                    })

            oidentifier = odata.get('identifier')
            log.info('processing {}/{} organization: {}'.format(oidx + 1, ocount, odata['name']))
            # we require identifier for org now.
            if not oidentifier:
                odata.pop('identifier', None)
                tmp_identifier = get_temp_org_identifier()
                log.info(
                    f"org: [{odata['name']}] {odata['title']}: "
                    f'setting temporal identifier: {tmp_identifier}'
                )
                ocontext = context.copy()

                ocontext['allow_partial_update'] = True
                # oupdate(ocontext, {'id': odata['id'],
                #                  'identifier': tmp_identifier})
                update_organization_identifier(odata['id'], tmp_identifier)
    else:
        log.info(u'Skipping organizations processing')
    pcontext = context.copy()
    pkg_list = get_package_list(pkg_uuid)
    pcount = pkg_list.count()
    log.info(f'processing {pcount} packages')
    errored = []

    if offset:
        pkg_list = pkg_list.offset(offset)
    if limit:
        pkg_list = pkg_list.limit(limit)

    # pidx may be not initialized for empty slice, need separate counter
    # to count actually processed datasets
    pidx_count = 0
    for pidx, pname in enumerate(pkg_list):
        pcontext['schema'] = pupdate_schema
        pname = pname[0]
        log.info(f'processing {pidx + 1}/{pcount} package: {pname}')
        pdata = pshow(context, {'name_or_id': pname})  # , 'use_default_schema': True})

        # remove empty conforms_to to avoid silly validation errors
        if not pdata.get('conforms_to'):
            pdata.pop('conforms_to', None)
        # ... the same for alternate_identifier
        if not pdata.get('alternate_identifier'):
            pdata.pop('alternate_identifier', None)

        update_creator(pdata)
        update_temporal_coverage(pdata)
        update_theme(pdata)
        update_identifier(pdata)
        update_modified(pdata)
        update_frequency(pdata)
        update_conforms_to(pdata)
        update_holder_info(pdata)
        interfaces.populate_resource_license(pdata)
        pdata['metadata_modified'] = None
        log.info(f"updating {pdata['id']} {pdata['name']}")
        try:
            out = pupdate(pcontext, pdata)
            pidx_count += 1
        except ValidationError as err:
            log.error(
                f"Cannot update due to validation error {pdata['name']}",
                exc_info=True
            )
            errored.append((pidx, pdata['name'], err,))
            continue

        except Exception as err:
            log.error(
                f"Cannot update due to general error {pdata['name']}",
                exc_info=True
            )
            errored.append((pidx, pdata['name'], err,))
            continue
        log.debug('-' * 9)

    if not skip_orgs:
        log.info(f'processed {oidx} out of {ocount} organizations')
    log.info(f'processed {pidx_count} out of {pcount} packages in total')
    if errored:
        log.info(f'Following {len(errored)} datasets failed:')
        for position, ptitile, err in errored:
            err_summary = getattr(err, 'error', None) or err

            # this is a hack on dumb override in __str__() in some exception subclasses
            # stringified exception raises itself otherwise.
            try:
                log.info(
                    f' {ptitile} at position {position}: {err.__class__}{err_summary}'
                )
            except Exception as err:
                err_summary = err
                log.error(
                    f' {ptitile} at position {position}: {err.__class__}{err_summary}'
                )
    return pidx_count