def check_pids(key, data, errors, context): ''' Check that compulsory PIDs exist. Also check that primary data PID is not modified in any way. ''' # Empty PIDs are removed in actions, so this check should do if data.get((u'pids', 0, u'id'), None) is None: raise Invalid({'key': 'pids', 'value': _('Missing dataset PIDs')}) primary_data_pid_found = False primary_pid = None primary_keys = [k for k in data.keys() if k[0] == 'pids' and k[2] == 'primary'] for k in primary_keys: if asbool(data[k] or False) and data[(k[0], k[1], 'type')] == 'data' and data[(k[0], k[1], 'id')]: primary_data_pid_found = True primary_pid = data[(k[0], k[1], 'id')] if not primary_data_pid_found: raise Invalid({'key': 'pids', 'value': _("Missing primary data PID")}) # Check constancy of primary data PID try: data_dict = logic.get_action('package_show')({}, {'id': data[('id',)]}) old_primary_pid = utils.get_pids_by_type('data', data_dict, primary=True)[0].get('id') if old_primary_pid and old_primary_pid != primary_pid: raise Invalid({'key': 'pids', 'value': _("Primary data PID can not be modified")}) except (logic.NotFound, KeyError): # New dataset, all is well pass
def get_urn_fi_address(package): if package.get('id', '').startswith('http://') or package.get('id', '').startswith('https://'): return package.get('id') pid = get_pids_by_type('data', package, primary=True)[0].get('id', None) if is_urn(pid): template = config.get('ckanext.kata.urn_address_template', "http://urn.fi/%(pid)s") return template % {'pid': pid} return ''
def check_primary_pids(key, data, errors, context): ''' Check that primary pids exist, if not, get them from package.id and package.name :param key: key :param data: data :param errors: validation errors :param context: context ''' data_pids = utils.get_pids_by_type('data', {'pids': data.get(('pids',))}, primary=True) if not data_pids: data[('pids',)].append({'primary': u'True', 'type': 'data', 'id': data[('name',)]})
def _handle_pids(context, data_dict): ''' Do some PID modifications to data_dict ''' if not 'pids' in data_dict: data_dict['pids'] = [] else: # Clean up empty PIDs non_empty = [] for pid in data_dict['pids']: if pid.get('id'): non_empty.append(pid) data_dict['pids'] = non_empty if data_dict.get('generate_version_pid') == 'on': data_dict['pids'] += [{'id': utils.generate_pid(), 'type': 'version', 'provider': 'Etsin', }] # If no primary data PID, generate one if this is a new dataset if not utils.get_pids_by_type('data', data_dict, primary=True): model = context["model"] session = context["session"] if data_dict.get('id'): query = session.query(model.Package.id).filter_by(name=data_dict['id']) # id contains name ! result = query.first() if result: return # Existing dataset, don't generate new data PID data_dict['pids'].insert(0, {'id': utils.generate_pid(), 'type': 'data', 'primary': 'True', 'provider': 'Etsin', })
def test_get_pids_by_type(self): data_dict = copy.deepcopy(TEST_DATADICT) data_dict['id'] = 'some_package.id' pids = utils.get_pids_by_type(u'relation', data_dict) assert len(pids) == 3 pids = utils.get_pids_by_type(u'primary', data_dict) assert len(pids) == 1 pids = utils.get_pids_by_type(u'relation', data_dict, relation='isPreviousVersionOf') assert len(pids) == 1 pids = utils.get_pids_by_type(u'relation', data_dict, relation='isPartOf') assert len(pids) == 1 pids = utils.get_pids_by_type(u'relation', data_dict, relation='generalRelation') assert len(pids) == 1 pids = utils.get_pids_by_type('some_unknown_type', data_dict) assert len(pids) == 0
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Etsin: homepage uri = url_for(controller='package', action='read', id=dataset_dict.get('name'), qualified=True) g.add((dataset_ref, FOAF.homepage, URIRef(uri))) # Etsin: primary identifiers data_pids = get_pids_by_type('data', dataset_dict) for pid in data_pids: g.add((dataset_ref, ADMS.identifier, URIRef(pid.get('id')))) version_pids = get_pids_by_type('version', dataset_dict) for pid in version_pids: g.add((dataset_ref, DCT.identifier, URIRef(pid.get('id')))) g.add((dataset_ref, DCT.isVersionOf, URIRef(pid.get('id')))) # Etsin: Title and Description, including translations items = [ (DCT.title, 'langtitle', 'title'), (DCT.description, 'notes'), ] for item in items: self._add_translated_triple_from_dict( dataset_dict, dataset_ref, *item) # Etsin: Agents for agent in dataset_dict.get('agent', []): agent_role = agent.get('role') agent_id = agent.get('id') # Rights Holders if agent_role in ['owner', 'distributor']: name = agent.get('name', None) if agent_role == 'owner': if not get_if_url(agent.get('name')): name = agent.get('name', agent.get('organisation', '')) nodetype = DCT.rightsHolder if agent_role == 'distributor': nodetype = DCT.publisher agent_node_ref = BNode() g.add((agent_node_ref, RDF.type, FOAF.Agent)) g.add((dataset_ref, nodetype, agent_node_ref)) g.add((agent_node_ref, FOAF.name, Literal(name))) if agent_id: g.add((agent_node_ref, DCT.identifier, Literal(agent_id))) # Authors if agent_role in ['author', 'contributor']: if agent_role == 'author': nodetype = DCT.creator if agent_role == 'contributor': nodetype = DCT.contributor organization_ref = BNode() agent_ref = BNode() memberof_ref = BNode() creator_ref = BNode() g.add((organization_ref, FOAF.name, Literal( agent.get('organisation', None)))) g.add((memberof_ref, FOAF.organization, organization_ref)) g.add((agent_ref, ORG.memberOf, memberof_ref)) g.add((agent_ref, FOAF.name, Literal(agent.get('name', None)))) g.add((creator_ref, FOAF.Agent, agent_ref)) g.add((dataset_ref, nodetype, creator_ref)) if agent_id: g.add((agent_ref, DCT.identifier, Literal(agent_id))) # Funders if agent.get('role') == 'funder': organization_ref = BNode() memberof_ref = BNode() project_ref = BNode() isoutputof_ref = BNode() agent_url = agent.get('URL') if agent_url: g.add((project_ref, FOAF.homepage, Literal(agent_url))) funding_id = agent.get('fundingid') if funding_id: g.add((project_ref, RDFS.comment, Literal(funding_id))) g.add((organization_ref, FOAF.name, Literal( agent.get('organisation', None)))) g.add((memberof_ref, FOAF.organization, organization_ref)) g.add((project_ref, ORG.memberOf, memberof_ref)) agent_name = agent.get('name', None) g.add((project_ref, FOAF.name, Literal(agent_name))) if agent_id: g.add((project_ref, DCT.identifier, Literal(agent_id))) g.add((isoutputof_ref, FOAF.Project, project_ref)) g.add((dataset_ref, FRAPO.isOutputOf, isoutputof_ref)) # Etsin: Publishers for contact in dataset_dict.get('contact'): agent_node_ref = BNode() agent_id = contact.get('id') g.add((agent_node_ref, RDF.type, FOAF.Agent)) g.add((dataset_ref, DCT.publisher, agent_node_ref)) contact_name = contact.get('name', None) g.add((agent_node_ref, FOAF.name, Literal(contact_name))) if agent_id: g.add((agent_node_ref, DCT.identifier, Literal(agent_id))) contact_email = contact.get('email') if contact_email and contact_email != 'hidden': g.add((agent_node_ref, FOAF.mbox, URIRef("mailto:" + contact_email))) contact_url = contact.get('URL') if contact_url: g.add((agent_node_ref, FOAF.homepage, URIRef(contact_url))) contact_phone = contact.get('phone') if contact_phone: g.add((agent_node_ref, FOAF.phone, URIRef("tel:" + contact_phone))) # Etsin: Organization organization_name = resolve_org_name(dataset_dict.get('owner_org')) publisher_ref = BNode() g.add((dataset_ref, DCT.publisher, publisher_ref)) g.add((publisher_ref, FOAF.organization, Literal(organization_name))) # Etsin: Tags - can be URLs or user inputted keywords # TODO: resolve URLs from Finto. Currently get_label_for_uri() breaks # RDFlib. for tag in dataset_dict.get('tags', []): display_name = tag.get('display_name') g.add((dataset_ref, DCAT.keyword, Literal(display_name))) tag_name = tag.get('name') if is_url(tag_name): g.add((dataset_ref, DCAT.theme, URIRef(tag_name))) # Etsin: Dates # Peter: Issued-field is new. This used to be inside CatalogRecord. items = [ ('issued', DCT.issued, ['metadata_created'], Literal), ('modified', DCT.modified, ['metadata_modified'], Literal), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Etsin: Events for event in dataset_dict.get('event', []): event_ref = BNode() g.add((dataset_ref, DCT.event, event_ref)) g.add((event_ref, DCT.type, Literal(event.get('type')))) g.add((event_ref, DCT.creator, Literal(event.get('who')))) g.add((event_ref, DCT.date, Literal(str(event.get('when'))))) g.add((event_ref, DCT.description, Literal(event.get('descr')))) # Etsin: Citation citation = dataset_dict.get('citation') if citation: g.add((dataset_ref, DCT.bibliographicCitation, Literal(citation))) # Etsin: Distribution availability_list = ['access_application', 'access_request', 'through_provider'] checksum_ref = BNode() checksum_parent_ref = BNode() distribution_ref = BNode() dist_parent_ref = BNode() if dataset_dict.get('availability') == 'direct_download': access_url = get_download_url(dataset_dict) g.add((distribution_ref, DCAT.downloadURL, Literal(access_url))) checksum = dataset_dict.get('checksum') algorithm = dataset_dict.get('algorithm') if checksum and algorithm: g.add((checksum_ref, SPDX.checksumValue, Literal(checksum))) g.add((checksum_ref, SPDX.algorithm, Literal(algorithm))) g.add((checksum_parent_ref, SPDX.Checksum, checksum_ref)) g.add((distribution_ref, SPDX.checksum, checksum_parent_ref)) if dataset_dict.get('availability') in availability_list: access_url = get_download_url(dataset_dict) g.add((distribution_ref, DCAT.accessURL, Literal(access_url))) mimetype = dataset_dict.get('mimetype') if mimetype: g.add((distribution_ref, DCAT.mediaType, Literal(mimetype))) dist_format = dataset_dict.get('format') if dist_format: g.add((distribution_ref, DCT['format'], Literal(dist_format))) g.add((dist_parent_ref, DCAT.Distribution, distribution_ref)) g.add((dataset_ref, DCAT.distribution, dist_parent_ref)) # Etsin: Disciplines disciplines = dataset_dict.get('discipline', '') for discipline in split_disciplines(disciplines): if is_url(discipline): disc = URIRef(discipline) else: disc = Literal(discipline) g.add((dataset_ref, DCT.subject, disc)) # Etsin: Rights Declaration # Peter: There's no way to add an xmlns attribute under # the parent <DCT:rights> in rdflib category, declarations = get_rightscategory(dataset_dict) declaration_strings = '' for declaration in declarations: declaration_strings += u'<RightsDeclaration>{}</RightsDeclaration>\n'\ .format(declaration) xml_string = u'<RightsDeclarationMD RIGHTSCATEGORY="{}" \ xmlns="http://www.loc.gov/METS/" >\n{}</RightsDeclarationMD>'\ .format(category, declaration_strings) license_url = dataset_dict.get('license_URL') rights_ref = BNode() g.add((dataset_ref, DCT.rights, rights_ref)) g.add((rights_ref, DCT.RightsStatement, Literal( xml_string, datatype=RDF.XMLLiteral))) g.add((rights_ref, DCT.RightsStatement, Literal(license_url))) # Etsin: Spatial coverage = dataset_dict.get('geographic_coverage') if coverage: spatial_ref = BNode() location_ref = BNode() g.add((location_ref, RDFS.label, Literal(coverage))) g.add((spatial_ref, DCT.Location, location_ref)) g.add((dataset_ref, DCT.spatial_ref, spatial_ref)) # Etsin: Temporal # Peter: hasBeginning and hasEnd left out temporal_coverage_begin = dataset_dict.get('temporal_coverage_begin') temporal_coverage_end = dataset_dict.get('temporal_coverage_end') if temporal_coverage_begin or temporal_coverage_end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if temporal_coverage_begin: self._add_date_triple( temporal_extent, SCHEMA.startDate, temporal_coverage_begin) if temporal_coverage_end: self._add_date_triple( temporal_extent, SCHEMA.endDate, temporal_coverage_end) g.add((dataset_ref, DCT.temporal, temporal_extent)) # Etsin: language field needs to be stripped from spaces langs = self._get_dict_value(dataset_dict, 'language', '').split(', ') for lang in langs: params = (dataset_ref, DCAT.language, Literal(lang)) self.g.add(params)
def test_get_pids_by_type(self): data_dict = copy.deepcopy(TEST_DATADICT) data_dict['id'] = 'some_package.id' data_dict['name'] = 'some_package.name' pids = utils.get_pids_by_type('data', data_dict) assert len(pids) == 2 pids = utils.get_pids_by_type('data', data_dict, primary=True) assert len(pids) == 1 pids = utils.get_pids_by_type('data', data_dict, primary=True, use_package_id=True) assert len(pids) == 1 pids = utils.get_pids_by_type('data', data_dict, primary=False) assert len(pids) == 1 pids = utils.get_pids_by_type('metadata', data_dict) assert len(pids) == 1 pids = utils.get_pids_by_type('metadata', data_dict, primary=True) assert len(pids) == 0 pids = utils.get_pids_by_type('metadata', data_dict, primary=True, use_package_id=True) assert len(pids) == 1 pids = utils.get_pids_by_type('metadata', data_dict, use_package_id=True) assert len(pids) == 2 pids = utils.get_pids_by_type('version', data_dict) assert len(pids) == 1 pids = utils.get_pids_by_type('version', data_dict, primary=True) assert len(pids) == 0 pids = utils.get_pids_by_type('version', data_dict, primary=True, use_package_id=True) assert len(pids) == 0 pids = utils.get_pids_by_type('some_unknown_type', data_dict) assert len(pids) == 0
def graph_from_dataset(self, dataset_dict, dataset_ref): primary_pid = get_primary_pid(dataset_dict) if not primary_pid: return g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Etsin: homepage uri = url_for(controller='package', action='read', id=dataset_dict.get('name'), qualified=True) g.add( (dataset_ref, FOAF.homepage, URIRef(remove_trailing_spaces(uri)))) # Etsin: primary identifier g.add((dataset_ref, ADMS.identifier, URIRef(remove_trailing_spaces(primary_pid)))) # Etsin: Relation identifiers relation_pids = get_pids_by_type('relation', dataset_dict) for rpid in relation_pids: if rpid.get('relation') == 'isNewVersionOf' or rpid.get( 'relation') == 'isPreviousVersionOf': g.add((dataset_ref, DCT.isVersionOf, URIRef(remove_trailing_spaces(rpid.get('id'))))) elif rpid.get('relation') == 'hasPart': g.add((dataset_ref, DCT.hasPart, URIRef(remove_trailing_spaces(rpid.get('id'))))) elif rpid.get('relation') == 'isPartOf': g.add((dataset_ref, DCT.isPartOf, URIRef(remove_trailing_spaces(rpid.get('id'))))) else: g.add((dataset_ref, DCT.identifier, URIRef(remove_trailing_spaces(rpid.get('id'))))) # Etsin: Title and Description, including translations items = [ (DCT.title, 'langtitle', 'title'), (DCT.description, 'notes'), ] for item in items: self._add_translated_triple_from_dict(dataset_dict, dataset_ref, *item) # Etsin: Agents for agent in dataset_dict.get('agent', []): agent_role = agent.get('role') agent_id = agent.get('id') # Rights Holders if agent_role in ['owner', 'distributor']: name = agent.get('name', None) if agent_role == 'owner': if not get_if_url(agent.get('name')): name = agent.get('name', agent.get('organisation', '')) nodetype = DCT.rightsHolder if agent_role == 'distributor': nodetype = DCT.publisher agent_node_ref = BNode() g.add((agent_node_ref, RDF.type, FOAF.Agent)) g.add((dataset_ref, nodetype, agent_node_ref)) g.add((agent_node_ref, FOAF.name, Literal(name))) if agent_id: g.add((agent_node_ref, DCT.identifier, Literal(agent_id))) # Authors if agent_role in ['author', 'contributor']: if agent_role == 'author': nodetype = DCT.creator if agent_role == 'contributor': nodetype = DCT.contributor organization_ref = BNode() agent_ref = BNode() memberof_ref = BNode() creator_ref = BNode() g.add((organization_ref, FOAF.name, Literal(agent.get('organisation', None)))) g.add((memberof_ref, FOAF.organization, organization_ref)) g.add((agent_ref, ORG.memberOf, memberof_ref)) g.add((agent_ref, FOAF.name, Literal(agent.get('name', None)))) g.add((creator_ref, FOAF.Agent, agent_ref)) g.add((dataset_ref, nodetype, creator_ref)) if agent_id: g.add((agent_ref, DCT.identifier, Literal(agent_id))) # Funders if agent.get('role') == 'funder': organization_ref = BNode() memberof_ref = BNode() project_ref = BNode() isoutputof_ref = BNode() agent_url = agent.get('URL') if agent_url: g.add((project_ref, FOAF.homepage, Literal(agent_url))) funding_id = agent.get('fundingid') if funding_id: g.add((project_ref, RDFS.comment, Literal(funding_id))) g.add((organization_ref, FOAF.name, Literal(agent.get('organisation', None)))) g.add((memberof_ref, FOAF.organization, organization_ref)) g.add((project_ref, ORG.memberOf, memberof_ref)) agent_name = agent.get('name', None) g.add((project_ref, FOAF.name, Literal(agent_name))) if agent_id: g.add((project_ref, DCT.identifier, Literal(agent_id))) g.add((isoutputof_ref, FOAF.Project, project_ref)) g.add((dataset_ref, FRAPO.isOutputOf, isoutputof_ref)) # Etsin: Publishers for contact in dataset_dict.get('contact'): agent_node_ref = BNode() agent_id = contact.get('id') g.add((agent_node_ref, RDF.type, FOAF.Agent)) g.add((dataset_ref, DCT.publisher, agent_node_ref)) contact_name = contact.get('name', None) g.add((agent_node_ref, FOAF.name, Literal(contact_name))) if agent_id: g.add((agent_node_ref, DCT.identifier, Literal(agent_id))) contact_email = contact.get('email') if contact_email and contact_email != 'hidden': g.add((agent_node_ref, FOAF.mbox, URIRef("mailto:" + remove_trailing_spaces(contact_email)))) contact_url = contact.get('URL') if contact_url: g.add((agent_node_ref, FOAF.homepage, URIRef(remove_trailing_spaces(contact_url)))) contact_phone = remove_all_spaces(contact.get('phone')) if contact_phone: g.add((agent_node_ref, FOAF.phone, URIRef("tel:" + remove_trailing_spaces(contact_phone)))) # Etsin: Organization organization_name = resolve_org_name(dataset_dict.get('owner_org')) publisher_ref = BNode() g.add((dataset_ref, DCT.publisher, publisher_ref)) g.add((publisher_ref, FOAF.organization, Literal(organization_name))) # Etsin: Tags - can be URLs or user inputted keywords # TODO: resolve URLs from Finto. Currently get_label_for_uri() breaks # RDFlib. for tag in dataset_dict.get('tags', []): display_name = tag.get('display_name') g.add((dataset_ref, DCAT.keyword, Literal(display_name))) tag_name = tag.get('name') if is_url(tag_name): g.add((dataset_ref, DCAT.theme, URIRef(remove_trailing_spaces(tag_name)))) # Etsin: Dates # Peter: Issued-field is new. This used to be inside CatalogRecord. items = [ ('issued', DCT.issued, ['metadata_created'], Literal), ('modified', DCT.modified, ['metadata_modified'], Literal), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Etsin: Events for event in dataset_dict.get('event', []): event_ref = BNode() g.add((dataset_ref, DCT.event, event_ref)) g.add((event_ref, DCT.type, Literal(event.get('type')))) g.add((event_ref, DCT.creator, Literal(event.get('who')))) g.add((event_ref, DCT.date, Literal(str(event.get('when'))))) g.add((event_ref, DCT.description, Literal(event.get('descr')))) # Etsin: Citation citation = dataset_dict.get('citation') if citation: g.add((dataset_ref, DCT.bibliographicCitation, Literal(citation))) # Etsin: Distribution availability_list = [ 'access_application_rems', 'access_application_other', 'access_request' ] checksum_ref = BNode() checksum_parent_ref = BNode() distribution_ref = BNode() dist_parent_ref = BNode() if dataset_dict.get('availability') == 'direct_download': access_url = get_download_url(dataset_dict) g.add((distribution_ref, DCAT.downloadURL, Literal(access_url))) checksum = dataset_dict.get('checksum') algorithm = dataset_dict.get('algorithm') if checksum and algorithm: g.add((checksum_ref, SPDX.checksumValue, Literal(checksum))) g.add((checksum_ref, SPDX.algorithm, Literal(algorithm))) g.add((checksum_parent_ref, SPDX.Checksum, checksum_ref)) g.add((distribution_ref, SPDX.checksum, checksum_parent_ref)) if dataset_dict.get('availability') in availability_list: access_url = get_download_url(dataset_dict) g.add((distribution_ref, DCAT.accessURL, Literal(access_url))) mimetype = dataset_dict.get('mimetype') if mimetype: g.add((distribution_ref, DCAT.mediaType, Literal(mimetype))) dist_format = dataset_dict.get('format') if dist_format: g.add((distribution_ref, DCT['format'], Literal(dist_format))) g.add((dist_parent_ref, DCAT.Distribution, distribution_ref)) g.add((dataset_ref, DCAT.distribution, dist_parent_ref)) # Etsin: Disciplines disciplines = dataset_dict.get('discipline', '') for discipline in split_disciplines(disciplines): if is_url(discipline): disc = URIRef(remove_trailing_spaces(discipline)) else: disc = Literal(discipline) g.add((dataset_ref, DCT.subject, disc)) # Etsin: Rights Declaration # Peter: There's no way to add an xmlns attribute under # the parent <DCT:rights> in rdflib category, declarations = get_rightscategory(dataset_dict) declaration_strings = '' for declaration in declarations: declaration_strings += u'<RightsDeclaration>{}</RightsDeclaration>\n'\ .format(declaration) xml_string = u'<RightsDeclarationMD RIGHTSCATEGORY="{}" \ xmlns="http://www.loc.gov/METS/" >\n{}</RightsDeclarationMD>'\ .format(category, declaration_strings) license_url = dataset_dict.get('license_URL') rights_ref = BNode() g.add((dataset_ref, DCT.rights, rights_ref)) g.add((rights_ref, DCT.RightsStatement, Literal(xml_string, datatype=RDF.XMLLiteral))) g.add((rights_ref, DCT.RightsStatement, Literal(license_url))) # Etsin: Spatial coverage = dataset_dict.get('geographic_coverage') if coverage: spatial_ref = BNode() location_ref = BNode() g.add((location_ref, RDFS.label, Literal(coverage))) g.add((spatial_ref, DCT.Location, location_ref)) g.add((dataset_ref, DCT.spatial_ref, spatial_ref)) # Etsin: Temporal # Peter: hasBeginning and hasEnd left out temporal_coverage_begin = dataset_dict.get('temporal_coverage_begin') temporal_coverage_end = dataset_dict.get('temporal_coverage_end') if temporal_coverage_begin or temporal_coverage_end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if temporal_coverage_begin: self._add_date_triple(temporal_extent, SCHEMA.startDate, temporal_coverage_begin) if temporal_coverage_end: self._add_date_triple(temporal_extent, SCHEMA.endDate, temporal_coverage_end) g.add((dataset_ref, DCT.temporal, temporal_extent)) # Etsin: language field needs to be stripped from spaces langs = self._get_dict_value(dataset_dict, 'language', '').split(', ') for lang in langs: params = (dataset_ref, DCAT.language, Literal(lang)) self.g.add(params)