def test_discipline_split(self): ''' RDF export would look odd if discipline splitting function would break ''' disciplines = helpers.split_disciplines('Matematiikka,Fysiikka') assert disciplines[1] == 'Fysiikka' assert helpers.split_disciplines('Tiede-, taide- ja liikuntakasvatus')[0] == 'Tiede-, taide- ja liikuntakasvatus'
def test_discipline_split(self): ''' RDF export would look odd if discipline splitting function would break ''' disciplines = helpers.split_disciplines('Matematiikka,Fysiikka') assert disciplines[1] == 'Fysiikka' assert helpers.split_disciplines('Tiede-, taide- ja liikuntakasvatus')[ 0] == 'Tiede-, taide- ja liikuntakasvatus'
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Etsin: homepage uri = url_for(controller='package', action='read', id=dataset_dict.get('name'), qualified=True) g.add((dataset_ref, FOAF.homepage, URIRef(uri))) # Etsin: primary identifiers data_pids = get_pids_by_type('data', dataset_dict) for pid in data_pids: g.add((dataset_ref, ADMS.identifier, URIRef(pid.get('id')))) version_pids = get_pids_by_type('version', dataset_dict) for pid in version_pids: g.add((dataset_ref, DCT.identifier, URIRef(pid.get('id')))) g.add((dataset_ref, DCT.isVersionOf, URIRef(pid.get('id')))) # Etsin: Title and Description, including translations items = [ (DCT.title, 'langtitle', 'title'), (DCT.description, 'notes'), ] for item in items: self._add_translated_triple_from_dict( dataset_dict, dataset_ref, *item) # Etsin: Agents for agent in dataset_dict.get('agent', []): agent_role = agent.get('role') agent_id = agent.get('id') # Rights Holders if agent_role in ['owner', 'distributor']: name = agent.get('name', None) if agent_role == 'owner': if not get_if_url(agent.get('name')): name = agent.get('name', agent.get('organisation', '')) nodetype = DCT.rightsHolder if agent_role == 'distributor': nodetype = DCT.publisher agent_node_ref = BNode() g.add((agent_node_ref, RDF.type, FOAF.Agent)) g.add((dataset_ref, nodetype, agent_node_ref)) g.add((agent_node_ref, FOAF.name, Literal(name))) if agent_id: g.add((agent_node_ref, DCT.identifier, Literal(agent_id))) # Authors if agent_role in ['author', 'contributor']: if agent_role == 'author': nodetype = DCT.creator if agent_role == 'contributor': nodetype = DCT.contributor organization_ref = BNode() agent_ref = BNode() memberof_ref = BNode() creator_ref = BNode() g.add((organization_ref, FOAF.name, Literal( agent.get('organisation', None)))) g.add((memberof_ref, FOAF.organization, organization_ref)) g.add((agent_ref, ORG.memberOf, memberof_ref)) g.add((agent_ref, FOAF.name, Literal(agent.get('name', None)))) g.add((creator_ref, FOAF.Agent, agent_ref)) g.add((dataset_ref, nodetype, creator_ref)) if agent_id: g.add((agent_ref, DCT.identifier, Literal(agent_id))) # Funders if agent.get('role') == 'funder': organization_ref = BNode() memberof_ref = BNode() project_ref = BNode() isoutputof_ref = BNode() agent_url = agent.get('URL') if agent_url: g.add((project_ref, FOAF.homepage, Literal(agent_url))) funding_id = agent.get('fundingid') if funding_id: g.add((project_ref, RDFS.comment, Literal(funding_id))) g.add((organization_ref, FOAF.name, Literal( agent.get('organisation', None)))) g.add((memberof_ref, FOAF.organization, organization_ref)) g.add((project_ref, ORG.memberOf, memberof_ref)) agent_name = agent.get('name', None) g.add((project_ref, FOAF.name, Literal(agent_name))) if agent_id: g.add((project_ref, DCT.identifier, Literal(agent_id))) g.add((isoutputof_ref, FOAF.Project, project_ref)) g.add((dataset_ref, FRAPO.isOutputOf, isoutputof_ref)) # Etsin: Publishers for contact in dataset_dict.get('contact'): agent_node_ref = BNode() agent_id = contact.get('id') g.add((agent_node_ref, RDF.type, FOAF.Agent)) g.add((dataset_ref, DCT.publisher, agent_node_ref)) contact_name = contact.get('name', None) g.add((agent_node_ref, FOAF.name, Literal(contact_name))) if agent_id: g.add((agent_node_ref, DCT.identifier, Literal(agent_id))) contact_email = contact.get('email') if contact_email and contact_email != 'hidden': g.add((agent_node_ref, FOAF.mbox, URIRef("mailto:" + contact_email))) contact_url = contact.get('URL') if contact_url: g.add((agent_node_ref, FOAF.homepage, URIRef(contact_url))) contact_phone = contact.get('phone') if contact_phone: g.add((agent_node_ref, FOAF.phone, URIRef("tel:" + contact_phone))) # Etsin: Organization organization_name = resolve_org_name(dataset_dict.get('owner_org')) publisher_ref = BNode() g.add((dataset_ref, DCT.publisher, publisher_ref)) g.add((publisher_ref, FOAF.organization, Literal(organization_name))) # Etsin: Tags - can be URLs or user inputted keywords # TODO: resolve URLs from Finto. Currently get_label_for_uri() breaks # RDFlib. for tag in dataset_dict.get('tags', []): display_name = tag.get('display_name') g.add((dataset_ref, DCAT.keyword, Literal(display_name))) tag_name = tag.get('name') if is_url(tag_name): g.add((dataset_ref, DCAT.theme, URIRef(tag_name))) # Etsin: Dates # Peter: Issued-field is new. This used to be inside CatalogRecord. items = [ ('issued', DCT.issued, ['metadata_created'], Literal), ('modified', DCT.modified, ['metadata_modified'], Literal), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Etsin: Events for event in dataset_dict.get('event', []): event_ref = BNode() g.add((dataset_ref, DCT.event, event_ref)) g.add((event_ref, DCT.type, Literal(event.get('type')))) g.add((event_ref, DCT.creator, Literal(event.get('who')))) g.add((event_ref, DCT.date, Literal(str(event.get('when'))))) g.add((event_ref, DCT.description, Literal(event.get('descr')))) # Etsin: Citation citation = dataset_dict.get('citation') if citation: g.add((dataset_ref, DCT.bibliographicCitation, Literal(citation))) # Etsin: Distribution availability_list = ['access_application', 'access_request', 'through_provider'] checksum_ref = BNode() checksum_parent_ref = BNode() distribution_ref = BNode() dist_parent_ref = BNode() if dataset_dict.get('availability') == 'direct_download': access_url = get_download_url(dataset_dict) g.add((distribution_ref, DCAT.downloadURL, Literal(access_url))) checksum = dataset_dict.get('checksum') algorithm = dataset_dict.get('algorithm') if checksum and algorithm: g.add((checksum_ref, SPDX.checksumValue, Literal(checksum))) g.add((checksum_ref, SPDX.algorithm, Literal(algorithm))) g.add((checksum_parent_ref, SPDX.Checksum, checksum_ref)) g.add((distribution_ref, SPDX.checksum, checksum_parent_ref)) if dataset_dict.get('availability') in availability_list: access_url = get_download_url(dataset_dict) g.add((distribution_ref, DCAT.accessURL, Literal(access_url))) mimetype = dataset_dict.get('mimetype') if mimetype: g.add((distribution_ref, DCAT.mediaType, Literal(mimetype))) dist_format = dataset_dict.get('format') if dist_format: g.add((distribution_ref, DCT['format'], Literal(dist_format))) g.add((dist_parent_ref, DCAT.Distribution, distribution_ref)) g.add((dataset_ref, DCAT.distribution, dist_parent_ref)) # Etsin: Disciplines disciplines = dataset_dict.get('discipline', '') for discipline in split_disciplines(disciplines): if is_url(discipline): disc = URIRef(discipline) else: disc = Literal(discipline) g.add((dataset_ref, DCT.subject, disc)) # Etsin: Rights Declaration # Peter: There's no way to add an xmlns attribute under # the parent <DCT:rights> in rdflib category, declarations = get_rightscategory(dataset_dict) declaration_strings = '' for declaration in declarations: declaration_strings += u'<RightsDeclaration>{}</RightsDeclaration>\n'\ .format(declaration) xml_string = u'<RightsDeclarationMD RIGHTSCATEGORY="{}" \ xmlns="http://www.loc.gov/METS/" >\n{}</RightsDeclarationMD>'\ .format(category, declaration_strings) license_url = dataset_dict.get('license_URL') rights_ref = BNode() g.add((dataset_ref, DCT.rights, rights_ref)) g.add((rights_ref, DCT.RightsStatement, Literal( xml_string, datatype=RDF.XMLLiteral))) g.add((rights_ref, DCT.RightsStatement, Literal(license_url))) # Etsin: Spatial coverage = dataset_dict.get('geographic_coverage') if coverage: spatial_ref = BNode() location_ref = BNode() g.add((location_ref, RDFS.label, Literal(coverage))) g.add((spatial_ref, DCT.Location, location_ref)) g.add((dataset_ref, DCT.spatial_ref, spatial_ref)) # Etsin: Temporal # Peter: hasBeginning and hasEnd left out temporal_coverage_begin = dataset_dict.get('temporal_coverage_begin') temporal_coverage_end = dataset_dict.get('temporal_coverage_end') if temporal_coverage_begin or temporal_coverage_end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if temporal_coverage_begin: self._add_date_triple( temporal_extent, SCHEMA.startDate, temporal_coverage_begin) if temporal_coverage_end: self._add_date_triple( temporal_extent, SCHEMA.endDate, temporal_coverage_end) g.add((dataset_ref, DCT.temporal, temporal_extent)) # Etsin: language field needs to be stripped from spaces langs = self._get_dict_value(dataset_dict, 'language', '').split(', ') for lang in langs: params = (dataset_ref, DCAT.language, Literal(lang)) self.g.add(params)
def before_index(self, pkg_dict): ''' Modification to package dictionary before indexing it to Solr index. For example, we add resource mimetype to the index, modify agents and hide the email address :param pkg_dict: pkg_dict to modify :returns: the modified package dict to be indexed ''' EMAIL = re.compile(r'.*contact_\d*_email') # Add res_mimetype to pkg_dict. Can be removed after res_mimetype is # added to CKAN's index function. data = json.loads(pkg_dict['data_dict']) # We do not want owner_org to organization facets. Note that owner_org.name # is an id in our case and thus not human readable pkg_dict['organization'] = '' pkg_dict['isopen'] = data.get('isopen') pkg_dict['res_mimetype'] = [res['mimetype'] for res in data.get('resources', []) if res.get('mimetype')] # Extract plain text from resources and add to the data dict for indexing for resource in data.get('resources', []): if resource['resource_type'] in ('file', 'file.upload'): try: text = extractor.extract_text(resource['url'], resource['format']) except IOError as ioe: log.debug(str(ioe)) text = "" if text: all_text = pkg_dict.get('res_text_contents', '') all_text += (text + '\n') pkg_dict['res_text_contents'] = all_text # Separate agent roles for Solr indexing new_items = {} for key, value in pkg_dict.iteritems(): tokens = key.split('_') if tokens[0] == 'agent' and tokens[2] == 'role': role_idx = '{role}_{id}'.format(role=value, id=tokens[1]) # Must not be unicode org_idx = 'organization_{id}'.format(id=tokens[1]) agent_name = pkg_dict.get('agent_{id}_name'.format(id=tokens[1])) agent_org = pkg_dict.get('agent_{id}_organisation'.format(id=tokens[1])) agent_id = pkg_dict.get('agent_{id}_id'.format(id=tokens[1])) if agent_name: new_items[role_idx] = agent_name new_items['agent_name_' + tokens[1]] = agent_name if agent_org: new_items[org_idx] = agent_org new_items['agent_name_' + tokens[1] + '_org'] = agent_org if agent_id: new_items['agent_name_' + tokens[1] + '_id'] = agent_id # hide sensitive data if EMAIL.match(key): pkg_dict[key] = u'' pkg_dict.update(new_items) # hide sensitive data for item in data.get('extras', []): if EMAIL.match(item['key']): item['value'] = u'' # Resolve uri labels and add them to the Solr index. # Discipline field in pkg_dict is of type comma-separated-string, while # tags are given already as a list split_disciplines = helpers.split_disciplines(pkg_dict.get('discipline')) pkg_dict['extras_discipline_resolved'] = self._resolve_labels(split_disciplines, 'okm-tieteenala') pkg_dict['extras_keywords_resolved'] = self._resolve_labels(pkg_dict.get('tags'), 'koko') # Make dates compliant with ISO 8601 used by Solr. # We assume here that what we get is partial date (YYYY or YYYY-MM) that is compliant with the standard. # Eg. the standard always uses 4-digit year (1583-9999) and two-digit month DATE_TEMPLATES = {'temporal_coverage_begin': '2000-01-01T00:00:00Z', 'temporal_coverage_end': '2000-12-31T23:59:59Z'} for temporal_field, date_template in DATE_TEMPLATES.iteritems(): temporal_date = pkg_dict.get(temporal_field) # Remove time zone as Solr doesn't support it. # NOTE: Date time is not converted to UTC, but time zone is just stripped. Could be converted with arrow. if temporal_date: try: datetime_obj = iso8601.parse_date(temporal_date) temporal_date = datetime_obj.replace(tzinfo=None).isoformat() pkg_dict[temporal_field] = temporal_date + date_template[len(temporal_date):] except iso8601.ParseError: temporal_date = '' if temporal_date == '': # Remove empty strings as they won't fit into Solr's TrieDateField pkg_dict.pop(temporal_field) self._handle_titles(pkg_dict) validated = json.loads(pkg_dict.get('validated_data_dict')) _crypt = self._crypto() try: for item in validated.get('contact'): for k_ in item.iterkeys(): if k_ == u'email': item[k_] = base64.b64encode(_crypt.encrypt(self._pad(unicode(item[k_], "utf-8")))) except TypeError: # Harves sources pass pkg_dict['validated_data_dict'] = json.dumps(validated) pkg_dict['data_dict'] = json.dumps(data) return pkg_dict
def graph_from_dataset(self, dataset_dict, dataset_ref): primary_pid = get_primary_pid(dataset_dict) if not primary_pid: return g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Etsin: homepage uri = url_for(controller='package', action='read', id=dataset_dict.get('name'), qualified=True) g.add( (dataset_ref, FOAF.homepage, URIRef(remove_trailing_spaces(uri)))) # Etsin: primary identifier g.add((dataset_ref, ADMS.identifier, URIRef(remove_trailing_spaces(primary_pid)))) # Etsin: Relation identifiers relation_pids = get_pids_by_type('relation', dataset_dict) for rpid in relation_pids: if rpid.get('relation') == 'isNewVersionOf' or rpid.get( 'relation') == 'isPreviousVersionOf': g.add((dataset_ref, DCT.isVersionOf, URIRef(remove_trailing_spaces(rpid.get('id'))))) elif rpid.get('relation') == 'hasPart': g.add((dataset_ref, DCT.hasPart, URIRef(remove_trailing_spaces(rpid.get('id'))))) elif rpid.get('relation') == 'isPartOf': g.add((dataset_ref, DCT.isPartOf, URIRef(remove_trailing_spaces(rpid.get('id'))))) else: g.add((dataset_ref, DCT.identifier, URIRef(remove_trailing_spaces(rpid.get('id'))))) # Etsin: Title and Description, including translations items = [ (DCT.title, 'langtitle', 'title'), (DCT.description, 'notes'), ] for item in items: self._add_translated_triple_from_dict(dataset_dict, dataset_ref, *item) # Etsin: Agents for agent in dataset_dict.get('agent', []): agent_role = agent.get('role') agent_id = agent.get('id') # Rights Holders if agent_role in ['owner', 'distributor']: name = agent.get('name', None) if agent_role == 'owner': if not get_if_url(agent.get('name')): name = agent.get('name', agent.get('organisation', '')) nodetype = DCT.rightsHolder if agent_role == 'distributor': nodetype = DCT.publisher agent_node_ref = BNode() g.add((agent_node_ref, RDF.type, FOAF.Agent)) g.add((dataset_ref, nodetype, agent_node_ref)) g.add((agent_node_ref, FOAF.name, Literal(name))) if agent_id: g.add((agent_node_ref, DCT.identifier, Literal(agent_id))) # Authors if agent_role in ['author', 'contributor']: if agent_role == 'author': nodetype = DCT.creator if agent_role == 'contributor': nodetype = DCT.contributor organization_ref = BNode() agent_ref = BNode() memberof_ref = BNode() creator_ref = BNode() g.add((organization_ref, FOAF.name, Literal(agent.get('organisation', None)))) g.add((memberof_ref, FOAF.organization, organization_ref)) g.add((agent_ref, ORG.memberOf, memberof_ref)) g.add((agent_ref, FOAF.name, Literal(agent.get('name', None)))) g.add((creator_ref, FOAF.Agent, agent_ref)) g.add((dataset_ref, nodetype, creator_ref)) if agent_id: g.add((agent_ref, DCT.identifier, Literal(agent_id))) # Funders if agent.get('role') == 'funder': organization_ref = BNode() memberof_ref = BNode() project_ref = BNode() isoutputof_ref = BNode() agent_url = agent.get('URL') if agent_url: g.add((project_ref, FOAF.homepage, Literal(agent_url))) funding_id = agent.get('fundingid') if funding_id: g.add((project_ref, RDFS.comment, Literal(funding_id))) g.add((organization_ref, FOAF.name, Literal(agent.get('organisation', None)))) g.add((memberof_ref, FOAF.organization, organization_ref)) g.add((project_ref, ORG.memberOf, memberof_ref)) agent_name = agent.get('name', None) g.add((project_ref, FOAF.name, Literal(agent_name))) if agent_id: g.add((project_ref, DCT.identifier, Literal(agent_id))) g.add((isoutputof_ref, FOAF.Project, project_ref)) g.add((dataset_ref, FRAPO.isOutputOf, isoutputof_ref)) # Etsin: Publishers for contact in dataset_dict.get('contact'): agent_node_ref = BNode() agent_id = contact.get('id') g.add((agent_node_ref, RDF.type, FOAF.Agent)) g.add((dataset_ref, DCT.publisher, agent_node_ref)) contact_name = contact.get('name', None) g.add((agent_node_ref, FOAF.name, Literal(contact_name))) if agent_id: g.add((agent_node_ref, DCT.identifier, Literal(agent_id))) contact_email = contact.get('email') if contact_email and contact_email != 'hidden': g.add((agent_node_ref, FOAF.mbox, URIRef("mailto:" + remove_trailing_spaces(contact_email)))) contact_url = contact.get('URL') if contact_url: g.add((agent_node_ref, FOAF.homepage, URIRef(remove_trailing_spaces(contact_url)))) contact_phone = remove_all_spaces(contact.get('phone')) if contact_phone: g.add((agent_node_ref, FOAF.phone, URIRef("tel:" + remove_trailing_spaces(contact_phone)))) # Etsin: Organization organization_name = resolve_org_name(dataset_dict.get('owner_org')) publisher_ref = BNode() g.add((dataset_ref, DCT.publisher, publisher_ref)) g.add((publisher_ref, FOAF.organization, Literal(organization_name))) # Etsin: Tags - can be URLs or user inputted keywords # TODO: resolve URLs from Finto. Currently get_label_for_uri() breaks # RDFlib. for tag in dataset_dict.get('tags', []): display_name = tag.get('display_name') g.add((dataset_ref, DCAT.keyword, Literal(display_name))) tag_name = tag.get('name') if is_url(tag_name): g.add((dataset_ref, DCAT.theme, URIRef(remove_trailing_spaces(tag_name)))) # Etsin: Dates # Peter: Issued-field is new. This used to be inside CatalogRecord. items = [ ('issued', DCT.issued, ['metadata_created'], Literal), ('modified', DCT.modified, ['metadata_modified'], Literal), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Etsin: Events for event in dataset_dict.get('event', []): event_ref = BNode() g.add((dataset_ref, DCT.event, event_ref)) g.add((event_ref, DCT.type, Literal(event.get('type')))) g.add((event_ref, DCT.creator, Literal(event.get('who')))) g.add((event_ref, DCT.date, Literal(str(event.get('when'))))) g.add((event_ref, DCT.description, Literal(event.get('descr')))) # Etsin: Citation citation = dataset_dict.get('citation') if citation: g.add((dataset_ref, DCT.bibliographicCitation, Literal(citation))) # Etsin: Distribution availability_list = [ 'access_application_rems', 'access_application_other', 'access_request' ] checksum_ref = BNode() checksum_parent_ref = BNode() distribution_ref = BNode() dist_parent_ref = BNode() if dataset_dict.get('availability') == 'direct_download': access_url = get_download_url(dataset_dict) g.add((distribution_ref, DCAT.downloadURL, Literal(access_url))) checksum = dataset_dict.get('checksum') algorithm = dataset_dict.get('algorithm') if checksum and algorithm: g.add((checksum_ref, SPDX.checksumValue, Literal(checksum))) g.add((checksum_ref, SPDX.algorithm, Literal(algorithm))) g.add((checksum_parent_ref, SPDX.Checksum, checksum_ref)) g.add((distribution_ref, SPDX.checksum, checksum_parent_ref)) if dataset_dict.get('availability') in availability_list: access_url = get_download_url(dataset_dict) g.add((distribution_ref, DCAT.accessURL, Literal(access_url))) mimetype = dataset_dict.get('mimetype') if mimetype: g.add((distribution_ref, DCAT.mediaType, Literal(mimetype))) dist_format = dataset_dict.get('format') if dist_format: g.add((distribution_ref, DCT['format'], Literal(dist_format))) g.add((dist_parent_ref, DCAT.Distribution, distribution_ref)) g.add((dataset_ref, DCAT.distribution, dist_parent_ref)) # Etsin: Disciplines disciplines = dataset_dict.get('discipline', '') for discipline in split_disciplines(disciplines): if is_url(discipline): disc = URIRef(remove_trailing_spaces(discipline)) else: disc = Literal(discipline) g.add((dataset_ref, DCT.subject, disc)) # Etsin: Rights Declaration # Peter: There's no way to add an xmlns attribute under # the parent <DCT:rights> in rdflib category, declarations = get_rightscategory(dataset_dict) declaration_strings = '' for declaration in declarations: declaration_strings += u'<RightsDeclaration>{}</RightsDeclaration>\n'\ .format(declaration) xml_string = u'<RightsDeclarationMD RIGHTSCATEGORY="{}" \ xmlns="http://www.loc.gov/METS/" >\n{}</RightsDeclarationMD>'\ .format(category, declaration_strings) license_url = dataset_dict.get('license_URL') rights_ref = BNode() g.add((dataset_ref, DCT.rights, rights_ref)) g.add((rights_ref, DCT.RightsStatement, Literal(xml_string, datatype=RDF.XMLLiteral))) g.add((rights_ref, DCT.RightsStatement, Literal(license_url))) # Etsin: Spatial coverage = dataset_dict.get('geographic_coverage') if coverage: spatial_ref = BNode() location_ref = BNode() g.add((location_ref, RDFS.label, Literal(coverage))) g.add((spatial_ref, DCT.Location, location_ref)) g.add((dataset_ref, DCT.spatial_ref, spatial_ref)) # Etsin: Temporal # Peter: hasBeginning and hasEnd left out temporal_coverage_begin = dataset_dict.get('temporal_coverage_begin') temporal_coverage_end = dataset_dict.get('temporal_coverage_end') if temporal_coverage_begin or temporal_coverage_end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if temporal_coverage_begin: self._add_date_triple(temporal_extent, SCHEMA.startDate, temporal_coverage_begin) if temporal_coverage_end: self._add_date_triple(temporal_extent, SCHEMA.endDate, temporal_coverage_end) g.add((dataset_ref, DCT.temporal, temporal_extent)) # Etsin: language field needs to be stripped from spaces langs = self._get_dict_value(dataset_dict, 'language', '').split(', ') for lang in langs: params = (dataset_ref, DCAT.language, Literal(lang)) self.g.add(params)