def testOaiListRecords(self): header, body = getRequest(self.erfGeoEnrichmentPort, '/oai', {'verb': 'ListRecords', 'metadataPrefix': 'erfGeoEnrichment'}, parse=False) bodyLxml = XML(body) self.assertEquals(4, len(xpath(bodyLxml, '/oai:OAI-PMH/oai:ListRecords/oai:record'))) d = dict(zip( xpath(bodyLxml, '/oai:OAI-PMH/oai:ListRecords/oai:record/oai:metadata/rdf:RDF/oa:Annotation/oa:hasTarget/@rdf:resource'), xpath(bodyLxml, '/oai:OAI-PMH/oai:ListRecords/oai:record/oai:metadata/rdf:RDF/oa:Annotation'))) self.assertEquals(set(['NIOD_BBWO2:niod:3366459', 'geluidVanNl:geluid_van_nederland:47954146', 'NIOD_BBWO2:niod:3441263', 'limburgs_erfgoed:oai:le:RooyNet:37']), set(d.keys())) # contains no location information to even construct a ErfGeo search API query from annotation = d['NIOD_BBWO2:niod:3441263'] self.assertEquals(None, xpathFirst(annotation, 'oa:hasBody')) self.assertEquals('No ErfGeo search API query could be constructed from target record', xpathFirst(annotation, 'dcterms:description/text()')) self.assertEquals(None, xpathFirst(annotation, 'dcterms:source/@rdf:resource')) annotation = d['NIOD_BBWO2:niod:3366459'] self.assertEquals('http://data.digitalecollectie.nl/annotation/erfGeoEnrichment#TklPRF9CQldPMjpuaW9kOjMzNjY0NTk=', xpathFirst(annotation, '@rdf:about')) self.assertEquals('http://*****:*****@rdf:resource')) self.assertEquals('NIOD_BBWO2:niod:3366459', xpathFirst(annotation, 'oa:hasTarget/@rdf:resource')) annotationBody = xpathFirst(annotation, 'oa:hasBody/rdf:Description') placeInTime = xpathFirst(annotationBody, 'dcterms:spatial/hg:PlaceInTime') self.assertEquals('http://erfgeo.nl/hg/geonames/2747032', xpathFirst(placeInTime, '@rdf:about')) self.assertEquals('Soestdijk', xpathFirst(placeInTime, 'rdfs:label/text()')) geometryWKT = xpathFirst(placeInTime, 'geos:hasGeometry/rdf:Description/geos:asWKT/text()') self.assertEquals('POINT(5.28472 52.19083)', geometryWKT)
def assertSruQuery(self, expectedHits, query, path=None, additionalHeaders=None): path = path or '/sru' responseBody = self._doQuery(query=query, path=path, additionalHeaders=additionalHeaders) diagnostic = xpathFirst(responseBody, "//diag:diagnostic") if not diagnostic is None: raise RuntimeError(lxmltostring(diagnostic)) targetUris = set(xpath(responseBody, "/srw:searchRetrieveResponse/srw:records/srw:record/srw:recordData/rdf:RDF/oa:Annotation/oa:hasTarget/@rdf:resource")) count = int(xpath(responseBody, "/srw:searchRetrieveResponse/srw:numberOfRecords/text()")[0]) if type(expectedHits) is int: self.assertEquals(expectedHits, count) else: self.assertEquals(expectedHits, targetUris) self.assertEquals(len(expectedHits), count)
def _combine(self, erfgeoEnrichment, summary): yield '<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">\n' for data in [summary, erfgeoEnrichment]: rdfLxml = XML(data) for child in xpath(rdfLxml, '/rdf:RDF/*'): yield lxmltostring(child) yield '</rdf:RDF>'
def testSruRecordData(self): responseLxml = self._doQuery('dc:subject=Zeeoorlog') rdfXml = xpathFirst(responseLxml, '/srw:searchRetrieveResponse/srw:records/srw:record/srw:recordData/rdf:RDF') self.assertEquals(2, len(rdfXml.getchildren())) self.assertEquals('Verenigde Staten', xpathFirst(rdfXml, 'oa:Annotation/oa:hasBody/rdf:Description/dc:coverage/text()')) self.assertEquals(1, len(xpath(rdfXml, 'oa:Annotation/oa:hasBody/rdf:Description/dcterms:spatial/hg:PlaceInTime'))) self.assertEquals('NIOD_BBWO2:niod:3366459', xpathFirst(rdfXml, 'oa:Annotation/oa:hasTarget/@rdf:resource')) self.assertEquals(['http://data.digitalecollectie.nl/annotation/summary#TklPRF9CQldPMjpuaW9kOjMzNjY0NTk=', 'http://data.digitalecollectie.nl/annotation/erfGeoEnrichment#TklPRF9CQldPMjpuaW9kOjMzNjY0NTk='], xpath(rdfXml, 'oa:Annotation/@rdf:about'))
def _geoCoordinatesPresent(self, summary): annotationBody = xpathFirst(summary, 'oa:Annotation/oa:hasBody/*') nodes = [annotationBody] for uri in xpath(annotationBody, 'dcterms:spatial/@rdf:resource'): node = xpathFirst(summary, '*[@rdf:about="%s"]') if not node is None: nodes.append(node) for node in xpath(annotationBody, 'dcterms:spatial/*'): nodes.append(node) for node in nodes: geoLat = xpathFirst(node, 'geo:lat/text()') geoLong = xpathFirst(node, 'geo:long/text()') if geoLat and geoLong: return (geoLat, geoLong) asWkt = xpathFirst(node, 'geos:hasGeometry/*/geos:asWKT/text()') if not asWkt is None: return Geometry.parseWkt(asWkt).pointCoordinates().next() return None
def main(): for set in ['rijksmuseum']: #'NIOD', 'zeeuwse_bibliotheek', 'limburgs_erfgoed', 'geluidVanNl']: print 'set', set from sys import stdout; stdout.flush() setValues = defaultdict(int) for i, item in enumerate(iterateOaiPmh(baseurl="http://data.digitalecollectie.nl/oai", metadataPrefix='summary', set=set)): annotationBody = xpathFirst(item.metadata, 'oa:Annotation/oa:hasBody/*') coverageValues = xpath(annotationBody, 'dc:coverage/text()') + xpath(annotationBody, 'dcterms:spatial/text()') for value in coverageValues: print '[%s %s]: %s' % (set, i, value) from sys import stdout; stdout.flush() setValues[value] += 1 print 'set', set print 'number of different values', len(setValues) print 'highest counts:' print sorted(setValues.items(), key=lambda (k, v): v, reverse=True)[:20] from sys import stdout; stdout.flush() with open("/home/natag/digitalecollectie_%s_coverage_values.json" % set, "w") as f: dump(setValues, f, indent=4, item_sort_key=lambda (k, v): -v if type(v) == int else k)
def createUploadHelix(oaiJazz, storage, erfGeoEnrichmentFromSummary): return \ (Transparent(), (FilterMessages(allowed=['delete']), (UnprefixIdentifier(prefix=DC_OAI_PMH_ID_PREFIX), (RewritePartname(partname='summary'), (AddDeleteToMultiSequential(), (storage,), ), ), (SummaryToErfGeoEnrichment(), (oaiJazz,), (AddDeleteToMultiSequential(), (storage,), ) ) ) ), (FilterMessages(allowed=['add']), (UnprefixIdentifier(prefix=DC_OAI_PMH_ID_PREFIX), (CallStackDict({ 'setSpecs': lambda lxmlNode, **kwargs: \ set(xpath(lxmlNode, "oai:header/oai:setSpec/text()"))}), (XmlXPath(['/oai:record/oai:metadata/rdf:RDF'], namespaces=namespaces, fromKwarg='lxmlNode'), (RewritePartname(partname='summary'), (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), (AddDeleteToMultiSequential(), (storage,), ) ) ), (SummaryToErfGeoEnrichment(), (erfGeoEnrichmentFromSummary,), (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), (AddDeleteToMultiSequential(), (storage,), ) ), (OaiAddDeleteRecordWithPrefixesAndSetSpecs( metadataPrefixes=['erfGeoEnrichment', 'erfGeoEnrichment+summary']), (AdoptOaiSetSpecs(), (oaiJazz,) ) ), ) ) ) ) ) )
def _sruResponseToJson(self, arguments, sruResponseLxml, sruRequest): request = '/search?' + urlencode(arguments, doseq=True) result = dict( request=request, sruRequest=sruRequest ) errors = xpath(sruResponseLxml, '/srw:searchRetrieveResponse/srw:diagnostics/diag:diagnostic') if len(errors) > 0: errorDicts = result['errors'] = [] for error in errors: errorDicts.append({ 'message': xpathFirst(error, 'diag:message/text()'), 'details': xpathFirst(error, 'diag:details/text()') }) return self._resultAsJson(result) total = int(xpathFirst(sruResponseLxml, '/srw:searchRetrieveResponse/srw:numberOfRecords/text()')) result['total'] = total result['items'] = [summaryWithEnrichmentToJsonLd(rdf) for rdf in xpath(sruResponseLxml, '/srw:searchRetrieveResponse/srw:records/srw:record/srw:recordData/rdf:RDF')] facets = {} for navigator in xpath(sruResponseLxml, '/srw:searchRetrieveResponse/srw:extraResponseData/drilldown:drilldown/drilldown:term-drilldown/drilldown:navigator'): name = xpathFirst(navigator, '@name') facetEntries = [] for ddItem in xpath(navigator, 'drilldown:item'): value = xpathFirst(ddItem, 'text()') count = int(xpathFirst(ddItem, '@count')) facetEntry = dict(value=value, count=count) if count != total: newQuery = arguments['query'][0] + ' AND %s exact "%s"' % (name, value) facetEntry['href'] = '/search?' + urlencode(dict(arguments, query=newQuery), doseq=True) facetEntries.append(facetEntry) facets[name] = facetEntries if facets: result['facets'] = facets nextRecordPosition = xpathFirst(sruResponseLxml, '/srw:searchRetrieveResponse/srw:nextRecordPosition/text()') if nextRecordPosition: result['nextPage'] = '/search?' + urlencode(dict(arguments, startRecord=nextRecordPosition), doseq=True) return self._resultAsJson(result)
def queryFromSummary(self, summary): annotationBody = xpathFirst(summary, 'oa:Annotation/oa:hasBody/*') spatialValues = [s.strip() for s in xpath(annotationBody, 'dcterms:spatial[@xml:lang="nl"]/text()') if s.strip()] if len(spatialValues) == 0: spatialValues = [s.strip() for s in xpath(annotationBody, 'dcterms:spatial/text()') if s.strip()] for uri in xpath(annotationBody, 'dcterms:spatial/@rdf:resource'): for value in xpath(summary, '*[@rdf:about="%s"]/skos:prefLabel/text()' % uri): value = value.strip() if value and not value in spatialValues: spatialValues.append(value) if spatialValues: coverageValues = spatialValues # prefer the more specific relation; helps to ignore uses of dc:coverage for time related values else: coverageValues = [s.strip() for s in xpath(annotationBody, 'dc:coverage[@xml:lang="nl"]/text()') if s.strip()] if len(coverageValues) == 0: coverageValues = [s.strip() for s in xpath(annotationBody, 'dc:coverage/text()') if s.strip()] return self._queryFromCoverageValues(coverageValues)
def testOaiSets(self): header, body = getRequest(self.erfGeoEnrichmentPort, '/oai', {'verb': 'GetRecord', 'identifier': 'http://data.digitalecollectie.nl/annotation/erfGeoEnrichment#TklPRF9CQldPMjpuaW9kOjMzNjY0NTk=', 'metadataPrefix': 'erfGeoEnrichment'}) self.assertEquals(set(['NIOD']), set(xpath(body, '//oai:setSpec/text()'))) header, body = getRequest(self.erfGeoEnrichmentPort, '/oai', {'verb': 'ListSets'}) self.assertEquals(set(['NIOD', 'limburgs_erfgoed', 'geluidVanNl']), set(xpath(body, '//oai:setSpec/text()')))
def summaryWithEnrichmentToJsonLd(rdf): urisResolved = set() def processResourceElement(d, element): uri = xpathFirst(element, '@rdf:about') if uri: d['@id'] = uri elementCurie = tagToCurie(element.tag) if elementCurie != 'rdf:Description' and not elementCurie in TYPES_TO_IGNORE: d['@type'] = elementCurie for child in element.iterchildren(tag=Element): processRelationElement(d, child) def processRelationElement(d, element): try: elementCurie = tagToCurie(element.tag) except KeyError: return if elementCurie == 'prov:wasDerivedFrom': return objects = [] value = element.text if value and value.strip(): value = value.strip() if elementCurie in DECIMAL_VALUE_RELATIONS: value = Decimal(value) if elementCurie in SINGLE_LITERAL_VALUE_RELATIONS: d[elementCurie] = value return objects.append(value) uri = xpathFirst(element, '@rdf:resource') if uri: if elementCurie == 'rdf:type': typeCurie = uriToCurie(uri) if not typeCurie in TYPES_TO_IGNORE: d['@type'] = typeCurie return value = uri if not uri in urisResolved: urisResolved.add(uri) descriptionElement = xpathFirst(rdf, '//*[@rdf:about="%s"]' % uri) if not descriptionElement is None: value = {} processResourceElement(value, descriptionElement) objects.append(value) for child in element.iterchildren(tag=Element): resourceDict = {} processResourceElement(resourceDict, child) objects.append(resourceDict) if objects: d.setdefault(elementCurie, []).extend(objects) prefix, _, _ = elementCurie.partition(':') context[prefix] = namespaces[prefix] if elementCurie in RESOURCE_RELATIONS: context[elementCurie] = {"@type": "@id"} context = {'oa': namespaces.oa} d = { '@context': context, '@id': xpathFirst(rdf, 'oa:Annotation/oa:hasTarget/@rdf:resource'), } for annotationBody in xpath(rdf, 'oa:Annotation/oa:hasBody/rdf:Description'): for element in annotationBody.iterchildren(tag=Element): processRelationElement(d, element) return d