示例#1
0
    def testOaiListRecords(self):
        header, body = getRequest(self.erfGeoEnrichmentPort, '/oai', {'verb': 'ListRecords', 'metadataPrefix': 'erfGeoEnrichment'}, parse=False)
        bodyLxml = XML(body)
        self.assertEquals(4, len(xpath(bodyLxml, '/oai:OAI-PMH/oai:ListRecords/oai:record')))
        d = dict(zip(
            xpath(bodyLxml, '/oai:OAI-PMH/oai:ListRecords/oai:record/oai:metadata/rdf:RDF/oa:Annotation/oa:hasTarget/@rdf:resource'),
            xpath(bodyLxml, '/oai:OAI-PMH/oai:ListRecords/oai:record/oai:metadata/rdf:RDF/oa:Annotation')))
        self.assertEquals(set(['NIOD_BBWO2:niod:3366459', 'geluidVanNl:geluid_van_nederland:47954146', 'NIOD_BBWO2:niod:3441263', 'limburgs_erfgoed:oai:le:RooyNet:37']), set(d.keys()))

        # contains no location information to even construct a ErfGeo search API query from
        annotation = d['NIOD_BBWO2:niod:3441263']
        self.assertEquals(None, xpathFirst(annotation, 'oa:hasBody'))
        self.assertEquals('No ErfGeo search API query could be constructed from target record', xpathFirst(annotation, 'dcterms:description/text()'))
        self.assertEquals(None, xpathFirst(annotation, 'dcterms:source/@rdf:resource'))

        annotation = d['NIOD_BBWO2:niod:3366459']
        self.assertEquals('http://data.digitalecollectie.nl/annotation/erfGeoEnrichment#TklPRF9CQldPMjpuaW9kOjMzNjY0NTk=', xpathFirst(annotation, '@rdf:about'))
        self.assertEquals('http://*****:*****@rdf:resource'))
        self.assertEquals('NIOD_BBWO2:niod:3366459', xpathFirst(annotation, 'oa:hasTarget/@rdf:resource'))
        annotationBody = xpathFirst(annotation, 'oa:hasBody/rdf:Description')
        placeInTime = xpathFirst(annotationBody, 'dcterms:spatial/hg:PlaceInTime')
        self.assertEquals('http://erfgeo.nl/hg/geonames/2747032', xpathFirst(placeInTime, '@rdf:about'))
        self.assertEquals('Soestdijk', xpathFirst(placeInTime, 'rdfs:label/text()'))
        geometryWKT = xpathFirst(placeInTime, 'geos:hasGeometry/rdf:Description/geos:asWKT/text()')
        self.assertEquals('POINT(5.28472 52.19083)', geometryWKT)
示例#2
0
 def assertSruQuery(self, expectedHits, query, path=None, additionalHeaders=None):
     path = path or '/sru'
     responseBody = self._doQuery(query=query, path=path, additionalHeaders=additionalHeaders)
     diagnostic = xpathFirst(responseBody, "//diag:diagnostic")
     if not diagnostic is None:
         raise RuntimeError(lxmltostring(diagnostic))
     targetUris = set(xpath(responseBody, "/srw:searchRetrieveResponse/srw:records/srw:record/srw:recordData/rdf:RDF/oa:Annotation/oa:hasTarget/@rdf:resource"))
     count = int(xpath(responseBody, "/srw:searchRetrieveResponse/srw:numberOfRecords/text()")[0])
     if type(expectedHits) is int:
         self.assertEquals(expectedHits, count)
     else:
         self.assertEquals(expectedHits, targetUris)
         self.assertEquals(len(expectedHits), count)
 def _combine(self, erfgeoEnrichment, summary):
     yield '<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">\n'
     for data in [summary, erfgeoEnrichment]:
         rdfLxml = XML(data)
         for child in xpath(rdfLxml, '/rdf:RDF/*'):
             yield lxmltostring(child)
     yield '</rdf:RDF>'
示例#4
0
 def testSruRecordData(self):
     responseLxml = self._doQuery('dc:subject=Zeeoorlog')
     rdfXml = xpathFirst(responseLxml, '/srw:searchRetrieveResponse/srw:records/srw:record/srw:recordData/rdf:RDF')
     self.assertEquals(2, len(rdfXml.getchildren()))
     self.assertEquals('Verenigde Staten', xpathFirst(rdfXml, 'oa:Annotation/oa:hasBody/rdf:Description/dc:coverage/text()'))
     self.assertEquals(1, len(xpath(rdfXml, 'oa:Annotation/oa:hasBody/rdf:Description/dcterms:spatial/hg:PlaceInTime')))
     self.assertEquals('NIOD_BBWO2:niod:3366459', xpathFirst(rdfXml, 'oa:Annotation/oa:hasTarget/@rdf:resource'))
     self.assertEquals(['http://data.digitalecollectie.nl/annotation/summary#TklPRF9CQldPMjpuaW9kOjMzNjY0NTk=', 'http://data.digitalecollectie.nl/annotation/erfGeoEnrichment#TklPRF9CQldPMjpuaW9kOjMzNjY0NTk='], xpath(rdfXml, 'oa:Annotation/@rdf:about'))
 def _geoCoordinatesPresent(self, summary):
     annotationBody = xpathFirst(summary, 'oa:Annotation/oa:hasBody/*')
     nodes = [annotationBody]
     for uri in xpath(annotationBody, 'dcterms:spatial/@rdf:resource'):
         node = xpathFirst(summary, '*[@rdf:about="%s"]')
         if not node is None:
             nodes.append(node)
     for node in xpath(annotationBody, 'dcterms:spatial/*'):
         nodes.append(node)
     for node in nodes:
         geoLat = xpathFirst(node, 'geo:lat/text()')
         geoLong = xpathFirst(node, 'geo:long/text()')
         if geoLat and geoLong:
             return (geoLat, geoLong)
         asWkt = xpathFirst(node, 'geos:hasGeometry/*/geos:asWKT/text()')
         if not asWkt is None:
             return Geometry.parseWkt(asWkt).pointCoordinates().next()
     return None
def main():
    for set in ['rijksmuseum']: #'NIOD', 'zeeuwse_bibliotheek', 'limburgs_erfgoed', 'geluidVanNl']:
        print 'set', set
        from sys import stdout; stdout.flush()
        setValues = defaultdict(int)
        for i, item in enumerate(iterateOaiPmh(baseurl="http://data.digitalecollectie.nl/oai", metadataPrefix='summary', set=set)):
            annotationBody = xpathFirst(item.metadata, 'oa:Annotation/oa:hasBody/*')
            coverageValues = xpath(annotationBody, 'dc:coverage/text()') + xpath(annotationBody, 'dcterms:spatial/text()')
            for value in coverageValues:
                print '[%s %s]: %s' % (set, i, value)
                from sys import stdout; stdout.flush()
                setValues[value] += 1

        print 'set', set
        print 'number of different values', len(setValues)
        print 'highest counts:'
        print sorted(setValues.items(), key=lambda (k, v): v, reverse=True)[:20]
        from sys import stdout; stdout.flush()

        with open("/home/natag/digitalecollectie_%s_coverage_values.json" % set, "w") as f:
            dump(setValues, f, indent=4, item_sort_key=lambda (k, v): -v if type(v) == int else k)
示例#7
0
def createUploadHelix(oaiJazz, storage, erfGeoEnrichmentFromSummary):
    return \
        (Transparent(),
            (FilterMessages(allowed=['delete']),
                (UnprefixIdentifier(prefix=DC_OAI_PMH_ID_PREFIX),
                    (RewritePartname(partname='summary'),
                        (AddDeleteToMultiSequential(),
                            (storage,),
                        ),
                    ),
                    (SummaryToErfGeoEnrichment(),
                        (oaiJazz,),
                        (AddDeleteToMultiSequential(),
                            (storage,),
                        )
                    )
                )
            ),
            (FilterMessages(allowed=['add']),
                (UnprefixIdentifier(prefix=DC_OAI_PMH_ID_PREFIX),
                    (CallStackDict({
                        'setSpecs':
                        lambda lxmlNode, **kwargs: \
                            set(xpath(lxmlNode, "oai:header/oai:setSpec/text()"))}),
                        (XmlXPath(['/oai:record/oai:metadata/rdf:RDF'], namespaces=namespaces, fromKwarg='lxmlNode'),
                            (RewritePartname(partname='summary'),
                                (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'),
                                    (AddDeleteToMultiSequential(),
                                        (storage,),
                                    )
                                )
                            ),
                            (SummaryToErfGeoEnrichment(),
                                (erfGeoEnrichmentFromSummary,),
                                (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'),
                                    (AddDeleteToMultiSequential(),
                                        (storage,),
                                    )
                                ),
                                (OaiAddDeleteRecordWithPrefixesAndSetSpecs(
                                        metadataPrefixes=['erfGeoEnrichment', 'erfGeoEnrichment+summary']),
                                    (AdoptOaiSetSpecs(),
                                        (oaiJazz,)
                                    )
                                ),
                            )
                        )
                    )
                )
            )
        )
    def _sruResponseToJson(self, arguments, sruResponseLxml, sruRequest):
        request = '/search?' + urlencode(arguments, doseq=True)
        result = dict(
            request=request,
            sruRequest=sruRequest
        )
        errors = xpath(sruResponseLxml, '/srw:searchRetrieveResponse/srw:diagnostics/diag:diagnostic')
        if len(errors) > 0:
            errorDicts = result['errors'] = []
            for error in errors:
                errorDicts.append({
                    'message': xpathFirst(error, 'diag:message/text()'),
                    'details': xpathFirst(error, 'diag:details/text()')
                })
                return self._resultAsJson(result)

        total = int(xpathFirst(sruResponseLxml, '/srw:searchRetrieveResponse/srw:numberOfRecords/text()'))
        result['total'] = total
        result['items'] = [summaryWithEnrichmentToJsonLd(rdf) for rdf in xpath(sruResponseLxml, '/srw:searchRetrieveResponse/srw:records/srw:record/srw:recordData/rdf:RDF')]
        facets = {}
        for navigator in xpath(sruResponseLxml, '/srw:searchRetrieveResponse/srw:extraResponseData/drilldown:drilldown/drilldown:term-drilldown/drilldown:navigator'):
            name = xpathFirst(navigator, '@name')
            facetEntries = []
            for ddItem in xpath(navigator, 'drilldown:item'):
                value = xpathFirst(ddItem, 'text()')
                count = int(xpathFirst(ddItem, '@count'))
                facetEntry = dict(value=value, count=count)
                if count != total:
                    newQuery = arguments['query'][0] + ' AND %s exact "%s"' % (name, value)
                    facetEntry['href'] = '/search?' + urlencode(dict(arguments, query=newQuery), doseq=True)
                facetEntries.append(facetEntry)
            facets[name] = facetEntries
        if facets:
            result['facets'] = facets
        nextRecordPosition = xpathFirst(sruResponseLxml, '/srw:searchRetrieveResponse/srw:nextRecordPosition/text()')
        if nextRecordPosition:
            result['nextPage'] = '/search?' + urlencode(dict(arguments, startRecord=nextRecordPosition), doseq=True)
        return self._resultAsJson(result)
 def queryFromSummary(self, summary):
     annotationBody = xpathFirst(summary, 'oa:Annotation/oa:hasBody/*')
     spatialValues = [s.strip() for s in xpath(annotationBody, 'dcterms:spatial[@xml:lang="nl"]/text()') if s.strip()]
     if len(spatialValues) == 0:
         spatialValues = [s.strip() for s in xpath(annotationBody, 'dcterms:spatial/text()') if s.strip()]
     for uri in xpath(annotationBody, 'dcterms:spatial/@rdf:resource'):
         for value in xpath(summary, '*[@rdf:about="%s"]/skos:prefLabel/text()' % uri):
             value = value.strip()
             if value and not value in spatialValues:
                 spatialValues.append(value)
     if spatialValues:
         coverageValues = spatialValues  # prefer the more specific relation; helps to ignore uses of dc:coverage for time related values
     else:
         coverageValues = [s.strip() for s in xpath(annotationBody, 'dc:coverage[@xml:lang="nl"]/text()') if s.strip()]
         if len(coverageValues) == 0:
             coverageValues = [s.strip() for s in xpath(annotationBody, 'dc:coverage/text()') if s.strip()]
     return self._queryFromCoverageValues(coverageValues)
示例#10
0
    def testOaiSets(self):
        header, body = getRequest(self.erfGeoEnrichmentPort, '/oai', {'verb': 'GetRecord', 'identifier': 'http://data.digitalecollectie.nl/annotation/erfGeoEnrichment#TklPRF9CQldPMjpuaW9kOjMzNjY0NTk=', 'metadataPrefix': 'erfGeoEnrichment'})
        self.assertEquals(set(['NIOD']), set(xpath(body, '//oai:setSpec/text()')))

        header, body = getRequest(self.erfGeoEnrichmentPort, '/oai', {'verb': 'ListSets'})
        self.assertEquals(set(['NIOD', 'limburgs_erfgoed', 'geluidVanNl']), set(xpath(body, '//oai:setSpec/text()')))
def summaryWithEnrichmentToJsonLd(rdf):
    urisResolved = set()

    def processResourceElement(d, element):
        uri = xpathFirst(element, '@rdf:about')
        if uri:
            d['@id'] = uri
        elementCurie = tagToCurie(element.tag)
        if elementCurie != 'rdf:Description' and not elementCurie in TYPES_TO_IGNORE:
            d['@type'] = elementCurie
        for child in element.iterchildren(tag=Element):
            processRelationElement(d, child)

    def processRelationElement(d, element):
        try:
            elementCurie = tagToCurie(element.tag)
        except KeyError:
            return
        if elementCurie == 'prov:wasDerivedFrom':
            return
        objects = []
        value = element.text
        if value and value.strip():
            value = value.strip()
            if elementCurie in DECIMAL_VALUE_RELATIONS:
                value = Decimal(value)
            if elementCurie in SINGLE_LITERAL_VALUE_RELATIONS:
                d[elementCurie] = value
                return
            objects.append(value)
        uri = xpathFirst(element, '@rdf:resource')
        if uri:
            if elementCurie == 'rdf:type':
                typeCurie = uriToCurie(uri)
                if not typeCurie in TYPES_TO_IGNORE:
                    d['@type'] = typeCurie
                return
            value = uri
            if not uri in urisResolved:
                urisResolved.add(uri)
                descriptionElement = xpathFirst(rdf, '//*[@rdf:about="%s"]' % uri)
                if not descriptionElement is None:
                    value = {}
                    processResourceElement(value, descriptionElement)
            objects.append(value)
        for child in element.iterchildren(tag=Element):
            resourceDict = {}
            processResourceElement(resourceDict, child)
            objects.append(resourceDict)
        if objects:
            d.setdefault(elementCurie, []).extend(objects)
            prefix, _, _ = elementCurie.partition(':')
            context[prefix] = namespaces[prefix]
            if elementCurie in RESOURCE_RELATIONS:
                context[elementCurie] = {"@type": "@id"}

    context = {'oa': namespaces.oa}
    d = {
        '@context': context,
        '@id': xpathFirst(rdf, 'oa:Annotation/oa:hasTarget/@rdf:resource'),
    }
    for annotationBody in xpath(rdf, 'oa:Annotation/oa:hasBody/rdf:Description'):
        for element in annotationBody.iterchildren(tag=Element):
            processRelationElement(d, element)
    return d