示例#1
0
 def testSetIncrementalHarvestSchedule(self):
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), incrementalHarvestSchedule=None)
     oaiDownloadProcessor._time = lambda: 10
     oaiDownloadProcessor.setIncrementalHarvestSchedule(schedule=Schedule(period=3))
     self.assertEquals(0, oaiDownloadProcessor._earliestNextRequestTime)
     consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % ''))))
     self.assertEquals(13, oaiDownloadProcessor._earliestNextRequestTime)
示例#2
0
    def testSignalHarvestingDone(self):
        observer = CallTrace(emptyGeneratorMethods=['add'])
        oaiDownloadProcessor = OaiDownloadProcessor(path='/p', metadataPrefix='p', workingDirectory=self.tempdir, incrementalHarvestSchedule=None)
        oaiDownloadProcessor.addObserver(observer)

        consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % ''))))
        self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], observer.calledMethodNames())
示例#3
0
 def testBuildRequestNoneWhenNoResumptionToken(self):
     observer = CallTrace(emptyGeneratorMethods=['add'])
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO())
     oaiDownloadProcessor.addObserver(observer)
     consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE))))
     self.assertEquals(None, oaiDownloadProcessor._resumptionToken)
     self.assertEquals(None, oaiDownloadProcessor.buildRequest())
示例#4
0
 def testYieldSuspendFromAdd(self):
     observer = CallTrace()
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False)
     oaiDownloadProcessor.addObserver(observer)
     suspend = Suspend()
     observer.returnValues['add'] = (x for x in [suspend])
     yields = list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))))
     self.assertEquals([suspend, None], yields)
示例#5
0
 def testRestartAfterFinish(self):
     observer = CallTrace(emptyGeneratorMethods=['add'])
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), restartAfterFinish=True)
     oaiDownloadProcessor.addObserver(observer)
     consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE))))
     self.assertEquals(None, oaiDownloadProcessor._resumptionToken)
     request = oaiDownloadProcessor.buildRequest()
     self.assertTrue(request.startswith('GET /oai?verb=ListRecords&metadataPrefix=oai_dc HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: '), request)
示例#6
0
 def testIncrementalHarvestScheduleNone(self):
     observer = CallTrace(emptyGeneratorMethods=['add'])
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), incrementalHarvestSchedule=None)
     oaiDownloadProcessor.addObserver(observer)
     consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % ''))))
     self.assertEquals(None, oaiDownloadProcessor._resumptionToken)
     self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from)
     self.assertEquals(None, oaiDownloadProcessor._earliestNextRequestTime)
示例#7
0
 def testRaiseErrorOnBadResponse(self):
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True)
     badRecord = '<record>No Header</record>'
     try:
         list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % badRecord)))))
         self.fail()
     except IndexError:
         pass
示例#8
0
 def testUseResumptionToken(self):
     observer = CallTrace(emptyGeneratorMethods=['add'])
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO())
     oaiDownloadProcessor.addObserver(observer)
     consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN))))
     self.assertEquals('x?y&z', oaiDownloadProcessor._resumptionToken)
     self.assertEquals('GET /oai?verb=ListRecords&resumptionToken=x%%3Fy%%26z&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest())
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO())
     self.assertEquals('x?y&z', oaiDownloadProcessor._resumptionToken)
示例#9
0
 def testIncrementalHarvestWithFromWithDefaultScheduleMidnight(self):
     observer = CallTrace(emptyGeneratorMethods=['add'])
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO())
     oaiDownloadProcessor._time = oaiDownloadProcessor._incrementalHarvestSchedule._time = lambda: 01 * 60 * 60
     oaiDownloadProcessor._incrementalHarvestSchedule._utcnow = lambda: datetime.strptime("01:00", "%H:%M")
     oaiDownloadProcessor.addObserver(observer)
     consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE))))
     self.assertEquals(None, oaiDownloadProcessor._resumptionToken)
     self.assertEquals(24 * 60 * 60.0, oaiDownloadProcessor._earliestNextRequestTime)
示例#10
0
 def testSetInRequest(self):
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", set="setName", workingDirectory=self.tempdir, xWait=True)
     self.assertEquals("""GET /oai?verb=ListRecords&metadataPrefix=oai_dc&set=setName&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest())
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", set="set-_.!~*'()", workingDirectory=self.tempdir, xWait=True)
     self.assertEquals("""GET /oai?verb=ListRecords&metadataPrefix=oai_dc&set=set-_.%%21%%7E%%2A%%27%%28%%29&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest())
     resumptionToken = "u|c1286437597991025|mprefix|s|f"
     open(join(self.tempdir, 'harvester.state'), 'w').write("Resumptiontoken: %s\n" % resumptionToken)
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", set="setName", workingDirectory=self.tempdir, xWait=True)
     self.assertEquals("""GET /oai?verb=ListRecords&resumptionToken=u%%7Cc1286437597991025%%7Cmprefix%%7Cs%%7Cf&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest())
示例#11
0
    def testResponseDateAsFrom(self):
        observer = CallTrace(emptyGeneratorMethods=['add'])
        oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO())
        oaiDownloadProcessor.addObserver(observer)
        consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN))))
        self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from)

        oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO())
        self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from)
示例#12
0
 def testIncrementalHarvestReScheduleIfNoRecordsMatch(self):
     observer = CallTrace(emptyGeneratorMethods=['add'])
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", incrementalHarvestSchedule=Schedule(period=0), workingDirectory=self.tempdir, xWait=False, err=StringIO())
     oaiDownloadProcessor.addObserver(observer)
     consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % ''))))
     self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from)
     consume(oaiDownloadProcessor.handle(parse(StringIO(NO_RECORDS_MATCH_RESPONSE))))
     self.assertEquals(None, oaiDownloadProcessor._errorState)
     self.assertEquals('2012-06-01T19:20:30Z', oaiDownloadProcessor._from)
示例#13
0
 def testUpdateRequest(self):
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True)
     oaiDownloadProcessor.setPath('/otherOai')
     oaiDownloadProcessor.setMetadataPrefix('otherPrefix')
     oaiDownloadProcessor.setSet('aSet')
     oaiDownloadProcessor.setFrom('2014')
     self.assertEquals("""GET /otherOai?verb=ListRecords&from=2014&metadataPrefix=otherPrefix&set=aSet&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest())
示例#14
0
    def testPersistentIdentifier(self):
        identifierFilepath = join(self.tempdir, 'harvester.identifier')
        self.assertFalse(isfile(identifierFilepath))
        oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True)
        currentIdentifier = oaiDownloadProcessor._identifier
        self.assertTrue(isfile(identifierFilepath))
        self.assertEquals(currentIdentifier, open(identifierFilepath).read())
        self.assertEquals("""GET /oai?verb=ListRecords&metadataPrefix=oai_dc&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % currentIdentifier, oaiDownloadProcessor.buildRequest())

        oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True)
        self.assertEquals("""GET /oai?verb=ListRecords&metadataPrefix=oai_dc&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % currentIdentifier, oaiDownloadProcessor.buildRequest())
示例#15
0
 def testListRecordsRequestError(self):
     resumptionToken = "u|c1286437597991025|mprefix|s|f"
     open(join(self.tempdir, 'harvester.state'), 'w').write("Resumptiontoken: %s\n" % resumptionToken)
     observer = CallTrace()
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO())
     oaiDownloadProcessor.addObserver(observer)
     self.assertEquals('GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('resumptionToken', resumptionToken), ('x-wait', 'True')]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest())
     consume(oaiDownloadProcessor.handle(parse(StringIO(ERROR_RESPONSE))))
     self.assertEquals(0, len(observer.calledMethods))
     self.assertEquals("someError: Some error occurred.\n", oaiDownloadProcessor._err.getvalue())
     self.assertEquals('GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('metadataPrefix', 'oai_dc'), ('x-wait', 'True')]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest())
示例#16
0
 def testListIdentifiersHandle(self):
     observer = CallTrace(methods={'add': lambda **kwargs: (x for x in [])})
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, verb='ListIdentifiers')
     oaiDownloadProcessor.addObserver(observer)
     list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTIDENTIFIERS_RESPONSE)))))
     self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], [m.name for m in observer.calledMethods])
     addMethod = observer.calledMethods[1]
     self.assertEquals(0, len(addMethod.args))
     self.assertEqualsWS(ONE_HEADER, lxmltostring(addMethod.kwargs['lxmlNode']))
     self.assertEquals('2011-08-22T07:34:00Z', addMethod.kwargs['datestamp'])
     self.assertEquals('oai:identifier:1', addMethod.kwargs['identifier'])
示例#17
0
    def testHandleYieldsAtLeastOnceAfterEachRecord(self):
        def add(**kwargs):
            return
            yield
        observer = CallTrace(methods={'add': add})
        oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False)
        oaiDownloadProcessor.addObserver(observer)
        yields = list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))))
        self.assertEquals(1, len(yields))

        secondRecord = '<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:2</identifier><datestamp>2011-08-22T07:41:00Z</datestamp></header><metadata>ignored</metadata></record>'
        yields = list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % secondRecord)))))
        self.assertEquals(2, len(yields))
示例#18
0
 def testKeepResumptionTokenOnFailingAddCall(self):
     resumptionToken = "u|c1286437597991025|mprefix|s|f"
     open(join(self.tempdir, 'harvester.state'), 'w').write("Resumptiontoken: %s\n" % resumptionToken)
     observer = CallTrace()
     observer.exceptions={'add': Exception("Could be anything")}
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO())
     oaiDownloadProcessor.addObserver(observer)
     self.assertEquals('GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('resumptionToken', resumptionToken), ('x-wait', 'True')]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest())
     self.assertRaises(Exception, lambda: list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN))))))
     self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods])
     errorOutput = oaiDownloadProcessor._err.getvalue()
     self.assertTrue(errorOutput.startswith('Traceback'), errorOutput)
     self.assertTrue('Exception: Could be anything\nWhile processing:\n<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:1' in errorOutput, errorOutput)
     self.assertEquals('GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('resumptionToken', resumptionToken), ('x-wait', 'True')]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest())
示例#19
0
 def testHandleWithTwoRecords(self):
     observer = CallTrace(methods={'add': lambda **kwargs: (x for x in [])})
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True)
     oaiDownloadProcessor.addObserver(observer)
     secondRecord = '<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:2</identifier><datestamp>2011-08-22T07:41:00Z</datestamp></header><metadata>ignored</metadata></record>'
     list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % (secondRecord + RESUMPTION_TOKEN))))))
     self.assertEquals(['startOaiBatch', 'add', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods])
     addMethod0, addMethod1 = observer.calledMethods[1:3]
     self.assertEquals(0, len(addMethod0.args))
     self.assertEqualsWS(ONE_RECORD, lxmltostring(addMethod0.kwargs['lxmlNode']))
     self.assertEquals('2011-08-22T07:34:00Z', addMethod0.kwargs['datestamp'])
     self.assertEquals('oai:identifier:1', addMethod0.kwargs['identifier'])
     self.assertEqualsWS(secondRecord, lxmltostring(addMethod1.kwargs['lxmlNode']))
     self.assertEquals('2011-08-22T07:41:00Z', addMethod1.kwargs['datestamp'])
     self.assertEquals('oai:identifier:2', addMethod1.kwargs['identifier'])
示例#20
0
    def testIncrementalHarvestScheduleNoneOverruledWithSetIncrementalHarvestSchedule(self):
        oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), incrementalHarvestSchedule=None)
        oaiDownloadProcessor._time = lambda: 10
        consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % ''))))
        self.assertEquals(None, oaiDownloadProcessor._resumptionToken)
        self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from)
        self.assertEquals(None, oaiDownloadProcessor._earliestNextRequestTime)

        oaiDownloadProcessor.setIncrementalHarvestSchedule(schedule=Schedule(period=3))
        self.assertEquals(None, oaiDownloadProcessor.buildRequest())
        self.assertEquals(None, oaiDownloadProcessor._earliestNextRequestTime)
        oaiDownloadProcessor.scheduleNextRequest()
        self.assertNotEquals(None, oaiDownloadProcessor.buildRequest())
        self.assertEquals(0, oaiDownloadProcessor._earliestNextRequestTime)
        consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % ''))))
        self.assertEquals(13, oaiDownloadProcessor._earliestNextRequestTime)
示例#21
0
    def testIncrementalHarvestScheduleSetToNone(self):
        observer = CallTrace(emptyGeneratorMethods=['add'])
        oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", incrementalHarvestSchedule=Schedule(period=0), workingDirectory=self.tempdir, xWait=False, err=StringIO())
        oaiDownloadProcessor.addObserver(observer)
        consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE))))
        self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from)
        self.assertNotEqual(None, oaiDownloadProcessor._earliestNextRequestTime)
        self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], observer.calledMethodNames())

        observer.calledMethods.reset()
        oaiDownloadProcessor.setFrom(from_=None)
        oaiDownloadProcessor.setIncrementalHarvestSchedule(schedule=None)
        consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE))))
        self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from)
        self.assertEquals(None, oaiDownloadProcessor._earliestNextRequestTime)
        self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], observer.calledMethodNames())
示例#22
0
    def testShutdownPersistsStateOnAutocommit(self):
        observer = CallTrace(emptyGeneratorMethods=['add'])
        oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, autoCommit=False)
        oaiDownloadProcessor.addObserver(observer)
        consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN))))
        state = oaiDownloadProcessor.getState()
        self.assertFalse(isfile(join(self.tempdir, 'harvester.state')))

        oaiDownloadProcessor.handleShutdown()
        self.assertEquals({"errorState": None, 'from': '2002-06-01T19:20:30Z', "resumptionToken": state.resumptionToken}, load(open(join(self.tempdir, 'harvester.state'))))
示例#23
0
def main(reactor,
         port,
         statePath,
         gatewayPort,
         dbConfig,
         quickCommit=False,
         **ignored):

    #TODO: Implement logging.
    # normLogger = Logger(join(statePath, '..', 'gateway', 'normlogger'))

    dbStorageComponent = ResolverStorageComponent(dbConfig)
    verbose = True

    periodicGateWayDownload = PeriodicDownload(
        reactor,
        host='localhost',
        port=gatewayPort,
        schedule=Schedule(
            period=.1 if quickCommit else 10
        ),  # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need <=1 second! Otherwise tests will fail!
        name='resolver',
        autoStart=True)

    oaiDownload = OaiDownloadProcessor(path='/oaix',
                                       metadataPrefix=NORMALISED_DOC_NAME,
                                       workingDirectory=join(
                                           statePath, 'harvesterstate',
                                           'gateway'),
                                       userAgentAddition='ResolverServer',
                                       xWait=True,
                                       name='resolver',
                                       autoCommit=False)


    return \
    (Observable(),
        createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, dbStorageComponent),
        (ObservableHttpServer(reactor, port, compressResponse=True),
            (BasicHttpHandler(),
                (PathFilter("/"),
                    (StringServer("Resolver Server", ContentTypePlainText), )
                )
            )
        )
    )
示例#24
0
    def testHarvesterStateWithError(self):
        resumptionToken = "u|c1286437597991025|mprefix|s|f"
        open(join(self.tempdir, 'harvester.state'), 'w').write("Resumptiontoken: %s\n" % resumptionToken)
        observer = CallTrace()
        observer.exceptions={'add': Exception("Could be anything")}
        oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO(), name="Name")
        oaiDownloadProcessor.addObserver(observer)
        self.assertRaises(Exception, lambda: list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN))))))
        state = oaiDownloadProcessor.getState()
        self.assertEquals(resumptionToken, state.resumptionToken)
        self.assertEquals(None, state.from_)
        self.assertEquals("ERROR while processing 'oai:identifier:1': Could be anything", state.errorState)
        self.assertEquals("Name", state.name)

        oaiDownloadProcessor2 = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO())
        state2 = oaiDownloadProcessor2.getState()
        self.assertEquals(resumptionToken, state2.resumptionToken)
        self.assertEquals("ERROR while processing 'oai:identifier:1': Could be anything", state2.errorState)
示例#25
0
def main(reactor, port, statePath, gatewayPort, quickCommit=False, **ignored):

    strategie = Md5HashDistributeStrategy()
    storage = StorageComponent(join(statePath, 'store'),
                               strategy=strategie,
                               partsRemovedOnDelete=[
                                   NL_DIDL_NORMALISED_PREFIX,
                                   NL_DIDL_COMBINED_PREFIX, 'metadata'
                               ])

    oaiJazz = OaiJazz(join(statePath, 'oai'))
    oaiJazz.updateMetadataFormat("metadata", "http://didl.loc.nl/didl.xsd",
                                 NAMESPACEMAP.didl)
    oaiJazz.updateMetadataFormat(NL_DIDL_COMBINED_PREFIX, "",
                                 NAMESPACEMAP.gmhcombined)
    oaiJazz.updateMetadataFormat(NL_DIDL_NORMALISED_PREFIX, "",
                                 NAMESPACEMAP.gmhnorm)

    normLogger = Logger(join(statePath, '..', 'gateway', 'normlogger'))

    periodicGateWayDownload = PeriodicDownload(
        reactor,
        host='localhost',
        port=gatewayPort,
        schedule=Schedule(
            period=.1 if quickCommit else 10
        ),  # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need <=1 second! Otherwise tests will fail!
        name='api',
        autoStart=True)

    oaiDownload = OaiDownloadProcessor(path='/oaix',
                                       metadataPrefix=NORMALISED_DOC_NAME,
                                       workingDirectory=join(
                                           statePath, 'harvesterstate',
                                           'gateway'),
                                       userAgentAddition='ApiServer',
                                       xWait=True,
                                       name='api',
                                       autoCommit=False)


    return \
    (Observable(),
        createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, storage, oaiJazz),
        (ObservableHttpServer(reactor, port, compressResponse=True),
            (BasicHttpHandler(),
                (PathFilter('/oai'),
                    (OaiPmh(
                            repositoryName="Gemeenschappelijke Metadata Harvester DANS-KB",
                            adminEmail="*****@*****.**",
                            externalUrl="http://oai.gharvester.dans.knaw.nl",
                            batchSize=200,
                            supportXWait=False,
                            # preciseDatestamp=False,
                            # deleteInSets=False
                        ),
                        (oaiJazz, ),
                        (RetrieveToGetDataAdapter(),
                            (storage,),
                        ),
                        (OaiBranding(
                            url="https://www.narcis.nl/images/logos/logo-knaw-house.gif", #TODO: Link to a joint-GMH icon...
                            link="https://harvester.dans.knaw.nl",
                            title="Gemeenschappelijke Metadata Harvester (GMH) van DANS en de KB"),
                        ),
                        (OaiProvenance(
                            nsMap=NAMESPACEMAP,
                            baseURL=('meta', '//meta:repository/meta:baseurl/text()'),
                            harvestDate=('meta', '//meta:harvestdate/text()'),
                            metadataNamespace=('meta', '//meta:metadataPrefix/text()'), #TODO: Kan hardcoded in harvester mapper gezet eventueel: <metadataNamespace>urn:mpeg:mpeg21:2002:01-DII-NS</metadataNamespace>?? (storage,) #metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'),
                            identifier=('header','//oai:identifier/text()'),
                            datestamp=('header', '//oai:datestamp/text()')
                            ),
                            (RetrieveToGetDataAdapter(),
                                (storage,),
                            )
                        )
                    )
                ),
                (PathFilter('/rss'),
                    (LoggerRSS( title = 'GMH DANS-KB Normalisationlog Syndication', description = 'Harvester normalisation log for: ', link = 'http://rss.gharvester.dans.knaw.nl/rss', maximumRecords = 30),
                        (normLogger,
                            (storage,)
                        )
                    )
                ),
                (PathFilter('/xls'),
                    # (LogComponent("XLS-Request:"),),
                    (XlsServer(),)
                )
            )
        )
    )
示例#26
0
def writerMain(writerReactor, statePath, luceneserverPort, gatewayPort, quickCommit=False):

    http11Request = be(
        (HttpRequest1_1(),
            (SocketPool(reactor=writerReactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)),),
        )
    )
    indexCommitTimeout = 30

    defaultLuceneSettings = LuceneSettings(
            commitTimeout=indexCommitTimeout,
            readonly=False,
        )

    luceneWriter = luceneAndReaderConfig(defaultLuceneSettings, http11Request, luceneserverPort)

    periodicDownload = PeriodicDownload(
        writerReactor,
        host='localhost',
        port=gatewayPort,
        schedule=Schedule(period=1 if quickCommit else 10), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need 1 second! Otherwise tests will fail!
        name='index',
        autoStart=True)

    oaiDownload = OaiDownloadProcessor(
        path='/oaix',
        metadataPrefix=NORMALISED_DOC_NAME,
        workingDirectory=join(statePath, 'harvesterstate', 'gateway'),
        userAgentAddition='idx-server',
        xWait=True,
        name='index',
        autoCommit=False)

    # Post commit naar Lucene(server):
    scheduledCommitPeriodicCall = be(
        (PeriodicCall(writerReactor, message='commit', name='Scheduled commit', schedule=Schedule(period=1 if quickCommit else 300), initialSchedule=Schedule(period=1)), # WST: Flushes data from memory to disk. IntegrationTests need 1 second! Otherwise tests will fail! (API).
            (AllToDo(), # broadcast message to all components, despite of what kind of message...
                # (periodicDownload,), # WST: periodicDownload does not do anything with a 'commit' message? So why send it to it???
                (LuceneCommit(host='localhost', port=luceneserverPort,), # 'commit' message results in http post to /commit/ to Lucene server:
                    # (LogComponent("PERIODIC"),#), # [PERIODIC] httprequest1_1(*(), **{'body': None, 'host': 'localhost', 'request': '/commit/', 'port': 52501, 'method': 'POST'})
                    (http11Request,),
                    # ),
                )
            )
        )
    )


    writerServer = \
    (Observable(),
        (scheduledCommitPeriodicCall,), # Stuur periodiek een 'Commit' naar de LuceneServer...
        # (DebugPrompt(reactor=writerReactor, port=readerPort-1, globals=locals()),),
        (periodicDownload, # Ga/connect (periodiek) naar de Gateway-server...
            (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)),
                (oaiDownload, # Haal OAI spulletjes van de Gateway...
                    (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message.
                        # (LogComponent("SRU harvest van GATEWAY"),), #[SRU harvest van GATEWAY] add(*(), **{'partname': 'record', 'identifier': 'meresco:record:1', 'lxmlNode': '_ElementTree(<record xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><header><identifier>meresco:record:1</identifier><datestamp>2016-07-13T15:31:10Z</datestamp></header><metadata><document xmlns="http://meres
                        (FilterMessages(allowed=['add']),
                            
                            (XmlXPath(['/oai:record/oai:metadata/document:document'], fromKwarg='lxmlNode'),   
                                # (LogComponent("NormdocToFieldsList"),),                             
                                        (NormdocToFieldsList(), # Platte lijst met veldnamen en waardes...
                                            (RecordPidToAuthNid(),),
                                            # (LogComponent("NormdocToFieldsList"),), # [DcToFieldsList] add(*(), **{'fieldslist': [('dc:identifier', 'http://meresco.com?record=1'), ('dc:description', 'This is an example program about Search with Meresco'), ('dc:title', 'Example Program 1'), ('dc:creator', 'Seecr'), ('dc:publisher', 'Seecr'), ('dc:date', '2016'), ('dc:type', 'Example'), ('dc:subject', 'Search'), ('dc:language', 'en'), ('dc:rights', 'Open Source')], 'partname': 'record', 'identifier': 'meresco:record:1'})
                                            #                                           [NormdocToFieldsList] lookupNameIds(*(set(['doi:10.1002/lno.10611', 'wos:000423029300003']),), **{})
                                            (FieldsListToLuceneDocument( # Maakt addDocument messege + creeert de facet/drilldown velden waarvan de value's tot max. 256 chars getruncated worden.
                                                    fieldRegistry=luceneWriter.settings.fieldRegistry, # o.a. drilldownfields definitie
                                                    untokenizedFieldnames=untokenizedFieldnames, # untokenized fields
                                                    indexFieldFactory=DcFields, # Creeert een "__all__", veldnaam en optioneel "untokenized.veldnaam"... 
                                                    #rewriteIdentifier=(lambda idee: idee.split(':', 1)[-1]) # meresco:record:1' => 'record:1'
                                                ),
                                                # (LogComponent("FieldsListToLuceneDocument"),), # [LUCENE_WRITER] addDocument(*(), **{'fields': [{'type': 'TextField', 'name': '__all__', 'value': 'http://meresco.com?record=1'}, {'type': 'TextField', 'name': 'dc:identifier', 'value': 'http://meresco.com?record=1'}, {'type': 'StringField', 'name': 'untokenized.dc:identifier', 'value': 'http://meresco.com?record=1'}, {'type': 'TextField', 'name': '__all__', 'value': 'This is an example program about Search with Meresco'}, {'type': 'TextField', 'name': 'dc:description', 'value': 'This is an example program about Search with Meresco'}, {'type': 'TextField', 'name': '__all__', 'value': 'Example Program 1'}, {'type': 'TextField', 'name': 'dc:title', 'value': 'Example Program 1'}, {'type': 'TextField', 'name': '__all__', 'value': 'Seecr'}, {'type': 'TextField', 'name': 'dc:creator', 'value': 'Seecr'}, {'type': 'TextField', 'name': '__all__', 'value': 'Seecr'}, {'type': 'TextField', 'name': 'dc:publisher', 'value': 'Seecr'}, {'type': 'TextField', 'name': '__all__', 'value': '2016'}, {'type': 'TextField', 'name': 'dc:date', 'value': '2016'}, {'path': ['2016'], 'type': 'FacetField', 'name': 'untokenized.dc:date'}, {'type': 'TextField', 'name': '__all__', 'value': 'Example'}, {'type': 'TextField', 'name': 'dc:type', 'value': 'Example'}, {'type': 'TextField', 'name': '__all__', 'value': 'Search'}, {'type': 'TextField', 'name': 'dc:subject', 'value': 'Search'}, {'path': ['Search'], 'type': 'FacetField', 'name': 'untokenized.dc:subject'}, {'type': 'TextField', 'name': '__all__', 'value': 'en'}, {'type': 'TextField', 'name': 'dc:language', 'value': 'en'}, {'type': 'TextField', 'name': '__all__', 'value': 'Open Source'}, {'type': 'TextField', 'name': 'dc:rights', 'value': 'Open Source'}], 'identifier': 'meresco:record:1'})
                                                    # [####LUCENE_WRITER] addDocument(*(), **{'fields': [{'type': 'TextField', 'name': '__all__', 'value': 'knaw'}, {'type': 'TextField', 'name': 'meta:id', 'value': 'knaw'}, {'type': 'TextField', 'name': '__all__', 'value': 'olddata'}, {'type': 'TextField', 'name': 'meta:set', 'value': 'olddata'}, {'type': 'TextField', 'name': '__all__', 'value': 'http://oai.knaw.nl/oai'}, {'type': 'TextField', 'name': 'meta:baseurl', 'value': 'http://oai.knaw.nl/oai'}, {'type': 'TextField', 'name': '__all__', 'value': 'knaw'}, {'type': 'TextField', 'name': 'meta:repositoryGroupId', 'value': 'knaw'}, {'type': 'TextField', 'name': '__all__', 'value': 'nl_didl'}, {'type': 'TextField', 'name': 'meta:metadataPrefix', 'value': 'nl_didl'}, {'type': 'TextField', 'name': '__all__', 'value': 'publication'}, {'type': 'TextField', 'name': 'meta_collection', 'value': 'publication'}, {'path': ['publication'], 'type': 'FacetField', 'name': 'untokenized.meta_collection'}], 'identifier': 'knaw:record:3'})
                                                (luceneWriter,),
                                                # ),
                                            )
                                        )
                                    # )
                                # )
                            )
                        ),
                        (FilterMessages(allowed=['delete']),                            
                            (luceneWriter,),
                        )
                    )
                )
            )
        )
    )
    return writerServer
示例#27
0
def main(reactor,
         port,
         statePath,
         lucenePort,
         gatewayPort,
         quickCommit=False,
         **ignored):

    ######## START Lucene Integration ###############################################################
    defaultLuceneSettings = LuceneSettings(
        commitTimeout=30,
        readonly=True,
    )

    http11Request = be((
        HttpRequest1_1(),
        (SocketPool(reactor=reactor,
                    unusedTimeout=5,
                    limits=dict(totalSize=100, destinationSize=10)), ),
    ))

    luceneIndex = luceneAndReaderConfig(
        defaultLuceneSettings.clone(readonly=True), http11Request, lucenePort)

    luceneRoHelix = be(
        (AdapterToLuceneQuery(defaultCore=DEFAULT_CORE,
                              coreConverters={
                                  DEFAULT_CORE:
                                  QueryExpressionToLuceneQueryDict(
                                      UNQUALIFIED_TERM_FIELDS,
                                      luceneSettings=luceneIndex.settings),
                              }), (
                                  MultiLucene(host='localhost',
                                              port=lucenePort,
                                              defaultCore=DEFAULT_CORE),
                                  (luceneIndex, ),
                                  (http11Request, ),
                              )))

    ######## END Lucene Integration ###############################################################

    fieldnameRewrites = {
        #         UNTOKENIZED_PREFIX+'genre': UNTOKENIZED_PREFIX+'dc:genre',
    }

    def fieldnameRewrite(name):
        return fieldnameRewrites.get(name, name)

    def drilldownFieldnamesTranslate(fieldname):
        untokenizedName = untokenizedFieldname(fieldname)
        if untokenizedName in untokenizedFieldnames:
            fieldname = untokenizedName
        return fieldnameRewrite(fieldname)

    convertToComposedQuery = ConvertToComposedQuery(
        resultsFrom=DEFAULT_CORE,
        matches=[],
        drilldownFieldnamesTranslate=drilldownFieldnamesTranslate)

    strategie = Md5HashDistributeStrategy()
    storage = StorageComponent(join(statePath, 'store'),
                               strategy=strategie,
                               partsRemovedOnDelete=[
                                   HEADER_PARTNAME, META_PARTNAME,
                                   METADATA_PARTNAME, OAI_DC_PARTNAME,
                                   LONG_PARTNAME, SHORT_PARTNAME,
                                   OPENAIRE_PARTNAME
                               ])

    oaiJazz = OaiJazz(join(statePath, 'oai'))
    oaiJazz.updateMetadataFormat(
        OAI_DC_PARTNAME, "http://www.openarchives.org/OAI/2.0/oai_dc.xsd",
        "http://purl.org/dc/elements/1.1/")

    oai_oa_cerifJazz = OaiJazz(join(statePath, 'oai_cerif'))
    oai_oa_cerifJazz.updateMetadataFormat(
        OPENAIRE_PARTNAME,
        "https://www.openaire.eu/schema/cris/current/openaire-cerif-profile.xsd",
        "https://www.openaire.eu/cerif-profile/1.1/")
    # All of the following OAI-PMH sets shall be recognized by the CRIS, even if not all of them are populated.
    oai_oa_cerifJazz.updateSet("openaire_cris_projects",
                               "OpenAIRE_CRIS_projects")
    oai_oa_cerifJazz.updateSet("openaire_cris_orgunits",
                               "OpenAIRE_CRIS_orgunits")
    oai_oa_cerifJazz.updateSet("openaire_cris_persons",
                               "OpenAIRE_CRIS_persons")
    oai_oa_cerifJazz.updateSet("openaire_cris_patents",
                               "OpenAIRE_CRIS_patents")
    oai_oa_cerifJazz.updateSet("openaire_cris_products",
                               "OpenAIRE_CRIS_products")
    oai_oa_cerifJazz.updateSet("openaire_cris_publications",
                               "OpenAIRE_CRIS_publications")

    oai_oa_cerifJazz.updateSet("openaire_cris_funding",
                               "OpenAIRE_CRIS_funding")
    oai_oa_cerifJazz.updateSet("openaire_cris_events", "OpenAIRE_CRIS_events")
    oai_oa_cerifJazz.updateSet("openaire_cris_equipments",
                               "OpenAIRE_CRIS_equipments")

    cqlClauseConverters = [
        RenameFieldForExact(
            untokenizedFields=untokenizedFieldnames,
            untokenizedPrefix=UNTOKENIZED_PREFIX,
        ).filterAndModifier(),
        SearchTermFilterAndModifier(
            shouldModifyFieldValue=lambda *args: True,
            fieldnameModifier=fieldnameRewrite).filterAndModifier(),
    ]

    periodicGateWayDownload = PeriodicDownload(
        reactor,
        host='localhost',
        port=gatewayPort,
        schedule=Schedule(
            period=1 if quickCommit else 10
        ),  # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need 1 second! Otherwise tests will fail!
        name='api',
        autoStart=True)

    oaiDownload = OaiDownloadProcessor(path='/oaix',
                                       metadataPrefix=NORMALISED_DOC_NAME,
                                       workingDirectory=join(
                                           statePath, 'harvesterstate',
                                           'gateway'),
                                       userAgentAddition='ApiServer',
                                       xWait=True,
                                       name='api',
                                       autoCommit=False)

    executeQueryHelix = \
        (FilterMessages(allowed=['executeQuery']),
            (CqlMultiSearchClauseConversion(cqlClauseConverters, fromKwarg='query'),
                (DrilldownQueries(),
                    (convertToComposedQuery,
                        (luceneRoHelix,),
                    )
                )
            )
        )

    return \
    (Observable(),
        createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, storage, oaiJazz, oai_oa_cerifJazz),
        (ObservableHttpServer(reactor, port, compressResponse=True),
            (BasicHttpHandler(),
                (PathFilter(["/oai"]),
                    (OaiPmh(repositoryName="NARCIS OAI-pmh", adminEmail="*****@*****.**", externalUrl="http://oai.narcis.nl"),
                        (oaiJazz,),
                        (StorageAdapter(),
                            (storage,)
                        ),
                        (OaiBranding(
                            url="http://www.narcis.nl/images/logos/logo-knaw-house.gif",
                            link="http://oai.narcis.nl",
                            title="Narcis - The gateway to scholarly information in The Netherlands"),
                        ),
                        (OaiProvenance(
                            nsMap=NAMESPACEMAP,
                            baseURL=('meta', '//meta:repository/meta:baseurl/text()'),
                            harvestDate=('meta', '//meta:record/meta:harvestdate/text()'),
                            metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'),
                            identifier=('header','//oai:identifier/text()'),
                            datestamp=('header', '//oai:datestamp/text()')
                            ),
                            (storage,)
                        )
                    )
                ),
                (PathFilter(["/cerif"]),
                    (OaiPmhDans(repositoryName="OpenAIRE CERIF", adminEmail="*****@*****.**", repositoryIdentifier="services.nod.dans.knaw.nl", externalUrl="http://services.nod.dans.knaw.nl"), #TODO: pathFilter should resemble proxy path
                        (oai_oa_cerifJazz,),
                        (StorageAdapter(),
                            (storage,)
                        ),
                        (OaiOpenAIREDescription(
                            serviceid='organisation:ORG1242054',
                            acronym='services.nod.dans.knaw.nl',
                            name='NARCIS',
                            description='Compliant with the OpenAIRE Guidelines for CRIS Managers v.1.1.',
                            website='https://www.narcis.nl',
                            baseurl='http://services.nod.dans.knaw.nl/oa-cerif',
                            subjectheading='',
                            orgunitid='organisation:ORG1242054',
                            owneracronym='DANS'),
                        ),
                        # (OaiBranding(
                        #     url="http://www.narcis.nl/images/logos/logo-knaw-house.gif",
                        #     link="http://oai.narcis.nl",
                        #     title="Narcis - The gateway to scholarly information in The Netherlands"),
                        # ),
                        (OaiProvenance(
                            nsMap=NAMESPACEMAP,
                            baseURL=('meta', '//meta:repository/meta:baseurl/text()'),
                            harvestDate=('meta', '//meta:record/meta:harvestdate/text()'),
                            metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'),
                            identifier=('header','//oai:identifier/text()'),
                            datestamp=('header', '//oai:datestamp/text()')
                            ),
                            (storage,)
                        )
                    )
                ),
                (PathFilter(['/sru']),
                    (SruParser(
                            host='sru.narcis.nl',
                            port=80,
                            defaultRecordSchema='knaw_short',
                            defaultRecordPacking='xml'),
                        (SruLimitStartRecord(limitBeyond=4000),
                            (SruHandler(
                                    includeQueryTimes=False,
                                    extraXParameters=[],
                                    enableCollectLog=False),
                                (SruTermDrilldown(),),
                                executeQueryHelix,
                                (StorageAdapter(),
                                    (storage,)
                                )
                            )
                        )
                    )
                ),
                (PathFilter('/rss'),
                    (Rss(   supportedLanguages = ['nl','en'], # defaults to first, if requested language is not available or supplied.
                            title = {'nl':'NARCIS', 'en':'NARCIS'},
                            description = {'nl':'NARCIS: De toegang tot de Nederlandse wetenschapsinformatie', 'en':'NARCIS: The gateway to Dutch scientific information'},
                            link = {'nl':'http://www.narcis.nl/?Language=nl', 'en':'http://www.narcis.nl/?Language=en'},
                            maximumRecords = 20),
                        executeQueryHelix,
                        (RssItem(
                                nsMap=NAMESPACEMAP,
                                title = ('knaw_short', {'nl':'//short:metadata/short:titleInfo[not (@xml:lang)]/short:title/text()', 'en':'//short:metadata/short:titleInfo[@xml:lang="en"]/short:title/text()'}),
                                description = ('knaw_short', {'nl':'//short:abstract[not (@xml:lang)]/text()', 'en':'//short:abstract[@xml:lang="en"]/text()'}),
                                pubdate = ('knaw_short', '//short:dateIssued/short:parsed/text()'),
                                linkTemplate = 'http://www.narcis.nl/%(wcpcollection)s/RecordID/%(oai_identifier)s/Language/%(language)s',
                                wcpcollection = ('meta', '//*[local-name() = "collection"]/text()'),
                                oai_identifier = ('meta', '//meta:record/meta:id/text()'),
                                language = ('Dummy: Language is auto provided by the calling RSS component, but needs to be present to serve the linkTemplate.')
                            ),
                            (StorageAdapter(),
                                (storage,)
                            )
                        )
                    )
                )
            )
        )
    )
示例#28
0
    def testIncrementalHarvestWithFromAfterSomePeriod(self):
        observer = CallTrace(emptyGeneratorMethods=['add'])
        oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), incrementalHarvestSchedule=Schedule(period=10))
        oaiDownloadProcessor._time = lambda: 1.0
        oaiDownloadProcessor.addObserver(observer)
        consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE))))
        self.assertEquals(None, oaiDownloadProcessor._resumptionToken)
        self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from)

        self.assertEquals(None, oaiDownloadProcessor.buildRequest())
        oaiDownloadProcessor._time = lambda: 6.0
        self.assertEquals(None, oaiDownloadProcessor.buildRequest())
        oaiDownloadProcessor._time = lambda: 10.0
        self.assertEquals(None, oaiDownloadProcessor.buildRequest())
        oaiDownloadProcessor._time = lambda: 11.1
        request = oaiDownloadProcessor.buildRequest()
        self.assertTrue(request.startswith('GET /oai?verb=ListRecords&from=2002-06-01T19%3A20%3A30Z&metadataPrefix=oai_dc'), request)
示例#29
0
    def testHarvesterState(self):
        observer = CallTrace(emptyGeneratorMethods=['add'])
        oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO())
        oaiDownloadProcessor.addObserver(observer)
        state = oaiDownloadProcessor.getState()
        self.assertEquals(None, state.resumptionToken)
        self.assertEquals(None, state.from_)
        self.assertEquals(None, state.errorState)
        self.assertEquals(None, state.name)
        self.assertEquals("/oai", state.path)
        self.assertEquals("oai_dc", state.metadataPrefix)
        self.assertEquals(None, state.set)
        self.assertEquals(0, state.nextRequestTime)
        oaiDownloadProcessor.setSet('s')
        oaiDownloadProcessor.setPath('/p')
        oaiDownloadProcessor.setMetadataPrefix('pref')
        oaiDownloadProcessor.observable_setName('aName')
        consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN))))
        state = oaiDownloadProcessor.getState()
        self.assertEquals("x?y&z", state.resumptionToken)
        self.assertEquals('2002-06-01T19:20:30Z', state.from_)
        self.assertEquals(None, state.errorState)
        self.assertEquals('aName', state.name)
        self.assertEquals("/p", state.path)
        self.assertEquals("pref", state.metadataPrefix)
        self.assertEquals('s', state.set)
        self.assertEquals(0, state.nextRequestTime)

        # Change state of oaiDownloadProcessor -> changes stateView.
        oaiDownloadProcessor.setSet('x')
        self.assertEquals('x', state.set)

        oaiDownloadProcessor2 = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO())
        state2 = oaiDownloadProcessor2.getState()
        self.assertEquals(None, state2.name)
        self.assertEquals("oai_dc", state2.metadataPrefix)
        self.assertEquals("x?y&z", state2.resumptionToken)
        self.assertEquals('2002-06-01T19:20:30Z', state2.from_)
        self.assertEquals(None, state2.errorState)
        self.assertEquals(0, state.nextRequestTime)
示例#30
0
    def testScheduleNextRequest(self):
        oaiDownloadProcessor = OaiDownloadProcessor(path='/p', metadataPrefix='p', workingDirectory=self.tempdir)
        oaiDownloadProcessor._time = lambda: 17
        consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % ''))))
        self.assertTrue(oaiDownloadProcessor._earliestNextRequestTime > 17)

        oaiDownloadProcessor.scheduleNextRequest()
        self.assertEquals(0, oaiDownloadProcessor._earliestNextRequestTime)
        self.assertEquals(True, oaiDownloadProcessor._timeForNextRequest())
        self.assertNotEqual(None, oaiDownloadProcessor.buildRequest())

        oaiDownloadProcessor.scheduleNextRequest(Schedule(period=0))
        self.assertEquals(17, oaiDownloadProcessor._earliestNextRequestTime)
        self.assertEquals(True, oaiDownloadProcessor._timeForNextRequest())
        self.assertNotEqual(None, oaiDownloadProcessor.buildRequest())

        oaiDownloadProcessor.scheduleNextRequest(Schedule(period=120))
        self.assertEquals(137, oaiDownloadProcessor._earliestNextRequestTime)
        self.assertEquals(False, oaiDownloadProcessor._timeForNextRequest())
        self.assertEquals(None, oaiDownloadProcessor.buildRequest())
示例#31
0
 def testUpdateRequestAfterSetResumptionToken(self):
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", set="aSet", workingDirectory=self.tempdir, xWait=False)
     oaiDownloadProcessor.setPath('/otherOai')
     oaiDownloadProcessor.setFrom('2014')
     oaiDownloadProcessor.setResumptionToken('ReSumptionToken')
     self.assertEquals("""GET /otherOai?verb=ListRecords&resumptionToken=ReSumptionToken HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest())
示例#32
0
 def testSetIncrementalHarvestScheduleNotAllowedInCaseOfRestartAfterFinish(self):
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), restartAfterFinish=True)
     self.assertRaises(ValueError, lambda: oaiDownloadProcessor.setIncrementalHarvestSchedule(schedule=Schedule(period=3)))
示例#33
0
 def testPartitionRequest(self):
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, partition="1/2")
     self.assertEquals("""GET /oai?verb=ListRecords&metadataPrefix=oai_dc&x-partition=1%%2F2&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest())
示例#34
0
 def testRequestWithAdditionalUserAgent(self):
     oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, userAgentAddition="From a certain server")
     self.assertEquals("""GET /oai?verb=ListRecords&metadataPrefix=oai_dc&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x (From a certain server)\r\n\r\n""" % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest())