def testSetIncrementalHarvestSchedule(self): oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), incrementalHarvestSchedule=None) oaiDownloadProcessor._time = lambda: 10 oaiDownloadProcessor.setIncrementalHarvestSchedule(schedule=Schedule(period=3)) self.assertEquals(0, oaiDownloadProcessor._earliestNextRequestTime) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEquals(13, oaiDownloadProcessor._earliestNextRequestTime)
def testSignalHarvestingDone(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path='/p', metadataPrefix='p', workingDirectory=self.tempdir, incrementalHarvestSchedule=None) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], observer.calledMethodNames())
def testBuildRequestNoneWhenNoResumptionToken(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEquals(None, oaiDownloadProcessor._resumptionToken) self.assertEquals(None, oaiDownloadProcessor.buildRequest())
def testYieldSuspendFromAdd(self): observer = CallTrace() oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False) oaiDownloadProcessor.addObserver(observer) suspend = Suspend() observer.returnValues['add'] = (x for x in [suspend]) yields = list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % ''))))) self.assertEquals([suspend, None], yields)
def testRestartAfterFinish(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), restartAfterFinish=True) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEquals(None, oaiDownloadProcessor._resumptionToken) request = oaiDownloadProcessor.buildRequest() self.assertTrue(request.startswith('GET /oai?verb=ListRecords&metadataPrefix=oai_dc HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: '), request)
def testIncrementalHarvestScheduleNone(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), incrementalHarvestSchedule=None) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEquals(None, oaiDownloadProcessor._resumptionToken) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) self.assertEquals(None, oaiDownloadProcessor._earliestNextRequestTime)
def testRaiseErrorOnBadResponse(self): oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True) badRecord = '<record>No Header</record>' try: list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % badRecord))))) self.fail() except IndexError: pass
def testUseResumptionToken(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))) self.assertEquals('x?y&z', oaiDownloadProcessor._resumptionToken) self.assertEquals('GET /oai?verb=ListRecords&resumptionToken=x%%3Fy%%26z&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) self.assertEquals('x?y&z', oaiDownloadProcessor._resumptionToken)
def testIncrementalHarvestWithFromWithDefaultScheduleMidnight(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor._time = oaiDownloadProcessor._incrementalHarvestSchedule._time = lambda: 01 * 60 * 60 oaiDownloadProcessor._incrementalHarvestSchedule._utcnow = lambda: datetime.strptime("01:00", "%H:%M") oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEquals(None, oaiDownloadProcessor._resumptionToken) self.assertEquals(24 * 60 * 60.0, oaiDownloadProcessor._earliestNextRequestTime)
def testSetInRequest(self): oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", set="setName", workingDirectory=self.tempdir, xWait=True) self.assertEquals("""GET /oai?verb=ListRecords&metadataPrefix=oai_dc&set=setName&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", set="set-_.!~*'()", workingDirectory=self.tempdir, xWait=True) self.assertEquals("""GET /oai?verb=ListRecords&metadataPrefix=oai_dc&set=set-_.%%21%%7E%%2A%%27%%28%%29&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest()) resumptionToken = "u|c1286437597991025|mprefix|s|f" open(join(self.tempdir, 'harvester.state'), 'w').write("Resumptiontoken: %s\n" % resumptionToken) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", set="setName", workingDirectory=self.tempdir, xWait=True) self.assertEquals("""GET /oai?verb=ListRecords&resumptionToken=u%%7Cc1286437597991025%%7Cmprefix%%7Cs%%7Cf&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest())
def testResponseDateAsFrom(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO()) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from)
def testIncrementalHarvestReScheduleIfNoRecordsMatch(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", incrementalHarvestSchedule=Schedule(period=0), workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) consume(oaiDownloadProcessor.handle(parse(StringIO(NO_RECORDS_MATCH_RESPONSE)))) self.assertEquals(None, oaiDownloadProcessor._errorState) self.assertEquals('2012-06-01T19:20:30Z', oaiDownloadProcessor._from)
def testUpdateRequest(self): oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True) oaiDownloadProcessor.setPath('/otherOai') oaiDownloadProcessor.setMetadataPrefix('otherPrefix') oaiDownloadProcessor.setSet('aSet') oaiDownloadProcessor.setFrom('2014') self.assertEquals("""GET /otherOai?verb=ListRecords&from=2014&metadataPrefix=otherPrefix&set=aSet&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest())
def testPersistentIdentifier(self): identifierFilepath = join(self.tempdir, 'harvester.identifier') self.assertFalse(isfile(identifierFilepath)) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True) currentIdentifier = oaiDownloadProcessor._identifier self.assertTrue(isfile(identifierFilepath)) self.assertEquals(currentIdentifier, open(identifierFilepath).read()) self.assertEquals("""GET /oai?verb=ListRecords&metadataPrefix=oai_dc&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % currentIdentifier, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True) self.assertEquals("""GET /oai?verb=ListRecords&metadataPrefix=oai_dc&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % currentIdentifier, oaiDownloadProcessor.buildRequest())
def testListRecordsRequestError(self): resumptionToken = "u|c1286437597991025|mprefix|s|f" open(join(self.tempdir, 'harvester.state'), 'w').write("Resumptiontoken: %s\n" % resumptionToken) observer = CallTrace() oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) oaiDownloadProcessor.addObserver(observer) self.assertEquals('GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('resumptionToken', resumptionToken), ('x-wait', 'True')]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest()) consume(oaiDownloadProcessor.handle(parse(StringIO(ERROR_RESPONSE)))) self.assertEquals(0, len(observer.calledMethods)) self.assertEquals("someError: Some error occurred.\n", oaiDownloadProcessor._err.getvalue()) self.assertEquals('GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('metadataPrefix', 'oai_dc'), ('x-wait', 'True')]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest())
def testListIdentifiersHandle(self): observer = CallTrace(methods={'add': lambda **kwargs: (x for x in [])}) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, verb='ListIdentifiers') oaiDownloadProcessor.addObserver(observer) list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTIDENTIFIERS_RESPONSE))))) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], [m.name for m in observer.calledMethods]) addMethod = observer.calledMethods[1] self.assertEquals(0, len(addMethod.args)) self.assertEqualsWS(ONE_HEADER, lxmltostring(addMethod.kwargs['lxmlNode'])) self.assertEquals('2011-08-22T07:34:00Z', addMethod.kwargs['datestamp']) self.assertEquals('oai:identifier:1', addMethod.kwargs['identifier'])
def testHandleYieldsAtLeastOnceAfterEachRecord(self): def add(**kwargs): return yield observer = CallTrace(methods={'add': add}) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False) oaiDownloadProcessor.addObserver(observer) yields = list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % ''))))) self.assertEquals(1, len(yields)) secondRecord = '<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:2</identifier><datestamp>2011-08-22T07:41:00Z</datestamp></header><metadata>ignored</metadata></record>' yields = list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % secondRecord))))) self.assertEquals(2, len(yields))
def testKeepResumptionTokenOnFailingAddCall(self): resumptionToken = "u|c1286437597991025|mprefix|s|f" open(join(self.tempdir, 'harvester.state'), 'w').write("Resumptiontoken: %s\n" % resumptionToken) observer = CallTrace() observer.exceptions={'add': Exception("Could be anything")} oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) oaiDownloadProcessor.addObserver(observer) self.assertEquals('GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('resumptionToken', resumptionToken), ('x-wait', 'True')]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest()) self.assertRaises(Exception, lambda: list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))))) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) errorOutput = oaiDownloadProcessor._err.getvalue() self.assertTrue(errorOutput.startswith('Traceback'), errorOutput) self.assertTrue('Exception: Could be anything\nWhile processing:\n<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:1' in errorOutput, errorOutput) self.assertEquals('GET /oai?%s HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n' % (urlencode([('verb', 'ListRecords'), ('resumptionToken', resumptionToken), ('x-wait', 'True')]), oaiDownloadProcessor._identifier), oaiDownloadProcessor.buildRequest())
def testHandleWithTwoRecords(self): observer = CallTrace(methods={'add': lambda **kwargs: (x for x in [])}) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True) oaiDownloadProcessor.addObserver(observer) secondRecord = '<record xmlns="http://www.openarchives.org/OAI/2.0/"><header><identifier>oai:identifier:2</identifier><datestamp>2011-08-22T07:41:00Z</datestamp></header><metadata>ignored</metadata></record>' list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % (secondRecord + RESUMPTION_TOKEN)))))) self.assertEquals(['startOaiBatch', 'add', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) addMethod0, addMethod1 = observer.calledMethods[1:3] self.assertEquals(0, len(addMethod0.args)) self.assertEqualsWS(ONE_RECORD, lxmltostring(addMethod0.kwargs['lxmlNode'])) self.assertEquals('2011-08-22T07:34:00Z', addMethod0.kwargs['datestamp']) self.assertEquals('oai:identifier:1', addMethod0.kwargs['identifier']) self.assertEqualsWS(secondRecord, lxmltostring(addMethod1.kwargs['lxmlNode'])) self.assertEquals('2011-08-22T07:41:00Z', addMethod1.kwargs['datestamp']) self.assertEquals('oai:identifier:2', addMethod1.kwargs['identifier'])
def testIncrementalHarvestScheduleNoneOverruledWithSetIncrementalHarvestSchedule(self): oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), incrementalHarvestSchedule=None) oaiDownloadProcessor._time = lambda: 10 consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEquals(None, oaiDownloadProcessor._resumptionToken) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) self.assertEquals(None, oaiDownloadProcessor._earliestNextRequestTime) oaiDownloadProcessor.setIncrementalHarvestSchedule(schedule=Schedule(period=3)) self.assertEquals(None, oaiDownloadProcessor.buildRequest()) self.assertEquals(None, oaiDownloadProcessor._earliestNextRequestTime) oaiDownloadProcessor.scheduleNextRequest() self.assertNotEquals(None, oaiDownloadProcessor.buildRequest()) self.assertEquals(0, oaiDownloadProcessor._earliestNextRequestTime) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertEquals(13, oaiDownloadProcessor._earliestNextRequestTime)
def testIncrementalHarvestScheduleSetToNone(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", incrementalHarvestSchedule=Schedule(period=0), workingDirectory=self.tempdir, xWait=False, err=StringIO()) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) self.assertNotEqual(None, oaiDownloadProcessor._earliestNextRequestTime) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], observer.calledMethodNames()) observer.calledMethods.reset() oaiDownloadProcessor.setFrom(from_=None) oaiDownloadProcessor.setIncrementalHarvestSchedule(schedule=None) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) self.assertEquals(None, oaiDownloadProcessor._earliestNextRequestTime) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch', 'signalHarvestingDone'], observer.calledMethodNames())
def testShutdownPersistsStateOnAutocommit(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, autoCommit=False) oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))) state = oaiDownloadProcessor.getState() self.assertFalse(isfile(join(self.tempdir, 'harvester.state'))) oaiDownloadProcessor.handleShutdown() self.assertEquals({"errorState": None, 'from': '2002-06-01T19:20:30Z', "resumptionToken": state.resumptionToken}, load(open(join(self.tempdir, 'harvester.state'))))
def main(reactor, port, statePath, gatewayPort, dbConfig, quickCommit=False, **ignored): #TODO: Implement logging. # normLogger = Logger(join(statePath, '..', 'gateway', 'normlogger')) dbStorageComponent = ResolverStorageComponent(dbConfig) verbose = True periodicGateWayDownload = PeriodicDownload( reactor, host='localhost', port=gatewayPort, schedule=Schedule( period=.1 if quickCommit else 10 ), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need <=1 second! Otherwise tests will fail! name='resolver', autoStart=True) oaiDownload = OaiDownloadProcessor(path='/oaix', metadataPrefix=NORMALISED_DOC_NAME, workingDirectory=join( statePath, 'harvesterstate', 'gateway'), userAgentAddition='ResolverServer', xWait=True, name='resolver', autoCommit=False) return \ (Observable(), createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, dbStorageComponent), (ObservableHttpServer(reactor, port, compressResponse=True), (BasicHttpHandler(), (PathFilter("/"), (StringServer("Resolver Server", ContentTypePlainText), ) ) ) ) )
def testHarvesterStateWithError(self): resumptionToken = "u|c1286437597991025|mprefix|s|f" open(join(self.tempdir, 'harvester.state'), 'w').write("Resumptiontoken: %s\n" % resumptionToken) observer = CallTrace() observer.exceptions={'add': Exception("Could be anything")} oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO(), name="Name") oaiDownloadProcessor.addObserver(observer) self.assertRaises(Exception, lambda: list(compose(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))))) state = oaiDownloadProcessor.getState() self.assertEquals(resumptionToken, state.resumptionToken) self.assertEquals(None, state.from_) self.assertEquals("ERROR while processing 'oai:identifier:1': Could be anything", state.errorState) self.assertEquals("Name", state.name) oaiDownloadProcessor2 = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) state2 = oaiDownloadProcessor2.getState() self.assertEquals(resumptionToken, state2.resumptionToken) self.assertEquals("ERROR while processing 'oai:identifier:1': Could be anything", state2.errorState)
def main(reactor, port, statePath, gatewayPort, quickCommit=False, **ignored): strategie = Md5HashDistributeStrategy() storage = StorageComponent(join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[ NL_DIDL_NORMALISED_PREFIX, NL_DIDL_COMBINED_PREFIX, 'metadata' ]) oaiJazz = OaiJazz(join(statePath, 'oai')) oaiJazz.updateMetadataFormat("metadata", "http://didl.loc.nl/didl.xsd", NAMESPACEMAP.didl) oaiJazz.updateMetadataFormat(NL_DIDL_COMBINED_PREFIX, "", NAMESPACEMAP.gmhcombined) oaiJazz.updateMetadataFormat(NL_DIDL_NORMALISED_PREFIX, "", NAMESPACEMAP.gmhnorm) normLogger = Logger(join(statePath, '..', 'gateway', 'normlogger')) periodicGateWayDownload = PeriodicDownload( reactor, host='localhost', port=gatewayPort, schedule=Schedule( period=.1 if quickCommit else 10 ), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need <=1 second! Otherwise tests will fail! name='api', autoStart=True) oaiDownload = OaiDownloadProcessor(path='/oaix', metadataPrefix=NORMALISED_DOC_NAME, workingDirectory=join( statePath, 'harvesterstate', 'gateway'), userAgentAddition='ApiServer', xWait=True, name='api', autoCommit=False) return \ (Observable(), createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, storage, oaiJazz), (ObservableHttpServer(reactor, port, compressResponse=True), (BasicHttpHandler(), (PathFilter('/oai'), (OaiPmh( repositoryName="Gemeenschappelijke Metadata Harvester DANS-KB", adminEmail="*****@*****.**", externalUrl="http://oai.gharvester.dans.knaw.nl", batchSize=200, supportXWait=False, # preciseDatestamp=False, # deleteInSets=False ), (oaiJazz, ), (RetrieveToGetDataAdapter(), (storage,), ), (OaiBranding( url="https://www.narcis.nl/images/logos/logo-knaw-house.gif", #TODO: Link to a joint-GMH icon... link="https://harvester.dans.knaw.nl", title="Gemeenschappelijke Metadata Harvester (GMH) van DANS en de KB"), ), (OaiProvenance( nsMap=NAMESPACEMAP, baseURL=('meta', '//meta:repository/meta:baseurl/text()'), harvestDate=('meta', '//meta:harvestdate/text()'), metadataNamespace=('meta', '//meta:metadataPrefix/text()'), #TODO: Kan hardcoded in harvester mapper gezet eventueel: <metadataNamespace>urn:mpeg:mpeg21:2002:01-DII-NS</metadataNamespace>?? (storage,) #metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'), identifier=('header','//oai:identifier/text()'), datestamp=('header', '//oai:datestamp/text()') ), (RetrieveToGetDataAdapter(), (storage,), ) ) ) ), (PathFilter('/rss'), (LoggerRSS( title = 'GMH DANS-KB Normalisationlog Syndication', description = 'Harvester normalisation log for: ', link = 'http://rss.gharvester.dans.knaw.nl/rss', maximumRecords = 30), (normLogger, (storage,) ) ) ), (PathFilter('/xls'), # (LogComponent("XLS-Request:"),), (XlsServer(),) ) ) ) )
def writerMain(writerReactor, statePath, luceneserverPort, gatewayPort, quickCommit=False): http11Request = be( (HttpRequest1_1(), (SocketPool(reactor=writerReactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)),), ) ) indexCommitTimeout = 30 defaultLuceneSettings = LuceneSettings( commitTimeout=indexCommitTimeout, readonly=False, ) luceneWriter = luceneAndReaderConfig(defaultLuceneSettings, http11Request, luceneserverPort) periodicDownload = PeriodicDownload( writerReactor, host='localhost', port=gatewayPort, schedule=Schedule(period=1 if quickCommit else 10), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need 1 second! Otherwise tests will fail! name='index', autoStart=True) oaiDownload = OaiDownloadProcessor( path='/oaix', metadataPrefix=NORMALISED_DOC_NAME, workingDirectory=join(statePath, 'harvesterstate', 'gateway'), userAgentAddition='idx-server', xWait=True, name='index', autoCommit=False) # Post commit naar Lucene(server): scheduledCommitPeriodicCall = be( (PeriodicCall(writerReactor, message='commit', name='Scheduled commit', schedule=Schedule(period=1 if quickCommit else 300), initialSchedule=Schedule(period=1)), # WST: Flushes data from memory to disk. IntegrationTests need 1 second! Otherwise tests will fail! (API). (AllToDo(), # broadcast message to all components, despite of what kind of message... # (periodicDownload,), # WST: periodicDownload does not do anything with a 'commit' message? So why send it to it??? (LuceneCommit(host='localhost', port=luceneserverPort,), # 'commit' message results in http post to /commit/ to Lucene server: # (LogComponent("PERIODIC"),#), # [PERIODIC] httprequest1_1(*(), **{'body': None, 'host': 'localhost', 'request': '/commit/', 'port': 52501, 'method': 'POST'}) (http11Request,), # ), ) ) ) ) writerServer = \ (Observable(), (scheduledCommitPeriodicCall,), # Stuur periodiek een 'Commit' naar de LuceneServer... # (DebugPrompt(reactor=writerReactor, port=readerPort-1, globals=locals()),), (periodicDownload, # Ga/connect (periodiek) naar de Gateway-server... (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)), (oaiDownload, # Haal OAI spulletjes van de Gateway... (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message. # (LogComponent("SRU harvest van GATEWAY"),), #[SRU harvest van GATEWAY] add(*(), **{'partname': 'record', 'identifier': 'meresco:record:1', 'lxmlNode': '_ElementTree(<record xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><header><identifier>meresco:record:1</identifier><datestamp>2016-07-13T15:31:10Z</datestamp></header><metadata><document xmlns="http://meres (FilterMessages(allowed=['add']), (XmlXPath(['/oai:record/oai:metadata/document:document'], fromKwarg='lxmlNode'), # (LogComponent("NormdocToFieldsList"),), (NormdocToFieldsList(), # Platte lijst met veldnamen en waardes... (RecordPidToAuthNid(),), # (LogComponent("NormdocToFieldsList"),), # [DcToFieldsList] add(*(), **{'fieldslist': [('dc:identifier', 'http://meresco.com?record=1'), ('dc:description', 'This is an example program about Search with Meresco'), ('dc:title', 'Example Program 1'), ('dc:creator', 'Seecr'), ('dc:publisher', 'Seecr'), ('dc:date', '2016'), ('dc:type', 'Example'), ('dc:subject', 'Search'), ('dc:language', 'en'), ('dc:rights', 'Open Source')], 'partname': 'record', 'identifier': 'meresco:record:1'}) # [NormdocToFieldsList] lookupNameIds(*(set(['doi:10.1002/lno.10611', 'wos:000423029300003']),), **{}) (FieldsListToLuceneDocument( # Maakt addDocument messege + creeert de facet/drilldown velden waarvan de value's tot max. 256 chars getruncated worden. fieldRegistry=luceneWriter.settings.fieldRegistry, # o.a. drilldownfields definitie untokenizedFieldnames=untokenizedFieldnames, # untokenized fields indexFieldFactory=DcFields, # Creeert een "__all__", veldnaam en optioneel "untokenized.veldnaam"... #rewriteIdentifier=(lambda idee: idee.split(':', 1)[-1]) # meresco:record:1' => 'record:1' ), # (LogComponent("FieldsListToLuceneDocument"),), # [LUCENE_WRITER] addDocument(*(), **{'fields': [{'type': 'TextField', 'name': '__all__', 'value': 'http://meresco.com?record=1'}, {'type': 'TextField', 'name': 'dc:identifier', 'value': 'http://meresco.com?record=1'}, {'type': 'StringField', 'name': 'untokenized.dc:identifier', 'value': 'http://meresco.com?record=1'}, {'type': 'TextField', 'name': '__all__', 'value': 'This is an example program about Search with Meresco'}, {'type': 'TextField', 'name': 'dc:description', 'value': 'This is an example program about Search with Meresco'}, {'type': 'TextField', 'name': '__all__', 'value': 'Example Program 1'}, {'type': 'TextField', 'name': 'dc:title', 'value': 'Example Program 1'}, {'type': 'TextField', 'name': '__all__', 'value': 'Seecr'}, {'type': 'TextField', 'name': 'dc:creator', 'value': 'Seecr'}, {'type': 'TextField', 'name': '__all__', 'value': 'Seecr'}, {'type': 'TextField', 'name': 'dc:publisher', 'value': 'Seecr'}, {'type': 'TextField', 'name': '__all__', 'value': '2016'}, {'type': 'TextField', 'name': 'dc:date', 'value': '2016'}, {'path': ['2016'], 'type': 'FacetField', 'name': 'untokenized.dc:date'}, {'type': 'TextField', 'name': '__all__', 'value': 'Example'}, {'type': 'TextField', 'name': 'dc:type', 'value': 'Example'}, {'type': 'TextField', 'name': '__all__', 'value': 'Search'}, {'type': 'TextField', 'name': 'dc:subject', 'value': 'Search'}, {'path': ['Search'], 'type': 'FacetField', 'name': 'untokenized.dc:subject'}, {'type': 'TextField', 'name': '__all__', 'value': 'en'}, {'type': 'TextField', 'name': 'dc:language', 'value': 'en'}, {'type': 'TextField', 'name': '__all__', 'value': 'Open Source'}, {'type': 'TextField', 'name': 'dc:rights', 'value': 'Open Source'}], 'identifier': 'meresco:record:1'}) # [####LUCENE_WRITER] addDocument(*(), **{'fields': [{'type': 'TextField', 'name': '__all__', 'value': 'knaw'}, {'type': 'TextField', 'name': 'meta:id', 'value': 'knaw'}, {'type': 'TextField', 'name': '__all__', 'value': 'olddata'}, {'type': 'TextField', 'name': 'meta:set', 'value': 'olddata'}, {'type': 'TextField', 'name': '__all__', 'value': 'http://oai.knaw.nl/oai'}, {'type': 'TextField', 'name': 'meta:baseurl', 'value': 'http://oai.knaw.nl/oai'}, {'type': 'TextField', 'name': '__all__', 'value': 'knaw'}, {'type': 'TextField', 'name': 'meta:repositoryGroupId', 'value': 'knaw'}, {'type': 'TextField', 'name': '__all__', 'value': 'nl_didl'}, {'type': 'TextField', 'name': 'meta:metadataPrefix', 'value': 'nl_didl'}, {'type': 'TextField', 'name': '__all__', 'value': 'publication'}, {'type': 'TextField', 'name': 'meta_collection', 'value': 'publication'}, {'path': ['publication'], 'type': 'FacetField', 'name': 'untokenized.meta_collection'}], 'identifier': 'knaw:record:3'}) (luceneWriter,), # ), ) ) # ) # ) ) ), (FilterMessages(allowed=['delete']), (luceneWriter,), ) ) ) ) ) ) return writerServer
def main(reactor, port, statePath, lucenePort, gatewayPort, quickCommit=False, **ignored): ######## START Lucene Integration ############################################################### defaultLuceneSettings = LuceneSettings( commitTimeout=30, readonly=True, ) http11Request = be(( HttpRequest1_1(), (SocketPool(reactor=reactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)), ), )) luceneIndex = luceneAndReaderConfig( defaultLuceneSettings.clone(readonly=True), http11Request, lucenePort) luceneRoHelix = be( (AdapterToLuceneQuery(defaultCore=DEFAULT_CORE, coreConverters={ DEFAULT_CORE: QueryExpressionToLuceneQueryDict( UNQUALIFIED_TERM_FIELDS, luceneSettings=luceneIndex.settings), }), ( MultiLucene(host='localhost', port=lucenePort, defaultCore=DEFAULT_CORE), (luceneIndex, ), (http11Request, ), ))) ######## END Lucene Integration ############################################################### fieldnameRewrites = { # UNTOKENIZED_PREFIX+'genre': UNTOKENIZED_PREFIX+'dc:genre', } def fieldnameRewrite(name): return fieldnameRewrites.get(name, name) def drilldownFieldnamesTranslate(fieldname): untokenizedName = untokenizedFieldname(fieldname) if untokenizedName in untokenizedFieldnames: fieldname = untokenizedName return fieldnameRewrite(fieldname) convertToComposedQuery = ConvertToComposedQuery( resultsFrom=DEFAULT_CORE, matches=[], drilldownFieldnamesTranslate=drilldownFieldnamesTranslate) strategie = Md5HashDistributeStrategy() storage = StorageComponent(join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[ HEADER_PARTNAME, META_PARTNAME, METADATA_PARTNAME, OAI_DC_PARTNAME, LONG_PARTNAME, SHORT_PARTNAME, OPENAIRE_PARTNAME ]) oaiJazz = OaiJazz(join(statePath, 'oai')) oaiJazz.updateMetadataFormat( OAI_DC_PARTNAME, "http://www.openarchives.org/OAI/2.0/oai_dc.xsd", "http://purl.org/dc/elements/1.1/") oai_oa_cerifJazz = OaiJazz(join(statePath, 'oai_cerif')) oai_oa_cerifJazz.updateMetadataFormat( OPENAIRE_PARTNAME, "https://www.openaire.eu/schema/cris/current/openaire-cerif-profile.xsd", "https://www.openaire.eu/cerif-profile/1.1/") # All of the following OAI-PMH sets shall be recognized by the CRIS, even if not all of them are populated. oai_oa_cerifJazz.updateSet("openaire_cris_projects", "OpenAIRE_CRIS_projects") oai_oa_cerifJazz.updateSet("openaire_cris_orgunits", "OpenAIRE_CRIS_orgunits") oai_oa_cerifJazz.updateSet("openaire_cris_persons", "OpenAIRE_CRIS_persons") oai_oa_cerifJazz.updateSet("openaire_cris_patents", "OpenAIRE_CRIS_patents") oai_oa_cerifJazz.updateSet("openaire_cris_products", "OpenAIRE_CRIS_products") oai_oa_cerifJazz.updateSet("openaire_cris_publications", "OpenAIRE_CRIS_publications") oai_oa_cerifJazz.updateSet("openaire_cris_funding", "OpenAIRE_CRIS_funding") oai_oa_cerifJazz.updateSet("openaire_cris_events", "OpenAIRE_CRIS_events") oai_oa_cerifJazz.updateSet("openaire_cris_equipments", "OpenAIRE_CRIS_equipments") cqlClauseConverters = [ RenameFieldForExact( untokenizedFields=untokenizedFieldnames, untokenizedPrefix=UNTOKENIZED_PREFIX, ).filterAndModifier(), SearchTermFilterAndModifier( shouldModifyFieldValue=lambda *args: True, fieldnameModifier=fieldnameRewrite).filterAndModifier(), ] periodicGateWayDownload = PeriodicDownload( reactor, host='localhost', port=gatewayPort, schedule=Schedule( period=1 if quickCommit else 10 ), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need 1 second! Otherwise tests will fail! name='api', autoStart=True) oaiDownload = OaiDownloadProcessor(path='/oaix', metadataPrefix=NORMALISED_DOC_NAME, workingDirectory=join( statePath, 'harvesterstate', 'gateway'), userAgentAddition='ApiServer', xWait=True, name='api', autoCommit=False) executeQueryHelix = \ (FilterMessages(allowed=['executeQuery']), (CqlMultiSearchClauseConversion(cqlClauseConverters, fromKwarg='query'), (DrilldownQueries(), (convertToComposedQuery, (luceneRoHelix,), ) ) ) ) return \ (Observable(), createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, storage, oaiJazz, oai_oa_cerifJazz), (ObservableHttpServer(reactor, port, compressResponse=True), (BasicHttpHandler(), (PathFilter(["/oai"]), (OaiPmh(repositoryName="NARCIS OAI-pmh", adminEmail="*****@*****.**", externalUrl="http://oai.narcis.nl"), (oaiJazz,), (StorageAdapter(), (storage,) ), (OaiBranding( url="http://www.narcis.nl/images/logos/logo-knaw-house.gif", link="http://oai.narcis.nl", title="Narcis - The gateway to scholarly information in The Netherlands"), ), (OaiProvenance( nsMap=NAMESPACEMAP, baseURL=('meta', '//meta:repository/meta:baseurl/text()'), harvestDate=('meta', '//meta:record/meta:harvestdate/text()'), metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'), identifier=('header','//oai:identifier/text()'), datestamp=('header', '//oai:datestamp/text()') ), (storage,) ) ) ), (PathFilter(["/cerif"]), (OaiPmhDans(repositoryName="OpenAIRE CERIF", adminEmail="*****@*****.**", repositoryIdentifier="services.nod.dans.knaw.nl", externalUrl="http://services.nod.dans.knaw.nl"), #TODO: pathFilter should resemble proxy path (oai_oa_cerifJazz,), (StorageAdapter(), (storage,) ), (OaiOpenAIREDescription( serviceid='organisation:ORG1242054', acronym='services.nod.dans.knaw.nl', name='NARCIS', description='Compliant with the OpenAIRE Guidelines for CRIS Managers v.1.1.', website='https://www.narcis.nl', baseurl='http://services.nod.dans.knaw.nl/oa-cerif', subjectheading='', orgunitid='organisation:ORG1242054', owneracronym='DANS'), ), # (OaiBranding( # url="http://www.narcis.nl/images/logos/logo-knaw-house.gif", # link="http://oai.narcis.nl", # title="Narcis - The gateway to scholarly information in The Netherlands"), # ), (OaiProvenance( nsMap=NAMESPACEMAP, baseURL=('meta', '//meta:repository/meta:baseurl/text()'), harvestDate=('meta', '//meta:record/meta:harvestdate/text()'), metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'), identifier=('header','//oai:identifier/text()'), datestamp=('header', '//oai:datestamp/text()') ), (storage,) ) ) ), (PathFilter(['/sru']), (SruParser( host='sru.narcis.nl', port=80, defaultRecordSchema='knaw_short', defaultRecordPacking='xml'), (SruLimitStartRecord(limitBeyond=4000), (SruHandler( includeQueryTimes=False, extraXParameters=[], enableCollectLog=False), (SruTermDrilldown(),), executeQueryHelix, (StorageAdapter(), (storage,) ) ) ) ) ), (PathFilter('/rss'), (Rss( supportedLanguages = ['nl','en'], # defaults to first, if requested language is not available or supplied. title = {'nl':'NARCIS', 'en':'NARCIS'}, description = {'nl':'NARCIS: De toegang tot de Nederlandse wetenschapsinformatie', 'en':'NARCIS: The gateway to Dutch scientific information'}, link = {'nl':'http://www.narcis.nl/?Language=nl', 'en':'http://www.narcis.nl/?Language=en'}, maximumRecords = 20), executeQueryHelix, (RssItem( nsMap=NAMESPACEMAP, title = ('knaw_short', {'nl':'//short:metadata/short:titleInfo[not (@xml:lang)]/short:title/text()', 'en':'//short:metadata/short:titleInfo[@xml:lang="en"]/short:title/text()'}), description = ('knaw_short', {'nl':'//short:abstract[not (@xml:lang)]/text()', 'en':'//short:abstract[@xml:lang="en"]/text()'}), pubdate = ('knaw_short', '//short:dateIssued/short:parsed/text()'), linkTemplate = 'http://www.narcis.nl/%(wcpcollection)s/RecordID/%(oai_identifier)s/Language/%(language)s', wcpcollection = ('meta', '//*[local-name() = "collection"]/text()'), oai_identifier = ('meta', '//meta:record/meta:id/text()'), language = ('Dummy: Language is auto provided by the calling RSS component, but needs to be present to serve the linkTemplate.') ), (StorageAdapter(), (storage,) ) ) ) ) ) ) )
def testIncrementalHarvestWithFromAfterSomePeriod(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), incrementalHarvestSchedule=Schedule(period=10)) oaiDownloadProcessor._time = lambda: 1.0 oaiDownloadProcessor.addObserver(observer) consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE)))) self.assertEquals(None, oaiDownloadProcessor._resumptionToken) self.assertEquals('2002-06-01T19:20:30Z', oaiDownloadProcessor._from) self.assertEquals(None, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor._time = lambda: 6.0 self.assertEquals(None, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor._time = lambda: 10.0 self.assertEquals(None, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor._time = lambda: 11.1 request = oaiDownloadProcessor.buildRequest() self.assertTrue(request.startswith('GET /oai?verb=ListRecords&from=2002-06-01T19%3A20%3A30Z&metadataPrefix=oai_dc'), request)
def testHarvesterState(self): observer = CallTrace(emptyGeneratorMethods=['add']) oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) oaiDownloadProcessor.addObserver(observer) state = oaiDownloadProcessor.getState() self.assertEquals(None, state.resumptionToken) self.assertEquals(None, state.from_) self.assertEquals(None, state.errorState) self.assertEquals(None, state.name) self.assertEquals("/oai", state.path) self.assertEquals("oai_dc", state.metadataPrefix) self.assertEquals(None, state.set) self.assertEquals(0, state.nextRequestTime) oaiDownloadProcessor.setSet('s') oaiDownloadProcessor.setPath('/p') oaiDownloadProcessor.setMetadataPrefix('pref') oaiDownloadProcessor.observable_setName('aName') consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % RESUMPTION_TOKEN)))) state = oaiDownloadProcessor.getState() self.assertEquals("x?y&z", state.resumptionToken) self.assertEquals('2002-06-01T19:20:30Z', state.from_) self.assertEquals(None, state.errorState) self.assertEquals('aName', state.name) self.assertEquals("/p", state.path) self.assertEquals("pref", state.metadataPrefix) self.assertEquals('s', state.set) self.assertEquals(0, state.nextRequestTime) # Change state of oaiDownloadProcessor -> changes stateView. oaiDownloadProcessor.setSet('x') self.assertEquals('x', state.set) oaiDownloadProcessor2 = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, err=StringIO()) state2 = oaiDownloadProcessor2.getState() self.assertEquals(None, state2.name) self.assertEquals("oai_dc", state2.metadataPrefix) self.assertEquals("x?y&z", state2.resumptionToken) self.assertEquals('2002-06-01T19:20:30Z', state2.from_) self.assertEquals(None, state2.errorState) self.assertEquals(0, state.nextRequestTime)
def testScheduleNextRequest(self): oaiDownloadProcessor = OaiDownloadProcessor(path='/p', metadataPrefix='p', workingDirectory=self.tempdir) oaiDownloadProcessor._time = lambda: 17 consume(oaiDownloadProcessor.handle(parse(StringIO(LISTRECORDS_RESPONSE % '')))) self.assertTrue(oaiDownloadProcessor._earliestNextRequestTime > 17) oaiDownloadProcessor.scheduleNextRequest() self.assertEquals(0, oaiDownloadProcessor._earliestNextRequestTime) self.assertEquals(True, oaiDownloadProcessor._timeForNextRequest()) self.assertNotEqual(None, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor.scheduleNextRequest(Schedule(period=0)) self.assertEquals(17, oaiDownloadProcessor._earliestNextRequestTime) self.assertEquals(True, oaiDownloadProcessor._timeForNextRequest()) self.assertNotEqual(None, oaiDownloadProcessor.buildRequest()) oaiDownloadProcessor.scheduleNextRequest(Schedule(period=120)) self.assertEquals(137, oaiDownloadProcessor._earliestNextRequestTime) self.assertEquals(False, oaiDownloadProcessor._timeForNextRequest()) self.assertEquals(None, oaiDownloadProcessor.buildRequest())
def testUpdateRequestAfterSetResumptionToken(self): oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", set="aSet", workingDirectory=self.tempdir, xWait=False) oaiDownloadProcessor.setPath('/otherOai') oaiDownloadProcessor.setFrom('2014') oaiDownloadProcessor.setResumptionToken('ReSumptionToken') self.assertEquals("""GET /otherOai?verb=ListRecords&resumptionToken=ReSumptionToken HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest())
def testSetIncrementalHarvestScheduleNotAllowedInCaseOfRestartAfterFinish(self): oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=False, err=StringIO(), restartAfterFinish=True) self.assertRaises(ValueError, lambda: oaiDownloadProcessor.setIncrementalHarvestSchedule(schedule=Schedule(period=3)))
def testPartitionRequest(self): oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, partition="1/2") self.assertEquals("""GET /oai?verb=ListRecords&metadataPrefix=oai_dc&x-partition=1%%2F2&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x\r\n\r\n""" % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest())
def testRequestWithAdditionalUserAgent(self): oaiDownloadProcessor = OaiDownloadProcessor(path="/oai", metadataPrefix="oai_dc", workingDirectory=self.tempdir, xWait=True, userAgentAddition="From a certain server") self.assertEquals("""GET /oai?verb=ListRecords&metadataPrefix=oai_dc&x-wait=True HTTP/1.0\r\nX-Meresco-Oai-Client-Identifier: %s\r\nUser-Agent: Meresco-Oai-DownloadProcessor/5.x (From a certain server)\r\n\r\n""" % oaiDownloadProcessor._identifier, oaiDownloadProcessor.buildRequest())