def testErrorReportedToGustos(self): baseUrl = join(self.integrationTempdir, "choppy_oai.xml") filename = "{}?verb=ListRecords&metadataPrefix=oai_dc".format(baseUrl) with open(filename, "w") as fp: fp.write("""<?xml version="1.0" encoding="UTF-8"?> <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2017-10-31T15:12:52Z</responseDate><request from="2017-10-04T11:52:57Z" metadataPrefix="didl_mods" verb="ListRecords">https://surfsharekit.nl/oai/hhs/</request><ListRecords><record><header><identifier>oai:surfsharekit.nl:b6ea6503-e2fc-4974-8941-2a4a405dc72f</identifier><datestamp>2017-10-04T14:16:22Z</datestamp></header><metadata><didl:DIDL xmlns:didl="urn:mpeg:mpeg21:2002:02-DIDL-NS" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <didl:Item> <didl:Descriptor> <didl:Statement mimeType="application/xml"> <dii:Identifier xmlns:dii="urn:mpeg:mpeg21:2002:01-DII-NS">urn:nbn:nl:hs:18-b6ea6503-e2fc-4974-8941-2a4a405dc72f</dii:Identifier> </didl:Statement> </didl:Descrip""") errorCount = len(self.gustosUdpListener.log()) self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, baseUrl="file://{}".format(baseUrl)) t = Thread( target=lambda: self.startHarvester(concurrency=1, runOnce=True)) t.start() sleepWheel(5) last_logs = [ JsonDict.loads(l)['data'] for l in self.gustosUdpListener.log()[errorCount:] ] for data in reversed(last_logs): my_group_log = data.get(f'Harvester ({DOMAIN})', {}).get(f'{REPOSITORYGROUP}:{REPOSITORY}') if my_group_log is not None: break self.assertEqual({"count": 1}, my_group_log['errors'])
def testDontHarvestDeletedRepository(self): stdoutfile = join(self.integrationTempdir, "stdouterr-meresco-harvester-harvester.log") self.saveRepository(DOMAIN, 'xyz', REPOSITORYGROUP) t = Thread( target=lambda: self.startHarvester(concurrency=1, runOnce=False)) t.start() while not listdir(self.dumpDir): sleep(0.1) sleepWheel(1) def _readFile(name): with open(name) as fp: return fp.read() log = _readFile(stdoutfile) xyzOccurrences = log.count('[xyz]') self.removeRepository(DOMAIN, 'xyz', REPOSITORYGROUP) log = _readFile(stdoutfile) try: newXyzOccurrences = log.count('[xyz]') self.assertEqual( xyzOccurrences, newXyzOccurrences, "%s!=%s\n%s" % (xyzOccurrences, newXyzOccurrences, log)) finally: t.join()
def testGcWithoutWait(self): directory = join(self.tempdir, 'store') for x in xrange(3): try: s = SequentialStorage(directory) for i in xrange(99999): s.add('identifier%s' % i, 'data%s' % i) s.commit() size = s.getSizeOnDisk() self.assertTrue(size > 1000, size) for i in xrange(0, 99999, 3): # delete some s.delete('identifier%s' % i) s.commit() newSize = s.getSizeOnDisk() self.assertTrue(newSize >= size, (newSize, size)) s.gc() newSizeAfterGcStart = s.getSizeOnDisk() self.assertTrue(newSizeAfterGcStart >= newSize, (newSizeAfterGcStart, newSize)) # grows a little initially s.commit() newSize = s.getSizeOnDisk() self.assertTrue(newSize >= newSizeAfterGcStart, (newSize, newSizeAfterGcStart)) sleepWheel(1) s.commit() newSize = s.getSizeOnDisk() self.assertTrue(newSize < newSizeAfterGcStart, (newSize, newSizeAfterGcStart)) finally: s.close() rmtree(directory)
def testNearRealtimeOaiSavesState(self): observer = CallTrace("observer", ignoredAttributes=["observer_init"], methods={'add': lambda **kwargs: (x for x in [])}) oaiJazz = OaiJazz(join(self.tempdir, 'oai')) oaiJazz.updateMetadataFormat(prefix="prefix", schema="", namespace="") suspendRegister = SuspendRegister() oaiJazz.addObserver(suspendRegister) storageComponent = MultiSequentialStorage(join(self.tempdir, 'storage')) self._addOaiRecords(storageComponent, oaiJazz, 1) oaiPmhThread = None harvestThread = None def start(): global oaiPmhThread, harvestThread self.run = True portNumber = randint(50000, 60000) oaiPmhThread = Thread( None, lambda: self.startOaiPmh( portNumber, oaiJazz, storageComponent, suspendRegister)) harvestThread = Thread( None, lambda: self.startOaiHarvester(portNumber, observer)) oaiPmhThread.start() harvestThread.start() def stop(): global oaiPmhThread, harvestThread self.run = False oaiPmhThread.join() oaiPmhThread = None harvestThread.join() harvestThread = None start() requests = 1 sleepWheel(1.0 + 1.0 * requests) self.assertEqual(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode']) self.assertTrue("id0" in kwarg, kwarg) stop() observer.calledMethods.reset() storageComponent.addData(identifier="id1", name="prefix", data=b"<a>a1</a>") oaiJazz.addOaiRecord(identifier="id1", metadataPrefixes=["prefix"]) start() requests = 1 sleepWheel(1.0 + 1.0 * requests) self.assertEqual(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode']) self.assertFalse("id0" in kwarg, kwarg) self.assertTrue("id1" in kwarg, kwarg) stop()
def testBigAndRandomlyOverwrittenStore(self): def makeId(i): return "http://example.org/identifier/%s" % i N = 100000 M = N * 4 directory = '/data/test/diskspacetest' if isdir(directory): rmtree(directory) storeDir = join(directory, 'store') makedirs(storeDir) c = SequentialStorage(storeDir) print 'size???', getSimpleDirSize(storeDir) sys.stdout.flush() with open(join(directory, 'diskspace.log'), 'w') as f: for i in xrange(N): identifier=makeId(i) data=RECORD % i c.add(identifier=identifier, data=data) if i % 1000 == 0: f.write("%s, %s\n" % (i, getSimpleDirSize(storeDir))) print i sys.stdout.flush() c.commit() print 'committed' for j in xrange(M): i = randint(1, N) identifier=makeId(i) data=RECORD % j c.add(identifier=identifier, data=data) if j % 1000 == 0: f.write("%s, %s\n" % (i, getSimpleDirSize(storeDir))) print j, i sleepWheel(1.0) if j % 10000 == 0: t = time() c.commit() print 'commit took %s' % (time() - t) sys.stdout.flush() sleepWheel(2.0) t = time() c.close() print 'close took %s' % (time() - t) f.write("%s, %s\n" % (i, getSimpleDirSize(storeDir)))
def testContinuousHarvest(self): oldlogs = self.getLogs() self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, continuous=1) t = Thread(target=lambda: self.startHarvester(concurrency=1, runOnce=False, repository=REPOSITORY)) t.start() try: sleepWheel(5) logs = self.getLogs()[len(oldlogs):] self.assertTrue(len(logs) > 2, logs) self.assertEqual({'path': '/oai', 'arguments': {'verb': ['ListRecords'], 'metadataPrefix': ['oai_dc']}}, logs[0]) self.assertTrue('resumptionToken' in logs[1]['arguments'], logs[1]) self.assertTrue('from' in logs[2]['arguments'], logs[2]) finally: t.join()
def _createDatabase(self): if self.fastMode: print "Reusing database in", self.integrationTempdir return start = time() print "Creating database in", self.integrationTempdir try: self._runExecutable(join(self.testdataDir, 'upload.py'), processName='IntegrationUpload', cwd=self.testdataDir, port=self.httpPort, redirect=False, timeoutInSeconds=20) sleepWheel(5) print "Finished creating database in %s seconds" % (time() - start) except Exception: print 'Error received while creating database for', self.stateName print_exc() exit(1)
def _createDatabase(self): if self.fastMode: print "Reusing database in", self.integrationTempdir return start = time() print "Creating database in", self.integrationTempdir try: self._runExecutable(join(self.testdataDir, 'upload.py'), processName='IntegrationUpload', cwd=self.testdataDir, port=self.httpPort, redirect=False, timeoutInSeconds=60) sleepWheel(5) postRequest(self.luceneServerPort, "/default/settings/", data=JsonDict(commitCount=1).dumps(), parse=False) print "Finished creating database in %s seconds" % (time() - start) except Exception: print 'Error received while creating database for', self.stateName print_exc() exit(1)
def testNearRealtimeOaiSavesState(self): observer = CallTrace("observer", ignoredAttributes=["observer_init"], methods={'add': lambda **kwargs: (x for x in [])}) oaiJazz = OaiJazz(join(self.tempdir, 'oai')) suspendRegister = SuspendRegister() oaiJazz.addObserver(suspendRegister) storageComponent = MultiSequentialStorage(join(self.tempdir, 'storage')) self._addOaiRecords(storageComponent, oaiJazz, 1) oaiPmhThread = None harvestThread = None def start(): global oaiPmhThread, harvestThread self.run = True portNumber = randint(50000, 60000) oaiPmhThread = Thread(None, lambda: self.startOaiPmh(portNumber, oaiJazz, storageComponent, suspendRegister)) harvestThread = Thread(None, lambda: self.startOaiHarvester(portNumber, observer)) oaiPmhThread.start() harvestThread.start() def stop(): global oaiPmhThread, harvestThread self.run = False oaiPmhThread.join() oaiPmhThread = None harvestThread.join() harvestThread = None start() requests = 1 sleepWheel(1.0 + 1.0 * requests) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode']) self.assertTrue("id0" in kwarg, kwarg) stop() observer.calledMethods.reset() storageComponent.addData(identifier="id1", name="prefix", data="<a>a1</a>") oaiJazz.addOaiRecord(identifier="id1", sets=[], metadataFormats=[("prefix", "", "")]) start() requests = 1 sleepWheel(1.0 + 1.0 * requests) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode']) self.assertFalse("id0" in kwarg, kwarg) self.assertTrue("id1" in kwarg, kwarg) stop()
def testStartHarvestingAddedRepository(self): t = Thread(target=lambda: self.startHarvester(concurrency=1, runOnce=False)) t.start() while not listdir(self.dumpDir): sleep(0.1) self.saveRepository(DOMAIN, 'xyz', REPOSITORYGROUP) stdoutfile = join(self.integrationTempdir, "stdouterr-meresco-harvester-harvester.log") sleepWheel(5) log = open(stdoutfile).read() try: self.assertTrue('xyz' in log, log) finally: self.removeRepository(DOMAIN, 'xyz', REPOSITORYGROUP) t.join()
def testErrorReportedToGustos(self): baseUrl = join(self.integrationTempdir, "choppy_oai.xml") filename = "{}?verb=ListRecords&metadataPrefix=oai_dc".format(baseUrl) with open(filename, "w") as fp: fp.write("""<?xml version="1.0" encoding="UTF-8"?> <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2017-10-31T15:12:52Z</responseDate><request from="2017-10-04T11:52:57Z" metadataPrefix="didl_mods" verb="ListRecords">https://surfsharekit.nl/oai/hhs/</request><ListRecords><record><header><identifier>oai:surfsharekit.nl:b6ea6503-e2fc-4974-8941-2a4a405dc72f</identifier><datestamp>2017-10-04T14:16:22Z</datestamp></header><metadata><didl:DIDL xmlns:didl="urn:mpeg:mpeg21:2002:02-DIDL-NS" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <didl:Item> <didl:Descriptor> <didl:Statement mimeType="application/xml"> <dii:Identifier xmlns:dii="urn:mpeg:mpeg21:2002:01-DII-NS">urn:nbn:nl:hs:18-b6ea6503-e2fc-4974-8941-2a4a405dc72f</dii:Identifier> </didl:Statement> </didl:Descrip""") errorCount = len(self.gustosUdpListener.log()) self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, baseUrl="file://{}".format(baseUrl)) t = Thread(target=lambda: self.startHarvester(concurrency=1, runOnce=True)) t.start() sleepWheel(4) self.assertEqual(errorCount + 1, len(self.gustosUdpListener.log()))
def _createDatabase(self): if self.fastMode: print "Reusing database in", self.integrationTempdir return start = time() print "Creating database in", self.integrationTempdir try: for f in listdir(self.testdataDir): postRequest(self.gatewayPort, '/update', data=open(join(self.testdataDir, f)).read(), parse=False) sleepWheel(2) print "Finished creating database in %s seconds" % (time() - start) except Exception: print 'Error received while creating database for', self.stateName print_exc() sleep(1) exit(1)
def testSleepWheelNoCallback(self): t0 = time() with stdout_replaced(): retval = sleepWheel(0.01, interval=0.001) t1 = time() delta = t1 - t0 self.assertTrue(0.01 < delta < max(0.02, (0.02 * T_ADJUSTMENT * T)), delta) self.assertEqual(False, retval)
def testSleepWheelNoCallback(self): t0 = time() with stdout_replaced(): retval = sleepWheel(0.01, interval=0.001) t1 = time() delta = t1 - t0 self.assertTrue(0.01 < delta < max(0.02, (0.02 * T_ADJUSTMENT * T)), delta) self.assertEquals(False, retval)
def testStartHarvestingAddedRepository(self): t = Thread( target=lambda: self.startHarvester(concurrency=1, runOnce=False)) t.start() while not listdir(self.dumpDir): sleep(0.1) self.saveRepository(DOMAIN, 'xyz', REPOSITORYGROUP) stdoutfile = join(self.integrationTempdir, "stdouterr-meresco-harvester-harvester.log") sleepWheel(5) log = open(stdoutfile).read() try: self.assertTrue('xyz' in log, log) finally: self.removeRepository(DOMAIN, 'xyz', REPOSITORYGROUP) t.join()
def testDontHarvestDeletedRepository(self): stdoutfile = join(self.integrationTempdir, "stdouterr-meresco-harvester-harvester.log") self.saveRepository(DOMAIN, 'xyz', REPOSITORYGROUP) t = Thread(target=lambda: self.startHarvester(concurrency=1, runOnce=False)) t.start() while not listdir(self.dumpDir): sleep(0.1) sleepWheel(1) log = open(stdoutfile).read() xyzOccurrences = log.count('[xyz]') self.removeRepository(DOMAIN, 'xyz', REPOSITORYGROUP) sleepWheel(5) log = open(stdoutfile).read() try: self.assertFalse('Traceback' in log, log) newXyzOccurrences = log.count('[xyz]') self.assertEquals(xyzOccurrences, newXyzOccurrences, "%s!=%s\n%s" % (xyzOccurrences, newXyzOccurrences, log)) finally: t.join()
def _createDatabase(self): if self.fastMode: print "Reusing database in", self.integrationTempdir return start = time() print "Creating database in", self.integrationTempdir try: for f in sorted(glob(self.testdataDir + '/*.updateRequest')): # for f in listdir(self.testdataDir): print "Uploading file:", f postRequest(self.gatewayPort, '/update', data=open(join(self.testdataDir, f)).read(), parse=False) sleepWheel(2) print "Finished creating database in %s seconds" % (time() - start) except Exception: print 'Error received while creating database for', self.stateName print_exc() sleep(1) exit(1)
def testNearRealtimeOai(self): self.run = True portNumber = randint(50000, 60000) suspendRegister = SuspendRegister() oaiJazz = OaiJazz(join(self.tempdir, 'oai')) oaiJazz.updateMetadataFormat(prefix="prefix", schema="", namespace="") oaiJazz.addObserver(suspendRegister) storageComponent = MultiSequentialStorage(join(self.tempdir, 'storage')) self._addOaiRecords(storageComponent, oaiJazz, 3) oaiPmhThread = Thread( None, lambda: self.startOaiPmh(portNumber, oaiJazz, storageComponent, suspendRegister)) observer = CallTrace("observer", ignoredAttributes=["observer_init"], methods={'add': lambda **kwargs: (x for x in [])}) harvestThread = Thread( None, lambda: self.startOaiHarvester(portNumber, observer)) oaiPmhThread.start() harvestThread.start() try: requests = 3 sleepWheel(1.0 + 1.0 * requests) self.assertEqual([ 'startOaiBatch', 'add', 'add', 'stopOaiBatch', 'startOaiBatch', 'add', 'stopOaiBatch' ], [m.name for m in observer.calledMethods]) ids = [ xpath(m.kwargs['lxmlNode'], '//oai:header/oai:identifier/text()') for m in observer.calledMethods if m.name == 'add' ] self.assertEqual([['id0'], ['id1'], ['id2']], ids) self.assertEqual(1, len(suspendRegister)) observer.calledMethods.reset() requests += 1 storageComponent.addData(identifier="id3", name="prefix", data=b"<a>a3</a>") oaiJazz.addOaiRecord(identifier="id3", metadataPrefixes=["prefix"]) sleepWheel(1) self.assertEqual(0, len(suspendRegister)) self.assertEqual(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode']) self.assertTrue("id3" in kwarg, kwarg) sleepWheel(1.0) self.assertEqual(1, len(suspendRegister)) finally: self.run = False oaiPmhThread.join() harvestThread.join() oaiJazz.close()
def testContinuousHarvest(self): oldlogs = self.getLogs() self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, continuous=1) t = Thread(target=lambda: self.startHarvester( concurrency=1, runOnce=False, repository=REPOSITORY)) t.start() try: sleepWheel(5) logs = self.getLogs()[len(oldlogs):] self.assertTrue(len(logs) > 2, logs) self.assertEqual( { 'path': '/oai', 'arguments': { 'verb': ['ListRecords'], 'metadataPrefix': ['oai_dc'] } }, logs[0]) self.assertTrue('resumptionToken' in logs[1]['arguments'], logs[1]) self.assertTrue('from' in logs[2]['arguments'], logs[2]) finally: t.join()
def testSleepWheelCallbackFalsy(self): calls = [] def callback(): calls.append(True) t0 = time() with stdout_replaced() as out: retval = sleepWheel(0.01, interval=0.001, callback=callback) t1 = time() self.assertEqual('\\\x08|\x08/\x08-\x08\\\x08|\x08/\x08-\x08\\\x08|\x08', out.getvalue()) delta = t1 - t0 self.assertTrue(0.01 < delta < max(0.02, (0.02 * T_ADJUSTMENT * T)), delta) self.assertEqual(10, len(calls)) self.assertEqual(False, retval)
def testSleepWheelCallbackFalsy(self): calls = [] def callback(): calls.append(True) t0 = time() with stdout_replaced() as out: retval = sleepWheel(0.01, interval=0.001, callback=callback) t1 = time() self.assertEquals('\\\x08|\x08/\x08-\x08\\\x08|\x08/\x08-\x08\\\x08|\x08', out.getvalue()) delta = t1 - t0 self.assertTrue(0.01 < delta < max(0.02, (0.02 * T_ADJUSTMENT * T)), delta) self.assertEquals(10, len(calls)) self.assertEquals(False, retval)
def testNearRealtimeOai(self): self.run = True portNumber = randint(50000, 60000) suspendRegister = SuspendRegister() oaiJazz = OaiJazz(join(self.tempdir, 'oai')) oaiJazz.addObserver(suspendRegister) storageComponent = MultiSequentialStorage(join(self.tempdir, 'storage')) self._addOaiRecords(storageComponent, oaiJazz, 3) oaiPmhThread = Thread(None, lambda: self.startOaiPmh(portNumber, oaiJazz, storageComponent, suspendRegister)) observer = CallTrace("observer", ignoredAttributes=["observer_init"], methods={'add': lambda **kwargs: (x for x in [])}) harvestThread = Thread(None, lambda: self.startOaiHarvester(portNumber, observer)) oaiPmhThread.start() harvestThread.start() try: requests = 3 sleepWheel(1.0 + 1.0 * requests) self.assertEquals(['startOaiBatch', 'add', 'add', 'stopOaiBatch', 'startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) ids = [xpath(m.kwargs['lxmlNode'], '//oai:header/oai:identifier/text()') for m in observer.calledMethods if m.name == 'add'] self.assertEquals([['id0'],['id1'],['id2']], ids) self.assertEquals(1, len(suspendRegister)) observer.calledMethods.reset() requests += 1 storageComponent.addData(identifier="id3", name="prefix", data="<a>a3</a>") oaiJazz.addOaiRecord(identifier="id3", sets=[], metadataFormats=[("prefix", "", "")]) sleepWheel(1) self.assertEquals(0, len(suspendRegister)) self.assertEquals(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode']) self.assertTrue("id3" in kwarg, kwarg) sleepWheel(1.0) self.assertEquals(1, len(suspendRegister)) finally: self.run = False oaiPmhThread.join() harvestThread.join() oaiJazz.close()
def _createDatabase(self): if self.fastMode: print "Reusing database in", self.integrationTempdir return print "Creating database in", self.integrationTempdir sleepWheel(28) # give ErfGeoEnrichment service etc. some time to process and commit