def testRefreshWithIgnoredRecords(self): log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY) log.startRepository() for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,2,120,121]]: if uploadId == '%s:oai:record:02' % (REPOSITORY): uploadId = '%s:oai:record:02/&gkn' % (REPOSITORY) log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123,124]]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [7,8,125,126,127,128]]: log.notifyHarvestedRecord(uploadId) log.logInvalidData(uploadId, 'ignored message') log.logIgnoredIdentifierWarning(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() totalRecords = 15 oldUploads = 2 oldDeletes = 3 oldIgnoreds = 4 self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='refresh') self.startHarvester(repository=REPOSITORY) # Smoot init self.assertEquals(0, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) # Smooth harvest self.startHarvester(repository=REPOSITORY) # Smooth harvest self.assertEquals(totalRecords, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) # Smooth finish self.assertEquals(totalRecords + oldUploads + oldIgnoreds, self.sizeDumpDir()) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(invalidIds), invalidIds) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(13, len(ids), ids)
def testClearWithInvalidRecords(self): log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY) log.startRepository() for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,2,120,121]]: log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123,124]]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [7,8,125,126,127,128]]: log.notifyHarvestedRecord(uploadId) log.logInvalidData(uploadId, 'ignored message') log.logIgnoredIdentifierWarning(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() oldUploads = 4 oldDeletes = 5 oldInvalids = 6 self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='clear') self.startHarvester(repository=REPOSITORY) self.assertEquals(oldUploads+oldInvalids, self.sizeDumpDir()) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(invalidIds), invalidIds) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(ids), ids)
def testHasWorkBeforeAndAfterDoingWork(self): logger = HarvesterLog(stateDir=self.stateDir, logDir=self.logDir, name= 'name') self.assertTrue(logger.hasWork()) logger.startRepository() logger.endRepository(None, strftime("%Y-%m-%dT%H:%M:%SZ", logger._state._gmtime())) logger.close() logger = HarvesterLog(stateDir=self.stateDir, logDir=self.logDir, name= 'name') self.assertFalse(logger.hasWork())
def testLoggingAlwaysStartsNewline(self): "Tests an old situation that when a log was interrupted, it continued on the same line" f = open(self.stateDir+'/name.stats','w') f.write('Started: 2005-01-02 16:12:56, Harvested/Uploaded/Total: 199/200/1650, Don"crack"') f.close() logger = HarvesterLog(stateDir=self.stateDir, logDir=self.logDir, name= 'name') logger.startRepository() logger.close() lines = open(self.stateDir+'/name.stats').readlines() self.assertEqual(2,len(lines))
def testHasWorkWithResumptionTokenContinuous(self): logger = HarvesterLog(stateDir=self.stateDir, logDir=self.logDir, name= 'name') self.assertTrue(logger.hasWork(continuousInterval=60)) logger.startRepository() logger.endRepository('resumptionToken', strftime("%Y-%m-%dT%H:%M:%SZ", logger._state._gmtime())) logger.close() logger = HarvesterLog(stateDir=self.stateDir, logDir=self.logDir, name= 'name') self.assertTrue(logger.hasWork(continuousInterval=60)) logger.startRepository() logger.endRepository('resumptionToken2', strftime("%Y-%m-%dT%H:%M:%SZ", gmtime(time() - 60 - 1))) logger.close()
def testMarkDeleted(self): f = open(self.stateDir+'/name.stats','w') f.write('Started: 2005-01-02 16:12:56, Harvested/Uploaded/Total: 199/200/1650, Done: 2005-04-22 11:48:30, ResumptionToken: resumption') f.close() logger = HarvesterLog(stateDir=self.stateDir, logDir=self.logDir, name='name') self.assertEquals('resumption', logger._state.token) logger.markDeleted() logger.close() logger = HarvesterLog(stateDir=self.stateDir, logDir=self.logDir, name='name') self.assertEquals(None, logger._state.token) self.assertEquals(None, logger._state.from_) self.assertEquals(0, logger.totalIds())
def testRefresh(self): oldlogs = self.getLogs() log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY) log.startRepository() for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,7,120,121]]: log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123]]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='refresh') self.startHarvester(repository=REPOSITORY) logs = self.getLogs()[len(oldlogs):] self.assertEquals(0, len(logs)) self.startHarvester(repository=REPOSITORY) logs = self.getLogs() self.assertEquals('/oai', logs[-1]["path"]) self.assertEquals({'verb': ['ListRecords'], 'metadataPrefix': ['oai_dc']}, logs[-1]["arguments"]) statsFile = join(self.harvesterStateDir, DOMAIN, '%s.stats' % REPOSITORY) token = getResumptionToken(open(statsFile).readlines()[-1]) self.startHarvester(repository=REPOSITORY) logs = self.getLogs() self.assertEquals('/oai', logs[-1]["path"]) self.assertEquals({'verb': ['ListRecords'], 'resumptionToken': [token]}, logs[-1]["arguments"]) self.assertEquals(15, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) self.assertEquals(17, self.sizeDumpDir()) deleteFiles = [join(self.dumpDir, f) for f in listdir(self.dumpDir) if '_delete' in f] deletedIds = set([xpathFirst(parse(open(x)), '//ucp:recordIdentifier/text()') for x in deleteFiles]) self.assertEquals(set(['%s:oai:record:03' % REPOSITORY, '%s:oai:record:06' % REPOSITORY, '%s:oai:record:120' % REPOSITORY, '%s:oai:record:121' % REPOSITORY]), deletedIds) logs = self.getLogs()[len(oldlogs):] self.startHarvester(repository=REPOSITORY) self.assertEquals(len(logs), len(self.getLogs()[len(oldlogs):]), 'Action is over, expect nothing more.')
def testLogLineError(self): logger = HarvesterLog(stateDir=self.stateDir, logDir=self.logDir, name='name') logger.startRepository() try: logger.notifyHarvestedRecord("name:uploadId1") logger.uploadIdentifier("name:uploadId1") logger.notifyHarvestedRecord("name:uploadId2") raise Exception('FATAL') except: exType, exValue, exTb = exc_info() logger.endWithException(exType, exValue, exTb) logger.close() lines = open(self.stateDir+'/name.stats').readlines() eventline = open(self.logDir+'/name.events').readlines()[0].strip() #Total is now counted based upon the id's self.assertTrue('2/1/0/1, Error: ' in lines[0], lines[0]) date,event,id,comments = LOGLINE_RE.match(eventline).groups() self.assertEquals('ERROR', event.strip()) self.assertEquals('name', id) self.assertTrue(comments.startswith('Traceback (most recent call last):|File "')) self.assertTrue('harvesterlogtest.py", line ' in comments) self.assertTrue(comments.endswith(', in testLogLineError raise Exception(\'FATAL\')|Exception: FATAL'))
def testLogLine(self): logger = HarvesterLog(stateDir=self.stateDir, logDir=self.logDir, name= 'name') logger.startRepository() logger.notifyHarvestedRecord("name:uploadId1") logger.uploadIdentifier("name:uploadId1") logger.notifyHarvestedRecord("name:uploadId1") logger.deleteIdentifier("name:uploadId1") logger.notifyHarvestedRecord("name:uploadId2") logger.logInvalidData("name:uploadId2", "Test Exception") logger.logIgnoredIdentifierWarning("name:uploadId2") logger.endRepository(None, '2012-01-01T09:00:00Z') logger.close() lines = open(self.stateDir + '/name.stats').readlines() eventline = open(self.logDir + '/name.events').readlines()[1].strip() invalidUploadId2 = open(self.logDir + '/invalid/name/uploadId2').read() #Total is now counted based upon the id's self.assertTrue('3/1/1/0, Done:' in lines[0], lines[0]) date, event, id, comments = LOGLINE_RE.match(eventline).groups() self.assertEquals('SUCCES', event.strip()) self.assertEquals('name', id) self.assertEquals('Harvested/Uploaded/Deleted/Total: 3/1/1/0, ResumptionToken: None', comments) self.assertEquals('Test Exception', invalidUploadId2)