def testRefreshWithIgnoredRecords(self): log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY) log.startRepository() for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,2,120,121]]: if uploadId == '%s:oai:record:02' % (REPOSITORY): uploadId = '%s:oai:record:02/&gkn' % (REPOSITORY) log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123,124]]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [7,8,125,126,127,128]]: log.notifyHarvestedRecord(uploadId) log.logInvalidData(uploadId, 'ignored message') log.logIgnoredIdentifierWarning(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() totalRecords = 15 oldUploads = 2 oldDeletes = 3 oldIgnoreds = 4 self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='refresh') self.startHarvester(repository=REPOSITORY) # Smoot init self.assertEquals(0, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) # Smooth harvest self.startHarvester(repository=REPOSITORY) # Smooth harvest self.assertEquals(totalRecords, self.sizeDumpDir()) self.startHarvester(repository=REPOSITORY) # Smooth finish self.assertEquals(totalRecords + oldUploads + oldIgnoreds, self.sizeDumpDir()) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(invalidIds), invalidIds) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(13, len(ids), ids)
def testClearWithInvalidRecords(self): log = HarvesterLog(stateDir=join(self.harvesterStateDir, DOMAIN), logDir=join(self.harvesterLogDir, DOMAIN), name=REPOSITORY) log.startRepository() for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [1,2,120,121]]: log.notifyHarvestedRecord(uploadId) log.uploadIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [4,5,122,123,124]]: log.notifyHarvestedRecord(uploadId) log.deleteIdentifier(uploadId) for uploadId in ['%s:oai:record:%02d' % (REPOSITORY, i) for i in [7,8,125,126,127,128]]: log.notifyHarvestedRecord(uploadId) log.logInvalidData(uploadId, 'ignored message') log.logIgnoredIdentifierWarning(uploadId) log.endRepository('token', '2012-01-01T09:00:00Z') log.close() oldUploads = 4 oldDeletes = 5 oldInvalids = 6 self.saveRepository(DOMAIN, REPOSITORY, REPOSITORYGROUP, action='clear') self.startHarvester(repository=REPOSITORY) self.assertEquals(oldUploads+oldInvalids, self.sizeDumpDir()) invalidIds = open(join(self.harvesterStateDir, DOMAIN, "%s_invalid.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(invalidIds), invalidIds) ids = open(join(self.harvesterStateDir, DOMAIN, "%s.ids" % REPOSITORY)).readlines() self.assertEquals(0, len(ids), ids)
def testInvalidIDs(self): logger = HarvesterLog(stateDir=self.stateDir, logDir=self.logDir, name='name') logger.startRepository() logger.notifyHarvestedRecord('id:1') logger.logInvalidData('id:1', 'exception message') logger.notifyHarvestedRecord('id:2') logger.logInvalidData('id:2', 'exception message') self.assertEquals(['id:1', 'id:2'], logger.invalidIds())
def testLogInvalidData(self): logger = HarvesterLog(stateDir=self.stateDir, logDir=self.logDir, name='name') logger.startRepository() logger.notifyHarvestedRecord('repo/id:oai:bla/bla') logger.logInvalidData('repo/id:oai:bla/bla', "Error") self.assertEquals(1, logger.totalInvalidIds()) expectedFile = self.logDir + '/invalid/repo%2Fid/oai:bla%2Fbla' self.assertEquals("Error", open(expectedFile).read()) logger.notifyHarvestedRecord('repo/id:oai:bla/bla') self.assertEquals(0, logger.totalInvalidIds()) self.assertFalse(isfile(expectedFile))
def testLogIgnoredIdentifierWarning(self): logger = HarvesterLog(stateDir=self.stateDir, logDir=self.logDir, name='name') logger.startRepository() logger.notifyHarvestedRecord('repoid:oai:bla/bla') logger.logInvalidData('repoid:oai:bla/bla', 'bla/bla') self.assertEquals('', open(self.logDir + '/name.events').read()) logger.logIgnoredIdentifierWarning('repoid:oai:bla/bla') self.assertTrue(open(self.logDir + '/name.events').read().endswith("\tWARNING\t[repoid:oai:bla/bla]\tIGNORED\n")) self.assertEquals(1, logger.totalInvalidIds()) logger.notifyHarvestedRecord('repoid:oai:bla/bla') self.assertEquals(0, logger.totalInvalidIds()) logger.uploadIdentifier('repoid:oai:bla/bla') self.assertEquals(1, logger.totalIds())
def testLogLine(self): logger = HarvesterLog(stateDir=self.stateDir, logDir=self.logDir, name= 'name') logger.startRepository() logger.notifyHarvestedRecord("name:uploadId1") logger.uploadIdentifier("name:uploadId1") logger.notifyHarvestedRecord("name:uploadId1") logger.deleteIdentifier("name:uploadId1") logger.notifyHarvestedRecord("name:uploadId2") logger.logInvalidData("name:uploadId2", "Test Exception") logger.logIgnoredIdentifierWarning("name:uploadId2") logger.endRepository(None, '2012-01-01T09:00:00Z') logger.close() lines = open(self.stateDir + '/name.stats').readlines() eventline = open(self.logDir + '/name.events').readlines()[1].strip() invalidUploadId2 = open(self.logDir + '/invalid/name/uploadId2').read() #Total is now counted based upon the id's self.assertTrue('3/1/1/0, Done:' in lines[0], lines[0]) date, event, id, comments = LOGLINE_RE.match(eventline).groups() self.assertEquals('SUCCES', event.strip()) self.assertEquals('name', id) self.assertEquals('Harvested/Uploaded/Deleted/Total: 3/1/1/0, ResumptionToken: None', comments) self.assertEquals('Test Exception', invalidUploadId2)
def testClearInvalidData(self): logger = HarvesterLog(stateDir=self.stateDir, logDir=self.logDir, name='name') logger.startRepository() logger.notifyHarvestedRecord('repoid:oai:bla/bla') logger.logInvalidData('repoid:oai:bla/bla', "Error") self.assertTrue(isfile(self.logDir + '/invalid/repoid/oai:bla%2Fbla')) logger.notifyHarvestedRecord('repoid:recordid') logger.logInvalidData('repoid:recordid', "Error") self.assertTrue(isfile(self.logDir + '/invalid/repoid/recordid')) logger.notifyHarvestedRecord('repo2:1') logger.logInvalidData('repo2:1', "Error") self.assertTrue(isfile(self.logDir + '/invalid/repo2/1')) self.assertEquals(['repoid:oai:bla/bla', 'repoid:recordid', 'repo2:1'], logger.invalidIds()) logger.clearInvalidData('repoid') self.assertEquals(['repo2:1'], logger.invalidIds()) self.assertFalse(isfile(self.logDir + '/invalid/repoid/oai:bla%2Fbla')) self.assertFalse(isfile(self.logDir + '/invalid/repoid/recordid')) self.assertTrue(isfile(self.logDir + '/invalid/repo2/1'))