def setUp(self): SeecrTestCase.setUp(self) self.stateDir = join(self.tempdir, "state") self.logDir = join(self.tempdir, "log") self.domainId = "adomain" makedirs(join(self.stateDir, self.domainId)) repoId1LogDir = join(self.logDir, self.domainId, "invalid", "repoId1") repoId2LogDir = join(self.logDir, self.domainId, "invalid", escapeFilename("repoId/2")) makedirs(repoId1LogDir) makedirs(repoId2LogDir) open(join(repoId1LogDir, "invalidId1"), 'w').write("<diagnostic>ERROR1</diagnostic>") open(join(repoId1LogDir, "invalidId&2"), 'w').write("<diagnostic>ERROR2</diagnostic>") open(join(repoId2LogDir, escapeFilename("invalidId/3")), 'w').write("<diagnostic>ERROR3</diagnostic>") open(join(self.stateDir, self.domainId, "repoId1_invalid.ids"), 'w').write("invalidId1\ninvalidId&2") open(join(self.stateDir, self.domainId, escapeFilename("repoId/2_invalid.ids")), 'w').write("invalidId/3") open(join(self.stateDir, self.domainId, "repoId3_invalid.ids"), 'w').write("") self.status = RepositoryStatus(self.logDir, self.stateDir) observer = CallTrace("HarvesterData") observer.returnValues["getRepositoryGroupIds"] = ["repoGroupId1", "repoGroupId2"] def getRepositoryIds(domainId, repositoryGroupId): if repositoryGroupId == "repoGroupId1": return ["repoId1", "repoId/2"] return ["repoId3", "anotherRepoId"] observer.methods["getRepositoryIds"] = getRepositoryIds def getRepositoryGroupId(domainId, repositoryId): return 'repoGroupId1' if repositoryId in ['repoId1', 'repoId/2'] else 'repoGroupId2' observer.methods["getRepositoryGroupId"] = getRepositoryGroupId self.status.addObserver(observer)
def setUp(self): SeecrTestCase.setUp(self) self.domainId = "adomain" self.stateDir = mkdir(self.tempdir, "state") mkdir(self.stateDir, self.domainId) self.logDir = mkdir(self.tempdir, "log") repoId1LogDir = mkdir(self.logDir, self.domainId, "invalid", "repoId1") repoId2LogDir = mkdir(self.logDir, self.domainId, "invalid", escapeFilename("repoId/2")) _writeFile(repoId1LogDir, "invalidId1", data="<diagnostic>ERROR1</diagnostic>") _writeFile(repoId1LogDir, "invalidId&2", data="<diagnostic>ERROR2</diagnostic>") _writeFile(repoId2LogDir, escapeFilename("invalidId/3"), data="<diagnostic>ERROR3</diagnostic>") _writeFile(self.stateDir, self.domainId, "repoId1_invalid.ids", data="invalidId1\ninvalidId&2") _writeFile(self.stateDir, self.domainId, escapeFilename("repoId/2_invalid.ids"), data="invalidId/3") _writeFile(self.stateDir, self.domainId, "repoId3_invalid.ids", data="") self.status = RepositoryStatus(self.logDir, self.stateDir) observer = CallTrace("HarvesterData") observer.returnValues["getRepositoryGroupIds"] = [ "repoGroupId1", "repoGroupId2" ] def getRepositoryIds(domainId, repositoryGroupId): if repositoryGroupId == "repoGroupId1": return ["repoId1", "repoId/2"] return ["repoId3", "anotherRepoId"] observer.methods["getRepositoryIds"] = getRepositoryIds def getRepositoryGroupId(domainId, repositoryId): return 'repoGroupId1' if repositoryId in ['repoId1', 'repoId/2' ] else 'repoGroupId2' observer.methods["getRepositoryGroupId"] = getRepositoryGroupId self.status.addObserver(observer)
def assertName(self, name): fname = join( '/tmp', escapeFilename(name)) open(fname, 'w').close() try: self.assertTrue(isfile(fname)) finally: remove(fname)
def _getRepositoryJson(self, domainId, repositoryId): repojsonfile = join( '/var/lib/meresco-harvester/data', escapeFilename("%s.%s.repository" % (domainId, repositoryId))) if not isfile(repojsonfile): return None return repojsonfile
def prepareOaiPmh(dataDirs, tempDir, storage, batchSize): print 'DATADIRS', dataDirs oaiSuspendRegister = SuspendRegister() oaiJazz = OaiJazz(tempDir) oaiJazz.addObserver(oaiSuspendRegister) oaiJazzOperations = { 'ADD': oaiJazz.addOaiRecord, 'DEL': oaiJazz.deleteOaiRecord } for dataDir in dataDirs: for action, filename, setSpecs in iterOaiData(dataDir): identifier, metadataPrefix = filename.rsplit('.', 1) oaiJazzOperations[action]( identifier=identifier, setSpecs=setSpecs, metadataPrefixes=[metadataPrefix], ) storage.addFile(filename, join(dataDir, escapeFilename(filename))) sleep(0.000001) oaiJazz.commit() oaiPmh = be( (IllegalFromFix(), (OaiPmh(repositoryName='Mock', adminEmail='*****@*****.**', supportXWait=True, batchSize=batchSize), # (LogComponent('OaiPmh'),), (oaiJazz,), (oaiSuspendRegister,), (storage,), ) ) ) return oaiPmh
def dumpOai(port, path, oaiDumpDir, metadataPrefix, set_=None, host=None, limit=None, append=False): host = host or '127.0.0.1' baseurl = 'http://%s:%s%s' % (host, port, path) if not append: isdir(oaiDumpDir) and rmtree(oaiDumpDir) makedirs(oaiDumpDir) with open(join(oaiDumpDir, 'oai.ids'), 'a') as ids: for oaiItem in islice( iterateOaiPmh(baseurl=baseurl, metadataPrefix=metadataPrefix, set=set_), limit): filename = '%s.%s' % (oaiItem.identifier, metadataPrefix) ids.write('%s %s |%s|\n' % ('DEL' if oaiItem.deleted else 'ADD', filename, '|'.join( sorted(oaiItem.setSpecs)))) if not oaiItem.deleted: open(join(oaiDumpDir, escapeFilename(filename)), 'w').write( lxmltostring(oaiItem.metadata, pretty_print=True)) print("Oai dump created in %s" % oaiDumpDir)
def add(self, uploadid): uploadid = escapeFilename(uploadid) if uploadid in self._ids: return self._ids.append(uploadid) self._idsfile.write(uploadid + "\n") self._idsfile.flush()
def getRunningStatesForDomain(self, domainId): return sorted([ mergeDicts(jsonLoad(open(filepath)), {'repositoryId': repoId}) for groupId in self.call.getRepositoryGroupIds(domainId=domainId) for repoId in self.call.getRepositoryIds(domainId=domainId, repositoryGroupId=groupId) for filepath in [join(self._statePath, domainId, escapeFilename("%s.running" % repoId))] if isfile(filepath) ], key=lambda d: d['changedate'], reverse=True)
def _invalidCount(self, domainId, repositoryId): invalidFile = join(self._statePath, domainId, escapeFilename("%s_invalid.ids" % repositoryId)) if not isfile(invalidFile): return 0 with open(invalidFile) as fp: return len(fp.readlines())
def writeIds(filename, ids): f = open(filename,'w') try: for id in ids: f.write(escapeFilename(id)) f.write('\n') finally: f.close()
def _getStorage(self, name, mayCreate=False): storage = self._storage.get(name) if storage is None: directory = join(self._directory, escapeFilename(name)) if isdir(directory) or mayCreate: self._storage[name] = storage = SequentialStorage(directory) else: raise KeyError(name) return storage
def invalidRecords(self, domainId, repositoryId): invalidFile = join(self._statePath, domainId, escapeFilename("%s_invalid.ids" % repositoryId)) if not isfile(invalidFile): return [] return reversed( [x[:-1] if x[-1] == '\n' else x for x in (unescapeFilename(line) for line in open(invalidFile) if line.strip()) ] )
def writeIds(filename, ids): path = pathlib.Path(filename) if ids is None or len(ids) == 0: path.unlink() return idfilenew = path.with_name(path.name + '.new') with idfilenew.open('w') as fp: for anId in ids: fp.write('{}\n'.format(escapeFilename(anId))) idfilenew.rename(path)
def invalidRecords(self, domainId, repositoryId): invalidFile = join(self._statePath, domainId, escapeFilename("%s_invalid.ids" % repositoryId)) if not isfile(invalidFile): return [] with open(invalidFile) as fp: return reversed([ x[:-1] if x[-1] == '\n' else x for x in (unescapeFilename(line) for line in fp if line.strip()) ])
def delete(self, anUpload): filename = self._filenameFor(anUpload) if not self._target.oaiEnvelope: os.path.isfile(filename) and os.remove(filename) with open(os.path.join(self._target.path, 'deleted_records'), 'a') as f: f.write('%s\n' % escapeFilename(anUpload.id)) else: xmlResult = self._createOutput(anUpload) with open(filename, 'w') as fd: fd.write(lxmltostring(xmlResult)) self._logDelete(anUpload.id)
def getRunningStatesForDomain(self, domainId): def _jsonLoad(filename): with open(filename) as fp: return jsonLoad(fp) return sorted([ mergeDicts(_jsonLoad(filepath), {'repositoryId': repoId}) for groupId in self.call.getRepositoryGroupIds(domainId=domainId) for repoId in self.call.getRepositoryIds(domainId=domainId, repositoryGroupId=groupId) for filepath in [ join(self._statePath, domainId, escapeFilename("%s.running" % repoId)) ] if isfile(filepath) ], key=lambda d: d['changedate'], reverse=True)
def delete(self, anUpload): filename = self._filenameFor(anUpload) if not self._target.oaiEnvelope: os.path.isfile(filename) and os.remove(filename) f = open(os.path.join(self._target.path, "deleted_records"), "a") try: f.write("%s\n" % escapeFilename(anUpload.id)) finally: f.close() else: xmlResult = self._createOutput(anUpload) fd = open(filename, "w") try: fd.write(lxmltostring(xmlResult)) finally: fd.close() self._logDelete(anUpload.id)
def getRssLogger(repositoryId, logfileDir): logger = logging.getLogger(repositoryId) if len(logger.handlers) > 0: #print "Logger Available..." return logger # No handlers set yet, this is a new logger from the factory... logger.setLevel(logging.WARNING) LOG_FILENAME = join(logfileDir, escapeFilename(repositoryId)) rfh = logging.handlers.RotatingFileHandler((LOG_FILENAME), maxBytes=MAXLOGSIZE, backupCount=BACKUPCOUNT) formatter = logging.Formatter("%(asctime)s %(message)s", "%Y-%m-%dT%H:%M:%SZ") rfh.setFormatter(formatter) logger.addHandler(rfh) #print "Created new Logger..." return logger
def __init__(self, stateDir, logDir, name): self._statePath = pathlib.Path(stateDir) self.logPath = pathlib.Path(logDir) self._statePath.mkdir(parents=True, exist_ok=True) self.logPath.mkdir(parents=True, exist_ok=True) esc_name = escapeFilename(name) self.invalidLogPath = self.logPath / INVALID_DATA_MESSAGES_DIR / esc_name self._name = name self._ids = Ids(self._statePath / f'{esc_name}.ids') self._invalidIds = Ids(self._statePath / f'{esc_name}_invalid.ids') self._oldIds = Ids(self._statePath / f'{esc_name}.ids.old') self._statsfilepath = self._statePath / f'{esc_name}.stats' self._forceFinalNewlineOnStatsFile() self._resumptionFilepath = self._statePath / f'{esc_name}.next' self._runningFilepath = self._statePath / f'{esc_name}.running' self._countFilepath = self._statePath / f'{esc_name}.count' self.from_ = None self.token = None self._counts = None self.lastSuccessfulHarvest = None self._readState() self._statsfile = None
def _invalidDataMessageFilePath(self, uploadid): repositoryId, recordId = uploadid.split(":", 1) return self._state.invalidLogPath / escapeFilename(recordId)
def add(self, uploadid): if uploadid in self._ids: return self._ids.append(uploadid) self._idsfile.write('{}\n'.format(escapeFilename(uploadid))) self._idsfile.flush()
def getInvalidRecord(self, domainId, repositoryId, recordId): invalidDir = join(self._logPath, domainId, INVALID_DATA_MESSAGES_DIR) with open( join(invalidDir, escapeFilename(repositoryId), escapeFilename(recordId))) as fp: return parse(fp)
def getInvalidRecord(self, domainId, repositoryId, recordId): invalidDir = join(self._logPath, domainId, INVALID_DATA_MESSAGES_DIR) return parse(open(join(invalidDir, escapeFilename(repositoryId), escapeFilename(recordId))))
def _invalidDataMessageFilePath(self, uploadid): repositoryId, recordId = uploadid.split(":", 1) return join(self._logDir, INVALID_DATA_MESSAGES_DIR, escapeFilename(repositoryId), escapeFilename(recordId))
def remove(self, uploadid): uploadid = escapeFilename(uploadid) if uploadid in self._ids: self._ids.remove(uploadid) self.close() self.open()
def _invalidCount(self, domainId, repositoryId): invalidFile = join(self._statePath, domainId, escapeFilename("%s_invalid.ids" % repositoryId)) return len(open(invalidFile).readlines()) if isfile(invalidFile) else 0
def dna(reactor, portNumber, config, tempDir, batchSize): print 'Config', config root = HandleRequestLog() storage = DataStorage() for data in config: oaiName = ''.join(data['path'].split('/')) oaiSuspendRegister = SuspendRegister() try: oaiJazz = OaiJazz(join(tempDir, oaiName), preciseDatestamp=True) # needed for backwards compatibility with meresco-oai versions preceding 5.16 except TypeError: oaiJazz = OaiJazz(join(tempDir, oaiName)) oaiJazz = be( (oaiJazz, (oaiSuspendRegister,) ) ) oaiJazzOperations = { 'ADD': oaiJazz.addOaiRecord, 'DEL': oaiJazz.deleteOaiRecord } for directory in data['dirs']: for action, filename, setSpecs in iterOaiData(directory): identifier, metadataPrefix = filename.rsplit('.', 1) oaiJazzOperations[action]( identifier=identifier, setSpecs=setSpecs, metadataPrefixes=[metadataPrefix], ) storage.addFile(filename, join(directory, escapeFilename(filename))) sleep(0.000001) oaiJazz.commit() try: oaiPmh = OaiPmh(repositoryName='Mock', adminEmail='*****@*****.**', supportXWait=True, batchSize=batchSize, preciseDatestamp=True) except TypeError: oaiPmh = OaiPmh(repositoryName='Mock', adminEmail='*****@*****.**', supportXWait=True, batchSize=batchSize) # needed for backwards compatibility with meresco-oai versions preceding 5.16 tree = be( (PathFilter(data['path'], excluding=['/ready']), (IllegalFromFix(), (oaiPmh, (oaiJazz,), (oaiSuspendRegister,), (storage,), ) ) ) ) root.addObserver(tree) return \ (Observable(), (ObservableHttpServer(reactor, portNumber), (LogCollector(), (ApacheLogWriter(stdout),), (root, (PathFilter("/ready"), (StringServer('yes', ContentTypePlainText),) ) ) ) ) )