def setUp(self): SeecrTestCase.setUp(self) self.proxy = InternalServerProxy("http://localhost") self.requests = [] def _urlopen(url, data=None): if data: self.requests.append((url, data.decode())) else: self.requests.append(url) return StringIO(JsonDict(self.response).dumps()) self.proxy._urlopen = _urlopen
def setUp(self): SeecrTestCase.setUp(self) self.proxy = InternalServerProxy("http://localhost") self.requests = [] def _urlopen(url, data=None): if data: self.requests.append((url, data)) else: self.requests.append(url) return StringIO(JsonDict(self.response).dumps()) self.proxy._urlopen = _urlopen
def __init__(self): if len(argv[1:]) == 0: argv.append('-h') self.parser = OptionParser() args = self.parse_args() self.__dict__.update(args.__dict__) if not self.domainId: self.parser.error("Specify domain") if self._concurrency < 1: self.parser.error("Concurrency must be at least 1.") config = JsonDict.load(urlopen(self.serverUrl + '/info/config')) if self._logDir is None: self._logDir = config['logPath'] if self._stateDir is None: self._stateDir = config['statePath'] self.proxy = InternalServerProxy(self.serverUrl, self.setActionDone) self.repository = self.repositoryId and self.proxy.getRepositoryObject( identifier=self.repositoryId, domainId=self.domainId)
class InternalServerProxyTest(SeecrTestCase): def setUp(self): SeecrTestCase.setUp(self) self.proxy = InternalServerProxy("http://localhost") self.requests = [] def _urlopen(url, data=None): if data: self.requests.append((url, data)) else: self.requests.append(url) return StringIO(JsonDict(self.response).dumps()) self.proxy._urlopen = _urlopen def testGetRepository(self): self.response = { 'request': {'verb': 'GetRepository'}, 'response': {'GetRepository': { 'identifier': 'repo1', 'use': True, 'complete': False, }} } repoDict = self.proxy.getRepository(identifier='repo1', domainId='domainId') self.assertEqual('http://localhost/get?verb=GetRepository&identifier=repo1&domainId=domainId', self.requests[0]) repo = self.proxy.getRepositoryObject(identifier='repo1', domainId='domainId') self.assertEqual(self.requests[0], self.requests[-1]) self.assertEqual({'complete': False, 'identifier': 'repo1', 'use': True}, repoDict) self.assertEqual('repo1', repo.id) self.assertFalse(repo.complete) self.assertTrue(repo.use) def testGetStatus(self): self.response = {'response': {'GetStatus': '?'}} self.proxy.getStatus(domainId='domainId') self.assertEqual('http://localhost/get?verb=GetStatus&domainId=domainId', self.requests[0]) def testErrorInResponse(self): self.response = {'request': {'verb': 'getUnknown'}, 'error': {'code': 'badVerb', 'message': 'Bad verb'}} try: self.proxy.getStatus(domainId='domainId') self.fail() except ValueError, e: self.assertEqual('Bad verb', str(e))
class StartHarvester(object): def __init__(self): if len(argv[1:]) == 0: argv.append('-h') self.parser = OptionParser() args = self.parse_args() self.__dict__.update(args.__dict__) if not self.domainId: self.parser.error("Specify domain") if self._concurrency < 1: self.parser.error("Concurrency must be at least 1.") config = JsonDict.load(urlopen(self.serverUrl + '/info/config')) if self._logDir is None: self._logDir = config['logPath'] if self._stateDir is None: self._stateDir = config['statePath'] self.proxy = InternalServerProxy(self.serverUrl, self.setActionDone) self.repository = self.repositoryId and self.proxy.getRepositoryObject( identifier=self.repositoryId, domainId=self.domainId) def parse_args(self): self.parser.add_option("-d", "--domain", dest="domainId", help="Mandatory argument denoting the domain.", metavar="DOMAIN") self.parser.add_option("-u", "--url", dest="serverUrl", help="The url of the Meresco Harvester Server", default="http://localhost:8888") self.parser.add_option( "-r", "--repository", dest="repositoryId", help= "Process a single repository within the given domain. Defaults to all repositories from the domain.", metavar="REPOSITORY") self.parser.add_option("", "--gustosId", dest="gustosId", help="Name this harvester sends to Gustos") self.parser.add_option("", "--gustosHost", dest="gustosHost", help="Hostname for the gustos server") self.parser.add_option( "", "--gustosPort", dest="gustosPort", help="Portnumber of gustos on the gustos server", default=8001, type=int), self.parser.add_option( "-t", "--set-process-timeout", dest="processTimeout", type="int", default=60 * 60, metavar="TIMEOUT", help="Subprocess will be timed out after amount of seconds.") self.parser.add_option( "--logDir", "", dest="_logDir", help="Override the logDir in the apache configuration.", metavar="DIRECTORY", default=None) self.parser.add_option( "--stateDir", dest="_stateDir", help="Override the stateDir in the apache configuration.", metavar="DIRECTORY", default=None) self.parser.add_option( "--concurrency", dest="_concurrency", type="int", default=1, help= "Number of repositories to be concurrently harvested. Defaults to 1 (no concurrency).", metavar="NUMBER") self.parser.add_option("--force-target", "", dest="forceTarget", help="Overrides the repository's target", metavar="TARGETID") self.parser.add_option("--force-mapping", "", dest="forceMapping", help="Overrides the repository's mapping", metavar="MAPPINGID") self.parser.add_option("--no-action-done", "", action="store_false", dest="setActionDone", default=True, help="Do not set SAHARA's actions", metavar="TARGETID") self.parser.add_option( "--runOnce", "", dest="runOnce", action="store_true", default=False, help= "Prevent harvester from looping (if combined with --repository)") self.parser.add_option("--child", "", action="store_true", dest="child", default=False, help=SUPPRESS_HELP) self.parser.add_option("--sleepTime", "", dest="sleepTime", type='int', default=1, help=SUPPRESS_HELP) (options, args) = self.parser.parse_args() for opt in ['serverUrl']: if not getattr(options, opt, None): raise ValueError('Missing option: %s' % repr(opt)) return options def start(self): if self.child: self._startRepository() else: self._startChildProcesses() def _startChildProcesses(self): running = set() if self.repository: waiting = [self.repositoryId] else: waiting = self.proxy.getRepositoryIds(self.domainId) processes = {} try: while running or waiting: while waiting and (len(running) < self._concurrency): repositoryId = waiting.pop(0) self._createProcess(processes, repositoryId) running.add(repositoryId) try: readers, _, _ = select(list(processes.keys()), [], []) except error as e: (errno, description) = e.args if errno == EINTR: pass else: raise for reader in readers: if reader not in processes: continue t, process, repositoryId = processes[reader] try: pipeContent = read(reader, 4096) except OSError as e: if e.errno == EAGAIN: continue raise poFileno = process.stdout.fileno() peFileno = process.stderr.fileno() strm = stdout if reader == poFileno else stderr strm.write(pipeContent.decode( ) if type(pipeContent) is bytes else pipeContent) strm.flush() if process.poll() is not None: exitstatus = t.stopScript(process) running.remove(repositoryId) del processes[poFileno] del processes[peFileno] if exitstatus == AGAIN_EXITCODE: waiting.insert(0, repositoryId) else: if exitstatus != 0: stderr.write( "Process (for repository %s) exited with exitstatus %s.\n" % (repositoryId, exitstatus)) stderr.flush() if not self.runOnce: waiting.append(repositoryId) self._updateWaiting(waiting, running) except: for t in set( [t for t, process, repositoryId in list(processes.values())]): t.terminate() raise def _createProcess(self, processes, repositoryId): t = TimedProcess(signal=SIGTERM) process = t.executeScript(self._createArgs(repositoryId), self.processTimeout) processes[process.stdout.fileno()] = t, process, repositoryId processes[process.stderr.fileno()] = t, process, repositoryId def _createArgs(self, repositoryId): args = argv + ["--child"] extraArg = '--repository=%s' % repositoryId if not extraArg in argv: args += [extraArg] return args def _updateWaiting(self, waiting, running): if self.runOnce or self.repository: return repositoryIds = self.proxy.getRepositoryIds(self.domainId) for repoId in waiting[:]: if not repoId in repositoryIds: waiting.remove(repoId) for repoId in repositoryIds: if not repoId in waiting and not repoId in running: waiting.append(repoId) def _startRepository(self): if self.forceTarget: self.repository.targetId = self.forceTarget if self.forceMapping: self.repository.mappingId = self.forceMapping self._generalHarvestLog = CompositeLogger([ (['*'], StreamEventLogger(stdout)), (['ERROR', 'WARN'], StreamEventLogger(stderr)), ]) gustosClient = GustosClient(id=self.gustosId, gustosHost=self.gustosHost, gustosPort=self.gustosPort, threaded=False) if self.gustosId else None messageIgnored, again = self.repository.do( stateDir=join(self._stateDir, self.domainId), logDir=join(self._logDir, self.domainId), generalHarvestLog=self._generalHarvestLog, gustosClient=gustosClient) sleep(self.sleepTime) if again: exit(AGAIN_EXITCODE)
class InternalServerProxyTest(SeecrTestCase): def setUp(self): SeecrTestCase.setUp(self) self.proxy = InternalServerProxy("http://localhost") self.requests = [] def _urlopen(url, data=None): if data: self.requests.append((url, data.decode())) else: self.requests.append(url) return StringIO(JsonDict(self.response).dumps()) self.proxy._urlopen = _urlopen def testGetRepository(self): self.response = { 'request': { 'verb': 'GetRepository' }, 'response': { 'GetRepository': { 'identifier': 'repo1', 'use': True, 'complete': False, } } } repoDict = self.proxy.getRepository(identifier='repo1', domainId='domainId') self.assertEqual( 'http://localhost/get?verb=GetRepository&identifier=repo1&domainId=domainId', self.requests[0]) repo = self.proxy.getRepositoryObject(identifier='repo1', domainId='domainId') self.assertEqual(self.requests[0], self.requests[-1]) self.assertEqual( { 'complete': False, 'identifier': 'repo1', 'use': True }, repoDict) self.assertEqual('repo1', repo.id) self.assertFalse(repo.complete) self.assertTrue(repo.use) def testGetStatus(self): self.response = {'response': {'GetStatus': '?'}} self.proxy.getStatus(domainId='domainId') self.assertEqual( 'http://localhost/get?verb=GetStatus&domainId=domainId', self.requests[0]) def testErrorInResponse(self): self.response = { 'request': { 'verb': 'getUnknown' }, 'error': { 'code': 'badVerb', 'message': 'Bad verb' } } try: self.proxy.getStatus(domainId='domainId') self.fail() except ValueError as e: self.assertEqual('Bad verb', str(e)) def testSetActionDone(self): self.response = {} self.proxy.repositoryActionDone(domainId='adomain', repositoryId='repo1') self.assertEqual(('http://localhost/action/repositoryDone', 'domainId=adomain&identifier=repo1'), self.requests[0])