def _submitJob(self, jobNum, module): fd, jdl = tempfile.mkstemp('.jdl') try: data = self.makeJDL(jobNum, module) utils.safeWrite(os.fdopen(fd, 'w'), data) except Exception: utils.removeFiles([jdl]) raise BackendError('Could not write jdl data to %s.' % jdl) try: tmp = utils.filterDict(self._submitParams, vF = lambda v: v) params = str.join(' ', map(lambda (x, y): '%s %s' % (x, y), tmp.items())) log = tempfile.mktemp('.log') activity = utils.ActivityLog('submitting jobs') proc = utils.LoggedProcess(self._submitExec, '%s --nomsg --noint --logfile "%s" "%s"' % (params, log, jdl)) wmsId = None for line in filter(lambda x: x.startswith('http'), map(str.strip, proc.iter())): wmsId = line retCode = proc.wait() del activity if (retCode != 0) or (wmsId == None): if self.explainError(proc, retCode): pass else: proc.logError(self.errorLog, log = log, jdl = jdl) finally: utils.removeFiles([log, jdl]) return (jobNum, utils.QM(wmsId, self._createId(wmsId), None), {'jdl': str.join('', data)})
def _submitJob(self, jobNum, module): fd, jdl = tempfile.mkstemp('.jdl') try: jdlData = self.makeJDL(jobNum, module) utils.safeWrite(os.fdopen(fd, 'w'), jdlData) except Exception: utils.removeFiles([jdl]) raise BackendError('Could not write jdl data to %s.' % jdl) try: submitArgs = [] for key_value in utils.filterDict(self._submitParams, vF = lambda v: v).items(): submitArgs.extend(key_value) submitArgs.append(jdl) activity = Activity('submitting job %d' % jobNum) proc = LocalProcess(self._submitExec, '--nomsg', '--noint', '--logfile', '/dev/stderr', *submitArgs) gcID = None for line in ifilter(lambda x: x.startswith('http'), imap(str.strip, proc.stdout.iter(timeout = 60))): gcID = line retCode = proc.status(timeout = 0, terminate = True) activity.finish() if (retCode != 0) or (gcID is None): if self.explainError(proc, retCode): pass else: self._log.log_process(proc, files = {'jdl': SafeFile(jdl).read()}) finally: utils.removeFiles([jdl]) return (jobNum, utils.QM(gcID, self._createId(gcID), None), {'jdl': str.join('', jdlData)})
def cancelJobs(self, allIds): if len(allIds) == 0: raise StopIteration waitFlag = False for ids in map(lambda x: allIds[x:x+5], range(0, len(allIds), 5)): # Delete jobs in groups of 5 - with 5 seconds between groups if waitFlag and utils.wait(5) == False: break waitFlag = True jobNumMap = dict(ids) jobs = self.writeWMSIds(ids) log = tempfile.mktemp('.log') activity = utils.ActivityLog('cancelling jobs') proc = utils.LoggedProcess(self._cancelExec, '--noint --logfile "%s" -i "%s"' % (log, jobs)) retCode = proc.wait() del activity # select cancelled jobs for deletedWMSId in filter(lambda x: x.startswith('- '), proc.iter()): deletedWMSId = self._createId(deletedWMSId.strip('- \n')) yield (jobNumMap.get(deletedWMSId), deletedWMSId) if retCode != 0: if self.explainError(proc, retCode): pass else: proc.logError(self.errorLog, log = log) utils.removeFiles([log, jobs])
def checkJobs(self, ids): if len(ids) == 0: raise StopIteration jobNumMap = dict(ids) jobs = ' '.join(self._getRawIDs(ids)) log = tempfile.mktemp('.log') activity = utils.ActivityLog('checking job status') proc = utils.LoggedProcess(self._statusExec, '--level 0 --logfile "%s" %s' % (log, jobs)) for jobOutput in proc.getOutput().split('******')[1:]: data = {} for statusRegexLevel0 in self._statusRegexLevel0: match = re.match(statusRegexLevel0, jobOutput.replace('\n', ' ')) if match: data = match.groupdict() break data['id'] = self._createId(data['rawId']) yield (jobNumMap.get(data['id']), data['id'], self._statusMap[data.get('status', 'DONE-FAILED')], data) retCode = proc.wait() del activity if retCode != 0: if self.explainError(proc, retCode): pass else: proc.logError(self.errorLog, log = log, jobs = jobs) utils.removeFiles([log])
def cancelJobs(self, allIds): if len(allIds) == 0: raise StopIteration waitFlag = False for ids in imap(lambda x: allIds[x:x+5], irange(0, len(allIds), 5)): # Delete jobs in groups of 5 - with 5 seconds between groups if waitFlag and not utils.wait(5): break waitFlag = True jobNumMap = dict(ids) jobs = self.writeWMSIds(ids) activity = utils.ActivityLog('cancelling jobs') proc = LocalProcess(self._cancelExec, '--noint', '--logfile', '/dev/stderr', '-i', jobs) retCode = proc.status(timeout = 60, terminate = True) del activity # select cancelled jobs for deletedWMSId in ifilter(lambda x: x.startswith('- '), proc.stdout.iter()): deletedWMSId = self._createId(deletedWMSId.strip('- \n')) yield (jobNumMap.get(deletedWMSId), deletedWMSId) if retCode != 0: if self.explainError(proc, retCode): pass else: self._log.log_process(proc, files = {'jobs': utils.safeRead(jobs)}) utils.removeFiles([jobs])
def cancelJobs(self, allIds): if len(allIds) == 0: raise StopIteration waitFlag = False for ids in imap(lambda x: allIds[x:x + 5], irange(0, len(allIds), 5)): # Delete jobs in groups of 5 - with 5 seconds between groups if waitFlag and not utils.wait(5): break waitFlag = True jobNumMap = dict(ids) jobs = self.writeWMSIds(ids) activity = utils.ActivityLog('cancelling jobs') proc = LocalProcess(self._cancelExec, '--noint', '--logfile', '/dev/stderr', '-i', jobs) retCode = proc.status(timeout=60, terminate=True) del activity # select cancelled jobs for deletedWMSId in ifilter(lambda x: x.startswith('- '), proc.stdout.iter()): deletedWMSId = self._createId(deletedWMSId.strip('- \n')) yield (jobNumMap.get(deletedWMSId), deletedWMSId) if retCode != 0: if self.explainError(proc, retCode): pass else: self._log.log_process(proc, files={'jobs': utils.safeRead(jobs)}) utils.removeFiles([jobs])
def _getJobsOutput(self, ids): if len(ids) == 0: raise StopIteration basePath = os.path.join(self._outputPath, 'tmp') try: if len(ids) == 1: # For single jobs create single subdir tmpPath = os.path.join(basePath, md5(ids[0][0]).hexdigest()) else: tmpPath = basePath utils.ensureDirExists(tmpPath) except Exception: raise BackendError('Temporary path "%s" could not be created.' % tmpPath, BackendError) jobNumMap = dict(ids) jobs = self.writeWMSIds(ids) activity = Activity('retrieving %d job outputs' % len(ids)) proc = LocalProcess(self._outputExec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmpPath) # yield output dirs todo = jobNumMap.values() currentJobNum = None for line in imap(str.strip, proc.stdout.iter(timeout = 60)): if line.startswith(tmpPath): todo.remove(currentJobNum) outputDir = line.strip() if os.path.exists(outputDir): if 'GC_WC.tar.gz' in os.listdir(outputDir): wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz') try: tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir) os.unlink(wildcardTar) except Exception: self._log.error('Can\'t unpack output files contained in %s', wildcardTar) yield (currentJobNum, line.strip()) currentJobNum = None else: currentJobNum = jobNumMap.get(self._createId(line), currentJobNum) retCode = proc.status(timeout = 0, terminate = True) activity.finish() if retCode != 0: if 'Keyboard interrupt raised by user' in proc.stderr.read(timeout = 0): utils.removeFiles([jobs, basePath]) raise StopIteration else: self._log.log_process(proc, files = {'jobs': SafeFile(jobs).read()}) self._log.error('Trying to recover from error ...') for dirName in os.listdir(basePath): yield (None, os.path.join(basePath, dirName)) # return unretrievable jobs for jobNum in todo: yield (jobNum, None) utils.removeFiles([jobs, basePath])
def _readJobs(self, jobLimit): jobMap = {} maxJobs = 0 if os.path.exists(self._dbFile): try: tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED) except Exception: # Try to recover job archive self._log.warning( '=' * 40 + '\nStarting recovery of broken job database => Answer "y" if asked "Is this a single-disk archive?"!\n' + '=' * 40) os.system('zip -FF %s --out %s.tmp 2> /dev/null' % (self._dbFile, self._dbFile)) os.rename(self._dbFile, self._dbFile + '.broken') os.rename(self._dbFile + '.tmp', self._dbFile) tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED) removeFiles([self._dbFile + '.broken']) brokenList = [] for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple( imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1))) try: fp = tar.open(fnTarInfo) try: fp.read() finally: fp.close() except Exception: clear_current_exception() for broken in brokenList: os.system('zip %s -d %s' % (self._dbFile, broken)) self._log.info('Recover completed!') activity = Activity('Reading job transactions') maxJobs = len(tar.namelist()) tMap = {} for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple( imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1))) if tid < tMap.get(jobNum, 0): continue try: data = self._fmt.parse(tar.open(fnTarInfo).read()) except Exception: continue jobMap[jobNum] = self._create_job_obj(fnTarInfo, data) tMap[jobNum] = tid if idx % 100 == 0: activity.update('Reading job transactions %d [%d%%]' % (idx, (100.0 * idx) / maxJobs)) activity.finish() self._serial = maxJobs return jobMap
def image(self): cherrypy.response.headers['Content-Type']= 'image/png' nodes = ["MetadataSplitter", "RunSplitter"] edges = [("MetadataSplitter", "RunSplitter")] nodeStr = str.join('', map(lambda x: '%s [label="%s", fillcolor="/set312/1", style="filled"]\n' % (x, x), nodes)) edgeStr = str.join('', map(lambda x: '%s -> %s' % x, edges)) inp = "digraph mygraph { overlap=False; ranksep=1.5; %s; %s; }" % (nodeStr, edgeStr) fd, fn = tempfile.mkstemp() os.fdopen(fd, 'w').write(inp) proc = utils.LoggedProcess('neato', '%s -Tpng' % fn) result = proc.getOutput() utils.removeFiles([fn]) return result
def __init__(self, config, jobLimit = -1, jobSelector = None): dbPath = config.getWorkPath('jobs') self._dbFile = config.getWorkPath('jobs.zip') if os.path.exists(dbPath) and os.path.isdir(dbPath) and not os.path.exists(self._dbFile): activity = Activity('Converting job database') self._serial = 0 try: oldDB = TextFileJobDB(config) for jobNum in oldDB.getJobs(): self.commit(jobNum, oldDB.get(jobNum)) except Exception: removeFiles([self._dbFile]) raise activity.finish() ZippedJobDB.__init__(self, config, jobLimit, jobSelector)
def __init__(self, config, jobLimit=-1, jobSelector=None): dbPath = config.getWorkPath('jobs') self._dbFile = config.getWorkPath('jobs.zip') if os.path.exists(dbPath) and os.path.isdir( dbPath) and not os.path.exists(self._dbFile): activity = Activity('Converting job database') self._serial = 0 try: oldDB = TextFileJobDB(config) for jobNum in oldDB.getJobs(): self.commit(jobNum, oldDB.get(jobNum)) except Exception: removeFiles([self._dbFile]) raise activity.finish() ZippedJobDB.__init__(self, config, jobLimit, jobSelector)
def _getJobsOutput(self, ids): if not len(ids): raise StopIteration activity = Activity('retrieving %d job outputs' % len(ids)) for gcID, jobNum in ids: path = self._sandbox_helper.get_sandbox(gcID) if path is None: yield (jobNum, None) continue # Cleanup sandbox outFiles = lchain(imap(lambda pat: glob.glob(os.path.join(path, pat)), self.outputFiles)) utils.removeFiles(ifilter(lambda x: x not in outFiles, imap(lambda fn: os.path.join(path, fn), os.listdir(path)))) yield (jobNum, path) activity.finish()
def _getJobsOutput(self, ids): if not len(ids): raise StopIteration activity = utils.ActivityLog('retrieving job outputs') for wmsId, jobNum in ids: path = self._getSandbox(wmsId) if path == None: yield (jobNum, None) continue # Cleanup sandbox outFiles = utils.listMapReduce(lambda pat: glob.glob(os.path.join(path, pat)), self.outputFiles) utils.removeFiles(filter(lambda x: x not in outFiles, map(lambda fn: os.path.join(path, fn), os.listdir(path)))) yield (jobNum, path) del activity
def __init__(self, config, jobLimit=-1, jobSelector=None): dbPath = config.getWorkPath("jobs") self._dbFile = config.getWorkPath("jobs.zip") if os.path.exists(dbPath) and os.path.isdir(dbPath) and not os.path.exists(self._dbFile): log = utils.ActivityLog("Converting job database...") self._serial = 0 try: oldDB = JobDB(config) oldDB.readJobs(-1) for jobNum in oldDB.getJobs(): self.commit(jobNum, oldDB.get(jobNum)) except Exception: utils.removeFiles([dbFile]) raise del log ZippedJobDB.__init__(self, config, jobLimit, jobSelector)
def _getJobsOutput(self, ids): if not len(ids): raise StopIteration activity = utils.ActivityLog('retrieving job outputs') for wmsId, jobNum in ids: path = self._getSandbox(wmsId) if path is None: yield (jobNum, None) continue # Cleanup sandbox outFiles = lchain(imap(lambda pat: glob.glob(os.path.join(path, pat)), self.outputFiles)) utils.removeFiles(ifilter(lambda x: x not in outFiles, imap(lambda fn: os.path.join(path, fn), os.listdir(path)))) yield (jobNum, path) del activity
def _readJobs(self, jobLimit): jobMap = {} maxJobs = 0 if os.path.exists(self._dbFile): try: tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED) except Exception: # Try to recover job archive self._log.warning('=' * 40 + '\nStarting recovery of broken job database => Answer "y" if asked "Is this a single-disk archive?"!\n' + '=' * 40) os.system('zip -FF %s --out %s.tmp 2> /dev/null' % (self._dbFile, self._dbFile)) os.rename(self._dbFile, self._dbFile + '.broken') os.rename(self._dbFile + '.tmp', self._dbFile) tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED) removeFiles([self._dbFile + '.broken']) brokenList = [] for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple(imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1))) try: fp = tar.open(fnTarInfo) try: fp.read() finally: fp.close() except Exception: clear_current_exception() for broken in brokenList: os.system('zip %s -d %s' % (self._dbFile, broken)) self._log.info('Recover completed!') activity = Activity('Reading job transactions') maxJobs = len(tar.namelist()) tMap = {} for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple(imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1))) if tid < tMap.get(jobNum, 0): continue try: data = self._fmt.parse(tar.open(fnTarInfo).read()) except Exception: continue jobMap[jobNum] = self._create_job_obj(fnTarInfo, data) tMap[jobNum] = tid if idx % 100 == 0: activity.update('Reading job transactions %d [%d%%]' % (idx, (100.0 * idx) / maxJobs)) activity.finish() self._serial = maxJobs return jobMap
def readJobs(self, jobLimit): jobMap = {} maxJobs = 0 if os.path.exists(self._dbFile): try: tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED) except Exception: # Try to recover job archive utils.eprint('=' * 40 + '\nStarting recovery of broken job database') utils.eprint(' => Answer "y" if asked "Is this a single-disk archive?"!\n' + '=' * 40) os.system('zip -FF %s --out %s.tmp 2> /dev/null' % (self._dbFile, self._dbFile)) os.rename(self._dbFile, self._dbFile + '.broken') os.rename(self._dbFile + '.tmp', self._dbFile) tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED) utils.removeFiles([self._dbFile + '.broken']) brokenList = [] for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple(imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1))) try: fp = tar.open(fnTarInfo) try: fp.read() finally: fp.close() except Exception: pass for broken in brokenList: os.system('zip %s -d %s' % (self._dbFile, broken)) utils.eprint('Recover completed!') activity = utils.ActivityLog('Reading job transactions ...') maxJobs = len(tar.namelist()) tMap = {} for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple(imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1))) if tid < tMap.get(jobNum, 0): continue data = utils.DictFormat(escapeString = True).parse(tar.open(fnTarInfo).read()) jobMap[jobNum] = Job.loadData(fnTarInfo, data) tMap[jobNum] = tid if idx % 100 == 0: activity.finish() activity = utils.ActivityLog('Reading job transactions ... %d [%d%%]' % (idx, (100.0 * idx) / maxJobs)) self._serial = maxJobs return jobMap
def cancelJobs(self, allIds): if len(allIds) == 0: raise StopIteration waitFlag = False for ids in imap(lambda x: allIds[x:x+self._nJobsPerChunk], irange(0, len(allIds), self._nJobsPerChunk)): # Delete jobs in groups of 5 - with 5 seconds between groups if waitFlag and not utils.wait(5): break waitFlag = True jobNumMap = dict(ids) jobs = ' '.join(self._getRawIDs(ids)) log = tempfile.mktemp('.log') activity = utils.ActivityLog('cancelling jobs') proc = utils.LoggedProcess(self._cancelExec, '--noint --logfile "%s" %s' % (log, jobs)) retCode = proc.wait() del activity # select cancelled jobs for rawId in self._getRawIDs(ids): deletedWMSId = self._createId(rawId) yield (jobNumMap.get(deletedWMSId), deletedWMSId) if retCode != 0: if self.explainError(proc, retCode): pass else: proc.logError(self.errorLog, log = log) purgeLog = tempfile.mktemp('.log') purgeProc = utils.LoggedProcess(self._purgeExec, '--noint --logfile "%s" %s' % (purgeLog, jobs)) retCode = purgeProc.wait() if retCode != 0: if self.explainError(purgeProc, retCode): pass else: proc.logError(self.errorLog, log = purgeLog, jobs = jobs) utils.removeFiles([log, purgeLog])
def checkJobs(self, ids): if len(ids) == 0: raise StopIteration jobNumMap = dict(ids) jobs = self.writeWMSIds(ids) activity = utils.ActivityLog('checking job status') proc = LocalProcess(self._statusExec, '--verbosity', 1, '--noint', '--logfile', '/dev/stderr', '-i', jobs) for data in self._parseStatus(proc.stdout.iter(timeout = 60)): data['id'] = self._createId(data['id']) yield (jobNumMap.get(data['id']), data['id'], self._statusMap[data['status']], data) retCode = proc.status(timeout = 0, terminate = True) del activity if retCode != 0: if self.explainError(proc, retCode): pass else: self._log.log_process(proc, files = {'jobs': utils.safeRead(jobs)}) utils.removeFiles([jobs])
def readJobs(self, jobLimit): jobMap = {} maxJobs = 0 if os.path.exists(self._dbFile): try: tar = zipfile.ZipFile(self._dbFile, "r", zipfile.ZIP_DEFLATED) except Exception: # Try to recover job archive utils.eprint("=" * 40 + "\nStarting recovery of broken job database") utils.eprint(' => Answer "y" if asked "Is this a single-disk archive?"!\n' + "=" * 40) os.system("zip -FF %s --out %s.tmp 2> /dev/null" % (self._dbFile, self._dbFile)) os.rename(self._dbFile, self._dbFile + ".broken") os.rename(self._dbFile + ".tmp", self._dbFile) tar = zipfile.ZipFile(self._dbFile, "r", zipfile.ZIP_DEFLATED) utils.removeFiles([self._dbFile + ".broken"]) brokenList = [] for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple(map(lambda s: int(s[1:]), fnTarInfo.split("_", 1))) try: rawData = tar.open(fnTarInfo).read() except Exception: pass for broken in brokenList: os.system("zip %s -d %s" % (self._dbFile, broken)) utils.eprint("Recover completed!") log = None maxJobs = len(tar.namelist()) tMap = {} for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple(map(lambda s: int(s[1:]), fnTarInfo.split("_", 1))) if tid < tMap.get(jobNum, 0): continue data = utils.DictFormat(escapeString=True).parse(tar.open(fnTarInfo).read()) jobMap[jobNum] = Job.loadData(fnTarInfo, data) tMap[jobNum] = tid if idx % 100 == 0: del log log = utils.ActivityLog("Reading job transactions ... %d [%d%%]" % (idx, (100.0 * idx) / maxJobs)) self._serial = maxJobs return jobMap
def checkJobs(self, ids): if len(ids) == 0: raise StopIteration jobNumMap = dict(ids) jobs = self.writeWMSIds(ids) log = tempfile.mktemp('.log') activity = utils.ActivityLog('checking job status') proc = utils.LoggedProcess(self._statusExec, '--verbosity 1 --noint --logfile "%s" -i "%s"' % (log, jobs)) for data in self._parseStatus(proc.iter()): data['id'] = self._createId(data['id']) yield (jobNumMap.get(data['id']), data['id'], self._statusMap[data['status']], data) retCode = proc.wait() del activity if retCode != 0: if self.explainError(proc, retCode): pass else: proc.logError(self.errorLog, log = log, jobs = jobs) utils.removeFiles([log, jobs])
def _submitJob(self, jobNum, module): fd, jdl = tempfile.mkstemp('.jdl') try: jdlData = self.makeJDL(jobNum, module) utils.safeWrite(os.fdopen(fd, 'w'), jdlData) except Exception: utils.removeFiles([jdl]) raise BackendError('Could not write jdl data to %s.' % jdl) try: submitArgs = [] for key_value in utils.filterDict(self._submitParams, vF=lambda v: v).items(): submitArgs.extend(key_value) submitArgs.append(jdl) activity = Activity('submitting job %d' % jobNum) proc = LocalProcess(self._submitExec, '--nomsg', '--noint', '--logfile', '/dev/stderr', *submitArgs) gcID = None for line in ifilter(lambda x: x.startswith('http'), imap(str.strip, proc.stdout.iter(timeout=60))): gcID = line retCode = proc.status(timeout=0, terminate=True) activity.finish() if (retCode != 0) or (gcID is None): if self.explainError(proc, retCode): pass else: self._log.log_process(proc, files={'jdl': SafeFile(jdl).read()}) finally: utils.removeFiles([jdl]) return (jobNum, utils.QM(gcID, self._createId(gcID), None), { 'jdl': str.join('', jdlData) })
def bulkSubmissionBegin(self): self._submitParams.update({ '-d': None }) if self._discovery_module: self._submitParams.update({ '-e': self._discovery_module.getWMS() }) if self._useDelegate == False: self._submitParams.update({ '-a': ' ' }) return True log = tempfile.mktemp('.log') try: dID = 'GCD' + md5(str(time.time())).hexdigest()[:10] activity = utils.ActivityLog('creating delegate proxy for job submission') proc = utils.LoggedProcess(self._delegateExec, '%s -d %s --noint --logfile "%s"' % (utils.QM(self._configVO, '--config "%s"' % self._configVO, ''), dID, log)) output = proc.getOutput(wait = True) if ('glite-wms-job-delegate-proxy Success' in output) and (dID in output): self._submitParams.update({ '-d': dID }) del activity if proc.wait() != 0: proc.logError(self.errorLog, log = log) return (self._submitParams.get('-d', None) != None) finally: utils.removeFiles([log])
def checkJobs(self, ids): if len(ids) == 0: raise StopIteration jobNumMap = dict(ids) jobs = self.writeWMSIds(ids) activity = utils.ActivityLog('checking job status') proc = LocalProcess(self._statusExec, '--verbosity', 1, '--noint', '--logfile', '/dev/stderr', '-i', jobs) for data in self._parseStatus(proc.stdout.iter(timeout=60)): data['id'] = self._createId(data['id']) yield (jobNumMap.get(data['id']), data['id'], self._statusMap[data['status']], data) retCode = proc.status(timeout=0, terminate=True) del activity if retCode != 0: if self.explainError(proc, retCode): pass else: self._log.log_process(proc, files={'jobs': utils.safeRead(jobs)}) utils.removeFiles([jobs])
def _getJobsOutput(self, ids): if len(ids) == 0: raise StopIteration basePath = os.path.join(self._outputPath, 'tmp') try: if len(ids) == 1: # For single jobs create single subdir tmpPath = os.path.join(basePath, md5(ids[0][0]).hexdigest()) else: tmpPath = basePath utils.ensureDirExists(tmpPath) except Exception: raise BackendError('Temporary path "%s" could not be created.' % tmpPath, RuntimeError) jobNumMap = dict(ids) jobs = self.writeWMSIds(ids) log = tempfile.mktemp('.log') activity = utils.ActivityLog('retrieving job outputs') proc = utils.LoggedProcess(self._outputExec, '--noint --logfile "%s" -i "%s" --dir "%s"' % (log, jobs, tmpPath)) # yield output dirs todo = jobNumMap.values() currentJobNum = None for line in map(str.strip, proc.iter()): if line.startswith(tmpPath): todo.remove(currentJobNum) outputDir = line.strip() if os.path.exists(outputDir): if 'GC_WC.tar.gz' in os.listdir(outputDir): wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz') try: tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir) os.unlink(wildcardTar) except Exception: utils.eprint("Can't unpack output files contained in %s" % wildcardTar) pass yield (currentJobNum, line.strip()) currentJobNum = None else: currentJobNum = jobNumMap.get(self._createId(line), currentJobNum) retCode = proc.wait() del activity if retCode != 0: if 'Keyboard interrupt raised by user' in proc.getError(): utils.removeFiles([log, jobs, basePath]) raise StopIteration else: proc.logError(self.errorLog, log = log) utils.eprint('Trying to recover from error ...') for dirName in os.listdir(basePath): yield (None, os.path.join(basePath, dirName)) # return unretrievable jobs for jobNum in todo: yield (jobNum, None) utils.removeFiles([log, jobs, basePath])
def _getJobsOutput(self, allIds): if len(allIds) == 0: raise StopIteration basePath = os.path.join(self._outputPath, 'tmp') try: if len(allIds) == 1: # For single jobs create single subdir basePath = os.path.join(basePath, md5(allIds[0][0]).hexdigest()) utils.ensureDirExists(basePath) except Exception: raise BackendError('Temporary path "%s" could not be created.' % basePath, BackendError) activity = utils.ActivityLog('retrieving job outputs') for ids in imap(lambda x: allIds[x:x+self._nJobsPerChunk], irange(0, len(allIds), self._nJobsPerChunk)): jobNumMap = dict(ids) jobs = ' '.join(self._getRawIDs(ids)) log = tempfile.mktemp('.log') #print self._outputExec, '--noint --logfile "%s" --dir "%s" %s' % (log, basePath, jobs) #import sys #sys.exit(1) proc = utils.LoggedProcess(self._outputExec, '--noint --logfile "%s" --dir "%s" %s' % (log, basePath, jobs)) # yield output dirs todo = jobNumMap.values() done = [] currentJobNum = None for line in imap(str.strip, proc.iter()): match = re.match(self._outputRegex, line) if match: currentJobNum = jobNumMap.get(self._createId(match.groupdict()['rawId'])) todo.remove(currentJobNum) done.append(match.groupdict()['rawId']) outputDir = match.groupdict()['outputDir'] if os.path.exists(outputDir): if 'GC_WC.tar.gz' in os.listdir(outputDir): wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz') try: tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir) os.unlink(wildcardTar) except Exception: utils.eprint("Can't unpack output files contained in %s" % wildcardTar) yield (currentJobNum, outputDir) currentJobNum = None retCode = proc.wait() if retCode != 0: if 'Keyboard interrupt raised by user' in proc.getError(): utils.removeFiles([log, basePath]) raise StopIteration else: proc.logError(self.errorLog, log = log) utils.eprint('Trying to recover from error ...') for dirName in os.listdir(basePath): yield (None, os.path.join(basePath, dirName)) del activity # return unretrievable jobs for jobNum in todo: yield (jobNum, None) purgeLog = tempfile.mktemp('.log') purgeProc = utils.LoggedProcess(self._purgeExec, '--noint --logfile "%s" %s' % (purgeLog, " ".join(done))) retCode = purgeProc.wait() if retCode != 0: if self.explainError(purgeProc, retCode): pass else: proc.logError(self.errorLog, log = purgeLog, jobs = done) utils.removeFiles([log, purgeLog, basePath])
def _getJobsOutput(self, ids): if len(ids) == 0: raise StopIteration basePath = os.path.join(self._outputPath, 'tmp') try: if len(ids) == 1: # For single jobs create single subdir tmpPath = os.path.join(basePath, md5(ids[0][0]).hexdigest()) else: tmpPath = basePath utils.ensureDirExists(tmpPath) except Exception: raise BackendError( 'Temporary path "%s" could not be created.' % tmpPath, BackendError) jobNumMap = dict(ids) jobs = self.writeWMSIds(ids) activity = Activity('retrieving %d job outputs' % len(ids)) proc = LocalProcess(self._outputExec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmpPath) # yield output dirs todo = jobNumMap.values() currentJobNum = None for line in imap(str.strip, proc.stdout.iter(timeout=60)): if line.startswith(tmpPath): todo.remove(currentJobNum) outputDir = line.strip() if os.path.exists(outputDir): if 'GC_WC.tar.gz' in os.listdir(outputDir): wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz') try: tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir) os.unlink(wildcardTar) except Exception: self._log.error( 'Can\'t unpack output files contained in %s', wildcardTar) yield (currentJobNum, line.strip()) currentJobNum = None else: currentJobNum = jobNumMap.get(self._createId(line), currentJobNum) retCode = proc.status(timeout=0, terminate=True) activity.finish() if retCode != 0: if 'Keyboard interrupt raised by user' in proc.stderr.read( timeout=0): utils.removeFiles([jobs, basePath]) raise StopIteration else: self._log.log_process(proc, files={'jobs': SafeFile(jobs).read()}) self._log.error('Trying to recover from error ...') for dirName in os.listdir(basePath): yield (None, os.path.join(basePath, dirName)) # return unretrievable jobs for jobNum in todo: yield (jobNum, None) utils.removeFiles([jobs, basePath])
def submitJobs(self, jobNumListFull, module): submitBatch=25 for index in range(0,len(jobNumListFull),submitBatch): jobNumList=jobNumListFull[index:index+submitBatch] self.debugOut("\nStarted submitting: %s" % jobNumList) self.debugPool() # get the full job config path and basename def _getJobCFG(jobNum): return os.path.join(self.getSandboxPath(jobNum), 'job_%d.var' % jobNum), 'job_%d.var' % jobNum activity = utils.ActivityLog('preparing jobs') # construct a temporary JDL for this batch of jobs jdlDescriptor, jdlFilePath = tempfile.mkstemp(suffix='.jdl') jdlSubmitPath = jdlFilePath self.debugOut("Writing temporary jdl to: "+jdlSubmitPath) try: data = self.makeJDLdata(jobNumList, module) utils.safeWrite(os.fdopen(jdlDescriptor, 'w'), data) except Exception: utils.removeFiles([jdlFilePath]) raise BackendError('Could not write jdl data to %s.' % jdlFilePath) # create the _jobconfig.sh file containing the actual data for jobNum in jobNumList: try: self._writeJobConfig(_getJobCFG(jobNum)[0], jobNum, module) except Exception: raise BackendError('Could not write _jobconfig data for %s.' % jobNum) self.debugOut("Copying to remote") # copy infiles to ssh/gsissh remote pool if required if self.remoteType == poolType.SSH or self.remoteType == poolType.GSISSH: activity = utils.ActivityLog('preparing remote scheduler') self.debugOut("Copying to sandbox") workdirBase = self.getWorkdirPath() # TODO: check whether shared remote files already exist and copy otherwise for fileDescr, fileSource, fileTarget in self._getSandboxFilesIn(module): copyProcess = self.Pool.LoggedCopyToRemote(fileSource, os.path.join(workdirBase, fileTarget)) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() # copy job config files self.debugOut("Copying job configs") for jobNum in jobNumList: fileSource, fileTarget = _getJobCFG(jobNum) copyProcess = self.Pool.LoggedCopyToRemote(fileSource, os.path.join(self.getWorkdirPath(jobNum), fileTarget)) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() # copy jdl self.debugOut("Copying jdl") jdlSubmitPath = os.path.join(workdirBase, os.path.basename(jdlFilePath)) copyProcess = self.Pool.LoggedCopyToRemote(jdlFilePath, jdlSubmitPath ) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() # copy proxy for authFile in self.proxy.getAuthFiles(): self.debugOut("Copying proxy") copyProcess = self.Pool.LoggedCopyToRemote(authFile, os.path.join(self.getWorkdirPath(), os.path.basename(authFile))) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() self.debugOut("Starting jobs") try: # submit all jobs simultaneously and temporarily store verbose (ClassAdd) output activity = utils.ActivityLog('queuing jobs at scheduler') proc = self.Pool.LoggedProcess(self.submitExec, ' -verbose %(JDL)s' % { "JDL": jdlSubmitPath }) self.debugOut("AAAAA") # extract the Condor ID (WMS ID) of the jobs from output ClassAds wmsJobIdList = [] for line in proc.iter(): if "GridControl_GCIDtoWMSID" in line: GCWMSID=line.split('=')[1].strip(' "\n').split('@') GCID,WMSID=int(GCWMSID[0]),GCWMSID[1].strip() # Condor creates a default job then overwrites settings on any subsequent job - i.e. skip every second, but better be sure if ( not wmsJobIdList ) or ( GCID not in zip(*wmsJobIdList)[0] ): wmsJobIdList.append((self._createId(WMSID),GCID)) if "GridControl_GCtoWMSID" in line: self.debugOut("o : %s" % line) self.debugOut("o : %s" % wmsJobIdList) retCode = proc.wait() if (retCode != 0) or ( len(wmsJobIdList) < len(jobNumList) ): if self.explainError(proc, retCode): pass else: utils.eprint("Submitted %4d jobs of %4d expected" % (len(wmsJobIdList),len(jobNumList))) proc.logError(self.errorLog, jdl = jdlFilePath) finally: utils.removeFiles([jdlFilePath]) self.debugOut("Done Submitting") # yield the (jobNum, WMS ID, other data) of each job successively for index in range(len(wmsJobIdList)): yield (wmsJobIdList[index][1], wmsJobIdList[index][0], {} ) self.debugOut("Yielded submitted job") self.debugFlush()
def submitJobs(self, jobNumListFull, module): submitBatch = 25 for index in irange(0, len(jobNumListFull), submitBatch): jobNumList = jobNumListFull[index:index + submitBatch] self.debugOut("\nStarted submitting: %s" % jobNumList) self.debugPool() # get the full job config path and basename def _getJobCFG(jobNum): return os.path.join(self.getSandboxPath(jobNum), 'job_%d.var' % jobNum), 'job_%d.var' % jobNum activity = utils.ActivityLog('preparing jobs') # construct a temporary JDL for this batch of jobs jdlDescriptor, jdlFilePath = tempfile.mkstemp(suffix='.jdl') jdlSubmitPath = jdlFilePath self.debugOut("Writing temporary jdl to: " + jdlSubmitPath) try: data = self.makeJDLdata(jobNumList, module) utils.safeWrite(os.fdopen(jdlDescriptor, 'w'), data) except Exception: utils.removeFiles([jdlFilePath]) raise BackendError('Could not write jdl data to %s.' % jdlFilePath) # create the _jobconfig.sh file containing the actual data for jobNum in jobNumList: try: self._writeJobConfig( _getJobCFG(jobNum)[0], jobNum, module, {}) except Exception: raise BackendError( 'Could not write _jobconfig data for %s.' % jobNum) self.debugOut("Copying to remote") # copy infiles to ssh/gsissh remote pool if required if self.remoteType == PoolType.SSH or self.remoteType == PoolType.GSISSH: activity = utils.ActivityLog('preparing remote scheduler') self.debugOut("Copying to sandbox") workdirBase = self.getWorkdirPath() # TODO: check whether shared remote files already exist and copy otherwise for _, fileSource, fileTarget in self._getSandboxFilesIn( module): copyProcess = self.Pool.LoggedCopyToRemote( fileSource, os.path.join(workdirBase, fileTarget)) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() # copy job config files self.debugOut("Copying job configs") for jobNum in jobNumList: fileSource, fileTarget = _getJobCFG(jobNum) copyProcess = self.Pool.LoggedCopyToRemote( fileSource, os.path.join(self.getWorkdirPath(jobNum), fileTarget)) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() # copy jdl self.debugOut("Copying jdl") jdlSubmitPath = os.path.join(workdirBase, os.path.basename(jdlFilePath)) copyProcess = self.Pool.LoggedCopyToRemote( jdlFilePath, jdlSubmitPath) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() # copy proxy for authFile in self._token.getAuthFiles(): self.debugOut("Copying proxy") copyProcess = self.Pool.LoggedCopyToRemote( authFile, os.path.join(self.getWorkdirPath(), os.path.basename(authFile))) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() self.debugOut("Starting jobs") try: # submit all jobs simultaneously and temporarily store verbose (ClassAdd) output activity = utils.ActivityLog('queuing jobs at scheduler') proc = self.Pool.LoggedExecute( self.submitExec, ' -verbose %(JDL)s' % {"JDL": jdlSubmitPath}) self.debugOut("AAAAA") # extract the Condor ID (WMS ID) of the jobs from output ClassAds wmsJobIdList = [] for line in proc.iter(): if "GridControl_GCIDtoWMSID" in line: GCWMSID = line.split('=')[1].strip(' "\n').split('@') GCID, WMSID = int(GCWMSID[0]), GCWMSID[1].strip() # Condor creates a default job then overwrites settings on any subsequent job - i.e. skip every second, but better be sure if (not wmsJobIdList) or (GCID not in lzip( *wmsJobIdList)[0]): wmsJobIdList.append((self._createId(WMSID), GCID)) if "GridControl_GCtoWMSID" in line: self.debugOut("o : %s" % line) self.debugOut("o : %s" % wmsJobIdList) retCode = proc.wait() activity.finish() if (retCode != 0) or (len(wmsJobIdList) < len(jobNumList)): if self.explainError(proc, retCode): pass else: utils.eprint("Submitted %4d jobs of %4d expected" % (len(wmsJobIdList), len(jobNumList))) proc.logError(self.errorLog, jdl=jdlFilePath) finally: utils.removeFiles([jdlFilePath]) self.debugOut("Done Submitting") # yield the (jobNum, WMS ID, other data) of each job successively for index in irange(len(wmsJobIdList)): yield (wmsJobIdList[index][1], wmsJobIdList[index][0], {}) self.debugOut("Yielded submitted job") self.debugFlush()