def forceMove(source, target): try: if os.path.exists(target): shutil.rmtree(target) except IOError, e: utils.eprint('Warning: "%s" cannot be removed: %s' % (target, str(e))) return False
def cancelJobs(self, ids): if not len(ids): raise StopIteration activity = utils.ActivityLog('cancelling jobs') proc = utils.LoggedProcess(self.cancelExec, self.getCancelArguments(self._getRawIDs(ids))) if proc.wait() != 0: for line in proc.getError().splitlines(): if not self.unknownID() in line: utils.eprint(line.strip()) del activity activity = utils.ActivityLog('waiting for jobs to finish') time.sleep(5) for wmsId, jobNum in ids: path = self._getSandbox(wmsId) if path is None: utils.eprint('Sandbox for job %d with wmsId "%s" could not be found' % (jobNum, wmsId)) continue try: shutil.rmtree(path) except Exception: raise BackendError('Sandbox for job %d with wmsId "%s" could not be deleted' % (jobNum, wmsId)) yield (jobNum, wmsId) del activity
def checkJobs(self, ids): if not len(ids): raise StopIteration activity = utils.ActivityLog('checking job status') proc = utils.LoggedProcess(self.statusExec, self.getCheckArguments(self._getRawIDs(ids))) tmp = {} for data in self.parseStatus(proc.iter()): wmsId = self._createId(data['id']) tmp[wmsId] = (wmsId, self.parseJobState(data['status']), data) for wmsId, jobNum in ids: if wmsId not in tmp: yield (jobNum, wmsId, Job.DONE, {}) else: yield tuple([jobNum] + list(tmp[wmsId])) retCode = proc.wait() del activity if retCode != 0: for line in proc.getError().splitlines(): if not self.unknownID() in line: utils.eprint(line)
def parseJobInfo(fn): if not os.path.exists(fn): return utils.eprint('Warning: "%s" does not exist.' % fn) try: info_content = open(fn, 'r').read() except Exception, ex: return utils.eprint('Warning: Unable to read "%s"!\n%s' % (fn, str(ex)))
def doTransfer(self, listDescSourceTarget): for (desc, source, target) in listDescSourceTarget: if not self.smPaths: raise ConfigError( "%s can't be transferred because '%s path wasn't set" % (desc, self.smOptPrefix)) for idx, sePath in enumerate(set(self.smPaths)): utils.vprint('Copy %s to SE %d ' % (desc, idx + 1), -1, newline=False) sys.stdout.flush() proc = se_copy(source, os.path.join(sePath, target), self.smForce) if proc.status(timeout=5 * 60, terminate=True) == 0: utils.vprint('finished', -1) else: utils.vprint('failed', -1) utils.eprint(proc.stderr.read(timeout=0)) utils.eprint( 'Unable to copy %s! You can try to copy it manually.' % desc) if not utils.getUserBool( 'Is %s (%s) available on SE %s?' % (desc, source, sePath), False): raise StorageError('%s is missing on SE %s!' % (desc, sePath))
def cancelJobs(self, ids): if not len(ids): raise StopIteration activity = utils.ActivityLog('cancelling jobs') proc = utils.LoggedProcess(self.cancelExec, self.getCancelArguments(self._getRawIDs(ids))) if proc.wait() != 0: for line in proc.getError().splitlines(): if not self.unknownID() in line: utils.eprint(line.strip()) del activity activity = utils.ActivityLog('waiting for jobs to finish') time.sleep(5) for wmsId, jobNum in ids: path = self._getSandbox(wmsId) if path == None: utils.eprint('Sandbox for job %d with wmsId "%s" could not be found' % (jobNum, wmsId)) continue try: shutil.rmtree(path) except: raise RuntimeError('Sandbox for job %d with wmsId "%s" could not be deleted' % (jobNum, wmsId)) yield (jobNum, wmsId) del activity
def _getJobsOutput(self, ids): if len(ids) == 0: raise StopIteration basePath = os.path.join(self._outputPath, 'tmp') try: if len(ids) == 1: # For single jobs create single subdir tmpPath = os.path.join(basePath, md5(ids[0][0]).hexdigest()) else: tmpPath = basePath utils.ensureDirExists(tmpPath) except Exception: raise BackendError('Temporary path "%s" could not be created.' % tmpPath, BackendError) jobNumMap = dict(ids) jobs = self.writeWMSIds(ids) activity = utils.ActivityLog('retrieving job outputs') proc = LocalProcess(self._outputExec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmpPath) # yield output dirs todo = jobNumMap.values() currentJobNum = None for line in imap(str.strip, proc.stdout.iter(timeout = 60)): if line.startswith(tmpPath): todo.remove(currentJobNum) outputDir = line.strip() if os.path.exists(outputDir): if 'GC_WC.tar.gz' in os.listdir(outputDir): wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz') try: tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir) os.unlink(wildcardTar) except Exception: utils.eprint("Can't unpack output files contained in %s" % wildcardTar) yield (currentJobNum, line.strip()) currentJobNum = None else: currentJobNum = jobNumMap.get(self._createId(line), currentJobNum) retCode = proc.status(timeout = 0, terminate = True) del activity if retCode != 0: if 'Keyboard interrupt raised by user' in proc.stderr.read(timeout = 0): utils.removeFiles([jobs, basePath]) raise StopIteration else: self._log.log_process(proc, files = {'jobs': utils.safeRead(jobs)}) utils.eprint('Trying to recover from error ...') for dirName in os.listdir(basePath): yield (None, os.path.join(basePath, dirName)) # return unretrievable jobs for jobNum in todo: yield (jobNum, None) utils.removeFiles([jobs, basePath])
def submitJobs(self, jobNumList, module): if not self.bulkSubmissionBegin(): # Trying to delegate proxy failed if self._forceDelegate: # User switched on forcing delegation => exception raise BackendError('Unable to delegate proxy!') utils.eprint('Unable to delegate proxy! Continue with automatic delegation...') self._submitParams.update({ '-a': ' ' }) self._useDelegate = False for submitInfo in GridWMS.submitJobs(self, jobNumList, module): yield submitInfo
def __init__(self, config, source): self._rawSource = source BasicParameterAdapter.__init__(self, config, source) self._mapJob2PID = {} if not os.path.isdir(config.getWorkPath()): os.makedirs(config.getWorkPath()) self._pathJob2PID = config.getWorkPath('params.map.gz') self._pathParams = config.getWorkPath('params.dat.gz') # Find out if init should be performed - overrides userResync! userInit = config.getState('init', detail='parameters') needInit = False if not (os.path.exists(self._pathParams) and os.path.exists(self._pathJob2PID)): needInit = True # Init needed if no parameter log exists if userInit and not needInit and (source.getMaxParameters() is not None): utils.eprint( 'Re-Initialization will overwrite the current mapping between jobs and parameter/dataset content! This can lead to invalid results!' ) if utils.getUserBool( 'Do you want to perform a syncronization between the current mapping and the new one to avoid this?', True): userInit = False doInit = userInit or needInit # Find out if resync should be performed userResync = config.getState('resync', detail='parameters') config.setState(False, 'resync', detail='parameters') needResync = False pHash = self._rawSource.getHash() self.storedHash = config.get('parameter hash', pHash, persistent=True) if self.storedHash != pHash: needResync = True # Resync needed if parameters have changed self._log.info('Parameter hash has changed') self._log.debug('\told hash: %s', self.storedHash) self._log.debug('\tnew hash: %s', pHash) config.setState(True, 'init', detail='config') doResync = (userResync or needResync) and not doInit if not doResync and not doInit: # Reuse old mapping activity = utils.ActivityLog( 'Loading cached parameter information') self.readJob2PID() activity.finish() return elif doResync: # Perform sync activity = utils.ActivityLog('Syncronizing parameter information') self.storedHash = None self._resyncState = self.resync() activity.finish() elif doInit: # Write current state self.writeJob2PID(self._pathJob2PID) ParameterSource.getClass('GCDumpParameterSource').write( self._pathParams, self) config.set('parameter hash', self._rawSource.getHash())
def getEntries(self, path, metadata, events, seList, objStore): metadata['GC_SOURCE_DIR'] = self.path (log, counter) = (None, 0) proc = storage.se_ls(self.path) for fn in proc.iter(): log = utils.ActivityLog('Reading source directory - [%d]' % counter) yield (os.path.join(self.path, fn.strip()), metadata, events, seList, objStore) counter += 1 if proc.wait(): utils.eprint(proc.getError())
def _socketHandler(self, maxFailCount=5): if self.sshLink: if self._refreshSSHLink(): if self.socketArgs!=self.socketArgsDef: self.socketArgs=self.socketArgsDef else: self.socketFailCount+=1 if self.socketArgs!="": self.socketArgs="" if self.socketFailCount>maxFailCount: eprint("Failed to create secure socket %s more than %s times!\nDisabling further attempts." % (self.sshLink,maxFailCount)) self.sshLink=False
def processBlocks(): # Validation, Filtering & Naming: for block in self.getBlocksInternal(): block.setdefault(DataProvider.BlockName, '0') block.setdefault(DataProvider.Provider, self.__class__.__name__) if self._datasetID: block[DataProvider.DatasetID] = self._datasetID if self._datasetNick: block[DataProvider.Nickname] = self._datasetNick else: block[DataProvider.Nickname] = self._nickProducer.process(block) # Filter file list events = sum(map(lambda x: x[DataProvider.NEntries], block[DataProvider.FileList])) if block.setdefault(DataProvider.NEntries, events) != events: utils.eprint('WARNING: Inconsistency in block %s#%s: Number of events doesn\'t match (b:%d != f:%d)' % (block[DataProvider.Dataset], block[DataProvider.BlockName], block[DataProvider.NEntries], events)) # Filter ignored and empty files block[DataProvider.FileList] = filter(lambda x: x[DataProvider.URL] not in self.ignoreURL, block[DataProvider.FileList]) if self.emptyFiles: block[DataProvider.FileList] = filter(lambda x: x[DataProvider.NEntries] != 0, block[DataProvider.FileList]) # Filter dataset sites if block.setdefault(DataProvider.Locations, None) != None: sites = utils.doBlackWhiteList(block[DataProvider.Locations], self.sitefilter, onEmpty = [], preferWL = False) if len(sites) == 0 and len(block[DataProvider.FileList]) != 0: utils.eprint('WARNING: Block %s#%s is not available at any site!' % (block[DataProvider.Dataset], block[DataProvider.BlockName])) block[DataProvider.Locations] = sites # Filter by number of files block[DataProvider.FileList] = block[DataProvider.FileList][:QM(self.limitFiles < 0, None, self.limitFiles)] # Filter by event count class EventCounter: def __init__(self, start, limit): (self.counter, self.limit) = (start, limit) def accept(self, fi): if (self.limit < 0) or (self.counter + fi[DataProvider.NEntries] <= self.limit): self.counter += fi[DataProvider.NEntries] return True return False eventCounter = EventCounter(self.allEvents, self.limitEvents) block[DataProvider.FileList] = filter(eventCounter.accept, block[DataProvider.FileList]) block[DataProvider.NEntries] = eventCounter.counter - self.allEvents self.allEvents = eventCounter.counter # Filter empty blocks if not (self.emptyBlock and block[DataProvider.NEntries] == 0): yield block
def retrieveJobs(self, ids): # Process output sandboxes returned by getJobsOutput # Function to force moving a directory def forceMove(source, target): try: if os.path.exists(target): shutil.rmtree(target) except IOError, e: utils.eprint('Warning: "%s" cannot be removed: %s' % (target, str(e))) return False try: shutil.move(source, target) except IOError, e: utils.eprint('Warning: Error moving job output directory from "%s" to "%s": %s' % (source, target, str(e))) return False
def __init__(self, config, source): self._rawSource = source BasicParameterAdapter.__init__(self, config, source) self._mapJob2PID = {} if not os.path.isdir(config.getWorkPath()): os.makedirs(config.getWorkPath()) self._pathJob2PID = config.getWorkPath('params.map.gz') self._pathParams = config.getWorkPath('params.dat.gz') # Find out if init should be performed - overrides userResync! userInit = config.getState('init', detail = 'parameters') needInit = False if not (os.path.exists(self._pathParams) and os.path.exists(self._pathJob2PID)): needInit = True # Init needed if no parameter log exists if userInit and not needInit and (source.getMaxParameters() is not None): utils.eprint('Re-Initialization will overwrite the current mapping between jobs and parameter/dataset content! This can lead to invalid results!') if utils.getUserBool('Do you want to perform a syncronization between the current mapping and the new one to avoid this?', True): userInit = False doInit = userInit or needInit # Find out if resync should be performed userResync = config.getState('resync', detail = 'parameters') config.setState(False, 'resync', detail = 'parameters') needResync = False pHash = self._rawSource.getHash() self._storedHash = config.get('parameter hash', pHash, persistent = True) if self._storedHash != pHash: needResync = True # Resync needed if parameters have changed self._log.info('Parameter hash has changed') self._log.debug('\told hash: %s', self._storedHash) self._log.debug('\tnew hash: %s', pHash) config.setState(True, 'init', detail = 'config') doResync = (userResync or needResync) and not doInit if not doResync and not doInit: # Reuse old mapping activity = utils.ActivityLog('Loading cached parameter information') self._readJob2PID() activity.finish() return elif doResync: # Perform sync activity = utils.ActivityLog('Syncronizing parameter information') self._storedHash = None self._resyncState = self.resync() activity.finish() elif doInit: # Write current state self._writeJob2PID(self._pathJob2PID) ParameterSource.getClass('GCDumpParameterSource').write(self._pathParams, self) config.set('parameter hash', self._rawSource.getHash())
def doTransfer(self, listDescSourceTarget): for (desc, source, target) in listDescSourceTarget: if not self.smPaths: raise ConfigError("%s can't be transferred because '%s path wasn't set" % (desc, self.smOptPrefix)) for idx, sePath in enumerate(set(self.smPaths)): utils.vprint('Copy %s to SE %d ' % (desc, idx + 1), -1, newline = False) sys.stdout.flush() proc = se_copy(source, os.path.join(sePath, target), self.smForce) if proc.status(timeout = 5*60, terminate = True) == 0: utils.vprint('finished', -1) else: utils.vprint('failed', -1) utils.eprint(proc.stderr.read(timeout = 0)) utils.eprint('Unable to copy %s! You can try to copy it manually.' % desc) if not utils.getUserBool('Is %s (%s) available on SE %s?' % (desc, source, sePath), False): raise StorageError('%s is missing on SE %s!' % (desc, sePath))
def _submitJob(self, jobNum, module): activity = utils.ActivityLog('submitting jobs') try: sandbox = self.sandPath # defined here for exception message in case os.mkdir fails if not os.path.exists(self.sandPath): os.mkdir(self.sandPath) sandbox = tempfile.mkdtemp('', '%s.%04d.' % (module.taskID, jobNum), self.sandPath) except Exception: raise BackendError('Unable to create sandbox directory "%s"!' % sandbox) sbPrefix = sandbox.replace(self.sandPath, '').lstrip('/') def translateTarget(d, s, t): return (d, s, os.path.join(sbPrefix, t)) self.smSBIn.doTransfer(ismap(translateTarget, self._getSandboxFilesIn(module))) cfgPath = os.path.join(sandbox, '_jobconfig.sh') self._writeJobConfig(cfgPath, jobNum, module, {'GC_SANDBOX': sandbox, 'GC_SCRATCH_SEARCH': str.join(' ', self.scratchPath)}) reqs = self.brokerSite.brokerAdd(module.getRequirements(jobNum), WMS.SITES) reqs = dict(self.brokerQueue.brokerAdd(reqs, WMS.QUEUES)) if (self.memory > 0) and (reqs.get(WMS.MEMORY, 0) < self.memory): reqs[WMS.MEMORY] = self.memory # local jobs need higher (more realistic) memory requirements (stdout, stderr) = (os.path.join(sandbox, 'gc.stdout'), os.path.join(sandbox, 'gc.stderr')) jobName = module.getDescription(jobNum).jobName proc = utils.LoggedProcess(self.submitExec, '%s %s "%s" %s' % (self.submitOpts, self.getSubmitArguments(jobNum, jobName, reqs, sandbox, stdout, stderr), utils.pathShare('gc-local.sh'), self.getJobArguments(jobNum, sandbox))) retCode = proc.wait() wmsIdText = proc.getOutput().strip().strip('\n') try: wmsId = self.parseSubmitOutput(wmsIdText) except Exception: wmsId = None del activity if retCode != 0: utils.eprint('WARNING: %s failed:' % self.submitExec) elif wmsId is None: utils.eprint('WARNING: %s did not yield job id:\n%s' % (self.submitExec, wmsIdText)) if wmsId: wmsId = self._createId(wmsId) open(os.path.join(sandbox, wmsId), 'w') else: proc.logError(self.errorLog) return (jobNum, utils.QM(wmsId, wmsId, None), {'sandbox': sandbox})
def checkJobsDirect(self, ids): if len(ids) == 0: raise StopIteration activity = utils.ActivityLog('checking job status') errors = [] for (wmsId, jobNum) in ids: try: data = utils.filterDict(dict(getStatusDirect(self._splitId(wmsId)[0])), vF = lambda v: (v != '') and (v != '0')) data['id'] = self._createId(data.get('jobid', wmsId)) data['dest'] = data.get('destination', 'N/A') yield (jobNum, data['id'], self._statusMap[data['status'].lower()], data) except: errors.append(repr(sys.exc_info()[1])) if utils.abort(): break del activity if errors: utils.eprint('The following glite errors have occured:\n%s' % str.join('\n', errors))
def _getUserSource(self, pExpr, parent): if not pExpr: return parent # Wrap psource factory functions def createWrapper(clsName): def wrapper(*args): try: parameterClass = ParameterSource.getClass(clsName) except Exception: raise ParameterError('Unable to create parameter source "%r"!' % clsName) try: return parameterClass.create(self.paramConfig, *args) except Exception: raise ParameterError('Error while creating "%r" with arguments "%r"' % (parameterClass.__name__, args)) return wrapper userFun = dict(map(lambda (key, cls): (key, createWrapper(cls)), ParameterSource.managerMap.items())) try: source = eval(pExpr, userFun) except Exception: utils.eprint('Available functions: %s' % userFun.keys()) raise return ZipLongParameterSource(parent, source)
def checkJobsDirect(self, ids): if len(ids) == 0: raise StopIteration activity = utils.ActivityLog('checking job status') errors = [] for (wmsId, jobNum) in ids: try: data = utils.filterDict(dict( getStatusDirect(self._splitId(wmsId)[0])), vF=lambda v: (v != '') and (v != '0')) data['id'] = self._createId(data.get('jobid', wmsId)) data['dest'] = data.get('destination', 'N/A') yield (jobNum, data['id'], self._statusMap[data['status'].lower()], data) except Exception: errors.append(repr(sys.exc_info()[1])) if utils.abort(): break del activity if errors: utils.eprint('The following glite errors have occured:\n%s' % str.join('\n', errors))
def scriptThread(self, script, jobNum = None, jobObj = None, allDict = {}): try: tmp = {} if jobNum != None: tmp.update(self.task.getSubmitInfo(jobNum)) if jobObj != None: tmp.update(jobObj.getAll()) tmp.update({'WORKDIR': self.config.getWorkPath(), 'CFGFILE': self.config.configFile}) tmp.update(self.task.getTaskConfig()) tmp.update(self.task.getJobConfig(jobNum)) if jobNum != None: tmp.update(self.task.getSubmitInfo(jobNum)) tmp.update(allDict) for key, value in tmp.iteritems(): os.environ["GC_%s" % key] = str(value) script = self.task.substVars(script, jobNum, tmp) if self.silent: utils.LoggedProcess(script).wait() else: os.system(script) except GCError: utils.eprint(GCError.message)
def readJobs(self, jobLimit): try: if not os.path.exists(self._dbPath): os.mkdir(self._dbPath) except Exception: raise JobError("Problem creating work directory '%s'" % self._dbPath) candidates = fnmatch.filter(os.listdir(self._dbPath), 'job_*.txt') (jobMap, log, maxJobs) = ({}, None, len(candidates)) for idx, jobFile in enumerate(candidates): if (jobLimit >= 0) and (len(jobMap) >= jobLimit): utils.eprint('Stopped reading job infos! The number of job infos in the work directory (%d) ' % len(jobMap), newline = False) utils.eprint('is larger than the maximum number of jobs (%d)' % jobLimit) break try: # 2xsplit is faster than regex jobNum = int(jobFile.split(".")[0].split("_")[1]) except Exception: continue jobObj = Job.load(os.path.join(self._dbPath, jobFile)) jobMap[jobNum] = jobObj if idx % 100 == 0: del log log = utils.ActivityLog('Reading job infos ... %d [%d%%]' % (idx, (100.0 * idx) / maxJobs)) return jobMap
def findCollision(tName, nameDict, varDict, hashKeys, keyFmt = lambda x: x): targetNames = nameDict.values() for name in list(set(targetNames)): targetNames.remove(name) if len(targetNames): ask = True for name in targetNames: utils.eprint("Multiple %s keys are mapped to the same %s name '%s'!" % (tName, tName, keyFmt(name))) for key in nameDict: if nameDict[key] == name: utils.eprint('\t%s hash %s using:' % (tName, keyFmt(key))) for x in filter(lambda (k, v): k in hashKeys, varDict[keyFmt(key)].items()): utils.eprint('\t\t%s = %s' % x) if ask and not utils.getUserBool('Do you want to continue?', False): sys.exit(0) ask = False
def readJobs(self, jobLimit): jobMap = {} maxJobs = 0 if os.path.exists(self._dbFile): try: tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED) except Exception: # Try to recover job archive utils.eprint('=' * 40 + '\nStarting recovery of broken job database') utils.eprint(' => Answer "y" if asked "Is this a single-disk archive?"!\n' + '=' * 40) os.system('zip -FF %s --out %s.tmp 2> /dev/null' % (self._dbFile, self._dbFile)) os.rename(self._dbFile, self._dbFile + '.broken') os.rename(self._dbFile + '.tmp', self._dbFile) tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED) utils.removeFiles([self._dbFile + '.broken']) brokenList = [] for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple(imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1))) try: fp = tar.open(fnTarInfo) try: fp.read() finally: fp.close() except Exception: pass for broken in brokenList: os.system('zip %s -d %s' % (self._dbFile, broken)) utils.eprint('Recover completed!') activity = utils.ActivityLog('Reading job transactions ...') maxJobs = len(tar.namelist()) tMap = {} for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple(imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1))) if tid < tMap.get(jobNum, 0): continue data = utils.DictFormat(escapeString = True).parse(tar.open(fnTarInfo).read()) jobMap[jobNum] = Job.loadData(fnTarInfo, data) tMap[jobNum] = tid if idx % 100 == 0: activity.finish() activity = utils.ActivityLog('Reading job transactions ... %d [%d%%]' % (idx, (100.0 * idx) / maxJobs)) self._serial = maxJobs return jobMap
def readJobs(self, jobLimit): jobMap = {} maxJobs = 0 if os.path.exists(self._dbFile): try: tar = zipfile.ZipFile(self._dbFile, "r", zipfile.ZIP_DEFLATED) except Exception: # Try to recover job archive utils.eprint("=" * 40 + "\nStarting recovery of broken job database") utils.eprint(' => Answer "y" if asked "Is this a single-disk archive?"!\n' + "=" * 40) os.system("zip -FF %s --out %s.tmp 2> /dev/null" % (self._dbFile, self._dbFile)) os.rename(self._dbFile, self._dbFile + ".broken") os.rename(self._dbFile + ".tmp", self._dbFile) tar = zipfile.ZipFile(self._dbFile, "r", zipfile.ZIP_DEFLATED) utils.removeFiles([self._dbFile + ".broken"]) brokenList = [] for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple(map(lambda s: int(s[1:]), fnTarInfo.split("_", 1))) try: rawData = tar.open(fnTarInfo).read() except Exception: pass for broken in brokenList: os.system("zip %s -d %s" % (self._dbFile, broken)) utils.eprint("Recover completed!") log = None maxJobs = len(tar.namelist()) tMap = {} for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple(map(lambda s: int(s[1:]), fnTarInfo.split("_", 1))) if tid < tMap.get(jobNum, 0): continue data = utils.DictFormat(escapeString=True).parse(tar.open(fnTarInfo).read()) jobMap[jobNum] = Job.loadData(fnTarInfo, data) tMap[jobNum] = tid if idx % 100 == 0: del log log = utils.ActivityLog("Reading job transactions ... %d [%d%%]" % (idx, (100.0 * idx) / maxJobs)) self._serial = maxJobs return jobMap
def _getJobsOutput(self, ids): if len(ids) == 0: raise StopIteration basePath = os.path.join(self._outputPath, 'tmp') try: if len(ids) == 1: # For single jobs create single subdir tmpPath = os.path.join(basePath, md5(ids[0][0]).hexdigest()) else: tmpPath = basePath utils.ensureDirExists(tmpPath) except Exception: raise BackendError( 'Temporary path "%s" could not be created.' % tmpPath, BackendError) jobNumMap = dict(ids) jobs = self.writeWMSIds(ids) activity = utils.ActivityLog('retrieving job outputs') proc = LocalProcess(self._outputExec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmpPath) # yield output dirs todo = jobNumMap.values() currentJobNum = None for line in imap(str.strip, proc.stdout.iter(timeout=60)): if line.startswith(tmpPath): todo.remove(currentJobNum) outputDir = line.strip() if os.path.exists(outputDir): if 'GC_WC.tar.gz' in os.listdir(outputDir): wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz') try: tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir) os.unlink(wildcardTar) except Exception: utils.eprint( "Can't unpack output files contained in %s" % wildcardTar) yield (currentJobNum, line.strip()) currentJobNum = None else: currentJobNum = jobNumMap.get(self._createId(line), currentJobNum) retCode = proc.status(timeout=0, terminate=True) del activity if retCode != 0: if 'Keyboard interrupt raised by user' in proc.stderr.read( timeout=0): utils.removeFiles([jobs, basePath]) raise StopIteration else: self._log.log_process(proc, files={'jobs': utils.safeRead(jobs)}) utils.eprint('Trying to recover from error ...') for dirName in os.listdir(basePath): yield (None, os.path.join(basePath, dirName)) # return unretrievable jobs for jobNum in todo: yield (jobNum, None) utils.removeFiles([jobs, basePath])
def _getJobsOutput(self, allIds): if len(allIds) == 0: raise StopIteration basePath = os.path.join(self._outputPath, 'tmp') try: if len(allIds) == 1: # For single jobs create single subdir basePath = os.path.join(basePath, md5(allIds[0][0]).hexdigest()) utils.ensureDirExists(basePath) except Exception: raise BackendError('Temporary path "%s" could not be created.' % basePath, BackendError) activity = utils.ActivityLog('retrieving job outputs') for ids in imap(lambda x: allIds[x:x+self._nJobsPerChunk], irange(0, len(allIds), self._nJobsPerChunk)): jobNumMap = dict(ids) jobs = ' '.join(self._getRawIDs(ids)) log = tempfile.mktemp('.log') #print self._outputExec, '--noint --logfile "%s" --dir "%s" %s' % (log, basePath, jobs) #import sys #sys.exit(1) proc = utils.LoggedProcess(self._outputExec, '--noint --logfile "%s" --dir "%s" %s' % (log, basePath, jobs)) # yield output dirs todo = jobNumMap.values() done = [] currentJobNum = None for line in imap(str.strip, proc.iter()): match = re.match(self._outputRegex, line) if match: currentJobNum = jobNumMap.get(self._createId(match.groupdict()['rawId'])) todo.remove(currentJobNum) done.append(match.groupdict()['rawId']) outputDir = match.groupdict()['outputDir'] if os.path.exists(outputDir): if 'GC_WC.tar.gz' in os.listdir(outputDir): wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz') try: tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir) os.unlink(wildcardTar) except Exception: utils.eprint("Can't unpack output files contained in %s" % wildcardTar) yield (currentJobNum, outputDir) currentJobNum = None retCode = proc.wait() if retCode != 0: if 'Keyboard interrupt raised by user' in proc.getError(): utils.removeFiles([log, basePath]) raise StopIteration else: proc.logError(self.errorLog, log = log) utils.eprint('Trying to recover from error ...') for dirName in os.listdir(basePath): yield (None, os.path.join(basePath, dirName)) del activity # return unretrievable jobs for jobNum in todo: yield (jobNum, None) purgeLog = tempfile.mktemp('.log') purgeProc = utils.LoggedProcess(self._purgeExec, '--noint --logfile "%s" %s' % (purgeLog, " ".join(done))) retCode = purgeProc.wait() if retCode != 0: if self.explainError(purgeProc, retCode): pass else: proc.logError(self.errorLog, log = purgeLog, jobs = done) utils.removeFiles([log, purgeLog, basePath])
def _getJobsOutput(self, ids): if len(ids) == 0: raise StopIteration basePath = os.path.join(self._outputPath, 'tmp') try: if len(ids) == 1: # For single jobs create single subdir tmpPath = os.path.join(basePath, md5(ids[0][0]).hexdigest()) else: tmpPath = basePath utils.ensureDirExists(tmpPath) except Exception: raise BackendError('Temporary path "%s" could not be created.' % tmpPath, RuntimeError) jobNumMap = dict(ids) jobs = self.writeWMSIds(ids) log = tempfile.mktemp('.log') activity = utils.ActivityLog('retrieving job outputs') proc = utils.LoggedProcess(self._outputExec, '--noint --logfile "%s" -i "%s" --dir "%s"' % (log, jobs, tmpPath)) # yield output dirs todo = jobNumMap.values() currentJobNum = None for line in map(str.strip, proc.iter()): if line.startswith(tmpPath): todo.remove(currentJobNum) outputDir = line.strip() if os.path.exists(outputDir): if 'GC_WC.tar.gz' in os.listdir(outputDir): wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz') try: tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir) os.unlink(wildcardTar) except Exception: utils.eprint("Can't unpack output files contained in %s" % wildcardTar) pass yield (currentJobNum, line.strip()) currentJobNum = None else: currentJobNum = jobNumMap.get(self._createId(line), currentJobNum) retCode = proc.wait() del activity if retCode != 0: if 'Keyboard interrupt raised by user' in proc.getError(): utils.removeFiles([log, jobs, basePath]) raise StopIteration else: proc.logError(self.errorLog, log = log) utils.eprint('Trying to recover from error ...') for dirName in os.listdir(basePath): yield (None, os.path.join(basePath, dirName)) # return unretrievable jobs for jobNum in todo: yield (jobNum, None) utils.removeFiles([log, jobs, basePath])
for inJobNum, dir in self._getJobsOutput(ids): # inJobNum != None, dir == None => Job could not be retrieved if dir == None: if inJobNum not in retrievedJobs: yield (inJobNum, -1, {}) continue # inJobNum == None, dir != None => Found leftovers of job retrieval if inJobNum == None: continue # inJobNum != None, dir != None => Job retrieval from WMS was ok info = os.path.join(dir, 'job.info') info_content = None if not os.path.exists(info): utils.eprint('Warning: "%s" does not exist.' % info) else: try: info_content = open(info, 'r').read() except Exception, ex: utils.eprint('Warning: Unable to read "%s"!\n%s' % (info, str(ex))) if info_content: try: # Function to parse job info file data = utils.DictFormat().parse(info_content, keyParser = {None: str}) jobNum = data['JOBID'] if jobNum != inJobNum: raise RuntimeError('Invalid job id in job file %s' % info) if forceMove(dir, os.path.join(self._outputPath, 'job_%d' % jobNum)): retrievedJobs.append(inJobNum) yield (jobNum, data['EXITCODE'], data)
def submitJobs(self, jobNumListFull, module): submitBatch=25 for index in range(0,len(jobNumListFull),submitBatch): jobNumList=jobNumListFull[index:index+submitBatch] self.debugOut("\nStarted submitting: %s" % jobNumList) self.debugPool() # get the full job config path and basename def _getJobCFG(jobNum): return os.path.join(self.getSandboxPath(jobNum), 'job_%d.var' % jobNum), 'job_%d.var' % jobNum activity = utils.ActivityLog('preparing jobs') # construct a temporary JDL for this batch of jobs jdlDescriptor, jdlFilePath = tempfile.mkstemp(suffix='.jdl') jdlSubmitPath = jdlFilePath self.debugOut("Writing temporary jdl to: "+jdlSubmitPath) try: data = self.makeJDLdata(jobNumList, module) utils.safeWrite(os.fdopen(jdlDescriptor, 'w'), data) except Exception: utils.removeFiles([jdlFilePath]) raise BackendError('Could not write jdl data to %s.' % jdlFilePath) # create the _jobconfig.sh file containing the actual data for jobNum in jobNumList: try: self._writeJobConfig(_getJobCFG(jobNum)[0], jobNum, module) except Exception: raise BackendError('Could not write _jobconfig data for %s.' % jobNum) self.debugOut("Copying to remote") # copy infiles to ssh/gsissh remote pool if required if self.remoteType == poolType.SSH or self.remoteType == poolType.GSISSH: activity = utils.ActivityLog('preparing remote scheduler') self.debugOut("Copying to sandbox") workdirBase = self.getWorkdirPath() # TODO: check whether shared remote files already exist and copy otherwise for fileDescr, fileSource, fileTarget in self._getSandboxFilesIn(module): copyProcess = self.Pool.LoggedCopyToRemote(fileSource, os.path.join(workdirBase, fileTarget)) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() # copy job config files self.debugOut("Copying job configs") for jobNum in jobNumList: fileSource, fileTarget = _getJobCFG(jobNum) copyProcess = self.Pool.LoggedCopyToRemote(fileSource, os.path.join(self.getWorkdirPath(jobNum), fileTarget)) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() # copy jdl self.debugOut("Copying jdl") jdlSubmitPath = os.path.join(workdirBase, os.path.basename(jdlFilePath)) copyProcess = self.Pool.LoggedCopyToRemote(jdlFilePath, jdlSubmitPath ) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() # copy proxy for authFile in self.proxy.getAuthFiles(): self.debugOut("Copying proxy") copyProcess = self.Pool.LoggedCopyToRemote(authFile, os.path.join(self.getWorkdirPath(), os.path.basename(authFile))) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() self.debugOut("Starting jobs") try: # submit all jobs simultaneously and temporarily store verbose (ClassAdd) output activity = utils.ActivityLog('queuing jobs at scheduler') proc = self.Pool.LoggedProcess(self.submitExec, ' -verbose %(JDL)s' % { "JDL": jdlSubmitPath }) self.debugOut("AAAAA") # extract the Condor ID (WMS ID) of the jobs from output ClassAds wmsJobIdList = [] for line in proc.iter(): if "GridControl_GCIDtoWMSID" in line: GCWMSID=line.split('=')[1].strip(' "\n').split('@') GCID,WMSID=int(GCWMSID[0]),GCWMSID[1].strip() # Condor creates a default job then overwrites settings on any subsequent job - i.e. skip every second, but better be sure if ( not wmsJobIdList ) or ( GCID not in zip(*wmsJobIdList)[0] ): wmsJobIdList.append((self._createId(WMSID),GCID)) if "GridControl_GCtoWMSID" in line: self.debugOut("o : %s" % line) self.debugOut("o : %s" % wmsJobIdList) retCode = proc.wait() if (retCode != 0) or ( len(wmsJobIdList) < len(jobNumList) ): if self.explainError(proc, retCode): pass else: utils.eprint("Submitted %4d jobs of %4d expected" % (len(wmsJobIdList),len(jobNumList))) proc.logError(self.errorLog, jdl = jdlFilePath) finally: utils.removeFiles([jdlFilePath]) self.debugOut("Done Submitting") # yield the (jobNum, WMS ID, other data) of each job successively for index in range(len(wmsJobIdList)): yield (wmsJobIdList[index][1], wmsJobIdList[index][0], {} ) self.debugOut("Yielded submitted job") self.debugFlush()
def submitJobs(self, jobNumListFull, module): submitBatch = 25 for index in irange(0, len(jobNumListFull), submitBatch): jobNumList = jobNumListFull[index:index + submitBatch] self.debugOut("\nStarted submitting: %s" % jobNumList) self.debugPool() # get the full job config path and basename def _getJobCFG(jobNum): return os.path.join(self.getSandboxPath(jobNum), 'job_%d.var' % jobNum), 'job_%d.var' % jobNum activity = utils.ActivityLog('preparing jobs') # construct a temporary JDL for this batch of jobs jdlDescriptor, jdlFilePath = tempfile.mkstemp(suffix='.jdl') jdlSubmitPath = jdlFilePath self.debugOut("Writing temporary jdl to: " + jdlSubmitPath) try: data = self.makeJDLdata(jobNumList, module) utils.safeWrite(os.fdopen(jdlDescriptor, 'w'), data) except Exception: utils.removeFiles([jdlFilePath]) raise BackendError('Could not write jdl data to %s.' % jdlFilePath) # create the _jobconfig.sh file containing the actual data for jobNum in jobNumList: try: self._writeJobConfig( _getJobCFG(jobNum)[0], jobNum, module, {}) except Exception: raise BackendError( 'Could not write _jobconfig data for %s.' % jobNum) self.debugOut("Copying to remote") # copy infiles to ssh/gsissh remote pool if required if self.remoteType == PoolType.SSH or self.remoteType == PoolType.GSISSH: activity = utils.ActivityLog('preparing remote scheduler') self.debugOut("Copying to sandbox") workdirBase = self.getWorkdirPath() # TODO: check whether shared remote files already exist and copy otherwise for _, fileSource, fileTarget in self._getSandboxFilesIn( module): copyProcess = self.Pool.LoggedCopyToRemote( fileSource, os.path.join(workdirBase, fileTarget)) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() # copy job config files self.debugOut("Copying job configs") for jobNum in jobNumList: fileSource, fileTarget = _getJobCFG(jobNum) copyProcess = self.Pool.LoggedCopyToRemote( fileSource, os.path.join(self.getWorkdirPath(jobNum), fileTarget)) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() # copy jdl self.debugOut("Copying jdl") jdlSubmitPath = os.path.join(workdirBase, os.path.basename(jdlFilePath)) copyProcess = self.Pool.LoggedCopyToRemote( jdlFilePath, jdlSubmitPath) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() # copy proxy for authFile in self._token.getAuthFiles(): self.debugOut("Copying proxy") copyProcess = self.Pool.LoggedCopyToRemote( authFile, os.path.join(self.getWorkdirPath(), os.path.basename(authFile))) if copyProcess.wait() != 0: if self.explainError(copyProcess, copyProcess.wait()): pass else: copyProcess.logError(self.errorLog, brief=True) self.debugFlush() self.debugOut("Starting jobs") try: # submit all jobs simultaneously and temporarily store verbose (ClassAdd) output activity = utils.ActivityLog('queuing jobs at scheduler') proc = self.Pool.LoggedExecute( self.submitExec, ' -verbose %(JDL)s' % {"JDL": jdlSubmitPath}) self.debugOut("AAAAA") # extract the Condor ID (WMS ID) of the jobs from output ClassAds wmsJobIdList = [] for line in proc.iter(): if "GridControl_GCIDtoWMSID" in line: GCWMSID = line.split('=')[1].strip(' "\n').split('@') GCID, WMSID = int(GCWMSID[0]), GCWMSID[1].strip() # Condor creates a default job then overwrites settings on any subsequent job - i.e. skip every second, but better be sure if (not wmsJobIdList) or (GCID not in lzip( *wmsJobIdList)[0]): wmsJobIdList.append((self._createId(WMSID), GCID)) if "GridControl_GCtoWMSID" in line: self.debugOut("o : %s" % line) self.debugOut("o : %s" % wmsJobIdList) retCode = proc.wait() activity.finish() if (retCode != 0) or (len(wmsJobIdList) < len(jobNumList)): if self.explainError(proc, retCode): pass else: utils.eprint("Submitted %4d jobs of %4d expected" % (len(wmsJobIdList), len(jobNumList))) proc.logError(self.errorLog, jdl=jdlFilePath) finally: utils.removeFiles([jdlFilePath]) self.debugOut("Done Submitting") # yield the (jobNum, WMS ID, other data) of each job successively for index in irange(len(wmsJobIdList)): yield (wmsJobIdList[index][1], wmsJobIdList[index][0], {}) self.debugOut("Yielded submitted job") self.debugFlush()
def dlfs_rm(path, msg): procRM = storage.se_rm(path) if procRM.wait() != 0: print '\t\tUnable to remove %s!' % msg utils.eprint('%s\n\n' % procRM.getMessage())
parser.add_option('-a', '--action', dest='action', default=None) parser.add_option('-J', '--job-selector', dest='selector', default=None) parser.add_option('-m', '--max-retry', dest='maxRetry', default=None, type='int') parser.add_option('-v', '--verbose', dest='verbosity', default=0, action='count') parser.add_option('-G', '--gui', dest='gui', action='store_const', const = 'ANSIGUI') parser.add_option('-W', '--webserver', dest='gui', action='store_const', const = 'CPWebserver') # Deprecated options - refer to new report script instead parser.add_option('-r', '--report', dest='old_report', default=False, action='store_true') parser.add_option('-R', '--site-report', dest='old_report', default=False, action='store_true') parser.add_option('-T', '--time-report', dest='old_report', default=False, action='store_true') parser.add_option('-M', '--task-report', dest='old_report', default=False, action='store_true') parser.add_option('-D', '--detail-report', dest='old_report', default=False, action='store_true') parser.add_option('', '--help-vars', dest='old_report', default=False, action='store_true') (opts, args) = parser.parse_args() if opts.help: utils.eprint('%s\n%s' % (usage, open(utils.pathShare('help.txt'), 'r').read())) sys.exit(os.EX_USAGE) utils.verbosity(opts.verbosity) logging.getLogger().setLevel(logging.DEFAULT - opts.verbosity) if opts.debug: logging.getLogger('exception').addHandler(logging.StreamHandler(sys.stdout)) # we need exactly one positional argument (config file) if len(args) != 1: utils.exitWithUsage(usage, 'Config file not specified!') if opts.old_report: utils.deprecated('Please use the more versatile report tool in the scripts directory!') # Config filler which collects data from command line arguments class OptsConfigFiller(ConfigFiller):