def main(): configFactory = createConfigFactory(configFile = args[0], additional = [OptsConfigFiller(parser)]) config = configFactory.getConfig() logging_setup(config.changeView(setSections = ['logging'])) # Check work dir validity (default work directory is the config file name) if not os.path.exists(config.getWorkPath()): if not config.getState('init'): utils.vprint('Will force initialization of %s if continued!' % config.getWorkPath(), -1) config.setState(True, 'init') if config.getChoiceYesNo('workdir create', True, interactive = 'Do you want to create the working directory %s?' % config.getWorkPath()): utils.ensureDirExists(config.getWorkPath(), 'work directory') # Create workflow and freeze config settings globalConfig = config.changeView(setSections = ['global']) workflow = globalConfig.getPlugin('workflow', 'Workflow:global', cls = Workflow).getInstance() configFactory.freezeConfig(writeConfig = config.getState('init', detail = 'config')) # Give config help if opts.help_cfg or opts.help_scfg: config.write(sys.stdout, printDefault = opts.help_cfg, printUnused = False, printMinimal = opts.help_scfg, printSource = opts.help_cfg) sys.exit(os.EX_OK) # Check if user requested deletion / reset of jobs if opts.delete: workflow.jobManager.delete(workflow.wms, opts.delete) sys.exit(os.EX_OK) if opts.reset: workflow.jobManager.reset(workflow.wms, opts.reset) sys.exit(os.EX_OK) # Run the configured workflow workflow.run()
def __init__(self, config, name): # Determine ROOT path from previous settings / environment / config file self._rootpath = config.get('root path', os.environ.get('ROOTSYS', ''), persistent=True, onChange=changeInitNeeded('sandbox')) if not self._rootpath: raise ConfigError( 'Either set environment variable "ROOTSYS" or set option "root path"!' ) utils.vprint('Using the following ROOT path: %s' % self._rootpath, -1) # Special handling for executables bundled with ROOT self._executable = config.get('executable', onChange=changeInitNeeded('sandbox')) exeFull = os.path.join(self._rootpath, 'bin', self._executable.lstrip('/')) self.builtIn = os.path.exists(exeFull) if self.builtIn: config.set('send executable', 'False') # store resolved built-in executable path? # Apply default handling from UserTask UserTask.__init__(self, config, name) self.updateErrorDict(utils.pathShare('gc-run.root.sh')) # Collect lib files needed by executable self.libFiles = []
def _cfgFindUninitialized(self, config, cfgFiles, autoPrepare, mustPrepare): comPath = os.path.dirname(os.path.commonprefix(cfgFiles)) cfgTodo = [] cfgStatus = [] for cfg in cfgFiles: cfg_new = config.getWorkPath(os.path.basename(cfg)) cfg_new_exists = os.path.exists(cfg_new) if cfg_new_exists: isInstrumented = self._cfgIsInstrumented(cfg_new) doCopy = False else: isInstrumented = self._cfgIsInstrumented(cfg) doCopy = True doPrepare = (mustPrepare or autoPrepare) and not isInstrumented doCopy = doCopy or doPrepare if doCopy: cfgTodo.append((cfg, cfg_new, doPrepare)) cfgStatus.append({1: cfg.split(comPath, 1)[1].lstrip('/'), 2: cfg_new_exists, 3: isInstrumented, 4: doPrepare}) utils.vprint('', -1) utils.printTabular([(1, 'Config file'), (2, 'Work dir'), (3, 'Instrumented'), (4, 'Scheduled')], cfgStatus, 'lccc') utils.vprint('', -1) return cfgTodo
def jobCycle(self, wait = utils.wait): while True: (didWait, lastSpaceMsg) = (False, 0) # Check whether wms can submit if not self.wms.canSubmit(self.task.wallTime, self._submitFlag): self._submitFlag = False # Check free disk space if (self._checkSpace > 0) and utils.freeSpace(self._workDir) < self._checkSpace: if time.time() - lastSpaceMsg > 5 * 60: utils.vprint('Not enough space left in working directory', -1, True) lastSpaceMsg = time.time() else: for action in map(str.lower, self._actionList): if action.startswith('c') and not utils.abort(): # check for jobs if self.jobManager.check(self.wms): didWait = wait(self.wms.getTimings()[1]) elif action.startswith('r') and not utils.abort(): # retrieve finished jobs if self.jobManager.retrieve(self.wms): didWait = wait(self.wms.getTimings()[1]) elif action.startswith('s') and not utils.abort() and self._submitFlag: if self.jobManager.submit(self.wms): didWait = wait(self.wms.getTimings()[1]) # quit if abort flag is set or not in continuous mode if utils.abort() or not self.runContinuous: break # idle timeout if not didWait: wait(self.wms.getTimings()[0])
def check(self, wms, maxsample = 100): jobList = self.sample(self.jobDB.getJobs(ClassSelector(JobClass.PROCESSING)), utils.QM(self.continuous, maxsample, -1)) # Check jobs in the joblist and return changes, timeouts and successfully reported jobs (change, timeoutList, reported) = self.checkJobList(wms, jobList) if change == None: # neither True or False => abort return False # Cancel jobs which took too long if len(timeoutList): change = True print '\nTimeout for the following jobs:' self.cancel(wms, timeoutList) # Process task interventions self.processIntervention(wms, self._task.getIntervention()) # Quit when all jobs are finished if self.jobDB.getJobsN(ClassSelector(JobClass.ENDSTATE)) == len(self.jobDB): self.logDisabled() self._eventhandler.onTaskFinish(len(self.jobDB)) if self._task.canFinish(): utils.vprint('Task successfully completed. Quitting grid-control!', -1, True) utils.abort(True) return change
def display(self): (catStateDict, catDescDict, _) = CategoryReport._getCategoryStateSummary(self) infos = [] head = set() stateCat = { Job.SUCCESS: 'SUCCESS', Job.FAILED: 'FAILED', Job.RUNNING: 'RUNNING', Job.DONE: 'RUNNING' } for catKey in catDescDict: tmp = dict(catDescDict[catKey]) head.update(tmp.keys()) for stateKey in catStateDict[catKey]: state = stateCat.get(stateKey, 'WAITING') tmp[state] = tmp.get(state, 0) + catStateDict[catKey][stateKey] infos.append(tmp) stateCatList = ['WAITING', 'RUNNING', 'FAILED', 'SUCCESS'] utils.vprint(level=-1) utils.printTabular(lmap(lambda x: (x, x), sorted(head) + stateCatList), infos, 'c' * len(head), fmt=dict.fromkeys( stateCatList, lambda x: '%7d' % parseStr(x, int, 0))) utils.vprint(level=-1)
def setupJobParameters(self, config, pm): config = config.addSections(['dataset']).addTags([self]) self.dataSplitter = None self.dataRefresh = None self.dataset = config.get('dataset', '').strip() if self.dataset == '': return config.set('se output pattern', '@NICK@_job_@MY_JOBID@_@X@', override = False) config.set('default lookup', 'DATASETNICK', override = False) defaultProvider = config.get('dataset provider', 'ListProvider') dataProvider = DataProvider.create(config, self.dataset, defaultProvider) splitterName = config.get('dataset splitter', 'FileBoundarySplitter') splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName)) self.dataSplitter = splitterClass(config) self.checkSE = config.getBool('dataset storage check', True, onChange = None) # Create and register dataset parameter plugin paramSource = DataParameterSource(config.getWorkPath(), 'data', dataProvider, self.dataSplitter, self.initDataProcessor()) DataParameterSource.datasetsAvailable['data'] = paramSource # Select dataset refresh rate self.dataRefresh = config.getTime('dataset refresh', -1, onChange = None) if self.dataRefresh > 0: paramSource.resyncSetup(interval = max(self.dataRefresh, dataProvider.queryLimit())) utils.vprint('Dataset source will be queried every %s' % utils.strTime(self.dataRefresh), -1) else: paramSource.resyncSetup(interval = 0) def externalRefresh(sig, frame): paramSource.resyncSetup(force = True) signal.signal(signal.SIGUSR2, externalRefresh) if self.dataSplitter.getMaxJobs() == 0: raise UserError('There are no events to process')
def getSubmissionJobs(self, maxsample, static = {'showBlocker': True}): # Get list of submittable jobs readyList = self.jobDB.getJobs(ClassSelector(JobClass.READY)) retryOK = readyList defaultJob = Job() if self.maxRetry >= 0: retryOK = filter(lambda x: self.jobDB.get(x, defaultJob).attempt - 1 < self.maxRetry, readyList) modOK = filter(self._task.canSubmit, readyList) jobList = set.intersection(set(retryOK), set(modOK)) if static['showBlocker'] and len(readyList) > 0 and len(jobList) == 0: # No submission but ready jobs err = [] err += utils.QM(len(retryOK) > 0 and len(modOK) == 0, [], ['have hit their maximum number of retries']) err += utils.QM(len(retryOK) == 0 and len(modOK) > 0, [], ['are vetoed by the task module']) utils.vprint('All remaining jobs %s!' % str.join(utils.QM(retryOK or modOK, ' or ', ' and '), err), -1, True) static['showBlocker'] = not (len(readyList) > 0 and len(jobList) == 0) # Determine number of jobs to submit submit = len(jobList) if self.inQueue > 0: submit = min(submit, self.inQueue - self.jobDB.getJobsN(ClassSelector(JobClass.ATWMS))) if self.inFlight > 0: submit = min(submit, self.inFlight - self.jobDB.getJobsN(ClassSelector(JobClass.PROCESSING))) if self.continuous: submit = min(submit, maxsample) submit = max(submit, 0) if self.doShuffle: return self.sample(jobList, submit) else: return sorted(jobList)[:submit]
def __init__(self, config, wmsName): WMS.__init__(self, config, wmsName) if self.wmsName != self.__class__.__name__.upper(): utils.vprint('Using batch system: %s (%s)' % (self.__class__.__name__, self.wmsName), -1) else: utils.vprint('Using batch system: %s' % self.wmsName, -1) self.errorLog = config.getWorkPath('error.tar') self._runlib = config.getWorkPath('gc-run.lib') if not os.path.exists(self._runlib): fp = SafeFile(self._runlib, 'w') content = SafeFile(utils.pathShare('gc-run.lib')).read() fp.write(content.replace('__GC_VERSION__', __import__('grid_control').__version__)) fp.close() self._outputPath = config.getWorkPath('output') utils.ensureDirExists(self._outputPath, 'output directory') self._failPath = config.getWorkPath('fail') # Initialise access token, broker and storage manager self._token = config.getCompositePlugin(['proxy', 'access token'], 'TrivialAccessToken', 'MultiAccessToken', cls = AccessToken, inherit = True, tags = [self]) # UI -> SE -> WN self.smSEIn = config.getPlugin('se input manager', 'SEStorageManager', cls = StorageManager, tags = [self], pargs = ('se', 'se input', 'SE_INPUT')) self.smSBIn = config.getPlugin('sb input manager', 'LocalSBStorageManager', cls = StorageManager, tags = [self], pargs = ('sandbox', 'sandbox', 'SB_INPUT')) # UI <- SE <- WN self.smSEOut = config.getPlugin('se output manager', 'SEStorageManager', cls = StorageManager, tags = [self], pargs = ('se', 'se output', 'SE_OUTPUT')) self.smSBOut = None self.fileNamesEnvironment = config.getBool("file names environment", True, onChange = None)
def __init__(self, config, name): NamedObject.__init__(self, config, name) self._workDir = config.getWorkPath() # Initialise task module self.task = config.getClass(['task', 'module'], cls = TaskModule, tags = [self]).getInstance() utils.vprint('Current task ID: %s' % self.task.taskID, -1) utils.vprint('Task started on %s' % self.task.taskDate, -1) # Initialise monitoring module self.monitor = ClassFactory(config, ('monitor', 'scripts'), ('monitor manager', 'MultiMonitor'), cls = Monitoring, tags = [self.task]).getInstance(self.task) # Initialise workload management interface self.wms = ClassFactory(config, ('backend', 'grid'), ('backend manager', 'MultiWMS'), cls = WMS, tags = [self.task]).getInstance() # Initialise job database jobManagerCls = config.getClass('job manager', 'SimpleJobManager', cls = JobManager, tags = [self.task, self.wms]) self.jobManager = jobManagerCls.getInstance(self.task, self.monitor) # Prepare work package self.wms.deployTask(self.task, self.monitor) global_config = config.clone() self._actionList = global_config.getList('jobs', 'action', ['check', 'retrieve', 'submit'], onChange = None) self.runContinuous = global_config.getBool('jobs', 'continuous', False, onChange = None) self._checkSpace = config.getInt('workdir space', 10, onChange = None) self._submitFlag = config.getBool('submission', True, onChange = None) guiClass = config.getClass('gui', 'SimpleConsole', cls = GUI, onChange = None) self._gui = guiClass.getInstance(config, self)
def __init__(self, remoteType="", **kwargs): self.cmd=False # pick requested remote connection try: self.remoteType = getattr(self.RPHType, remoteType.upper()) self.cmd = self.RPHTemplate[self.remoteType]["command"] self.copy = self.RPHTemplate[self.remoteType]["copy"] self.path = self.RPHTemplate[self.remoteType]["path"] self.argFormat = self.RPHTemplate[self.remoteType]["argFormat"] except Exception: raise ConfigError("Request to initialize RemoteProcessHandler of unknown type: %s" % remoteType) # destination should be of type: [user@]host if self.remoteType==self.RPHType.SSH or self.remoteType==self.RPHType.GSISSH: try: self.cmd = self.cmd % { "rhost" : kwargs["host"] } self.copy = self.copy % { "rhost" : kwargs["host"] } self.host = kwargs["host"] except Exception: raise ConfigError("Request to initialize RemoteProcessHandler of type %s without remote host." % self.RPHType.enumList[self.remoteType]) # add default arguments for all commands self.cmd = self.cmd % { "cmdargs" : kwargs.get("cmdargs",""), "args" : kwargs.get("args","") } self.copy = self.copy % { "cpargs" : kwargs.get("cpargs",""), "args" : kwargs.get("args","") } # test connection once proc = LoggedProcess(self.cmd % { "cmd" : "exit"}) ret = proc.getAll()[0] if ret != 0: raise CondorProcessError('Validation of remote connection failed!', proc) vprint('Remote interface initialized:\n Cmd: %s\n Cp : %s' % (self.cmd,self.copy), level=2)
def __init__(self, remoteType="", **kwargs): self.cmd=False # pick requested remote connection try: self.remoteType = getattr(self.RPHType, remoteType.upper()) self.cmd = self.RPHTemplate[self.remoteType]["command"] self.copy = self.RPHTemplate[self.remoteType]["copy"] self.path = self.RPHTemplate[self.remoteType]["path"] self.argFormat = self.RPHTemplate[self.remoteType]["argFormat"] except Exception: raise ConfigError("Request to initialize RemoteProcessHandler of unknown type: %s" % remoteType) # destination should be of type: [user@]host if self.remoteType==self.RPHType.SSH or self.remoteType==self.RPHType.GSISSH: try: self.cmd = self.cmd % { "rhost" : kwargs["host"] } self.copy = self.copy % { "rhost" : kwargs["host"] } self.host = kwargs["host"] except Exception: raise ConfigError("Request to initialize RemoteProcessHandler of type %s without remote host." % self.RPHType.enumList[self.remoteType]) # add default arguments for all commands self.cmd = self.cmd % { "cmdargs" : kwargs.get("cmdargs",""), "args" : kwargs.get("args","") } self.copy = self.copy % { "cpargs" : kwargs.get("cpargs",""), "args" : kwargs.get("args","") } # test connection once proc = LoggedProcess(self.cmd % { "cmd" : "exit"}) ret, out, err = proc.getAll() if ret!=0: raise CondorProcessError('Validation of remote connection failed!', proc) vprint('Remote interface initialized:\n Cmd: %s\n Cp : %s' % (self.cmd,self.copy), level=2)
def _displaySetup(self, dsPath, head): if os.path.exists(dsPath): nickNames = set() for block in DataProvider.loadFromFile(dsPath).getBlocks(): nickNames.add(block[DataProvider.Nickname]) utils.vprint('Mapping between nickname and other settings:\n', -1) report = [] for nick in sorted(nickNames): lumi_filter_str = formatLumi( self._nmLumi.lookup(nick, '', is_selector=False)) if len(lumi_filter_str) > 4: nice_lumi_filter = '%s ... %s (%d entries)' % ( lumi_filter_str[0], lumi_filter_str[-1], len(lumi_filter_str)) else: nice_lumi_filter = str.join(', ', lumi_filter_str) config_files = self._nmCfg.lookup(nick, '', is_selector=False) tmp = { 0: nick, 1: str.join(', ', imap(os.path.basename, config_files)), 2: nice_lumi_filter } lookupvars = {'DATASETNICK': nick} for src in self._pm.lookupSources: src.fillParameterInfo(None, lookupvars) tmp.update(lookupvars) report.append(tmp) utils.printTabular(head, report, 'cl') utils.vprint(level=-1)
def _cfgFindUninitialized(self, config, cfgFiles, autoPrepare, mustPrepare): comPath = os.path.dirname(os.path.commonprefix(cfgFiles)) cfgTodo = [] cfgStatus = [] for cfg in cfgFiles: cfg_new = config.getWorkPath(os.path.basename(cfg)) cfg_new_exists = os.path.exists(cfg_new) if cfg_new_exists: isInstrumented = self._cfgIsInstrumented(cfg_new) doCopy = False else: isInstrumented = self._cfgIsInstrumented(cfg) doCopy = True doPrepare = (mustPrepare or autoPrepare) and not isInstrumented doCopy = doCopy or doPrepare if doCopy: cfgTodo.append((cfg, cfg_new, doPrepare)) cfgStatus.append({ 1: cfg.split(comPath, 1)[1].lstrip('/'), 2: cfg_new_exists, 3: isInstrumented, 4: doPrepare }) utils.vprint('', -1) utils.printTabular([(1, 'Config file'), (2, 'Work dir'), (3, 'Instrumented'), (4, 'Scheduled')], cfgStatus, 'lccc') utils.vprint('', -1) return cfgTodo
def _discover(self, discoverFun, cached=True): if not cached or (self._itemsDiscovered is False): self._itemsDiscovered = discoverFun() msg = 'an unknown number of' if self._itemsDiscovered is not None: msg = str(len(self._itemsDiscovered)) utils.vprint('Broker discovered %s %s' % (msg, self._itemName)) return self._itemsDiscovered
def _discover(self, discoverFun, cached = True): if not cached or (self._itemsDiscovered == False): self._itemsDiscovered = discoverFun() msg = 'an unknown number of' if self._itemsDiscovered != None: msg = str(len(self._itemsDiscovered)) utils.vprint('Broker discovered %s %s' % (msg, self._itemName)) return self._itemsDiscovered
def getCMSDatasets(self): result = [self.datasetPath] if '*' in self.datasetPath: result = list(self.getCMSDatasetsImpl(self.datasetPath)) if len(result) == 0: raise DatasetError('No datasets selected by DBS wildcard %s !' % self.datasetPath) utils.vprint('DBS dataset wildcard selected:\n\t%s\n' % str.join('\n\t', result), -1) return result # List of resolved datasetPaths
def __init__(self, config, name): head = [(0, "Nickname")] # Mapping between nickname and config files: cfgList = config.get("nickname config", "") self.nmCfg = config.getDict( "nickname config", {}, parser=lambda x: map(str.strip, x.split(",")), str=lambda x: str.join(",", x) )[0] if cfgList: if "config file" in config.getOptions(): raise ConfigError("Please use 'nickname config' instead of 'config file'") allConfigFiles = utils.flatten(self.nmCfg.values()) config.set("config file", str.join("\n", allConfigFiles)) head.append((1, "Config file")) # Mapping between nickname and constants: self.nmCName = map(str.strip, config.get("nickname constants", "").split()) self.nmConst = {} for var in self.nmCName: tmp = config.getDict(var, {})[0] for (nick, value) in tmp.items(): if value: self.nmConst.setdefault(nick, {})[var] = value else: self.nmConst.setdefault(nick, {})[var] = "" head.append((var, var)) # Mapping between nickname and lumi filter: if "lumi filter" in config.getOptions(): raise ConfigError("Please use 'nickname lumi filter' instead of 'lumi filter'") lumiParse = lambda x: formatLumi(parseLumiFilter(x)) self.nmLumi = config.getDict("nickname lumi filter", {}, parser=lumiParse)[0] if self.nmLumi: for dataset in config.get("dataset", "").splitlines(): (datasetNick, datasetProvider, datasetExpr) = DataProvider.parseDatasetExpr(config, dataset, None) config.set( "dataset %s" % datasetNick, "lumi filter", str.join(",", utils.flatten(fromNM(self.nmLumi, datasetNick, []))), ) config.set("lumi filter", str.join(",", self.nmLumi.get(None, []))) head.append((2, "Lumi filter")) utils.vprint("Mapping between nickname and other settings:\n", -1) def report(): for nick in sorted(set(self.nmCfg.keys() + self.nmConst.keys() + self.nmLumi.keys())): tmp = { 0: nick, 1: str.join(", ", map(os.path.basename, self.nmCfg.get(nick, ""))), 2: self.displayLumi(self.nmLumi.get(nick, "")), } yield utils.mergeDicts([tmp, self.nmConst.get(nick, {})]) utils.printTabular(head, report(), "cl") utils.vprint(level=-1) CMSSW.__init__(self, config, name)
def logDisabled(self): disabled = self.jobDB.getJobs(ClassSelector(JobClass.DISABLED)) try: open(self.disableLog, 'w').write(str.join('\n', map(str, disabled))) except Exception: raise RuntimeError('Could not write disabled jobs to file %s!' % self.disableLog) if len(disabled) > 0: utils.vprint('There are %d disabled jobs in this task!' % len(disabled), -1, True) utils.vprint('Please refer to %s for a complete list.' % self.disableLog, -1, True, once = True)
def _checkTimeleft(self, neededTime): # check for time left delta = time.time() - self._lastUpdate timeleft = max(0, self._getTimeleft(cached = True) - delta) # recheck proxy => after > 30min have passed or when time is running out (max every 5 minutes) if (delta > self._minQueryTime) or (timeleft < neededTime and delta > self._maxQueryTime): self._lastUpdate = time.time() timeleft = self._getTimeleft(cached = False) verbosity = QM(timeleft < neededTime, -1, 0) utils.vprint('The proxy now has %s left' % utils.strTime(timeleft), verbosity, printTime = True) return timeleft >= neededTime
def canSubmit(self, neededTime, canCurrentlySubmit): if not self._checkTimeleft(self._lowerLimit): raise UserError('Your proxy only has %d seconds left! (Required are %s)' % (self._getTimeleft(cached = True), utils.strTime(self._lowerLimit))) if not self._checkTimeleft(self._lowerLimit + neededTime) and canCurrentlySubmit: utils.vprint('Proxy lifetime (%s) does not meet the proxy and walltime (%s) requirements!' % (utils.strTime(self._getTimeleft(cached = False)), utils.strTime(self._lowerLimit + neededTime)), -1, printTime = True) utils.vprint('Disabling job submission', -1, printTime = True) return False return True
def __init__(self, config, wmsName): utils.vprint('Using batch system: Condor/GlideInWMS', -1) ### WMSname=condor is a hardcoded hack until interface is clear BasicWMS.__init__(self, config, wmsName, 'condor') # special debug out/messages/annotations - may have noticeable effect on storage and performance! if config.get( self._getSections("backend"), "debugLog", ""): self.debug=open(config.get( self._getSections("backend"), "debugLog", ""),'a') else: self.debug=False ###### self.taskID = config.get('condor', 'task id', md5(str(time.time())).hexdigest(), persistent = True) # FIXME! self.debugOut(""" ############################# Initialized Condor/GlideInWMS ############################# Config: %s taskID: %s Name: %s ############################# """%(config.confName,self.taskID,wmsName)) # finalize config state by reading values or setting to defaults self.settings={ "jdl": { "Universe" : config.get( self._getSections("backend"), "Universe", "vanilla"), "NotifyEmail" : config.get( self._getSections("backend"), "NotifyEmail", ""), "ClassAdData" : config.getList( self._getSections("backend"), "ClassAdData",[]), "JDLData" : config.getList( self._getSections("backend"), "JDLData",[]) }, "pool" : { "hosts" : config.getList( self._getSections("backend"), "PoolHostList",[]) } } # prepare interfaces for local/remote/ssh pool access self._initPoolInterfaces(config) # load keys for condor pool ClassAds self.poolReqs = config.getDict(self._getSections("backend"), 'poolArgs req', {})[0] self.poolQuery = config.getDict(self._getSections("backend"), 'poolArgs query', {})[0] self._formatStatusReturnQuery(config) # Sandbox base path where individual job data is stored, staged and returned to self.sandPath = config.getPath(self._getSections("local"), 'sandbox path', config.getWorkPath('sandbox'), mustExist = False) # history query is faster with split files - check if and how this is used # default condor_history command works WITHOUT explicitly specified file self.historyFile = None if self.remoteType == poolType.LOCAL and commands.getoutput( self.configValExec + " ENABLE_HISTORY_ROTATION").lower() == "true": self.historyFile = commands.getoutput( self.configValExec + " HISTORY") if not os.path.isfile(self.historyFile): self.historyFile = None # broker for selecting Sites self.brokerSite = config.getClass('site broker', 'UserBroker', cls = Broker, tags = [self]).getInstance('sites', 'sites', self.getSites) self.debugFlush()
def __init__(self, config, wmsName): WMS.__init__(self, config, wmsName) if self.wmsName != self.__class__.__name__.upper(): utils.vprint( 'Using batch system: %s (%s)' % (self.__class__.__name__, self.wmsName), -1) else: utils.vprint('Using batch system: %s' % self.wmsName, -1) self.errorLog = config.getWorkPath('error.tar') self._runlib = config.getWorkPath('gc-run.lib') if not os.path.exists(self._runlib): fp = SafeFile(self._runlib, 'w') content = SafeFile(utils.pathShare('gc-run.lib')).read() fp.write( content.replace('__GC_VERSION__', __import__('grid_control').__version__)) fp.close() self._outputPath = config.getWorkPath('output') utils.ensureDirExists(self._outputPath, 'output directory') self._failPath = config.getWorkPath('fail') # Initialise access token, broker and storage manager self._token = config.getCompositePlugin(['proxy', 'access token'], 'TrivialAccessToken', 'MultiAccessToken', cls=AccessToken, inherit=True, tags=[self]) # UI -> SE -> WN self.smSEIn = config.getPlugin('se input manager', 'SEStorageManager', cls=StorageManager, tags=[self], pargs=('se', 'se input', 'SE_INPUT')) self.smSBIn = config.getPlugin('sb input manager', 'LocalSBStorageManager', cls=StorageManager, tags=[self], pargs=('sandbox', 'sandbox', 'SB_INPUT')) # UI <- SE <- WN self.smSEOut = config.getPlugin('se output manager', 'SEStorageManager', cls=StorageManager, tags=[self], pargs=('se', 'se output', 'SE_OUTPUT')) self.smSBOut = None self.fileNamesEnvironment = config.getBool("file names environment", True, onChange=None)
def _CreateSocket(self, duration = 60): args = [self.cmd, self.defaultArgs, "-o ControlMaster=yes", self.socketArgsDef, self.remoteHost, self._argFormat("sleep %d" % duration)] self.__ControlMaster = LoggedProcess(" ".join(args)) timeout = 0 while not os.path.exists(self.sshLink): time.sleep(0.5) timeout += 0.5 if timeout == 5: vprint("SSH socket still not available after 5 seconds...\n%s" % self.sshLink, level=1) vprint('Socket process: %s' % (self.__ControlMaster.cmd), level=2) if timeout == 10: return False
def __init__(self, config, wmsName): utils.vprint('Using batch system: Condor/GlideInWMS', -1) BasicWMS.__init__(self, config, wmsName) # special debug out/messages/annotations - may have noticeable effect on storage and performance! debugLogFN = config.get('debugLog', '') self.debug = False if debugLogFN: self.debug = open(debugLogFN, 'a') ###### self.taskID = config.get('task id', md5(str(time.time())).hexdigest(), persistent = True) # FIXME! self.debugOut(""" ############################# Initialized Condor/GlideInWMS ############################# Config: %s taskID: %s Name: %s ############################# """%(config.getConfigName(),self.taskID,wmsName)) # finalize config state by reading values or setting to defaults self.settings={ 'jdl': { 'Universe' : config.get('Universe', 'vanilla'), 'NotifyEmail' : config.get('NotifyEmail', ''), 'ClassAdData' : config.getList('ClassAdData',[]), 'JDLData' : config.getList('JDLData',[]) }, 'pool' : { 'hosts' : config.getList('PoolHostList',[]) } } # prepare interfaces for local/remote/ssh pool access self._initPoolInterfaces(config) # load keys for condor pool ClassAds self.poolReqs = config.getDict('poolArgs req', {})[0] self.poolQuery = config.getDict('poolArgs query', {})[0] self._formatStatusReturnQuery(config) # Sandbox base path where individual job data is stored, staged and returned to self.sandPath = config.getPath('sandbox path', config.getWorkPath('sandbox'), mustExist = False) # history query is faster with split files - check if and how this is used # default condor_history command works WITHOUT explicitly specified file self.historyFile = None if self.remoteType == PoolType.LOCAL and getoutput( self.configValExec + ' ENABLE_HISTORY_ROTATION').lower() == 'true': self.historyFile = getoutput( self.configValExec + ' HISTORY') if not os.path.isfile(self.historyFile): self.historyFile = None # broker for selecting Sites self.brokerSite = config.getPlugin('site broker', 'UserBroker', cls = Broker, tags = [self], pargs = ('sites', 'sites', self.getSites)) self.debugFlush()
def _setupJobParameters(self, config): data_config = config.changeView(viewClass = 'TaggedConfigView', addSections = ['dataset']) self.dataSplitter = None self._data_refresh = -1 def userRefresh(config, old_obj, cur_obj, cur_entry, obj2str): if (old_obj == '') and (cur_obj != ''): raise UserError('It is currently not possible to attach a dataset to a non-dataset task!') self._log.info('Dataset setup was changed - forcing resync...') config.setState(True, 'resync', detail = 'dataset') config.setState(True, 'init', detail = 'config') # This will trigger a write of the new options return cur_obj dataProvider = data_config.getCompositePlugin('dataset', '', ':MultiDatasetProvider:', cls = DataProvider, requirePlugin = False, onChange = userRefresh) self._forceRefresh = config.getState('resync', detail = 'dataset') config.setState(False, 'resync', detail = 'dataset') if not dataProvider: return tmp_config = data_config.changeView(viewClass = 'TaggedConfigView', setClasses = None, setNames = None, setTags = [], addSections = ['storage']) tmp_config.set('se output pattern', '@NICK@_job_@GC_JOB_ID@_@X@') tmp_config = data_config.changeView(viewClass = 'TaggedConfigView', setClasses = None, setNames = None, setTags = [], addSections = ['parameters']) tmp_config.set('default lookup', 'DATASETNICK') splitterName = data_config.get('dataset splitter', 'FileBoundarySplitter') splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName)) self.dataSplitter = splitterClass(data_config) # Create and register dataset parameter source partProcessor = data_config.getCompositePlugin('partition processor', 'TFCPartitionProcessor LocationPartitionProcessor MetaPartitionProcessor BasicPartitionProcessor', 'MultiPartitionProcessor', cls = PartitionProcessor, onChange = triggerResync(['parameters'])) DataParameterSource = ParameterSource.getClass('DataParameterSource') self._dataPS = DataParameterSource(data_config.getWorkPath(), 'data', dataProvider, self.dataSplitter, partProcessor) DataParameterSource.datasetsAvailable['data'] = self._dataPS # Select dataset refresh rate self._data_refresh = data_config.getTime('dataset refresh', -1, onChange = None) if self._data_refresh > 0: self._dataPS.resyncSetup(interval = max(self._data_refresh, dataProvider.queryLimit())) utils.vprint('Dataset source will be queried every %s' % strTime(self._data_refresh), -1) else: self._dataPS.resyncSetup(interval = 0) if self._forceRefresh: self._dataPS.resyncSetup(force = True) def externalRefresh(sig, frame): self._dataPS.resyncSetup(force = True) signal.signal(signal.SIGUSR2, externalRefresh) if self.dataSplitter.getMaxJobs() == 0: raise UserError('There are no events to process')
def setupJobParameters(self, config, pm): config = config.changeView(viewClass = 'TaggedConfigView', addSections = ['dataset']) self.dataSplitter = None self.dataRefresh = -1 def userRefresh(config, old_obj, cur_obj, cur_entry, obj2str): if (old_obj == '') and (cur_obj != ''): raise UserError('It is currently not possible to attach a dataset to a non-dataset task!') self._log.info('Dataset setup was changed - forcing resync...') config.setState(True, 'resync', detail = 'dataset') config.setState(True, 'init', detail = 'config') # This will trigger a write of the new options return cur_obj dataProvider = config.getCompositePlugin('dataset', '', ':MultiDatasetProvider:', cls = DataProvider, requirePlugin = False, onChange = userRefresh) self._forceRefresh = config.getState('resync', detail = 'dataset') config.setState(False, 'resync', detail = 'dataset') if not dataProvider: return tmp_config = config.changeView(viewClass = 'TaggedConfigView', setClasses = None, setNames = None, setTags = [], addSections = ['storage']) tmp_config.set('se output pattern', '@NICK@_job_@GC_JOB_ID@_@X@') tmp_config = config.changeView(viewClass = 'TaggedConfigView', setClasses = None, setNames = None, setTags = [], addSections = ['parameters']) tmp_config.set('default lookup', 'DATASETNICK') splitterName = config.get('dataset splitter', 'FileBoundarySplitter') splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName)) self.dataSplitter = splitterClass(config) # Create and register dataset parameter source partProcessor = config.getCompositePlugin('partition processor', 'BasicPartitionProcessor LocationPartitionProcessor', 'MultiPartitionProcessor', cls = PartitionProcessor) DataParameterSource = ParameterSource.getClass('DataParameterSource') self._dataPS = DataParameterSource(config.getWorkPath(), 'data', dataProvider, self.dataSplitter, partProcessor) DataParameterSource.datasetsAvailable['data'] = self._dataPS # Select dataset refresh rate self.dataRefresh = config.getTime('dataset refresh', -1, onChange = None) if self.dataRefresh > 0: self._dataPS.resyncSetup(interval = max(self.dataRefresh, dataProvider.queryLimit())) utils.vprint('Dataset source will be queried every %s' % strTime(self.dataRefresh), -1) else: self._dataPS.resyncSetup(interval = 0) if self._forceRefresh: self._dataPS.resyncSetup(force = True) def externalRefresh(sig, frame): self._dataPS.resyncSetup(force = True) signal.signal(signal.SIGUSR2, externalRefresh) if self.dataSplitter.getMaxJobs() == 0: raise UserError('There are no events to process')
def _getCMSSWPaths(self, config): result = [] if config.getState('init', detail = 'sandbox'): userPath = config.get('cmssw dir', '') if userPath != '': result.append(('CMSSW_DIR_USER', userPath)) if self.scramEnv.get('RELEASETOP', None): projPath = os.path.normpath('%s/../../../../' % self.scramEnv['RELEASETOP']) result.append(('CMSSW_DIR_PRO', projPath)) if result: utils.vprint('Local jobs will try to use the CMSSW software located here:', -1) for i, loc in enumerate(result): utils.vprint(' %i) %s' % (i + 1, loc[1]), -1) return result
def _configureSCRAMSettings(self, config): scramProject = config.getList('scram project', []) if len(scramProject): self.projectArea = config.getPath('project area', '') if len(self.projectArea): raise ConfigError('Cannot specify both SCRAM project and project area') if len(scramProject) != 2: raise ConfigError('SCRAM project needs exactly 2 arguments: PROJECT VERSION') else: self.projectArea = config.getPath('project area') if len(self.projectArea): self.pattern = config.getList('area files', ['-.*', '-config', 'bin', 'lib', 'python', 'module', '*/data', '*.xml', '*.sql', '*.db', '*.cf[if]', '*.py', '-*/.git', '-*/.svn', '-*/CVS', '-*/work.*']) if os.path.exists(self.projectArea): utils.vprint('Project area found in: %s' % self.projectArea, -1) else: raise ConfigError('Specified config area %r does not exist!' % self.projectArea) scramPath = os.path.join(self.projectArea, '.SCRAM') # try to open it try: fp = open(os.path.join(scramPath, 'Environment'), 'r') self.scramEnv = utils.DictFormat().parse(fp, keyParser = {None: str}) except Exception: raise ConfigError('Project area file %s/.SCRAM/Environment cannot be parsed!' % self.projectArea) for key in ['SCRAM_PROJECTNAME', 'SCRAM_PROJECTVERSION']: if key not in self.scramEnv: raise ConfigError('Installed program in project area not recognized.') default_archs = lfilter(lambda x: os.path.isdir(os.path.join(scramPath, x)) and not x.startswith('.'), os.listdir(scramPath)) + [noDefault] default_arch = default_archs[0] self.scramArch = config.get('scram arch', default_arch) try: fp = open(os.path.join(scramPath, self.scramArch, 'Environment'), 'r') self.scramEnv.update(utils.DictFormat().parse(fp, keyParser = {None: str})) except Exception: raise ConfigError('Project area file .SCRAM/%s/Environment cannot be parsed!' % self.scramArch) else: self.scramEnv = { 'SCRAM_PROJECTNAME': scramProject[0], 'SCRAM_PROJECTVERSION': scramProject[1] } self.scramArch = config.get('scram arch') self.scramVersion = config.get('scram version', 'scramv1') if self.scramEnv['SCRAM_PROJECTNAME'] != 'CMSSW': raise ConfigError('Project area contains no CMSSW project')
def display(self): reports = [] for jobNum in self._jobs: jobObj = self._jobDB.get(jobNum) if not jobObj or (jobObj.state == Job.INIT): continue reports.append({0: jobNum, 1: Job.enum2str(jobObj.state), 2: jobObj.wmsId}) if utils.verbosity() > 0: history = jobObj.history.items() history.reverse() for at, dest in history: if dest != "N/A": reports.append({1: at, 2: " -> " + dest}) elif jobObj.get("dest", "N/A") != "N/A": reports.append({2: " -> " + jobObj.get("dest")}) utils.printTabular(zip(range(3), ["Job", "Status / Attempt", "Id / Destination"]), reports, "rcl") utils.vprint()
def getGCBlocks(self, usePhedex): blockCache = [] for datasetPath in self.getCMSDatasets(): counter = 0 for (blockPath, listSE) in self.getCMSBlocks(datasetPath, getSites = not usePhedex): if blockPath in blockCache: raise DatasetError('CMS source provided duplicate blocks! %s' % blockPath) blockCache.append(blockPath) result = {} result[DataProvider.Dataset] = blockPath.split('#')[0] result[DataProvider.BlockName] = blockPath.split('#')[1] if usePhedex: # Start parallel phedex query dictSE = {} tPhedex = utils.gcStartThread("Query phedex site info for %s" % blockPath, self.getPhedexSEList, blockPath, dictSE) if self.selectedLumis: result[DataProvider.Metadata] = ['Runs'] if self.includeLumi: result[DataProvider.Metadata].append('Lumi') result[DataProvider.FileList] = list(self.getCMSFiles(blockPath)) if self.checkUnique: uniqueURLs = set(map(lambda x: x[DataProvider.URL], result[DataProvider.FileList])) if len(result[DataProvider.FileList]) != len(uniqueURLs): utils.vprint('Warning: The webservice returned %d duplicated files in dataset block %s! Continuing with unique files...' % (len(result[DataProvider.FileList]) - len(uniqueURLs)), -1) uniqueFIs = [] for fi in result[DataProvider.FileList]: if fi[DataProvider.URL] in uniqueURLs: uniqueURLs.remove(fi[DataProvider.URL]) uniqueFIs.append(fi) result[DataProvider.FileList] = uniqueFIs if usePhedex: tPhedex.join() listSE = dictSE.get(blockPath) result[DataProvider.Locations] = listSE if len(result[DataProvider.FileList]): counter += 1 yield result if (counter == 0) and self.selectedLumis: raise DatasetError('Dataset %s does not contain the requested run/lumi sections!' % datasetPath) elif counter == 0: raise DatasetError('Dataset %s does not contain any valid blocks!' % datasetPath)
def _getCMSSWPaths(self, config): result = [] if config.getState('init', detail='sandbox'): userPath = config.get('cmssw dir', '') if userPath != '': result.append(('CMSSW_DIR_USER', userPath)) if self.scramEnv.get('RELEASETOP', None): projPath = os.path.normpath('%s/../../../../' % self.scramEnv['RELEASETOP']) result.append(('CMSSW_DIR_PRO', projPath)) if result: utils.vprint( 'Local jobs will try to use the CMSSW software located here:', -1) for i, loc in enumerate(result): utils.vprint(' %i) %s' % (i + 1, loc[1]), -1) return result
def display(self): reports = [] for jobNum in self._jobs: jobObj = self._jobDB.get(jobNum) if not jobObj or (jobObj.state == Job.INIT): continue reports.append({0: jobNum, 1: Job.states[jobObj.state], 2: jobObj.wmsId}) if utils.verbosity() > 0: history = jobObj.history.items() history.reverse() for at, dest in history: if dest != 'N/A': reports.append({1: at, 2: ' -> ' + dest}) elif jobObj.get('dest', 'N/A') != 'N/A': reports.append({2: ' -> ' + jobObj.get('dest')}) utils.printTabular(zip(range(3), ['Job', 'Status / Attempt', 'Id / Destination']), reports, 'rcl') utils.vprint()
def displayWorkflow(self): utils.vprint(level=-1) self._report.display() utils.vprint(level=-1) if self._workflow.duration < 0: utils.vprint('Running in continuous mode. Press ^C to exit.', -1) elif self._workflow.duration > 0: utils.vprint( 'Running for %s' % strTimeShort(self._workflow.duration), -1) self._workflow.jobCycle()
def setupJobParameters(self, config, pm): config = config.changeView(viewClass = TaggedConfigView, addSections = ['dataset'], addTags = [self]) self.dataSplitter = None self.dataRefresh = None self._forceRefresh = config.getState('resync', detail = 'dataset', default = False) def userRefresh(config, old_obj, cur_obj, cur_entry, obj2str): if ((old_obj == '') and (cur_obj != '')): raise UserError('It is currently not possible to attach a dataset to a non-dataset task!') self._forceRefresh = True return cur_obj self.dataset = config.get('dataset', '', onChange = userRefresh).strip() if self.dataset == '': return config.set('se output pattern', '@NICK@_job_@GC_JOB_ID@_@X@') config.set('default lookup', 'DATASETNICK') defaultProvider = config.get('dataset provider', 'ListProvider') dataProvider = DataProvider.create(config, self.dataset, defaultProvider) splitterName = config.get('dataset splitter', 'FileBoundarySplitter') splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName)) self.dataSplitter = splitterClass(config) # Create and register dataset parameter source paramSplitProcessor = config.getCompositePlugin('dataset processor', 'BasicDataSplitProcessor SECheckSplitProcessor', 'MultiDataSplitProcessor', cls = DataSplitProcessor).getInstance(config) paramSource = DataParameterSource(config.getWorkPath(), 'data', dataProvider, self.dataSplitter, paramSplitProcessor) DataParameterSource.datasetsAvailable['data'] = paramSource # Select dataset refresh rate self.dataRefresh = config.getTime('dataset refresh', -1, onChange = None) if self.dataRefresh > 0: paramSource.resyncSetup(interval = max(self.dataRefresh, dataProvider.queryLimit())) utils.vprint('Dataset source will be queried every %s' % utils.strTime(self.dataRefresh), -1) else: paramSource.resyncSetup(interval = 0) if self._forceRefresh: paramSource.resyncSetup(force = True) def externalRefresh(sig, frame): paramSource.resyncSetup(force = True) signal.signal(signal.SIGUSR2, externalRefresh) if self.dataSplitter.getMaxJobs() == 0: raise UserError('There are no events to process')
def display(self): (catStateDict, catDescDict, _) = CategoryReport._getCategoryStateSummary(self) infos = [] head = set() stateCat = {Job.SUCCESS: 'SUCCESS', Job.FAILED: 'FAILED', Job.RUNNING: 'RUNNING', Job.DONE: 'RUNNING'} for catKey in catDescDict: tmp = dict(catDescDict[catKey]) head.update(tmp.keys()) for stateKey in catStateDict[catKey]: state = stateCat.get(stateKey, 'WAITING') tmp[state] = tmp.get(state, 0) + catStateDict[catKey][stateKey] infos.append(tmp) stateCatList = ['WAITING', 'RUNNING', 'FAILED', 'SUCCESS'] utils.vprint(level = -1) utils.printTabular(lmap(lambda x: (x, x), sorted(head) + stateCatList), infos, 'c' * len(head), fmt = dict.fromkeys(stateCatList, lambda x: '%7d' % parseStr(x, int, 0))) utils.vprint(level = -1)
def getPhedexSEList(self, blockPath, dictSE): dictSE[blockPath] = [] url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas' for phedexBlock in readJSON(url, {'block': blockPath})['phedex']['block']: for replica in phedexBlock['replica']: if self.nodeFilter(replica['node'], replica['complete'] == 'y'): location = None if self.locationFormat == 'hostname': location = replica.get('se') elif self.locationFormat == 'sitedb': location = replica.get('node') elif self.locationFormat == 'both' and (replica.get('node') or replica.get('se')): location = '%s/%s' % (replica.get('node'), replica.get('se')) if location: dictSE[blockPath].append(location) else: utils.vprint('Warning: Dataset block %s replica at %s / %s is skipped!' % (blockPath, replica.get('node'), replica.get('se')) , -1)
def initPSpace(self): result = [] def addEntry(pNum): tmp = {ParameterInfo.ACTIVE: True, ParameterInfo.REQS: []} self._psource.fillParameterInfo(pNum, tmp) lookupResult = self._matcher.lookup(tmp) if lookupResult: for (lookupIdx, tmp) in enumerate(lookupResult): result.append((pNum, lookupIdx)) if self._psource.getMaxParameters() is None: addEntry(None) else: for pNum in irange(self._psource.getMaxParameters()): addEntry(pNum) if len(result) == 0: utils.vprint('Lookup parameter "%s" has no matching entries!' % self._key, -1) return result
def __init__(self, config, name, abort = None): NamedPlugin.__init__(self, config, name) # Workdir settings self._workDir = config.getWorkPath() self._checkSpace = config.getInt('workdir space', 10, onChange = None) # Initialise task module self.task = config.getPlugin(['module', 'task'], cls = TaskModule, tags = [self]) if abort == 'task': return utils.vprint('Current task ID: %s' % self.task.taskID, -1) utils.vprint('Task started on %s' % self.task.taskDate, -1) # Initialise workload management interface self.wms = config.getCompositePlugin('backend', 'grid', 'MultiWMS', cls = WMS, tags = [self, self.task]) # Subsequent config calls also include section "jobs": jobs_config = config.changeView(viewClass = 'TaggedConfigView', addSections = ['jobs'], addTags = [self]) # Initialise monitoring module self.monitor = jobs_config.getCompositePlugin('monitor', 'scripts', 'MultiMonitor', cls = Monitoring, tags = [self, self.task], pargs = (self.task,)) # Initialise job database self.jobManager = jobs_config.getPlugin('job manager', 'SimpleJobManager', cls = JobManager, tags = [self, self.task, self.wms], pargs = (self.task, self.monitor)) # Prepare work package self.wms.deployTask(self.task, self.monitor) # Configure workflow settings self._actionList = jobs_config.getList('action', ['check', 'retrieve', 'submit'], onChange = None) self.duration = 0 if jobs_config.getBool('continuous', False, onChange = None): # legacy option self.duration = -1 self.duration = jobs_config.getTime('duration', self.duration, onChange = None) self._submitFlag = jobs_config.getBool('submission', True, onChange = None) self._submitTime = jobs_config.getTime('submission time requirement', self.task.wallTime, onChange = None) # Initialise GUI self._gui = jobs_config.getPlugin('gui', 'SimpleConsole', cls = GUI, onChange = None, pargs = (self,))
def getJobConfig(self, jobNum): data = CMSSW.getJobConfig(self, jobNum) nickdata = self.getVarsForNick(data.get("DATASETNICK")) data.update(nickdata) data["LUMI_RANGE"] = self.getActiveLumiFilter(data["LUMI_RANGE"], jobNum) if utils.verbosity() > 0: utils.vprint("Nickname: %s" % data.get("DATASETNICK"), 1) utils.vprint(" * Config files: %s" % data["CMSSW_CONFIG"], 1) utils.vprint(" * Lumi range: %s" % data["LUMI_RANGE"], 1) utils.vprint( " * Variables: %s" % utils.filterDict(nickdata, lambda k: k not in ["CMSSW_CONFIG", "LUMI_RANGE"]), 1 ) return data
def _getUserSource(self, pExpr, parent): tokens = tokenize(pExpr, lchain([self.precedence.keys(), list('()[]<>')])) tokens = list(tok2inlinetok(tokens, list(self.precedence.keys()))) utils.vprint('Parsing parameter string: "%s"' % str.join(' ', imap(str, tokens)), 0) tree = tok2tree(tokens, self.precedence) source_list = self.tree2expr(tree) if DataParameterSource.datasetsAvailable and not DataParameterSource.datasetsUsed: source_list.insert(0, DataParameterSource.create()) if parent: source_list.append(parent) if len(lfilter(lambda p: p.getMaxParameters() is not None, source_list)) > 1: source = self.combineSources(CrossParameterSource, source_list) else: source = self.combineSources(ZipLongParameterSource, source_list) # zip more efficient assert(len(source) == 1) source = source[0] for (PSourceClass, args) in self.elevatedSwitch: source = PSourceClass(source, *args) utils.vprint('Parsing output: %r' % source, 0) return source
def initPSpace(self): result = [] def addEntry(pNum): tmp = {ParameterInfo.ACTIVE: True, ParameterInfo.REQS: []} self._psource.fillParameterInfo(pNum, tmp) lookupResult = self._matcher.lookup(tmp) if lookupResult: for (lookupIdx, tmp) in enumerate(lookupResult): result.append((pNum, lookupIdx)) if self._psource.getMaxParameters() is None: addEntry(None) else: for pNum in irange(self._psource.getMaxParameters()): addEntry(pNum) if len(result) == 0: utils.vprint( 'Lookup parameter "%s" has no matching entries!' % self._key, -1) return result
def doTransfer(self, listDescSourceTarget): for (desc, source, target) in listDescSourceTarget: if not self.smPaths: raise ConfigError( "%s can't be transferred because '%s path wasn't set" % (desc, self.smOptPrefix)) for idx, sePath in enumerate(set(self.smPaths)): utils.vprint('Copy %s to SE %d ' % (desc, idx + 1), -1, newline=False) sys.stdout.flush() proc = se_copy(source, os.path.join(sePath, target), self.smForce) if proc.status(timeout=5 * 60, terminate=True) == 0: utils.vprint('finished', -1) else: utils.vprint('failed', -1) utils.eprint(proc.stderr.read(timeout=0)) utils.eprint( 'Unable to copy %s! You can try to copy it manually.' % desc) if not utils.getUserBool( 'Is %s (%s) available on SE %s?' % (desc, source, sePath), False): raise StorageError('%s is missing on SE %s!' % (desc, sePath))
def display(self): reports = [] for jobNum in self._jobs: jobObj = self._jobDB.get(jobNum) if not jobObj or (jobObj.state == Job.INIT): continue reports.append({ 0: jobNum, 1: Job.enum2str(jobObj.state), 2: jobObj.wmsId }) if utils.verbosity() > 0: history = jobObj.history.items() history.reverse() for at, dest in history: if dest != 'N/A': reports.append({1: at, 2: ' -> ' + dest}) elif jobObj.get('dest', 'N/A') != 'N/A': reports.append({2: ' -> ' + jobObj.get('dest')}) utils.printTabular( lzip(irange(3), ['Job', 'Status / Attempt', 'Id / Destination']), reports, 'rcl') utils.vprint()
def getPhedexSEList(self, blockPath, dictSE): dictSE[blockPath] = [] url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas' for phedexBlock in readJSON(url, {'block': blockPath})['phedex']['block']: for replica in phedexBlock['replica']: if self.nodeFilter(replica['node'], replica['complete'] == 'y'): location = None if self._locationFormat == CMSLocationFormat.hostname: location = replica.get('se') elif self._locationFormat == CMSLocationFormat.siteDB: location = replica.get('node') elif (self._locationFormat == CMSLocationFormat.both) and ( replica.get('node') or replica.get('se')): location = '%s/%s' % (replica.get('node'), replica.get('se')) if location: dictSE[blockPath].append(location) else: utils.vprint( 'Warning: Dataset block %s replica at %s / %s is skipped!' % (blockPath, replica.get('node'), replica.get('se')), -1)
def _CleanSocket(self): if not os.path.exists(self.sshLink): vprint("No Socket %s" % self.sshLink) return True vprint("Killing Socket %s" % self.sshLink) # killSocket = LoggedProcess( " ".join([self.cmd, self.defaultArgs, self.socketArgsDef, "-O exit", self.remoteHost]) ) # while killSocket.poll() == -1: # print "poll", killSocket.poll() # time.sleep(0.5) # timeout += 0.5 # if timeout == 5: # vprint("Failed to cancel ssh Socket...\n%s" % self.sshLink, level=1) # return False # print "done", killSocket.poll() timeout = 0 while os.path.exists(self.sshLink): vprint("exists %d" % timeout) time.sleep(0.5) timeout += 0.5 #if timeout == 5: # vprint("Failed to remove ssh Socket...\n%s" % self.sshLink, level=1) # return False return True
def submitJobs(self, jobNumList, task): # jobNumList = [1, 2, ...] utils.vprint( 'Inactive WMS (%s): Discarded submission of %d jobs' % (self.wmsName, len(jobNumList)), -1)
def display(self): summary = lmap(lambda x: 0.0, Job.enumNames) defaultJob = Job() for jobNum in self._jobs: summary[self._jobDB.get(jobNum, defaultJob).state] += 1 makeSum = lambda *states: sum(imap(lambda z: summary[z], states)) makePer = lambda *states: [ makeSum(*states), round(makeSum(*states) / len(self._jobDB) * 100.0) ] # Print report summary self._printHeader('REPORT SUMMARY:') njobs_total = len(self._jobDB) jobov_succ = makePer(Job.SUCCESS) utils.vprint( 'Total number of jobs:%9d Successful jobs:%8d %3d%%' % tuple([njobs_total] + jobov_succ), -1) njobs_assigned = makeSum(Job.SUBMITTED, Job.WAITING, Job.READY, Job.QUEUED, Job.RUNNING) jobov_fail = makePer(Job.ABORTED, Job.CANCELLED, Job.FAILED) utils.vprint( 'Jobs assigned to WMS:%9d Failing jobs:%8d %3d%%' % tuple([njobs_assigned] + jobov_fail), -1) utils.vprint(' ' * 65 + '\nDetailed Status Information: ', -1, newline=False) ignored = len(self._jobDB) - sum(summary) if ignored: utils.vprint( '(Jobs IGNORED:%8d %3d%%)' % (ignored, ignored / len(self._jobDB) * 100.0), -1) else: utils.vprint(' ' * 31, -1) for stateNum, category in enumerate(Job.enumNames): utils.vprint('Jobs %9s:%8d %3d%% ' % tuple([category] + makePer(stateNum)), -1, newline=stateNum % 2) utils.vprint('-' * 65, -1) return 0
def _printHeader(self, message, level=-1): utils.vprint('-' * 65, level) utils.vprint(message + self._header.rjust(65 - len(message)), level) utils.vprint(('-' * 15).ljust(65), level)
def __init__(self, config, name, abort=None): NamedPlugin.__init__(self, config, name) # Workdir settings self._workDir = config.getWorkPath() self._checkSpace = config.getInt('workdir space', 10, onChange=None) # Initialise task module self.task = config.getPlugin(['module', 'task'], cls=TaskModule, tags=[self]) if abort == 'task': return utils.vprint('Current task ID: %s' % self.task.taskID, -1) utils.vprint('Task started on %s' % self.task.taskDate, -1) # Initialise workload management interface self.wms = config.getCompositePlugin('backend', 'grid', 'MultiWMS', cls=WMS, tags=[self, self.task]) # Subsequent config calls also include section "jobs": jobs_config = config.changeView(viewClass='TaggedConfigView', addSections=['jobs'], addTags=[self]) # Initialise monitoring module self.monitor = jobs_config.getCompositePlugin('monitor', 'scripts', 'MultiMonitor', cls=Monitoring, tags=[self, self.task], pargs=(self.task, )) # Initialise job database self.jobManager = jobs_config.getPlugin( 'job manager', 'SimpleJobManager', cls=JobManager, tags=[self, self.task, self.wms], pargs=(self.task, self.monitor)) # Prepare work package self.wms.deployTask(self.task, self.monitor) # Configure workflow settings self._actionList = jobs_config.getList('action', ['check', 'retrieve', 'submit'], onChange=None) self.duration = 0 if jobs_config.getBool('continuous', False, onChange=None): # legacy option self.duration = -1 self.duration = jobs_config.getTime('duration', self.duration, onChange=None) self._submitFlag = jobs_config.getBool('submission', True, onChange=None) self._submitTime = jobs_config.getTime('submission time requirement', self.task.wallTime, onChange=None) # Initialise GUI self._gui = jobs_config.getPlugin('gui', 'SimpleConsole', cls=GUI, onChange=None, pargs=(self, ))
def checkJobs(self, ids): # ids = [(WMS-61226, 1), (WMS-61227, 2), ...] utils.vprint( 'Inactive WMS (%s): Discarded check of %d jobs' % (self.wmsName, len(ids)), -1)
def cancelJobs(self, ids): utils.vprint( 'Inactive WMS (%s): Discarded abort of %d jobs' % (self.wmsName, len(ids)), -1)
def checkJobs(self, wmsJobIdList): if len(wmsJobIdList) == 0: raise StopIteration self.debugOut('Started checking: %s' % set(lzip(*wmsJobIdList)[0])) self.debugPool() wmsIdList = list(self._getRawIDs(wmsJobIdList)) wmsIdArgument = ' '.join(wmsIdList) wmsToJobMap = dict(wmsJobIdList) activity = utils.ActivityLog('fetching job status') statusProcess = self.Pool.LoggedExecute( self.statusExec, '%(format)s %(jobIDs)s' % { "jobIDs": wmsIdArgument, "format": self.statusReturnFormat }) activity.finish() activity = utils.ActivityLog('checking job status') # process all lines of the status executable output utils.vprint('querrying condor_q', 2) for statusReturnLine in statusProcess.iter(): try: # test if wmsID job was requested, then extact data and remove from check list if statusReturnLine.split()[0] in wmsIdList: (jobID, wmsID, status, jobinfo) = self._statusReturnLineRead(statusReturnLine) wmsIdList.remove(wmsID) yield (jobID, self._createId(wmsID), status, jobinfo) except Exception: raise BackendError('Error reading job status info:\n%s' % statusReturnLine) # cleanup after final yield retCode = statusProcess.wait() if retCode != 0: if self.explainError(statusProcess, retCode): pass else: statusProcess.logError(self.errorLog, brief=True) activity.finish() self.debugOut("Remaining after condor_q: %s" % wmsIdList) # jobs not in queue have either succeeded or failed - both is considered 'Done' for GC # if no additional information is required, consider everything we couldn't find as done if retCode == 0: for wmsID in list(wmsIdList): wmsIdList.remove(wmsID) wmsID = self._createId(wmsID) yield (wmsToJobMap[wmsID], wmsID, Job.DONE, {}) # TODO: querry log on properly configured pool # querying the history can be SLOW! only do when necessary and possible if False and len(wmsIdList) > 0 and self.remoteType != PoolType.SPOOL: utils.vprint('querrying condor_history', 2) # querying the history can be VERY slow! Only do so bit by bit if possible if self.historyFile: historyList = sorted([ "-f " + file for file in ifilter( os.path.isfile, glob.glob(self.historyFile + "*")) ]) else: historyList = [""] # query the history file by file until no more jobs need updating for historyFile in historyList: if len(wmsIdList) > 0: statusArgs = '%(fileQuery)s %(format)s %(jobIDs)s' % { "fileQuery": historyFile, "jobIDs": " ", "format": self.statusReturnFormat } statusProcess = self.Pool.LoggedExecute( self.historyExec, statusArgs) for statusReturnLine in statusProcess.iter(): # test if line starts with a number and was requested try: # test if wmsID job was requested, then extact data and remove from check list if statusReturnLine.split()[0] in wmsIdList: (jobID, wmsID, status, jobinfo) = self._statusReturnLineRead( statusReturnLine) wmsIdList.remove(wmsID) yield (jobID, self._createId(wmsID), status, jobinfo) except Exception: raise BackendError( 'Error reading job status info:\n%s' % statusReturnLine) # cleanup after final yield retCode = statusProcess.wait() if retCode != 0: if self.explainError(statusProcess, retCode): pass else: statusProcess.logError(self.errorLog, brief=True) self.debugFlush()
def __init__(self, config, wmsName): utils.vprint('Using batch system: Condor/GlideInWMS', -1) BasicWMS.__init__(self, config, wmsName) # special debug out/messages/annotations - may have noticeable effect on storage and performance! debugLogFN = config.get('debugLog', '') self.debug = False if debugLogFN: self.debug = open(debugLogFN, 'a') ###### self.taskID = config.get('task id', md5(str(time.time())).hexdigest(), persistent=True) # FIXME! self.debugOut(""" ############################# Initialized Condor/GlideInWMS ############################# Config: %s taskID: %s Name: %s ############################# """ % (config.getConfigName(), self.taskID, wmsName)) # finalize config state by reading values or setting to defaults self.settings = { 'jdl': { 'Universe': config.get('Universe', 'vanilla'), 'NotifyEmail': config.get('NotifyEmail', ''), 'ClassAdData': config.getList('ClassAdData', []), 'JDLData': config.getList('JDLData', []) }, 'pool': { 'hosts': config.getList('PoolHostList', []) } } # prepare interfaces for local/remote/ssh pool access self._initPoolInterfaces(config) # load keys for condor pool ClassAds self.poolReqs = config.getDict('poolArgs req', {})[0] self.poolQuery = config.getDict('poolArgs query', {})[0] self._formatStatusReturnQuery(config) # Sandbox base path where individual job data is stored, staged and returned to self.sandPath = config.getPath('sandbox path', config.getWorkPath('sandbox'), mustExist=False) # history query is faster with split files - check if and how this is used # default condor_history command works WITHOUT explicitly specified file self.historyFile = None if self.remoteType == PoolType.LOCAL and getoutput( self.configValExec + ' ENABLE_HISTORY_ROTATION').lower() == 'true': self.historyFile = getoutput(self.configValExec + ' HISTORY') if not os.path.isfile(self.historyFile): self.historyFile = None # broker for selecting Sites self.brokerSite = config.getPlugin('site broker', 'UserBroker', cls=Broker, tags=[self], pargs=('sites', 'sites', self.getSites)) self.debugFlush()
def retrieveJobs(self, ids): utils.vprint( 'Inactive WMS (%s): Discarded retrieval of %d jobs' % (self.wmsName, len(ids)), -1)