def __init__(self, config): BasePlugin.__init__(self, config) self.config = config self.pool = [] self.createdReport = [] self.myinput = None if getattr(config.BossAir, 'MockPlugin', None) == None: msg = "Missing required config.BossAir.MockPlugin section" raise BossAirPluginException(msg) self.nProcess = getattr(config.BossAir.MockPlugin, 'mockPluginProcesses', 4) self.jobRunTime = getattr(config.BossAir.MockPlugin, 'jobRunTime', 120) #default job running time is two hours logging.info('Job Running time set to minutes %s' % self.jobRunTime) self.fakeReport = getattr(config.BossAir.MockPlugin, 'fakeReport', None) if self.fakeReport == None: msg = 'config.BossAir.MockPlugin.fakeReport is a required parameter' raise BossAirPluginException(msg) elif not os.path.isfile(self.fakeReport): msg = 'Cannot find %s file' % self.fakeReport raise BossAirPluginException(msg) self.jobsScheduledEnd = {} self.states = [ 'New', 'Timeout', 'Submitted', 'Waiting', 'Ready', 'Scheduled', 'Running', 'Done(failed)', 'Done', 'Aborted', 'Cleared', 'Cancelled by user', 'Cancelled' ]
def updateJobInformation(self, workflow, task, **kwargs): """ _updateJobInformation_ Update job information for all jobs in the workflow and task, the change will take effect if the job is Idle or becomes idle. The currently supported changes are only priority for which both the task (taskPriority) and workflow priority (requestPriority) must be provided. """ if 'taskPriority' in kwargs and 'requestPriority' in kwargs: # Do a priority update priority = (int(kwargs['requestPriority']) + int(kwargs['taskPriority'] * self.maxTaskPriority)) command = 'condor_qedit -constraint \'WMAgent_SubTaskName == "%s" && WMAgent_RequestName == "%s" ' %(task, workflow) command += '&& (JobPrio != %d)\' JobPrio %d' % (priority, priority) command = shlex.split(command) proc = subprocess.Popen(command, stderr = subprocess.PIPE, stdout = subprocess.PIPE) _, stderr = proc.communicate() if proc.returncode != 0: # Check if there are actually jobs to update command = 'condor_q -constraint \'WMAgent_SubTaskName == "%s" && WMAgent_RequestName == "%s"' %(task, workflow) command += ' && (JobPrio != %d)\'' % priority command += ' -format \'WMAgentID:\%d:::\' WMAgent_JobID' command = shlex.split(command) proc = subprocess.Popen(command, stderr = subprocess.PIPE, stdout = subprocess.PIPE) stdout, _ = proc.communicate() if stdout != '': msg = 'HTCondor edit failed with exit code %d\n'% proc.returncode msg += 'Error was: %s' % stderr raise BossAirPluginException(msg) return
def processWorker(myinput, tmp): try: while True: jj, report, lcreport = myinput.get() if jj == 'STOP': return targetDir = jj['cache_dir'] outfile = os.path.join(targetDir, "Report.0.pkl") if os.path.isfile(outfile): continue taskName = targetDir.split('/')[5] if jj['cache_dir'].count("Production/LogCollect") > 0: if lcreport is not None: lcreport.task = "/" + taskName + "/Production/LogCollect" f = open(outfile, 'w') logging.debug( 'Process worker is dumping the LogCollect report to ' + f.name) pickle.dump(lcreport, f) continue else: msg = "Parameter lcFakeReport is mandatory if you are using logCollect jobs" raise BossAirPluginException(msg) #ensure each lfn of each output file in the job is unique by adding the jobid jobid = str(jj['id']) if hasattr(report, 'cmsRun1') and hasattr(report.cmsRun1.output, 'output'): tmpname = report.cmsRun1.output.output.files.file0.lfn.split( '.root')[0] tmpname = tmpname + jobid report.cmsRun1.output.output.files.file0.lfn = tmpname + '.root' if hasattr(report, 'logArch1') and hasattr(report.logArch1, 'output'): tmpname = report.logArch1.output.logArchive.files.file0.lfn.split( '.tar.gz')[0] tmpname = tmpname + jobid report.logArch1.output.logArchive.files.file0.lfn = tmpname + '.root' #get target diretory and set task name report.task = "/" + taskName + "/Production" #pickle the report again f = open(outfile, 'w') logging.debug('Process worker is dumping the report to ' + f.name) pickle.dump(report, f) f.close() except Exception as ex: logging.exception(ex)
def __init__(self, config): self.config = config BasePlugin.__init__(self, config) self.locationDict = {} myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.locationAction = daoFactory(classname = "Locations.GetSiteInfo") self.packageDir = None if os.path.exists(os.path.join(getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py')): self.unpacker = os.path.join(getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py') else: self.unpacker = os.path.join(getWMBASE(), 'WMCore/WMRuntime/Unpacker.py') self.agent = getattr(config.Agent, 'agentName', 'WMAgent') self.sandbox = None self.scriptFile = None self.submitDir = None self.removeTime = getattr(config.BossAir, 'removeTime', 60) self.useGSite = getattr(config.BossAir, 'useGLIDEINSites', False) self.submitWMSMode = getattr(config.BossAir, 'submitWMSMode', False) self.errorThreshold= getattr(config.BossAir, 'submitErrorThreshold', 10) self.errorCount = 0 self.defaultTaskPriority = getattr(config.BossAir, 'defaultTaskPriority', 0) self.maxTaskPriority = getattr(config.BossAir, 'maxTaskPriority', 1e7) # Required for global pool accounting self.acctGroup = getattr(config.BossAir, 'acctGroup', "production") self.acctGroupUser = getattr(config.BossAir, 'acctGroupUser', "cmsdataops") # Build ourselves a pool self.pool = [] self.input = None self.result = None self.nProcess = getattr(self.config.BossAir, 'nCondorProcesses', 4) # Set up my proxy and glexec stuff self.setupScript = getattr(config.BossAir, 'UISetupScript', None) self.proxy = None self.serverCert = getattr(config.BossAir, 'delegatedServerCert', None) self.serverKey = getattr(config.BossAir, 'delegatedServerKey', None) self.myproxySrv = getattr(config.BossAir, 'myproxyServer', None) self.proxyDir = getattr(config.BossAir, 'proxyDir', '/tmp/') self.serverHash = getattr(config.BossAir, 'delegatedServerHash', None) self.glexecPath = getattr(config.BossAir, 'glexecPath', None) self.glexecWrapScript = getattr(config.BossAir, 'glexecWrapScript', None) self.glexecUnwrapScript = getattr(config.BossAir, 'glexecUnwrapScript', None) self.jdlProxyFile = None # Proxy name to put in JDL (owned by submit user) self.glexecProxyFile = None # Copy of same file owned by submit user if self.glexecPath: if not (self.myproxySrv and self.proxyDir): raise WMException('glexec requires myproxyServer and proxyDir to be set.') if self.myproxySrv: if not (self.serverCert and self.serverKey): raise WMException('MyProxy server requires serverCert and serverKey to be set.') # Make the directory for the proxies if self.proxyDir and not os.path.exists(self.proxyDir): logging.debug("proxyDir not found: creating it.") try: os.makedirs(self.proxyDir, 0o1777) except Exception as ex: msg = "Error: problem when creating proxyDir directory - '%s'" % str(ex) raise BossAirPluginException(msg) elif not os.path.isdir(self.proxyDir): msg = "Error: proxyDir '%s' is not a directory" % self.proxyDir raise BossAirPluginException(msg) if self.serverCert and self.serverKey and self.myproxySrv: self.proxy = self.setupMyProxy() # Build a request string self.reqStr = "(Memory >= 1 && OpSys == \"LINUX\" ) && (Arch == \"INTEL\" || Arch == \"X86_64\") && stringListMember(GLIDEIN_CMSSite, DESIRED_Sites) && ((REQUIRED_OS==\"any\") || (GLIDEIN_REQUIRED_OS==REQUIRED_OS))" if hasattr(config.BossAir, 'condorRequirementsString'): self.reqStr = config.BossAir.condorRequirementsString return
def __init__(self, config): self.config = config BasePlugin.__init__(self, config) self.locationDict = {} myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.locationAction = daoFactory(classname = "Locations.GetSiteInfo") self.packageDir = None if os.path.exists(os.path.join(getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py')): self.unpacker = os.path.join(getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py') else: self.unpacker = os.path.join(getWMBASE(), 'WMCore/WMRuntime/Unpacker.py') self.agent = getattr(config.Agent, 'agentName', 'WMAgent') self.sandbox = None self.scriptFile = None self.submitDir = None self.removeTime = getattr(config.BossAir, 'removeTime', 60) self.multiTasks = getattr(config.BossAir, 'multicoreTaskTypes', []) self.useGSite = getattr(config.BossAir, 'useGLIDEINSites', False) self.submitWMSMode = getattr(config.BossAir, 'submitWMSMode', False) self.errorThreshold= getattr(config.BossAir, 'submitErrorThreshold', 10) self.errorCount = 0 self.defaultTaskPriority = getattr(config.BossAir, 'defaultTaskPriority', 0) self.maxTaskPriority = getattr(config.BossAir, 'maxTaskPriority', 1e7) # Build ourselves a pool self.pool = [] self.input = None self.result = None self.nProcess = getattr(self.config.BossAir, 'nCondorProcesses', 4) # Set up my proxy and glexec stuff self.setupScript = getattr(config.BossAir, 'UISetupScript', None) self.proxy = None self.serverCert = getattr(config.BossAir, 'delegatedServerCert', None) self.serverKey = getattr(config.BossAir, 'delegatedServerKey', None) self.myproxySrv = getattr(config.BossAir, 'myproxyServer', None) self.proxyDir = getattr(config.BossAir, 'proxyDir', '/tmp/') self.serverHash = getattr(config.BossAir, 'delegatedServerHash', None) self.glexecPath = getattr(config.BossAir, 'glexecPath', None) self.glexecWrapScript = getattr(config.BossAir, 'glexecWrapScript', None) self.glexecUnwrapScript = getattr(config.BossAir, 'glexecUnwrapScript', None) self.jdlProxyFile = None # Proxy name to put in JDL (owned by submit user) self.glexecProxyFile = None # Copy of same file owned by submit user if self.glexecPath: if not (self.myproxySrv and self.proxyDir): raise WMException('glexec requires myproxyServer and proxyDir to be set.') if self.myproxySrv: if not (self.serverCert and self.serverKey): raise WMException('MyProxy server requires serverCert and serverKey to be set.') # Make the directory for the proxies if self.proxyDir and not os.path.exists(self.proxyDir): logging.debug("proxyDir not found: creating it.") try: os.makedirs(self.proxyDir, 01777) except Exception, ex: msg = "Error: problem when creating proxyDir directory - '%s'" % str(ex) raise BossAirPluginException(msg)
def track(self, jobs, info = None): changeList = [] completeList = [] runningList = [] jobsFile, arcId2job = self.createJobsFile(jobs) s, output = executeCommand("ngstat -t 180 -i %s" % jobsFile.name) if s != 0: raise BossAirPluginException("ngstat failed:" + output) for js in splitNgstatOutput(output): arcStat = None if js.find("Job information not found") >= 0: if js.find("job was only very recently submitted"): arcStat = "NOT_FOUND_NEW" else: arcStat = "LOST" arcIdMatch = re.search("(\w+://([a-zA-Z0-9.-]+)\S*/\d*)", js) if not arcIdMatch: raise BossAirPluginException("No grid job ID!") arcId = arcIdMatch.group(1) elif js.find("Malformed URL:") >= 0: # This shouldn't be possible, since we are pass arcID:s to # ngstat. arcIdMatch = re.search("URL: (\w+://([a-zA-Z0-9.-]+)\S*/\d*)", js) raise BossAirPluginException("Malformed URL for job " + arcIdMatch.group(1)) else: # With special cases taken care of above, we are left with # "normal" jobs. They are assumed to have the format # # Job <arcId> # Status: <status> # Whatever: blah # for line in js.split('\n'): arcIdMatch = re.match("Job +(\w+://([a-zA-Z0-9.-]+)\S*/\d*)", line) if arcIdMatch: arcId = arcIdMatch.group(1) continue statusMatch = re.match(" +Status: *(.+)", line) if statusMatch: arcStat = statusMatch.group(1) continue j = arcId2job[arcId] if arcStat == "NOT_FOUND_NEW": if j['status'] in [ "New", "ACCEPTING" ] and (not j['status_time']) \ or int(time.time()) - j['status_time'] < 60: arcStat = "ACCEPTING" # Probably approximately true else: arcStat = "LOST" j['globalState'] = ARCPlugin.stateMap()[arcStat] if arcStat != j['status']: j['status'] = arcStat j['status_time'] = int(time.time()) changeList.append(j) logging.debug("Job %s has status %s" % (j['gridid'], j['status'])) if ARCPlugin.stateMap()[arcStat] not in ["Complete", "Error"]: runningList.append(j) else: completeList.append(j) return runningList, changeList, completeList
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.submitDir = self.config.JobSubmitter.submitDir timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400) successfulJobs = [] failedJobs = [] jdlFiles = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs if len(self.pool) == 0: # Starting things up # This is obviously a submit API logging.info("Starting up PyCondorPlugin worker pool") self.inputQueue = multiprocessing.Queue() self.result = multiprocessing.Queue() for x in range(self.nProcess): p = multiprocessing.Process(target=submitWorker, args=(self.inputQueue, self.result, timeout)) p.start() self.pool.append(p) if not os.path.exists(self.submitDir): os.makedirs(self.submitDir) # Submit the jobs nSubmits = 0 queueError = False for jobsReady in grouper(jobs, self.jobsPerWorker): if queueError: # If the queue has failed, then we must not process any more jobs this cycle. break idList = [x['id'] for x in jobsReady] jdlList = self.makeSubmit(jobList=jobsReady) if not jdlList: # Then we got nothing logging.error("No JDL file made!") return {'NoResult': [0]} jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0]) with open(jdlFile, 'w') as handle: handle.writelines(jdlList) jdlFiles.append(jdlFile) # Now submit them logging.info("About to submit %i jobs", len(jobsReady)) if self.glexecPath: command = 'CS=`which condor_submit`; ' if self.glexecWrapScript: command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile if self.glexecUnwrapScript: command += '%s %s -- $CS %s' % ( self.glexecPath, self.glexecUnwrapScript, jdlFile) else: command += '%s $CS %s' % (self.glexecPath, jdlFile) else: command = "condor_submit %s" % jdlFile try: self.inputQueue.put({'command': command, 'idList': idList}) except AssertionError as ex: msg = "Critical error: input pipeline probably closed.\n" msg += str(ex) msg += "Error Procedure: Something critical has happened in the worker process\n" msg += "We will now proceed to pull all useful data from the queue (if it exists)\n" msg += "Then refresh the worker pool\n" logging.error(msg) queueError = True break nSubmits += 1 # Now we should have sent all jobs to be submitted # Going to do the rest of it now for dummy in range(nSubmits): try: res = self.result.get(block=True, timeout=timeout) except Queue.Empty: # If the queue was empty go to the next submit # Those jobs have vanished logging.error("Queue.Empty error received!") logging.error("This could indicate a critical condor error!") logging.error( "However, no information of any use was obtained due to process failure." ) logging.error( "Either process failed, or process timed out after %s seconds.", timeout) continue except AssertionError as ex: msg = "Found Assertion error while retrieving output from worker process.\n" msg += str(ex) msg += "This indicates something critical happened to a worker process" msg += "We will recover what jobs we know were submitted, and resubmit the rest" msg += "Refreshing worker pool at end of loop" logging.error(msg) continue try: dummyOut = res['stdout'] error = res['stderr'] idList = res['idList'] exitCode = res['exitCode'] except KeyError as ex: msg = "Error in finding key from result pipe\n" msg += "Something has gone critically wrong in the worker\n" try: msg += "Result: %s\n" % str(res) except: pass msg += str(ex) logging.error(msg) continue if not exitCode == 0: logging.error( "Condor returned non-zero. Printing out command stderr") logging.error(error) errorCheck, errorMsg = parseError(error=error) logging.error( "Processing failed jobs and proceeding to the next jobs.") logging.error("Do not restart component.") else: errorCheck = None if errorCheck: self.errorCount += 1 condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg) for jobID in idList: for job in jobs: if job.get('id', None) == jobID: job['fwjr'] = condorErrorReport failedJobs.append(job) break else: if self.errorCount > 0: self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # stop the component if self.errorCount > self.errorThreshold: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) raise BossAirPluginException(msg) # Remove JDL files unless commanded otherwise if self.deleteJDLFiles: for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order # to free up memory (in the midst of the process, the forked # memory space shouldn't be touched, so it should still be # shared, but after this point any action by the Submitter will # result in memory duplication). logging.info("Purging worker pool to clean up memory") self.close() # We must return a list of jobs successfully submitted, # and a list of jobs failed logging.info("Done submitting jobs for this cycle in PyCondorPlugin") return successfulJobs, failedJobs
class CondorPlugin(BasePlugin): """ _CondorPlugin_ Condor plugin for glide-in submissions """ @staticmethod def stateMap(): """ For a given name, return a global state """ stateDict = { 'New': 'Pending', 'Idle': 'Pending', 'Running': 'Running', 'Held': 'Error', 'Complete': 'Complete', 'Error': 'Error', 'Timeout': 'Error', 'Removed': 'Running', 'Unknown': 'Error' } # This call is optional but needs to for testing #BasePlugin.verifyState(stateDict) return stateDict def __init__(self, config): self.config = config BasePlugin.__init__(self, config) self.locationDict = {} myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.locationAction = daoFactory(classname="Locations.GetSiteInfo") self.packageDir = None if os.path.exists( os.path.join(getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py')): self.unpacker = os.path.join( getWMBASE(), 'src/python/WMCore/WMRuntime/Unpacker.py') else: self.unpacker = os.path.join(getWMBASE(), 'WMCore/WMRuntime/Unpacker.py') self.agent = getattr(config.Agent, 'agentName', 'WMAgent') self.sandbox = None self.scriptFile = None self.submitDir = None self.removeTime = getattr(config.BossAir, 'removeTime', 60) self.multiTasks = getattr(config.BossAir, 'multicoreTaskTypes', []) self.useGSite = getattr(config.BossAir, 'useGLIDEINSites', False) self.submitWMSMode = getattr(config.BossAir, 'submitWMSMode', False) self.errorThreshold = getattr(config.BossAir, 'submitErrorThreshold', 10) self.errorCount = 0 # Build ourselves a pool self.pool = [] self.input = None self.result = None self.nProcess = getattr(self.config.BossAir, 'nCondorProcesses', 4) # Set up my proxy and glexec stuff self.setupScript = getattr(config.BossAir, 'UISetupScript', None) self.proxy = None self.serverCert = getattr(config.BossAir, 'delegatedServerCert', None) self.serverKey = getattr(config.BossAir, 'delegatedServerKey', None) self.myproxySrv = getattr(config.BossAir, 'myproxyServer', None) self.proxyDir = getattr(config.BossAir, 'proxyDir', '/tmp/') self.serverHash = getattr(config.BossAir, 'delegatedServerHash', None) self.glexecPath = getattr(config.BossAir, 'glexecPath', None) self.glexecWrapScript = getattr(config.BossAir, 'glexecWrapScript', None) self.glexecUnwrapScript = getattr(config.BossAir, 'glexecUnwrapScript', None) self.jdlProxyFile = None # Proxy name to put in JDL (owned by submit user) self.glexecProxyFile = None # Copy of same file owned by submit user if self.glexecPath: if not (self.myproxySrv and self.proxyDir): raise WMException( 'glexec requires myproxyServer and proxyDir to be set.') if self.myproxySrv: if not (self.serverCert and self.serverKey): raise WMException( 'MyProxy server requires serverCert and serverKey to be set.' ) # Make the directory for the proxies if self.proxyDir and not os.path.exists(self.proxyDir): logging.debug("proxyDir not found: creating it.") try: os.makedirs(self.proxyDir, 01777) except Exception, ex: msg = "Error: problem when creating proxyDir directory - '%s'" % str( ex) raise BossAirPluginException(msg) elif not os.path.isdir(self.proxyDir): msg = "Error: proxyDir '%s' is not a directory" % self.proxyDir raise BossAirPluginException(msg)