class JobSpecExpander: def __init__(self, jobSpecFile): self.jobSpec = JobSpec() self.jobSpec.load(jobSpecFile) self.taskState = TaskState(os.getcwd()) self.taskState.loadRunResDB() self.workflowSpec = WorkflowSpec() self.workflowSpec.load(os.environ["PRODAGENT_WORKFLOW_SPEC"]) self.config = self.taskState.configurationDict() finder = NodeFinder(self.taskState.taskName()) self.jobSpec.payload.operate(finder) self.jobSpecNode = finder.result wffinder = NodeFinder(self.taskState.taskName()) self.workflowSpec.payload.operate(wffinder) self.workflowNode = wffinder.result if self.jobSpecNode.jobType != "Merge": if self.config.has_key('Configuration'): try: self.createPSet() except Exception, ex: msg = "Unable to generate cmsRun Config from JobSpec:\n" msg += str(ex) print msg badfile = open("exit.status", 'w') badfile.write("10040") badfile.close() else:
def queueJob(jobSpecFile, priorityMap, jobSpec = None, status = "new"): """ _queueJob_ Add a JobSpec to the JobQueue with a priority looked up from the priority map by job type. This queues a single job and is potentially slow for large groups of jobs. jobSpec is a JobSpec instance. If the jobSpec is available pass the jobSpec as well as the jobSpecFile location for the performace. jobSpecFile won't be loaded, but needed to update the database The status parameter can either be "new" or "held". Jobs marked as "new" will be released as resources become available, while jobs marked as "held" will sit in the JobQueue until they are explicitly released. """ if jobSpec != None and jobSpec.__class__ is JobSpec: spec = jobSpec else: spec = JobSpec() try: spec.load(jobSpecFile) except Exception, ex: msg = "Unable to read JobSpec File:\n" msg += "%s\n" % jobSpecFile msg += "Error: %s\n" % str(ex) logging.error(msg) return
def recreateJob(jobspecFile, jobQueue): """ re-create the processing job """ # remove entries from tr_Trigger/Action tables to be on safer side clean_tr_tables(jobspecFile) # create job if not merge spec = JobSpec() spec.load(jobspecFile) # // # // clean spec id from the job queue #// No easy way to do this in JobQueueAPI so use nekkid SQL for now Session.set_database(dbConfig) Session.connect() sqlStr1 = "DELETE FROM jq_queue WHERE job_spec_id=\"%s\"; " % spec.parameters['JobName'] Session.execute(sqlStr1) Session.commit_all() if spec.parameters['JobType'] in ('Processing', 'CleanUp', 'LogCollect', 'Harvesting'): # publish CreateJob print "- Resubmit Processing job" print "--> Publishing CreateJob for %s"%jobspecFile ms = MessageService() ms.registerAs("Test") if jobQueue: ms.publish("QueueJob", jobspecFile) else: ms.publish("CreateJob", jobspecFile) ms.commit() elif spec.parameters['JobType']=="Merge" : try: jobname=spec.parameters['JobName'] except Exception,ex: msg = "Problem extracting jobspec name from JobSpec File: %s Details: %s"%(jobspecFile,str(ex)) print msg return print "- Resubmit Merge job" print "--> Publishing GeneralJobFailures for %s"%jobname ms = MessageService() ms.registerAs("TestMA") ms.publish("GeneralJobFailure", jobname) ms.commit() time.sleep(1) print "--> Publishing MergeSensor:ReSubmit for %s"%jobname ms = MessageService() ms.registerAs("Test") ms.publish("MergeSensor:ReSubmit", jobname) ms.commit()
def clean_tr_tables(jobspecFile): """ remove job entries from tr_Trigger and tr_Action tables """ spec = JobSpec() spec.load(jobspecFile) try: jobspecid=spec.parameters['JobName'] except Exception,ex: msg = "Problem extracting jobspec name from JobSpec File: %s Details: %s"%(jobspecFile,str(ex)) print msg return
def loadJobSpec(): """ _loadJobSpec_ Load the JobSpec """ try: jobSpecFile = os.path.expandvars(os.environ["PRODAGENT_JOBSPEC"]) jobSpec = JobSpec() jobSpec.load(jobSpecFile) except Exception, ex: print "ERROR: Cannot load JobSpec file!!!" jobSpec = None
def __init__(self, jobSpecFile): jobSpec = JobSpec() jobSpec.load(jobSpecFile) taskState = TaskState(os.getcwd()) taskState.loadRunResDB() config = taskState.configurationDict() finder = NodeFinder(taskState.taskName()) jobSpec.payload.operate(finder) self.jobSpecNode = finder.result self.checkArgs() self.writeCfg() self.createArgs()
def __init__(self): self.state = TaskState(os.getcwd()) self.siteConf = self.state.getSiteConfig() self.spec = JobSpec() self.numberOfRetries = 3 self.retryInterval = 600 #seconds jobSpec = os.environ.get("PRODAGENT_JOBSPEC", None) if jobSpec == None: msg = "Unable to find JobSpec from PRODAGENT_JOBSPEC variable\n" msg += "Unable to proceed\n" raise RuntimeError, msg if not os.path.exists(jobSpec): msg += "Cannot find JobSpec file:\n %s\n" % jobSpec msg += "Unable to proceed\n" raise RuntimeError, msg self.specFile = jobSpec self.spec.load(jobSpec) self.spec.payload.loadConfiguration() self.files = self.spec.payload.cfgInterface.inputFiles print self.files self.lfnToPfn = {} self.localFiles = {}
def createJobSpec(self): """ _createJobSpec_ Create a tree of JobSpecNodes from this WorkflowSpec instances tree of payload nodes. Same tree structure will be used, but nodes will be JobSpecNode instances containing the details from the corresponding PayloadNode in this instance. To be used to create JobSpecNode trees from a Workflow to represent a job created from the general workflow """ self._NodeMap = {} result = JobSpec() result.payload = self._CloneTreeNode(self.payload) result.parameters.update(self.parameters) return result
def createJobSpec(self): """ _createJobSpec_ Create a tree of JobSpecNodes from this WorkflowSpec instances tree of payload nodes. Same tree structure will be used, but nodes will be JobSpecNode instances containing the details from the corresponding PayloadNode in this instance. To be used to create JobSpecNode trees from a Workflow to represent a job created from the general workflow """ self._NodeMap = {} result = JobSpec() result.payload = self._CloneTreeNode(self.payload) result.parameters.update(self.parameters) return result
def loadJobSpecNode(self): """ _loadJobSpecNode_ Load the job spec file referenced by PRODAGENT_JOB_SPEC env var and extract the node from it with the name provided """ if not os.environ.has_key("PRODAGENT_JOBSPEC"): print " No PRODAGENT_JOBSPEC set" return specFile = os.environ['PRODAGENT_JOBSPEC'] if not os.path.exists(specFile): print "Job Spec File %s does not exist" % specFile return jobSpec = JobSpec() jobSpec.load(specFile) self.jobSpec = jobSpec self.jobSpecNode = jobSpec.findNode(self.taskAttrs['Name']) self.jobSpecLoaded = True return
def loadJobSpecNode(self): """ _loadJobSpecNode_ Load the job spec file referenced by PRODAGENT_JOB_SPEC env var and extract the node from it with the name provided """ if not os.environ.has_key("PRODAGENT_JOBSPEC"): print " No PRODAGENT_JOBSPEC set" return specFile = os.environ['PRODAGENT_JOBSPEC'] if not os.path.exists(specFile): print "Job Spec File %s does not exist" % specFile return jobSpec = JobSpec() jobSpec.load(specFile) self.jobSpec = jobSpec self.jobSpecNode = jobSpec.findNode(self.taskAttrs['Name']) self.jobSpecLoaded = True return
def __init__(self, jobSpecFile): self.jobSpec = JobSpec() self.jobSpec.load(jobSpecFile) self.taskState = TaskState(os.getcwd()) self.taskState.loadRunResDB() self.workflowSpec = WorkflowSpec() self.workflowSpec.load(os.environ["PRODAGENT_WORKFLOW_SPEC"]) self.config = self.taskState.configurationDict() finder = NodeFinder(self.taskState.taskName()) self.jobSpec.payload.operate(finder) self.jobSpecNode = finder.result wffinder = NodeFinder(self.taskState.taskName()) self.workflowSpec.payload.operate(wffinder) self.workflowNode = wffinder.result tier0Merge = self.workflowSpec.parameters.get("Tier0Merge", "False") if self.jobSpecNode.jobType != "Merge" or tier0Merge == "True": if self.config.has_key('Configuration'): #try: self.createPSet() #except Exception, ex: # msg = "Unable to generate cmsRun Config from JobSpec:\n" # msg += str(ex) # print msg # badfile = open("exit.status", 'w') # badfile.write("10040") # badfile.close() else: # // # // Merge job #// self.createMergePSet() # do after pset created to get correct input files self.setJobDetails() if self.config.has_key('UserSandbox'): self.userSandbox()
def __init__(self, workflowSpec, jobSpec): self.workflowSpec = WorkflowSpec() self.workflowSpec.load(workflowSpec) self.jobSpec = JobSpec() self.jobSpec.load(jobSpec) taskState = TaskState(os.getcwd()) taskState.loadRunResDB() jobSpecFinder = NodeFinder(taskState.taskName()) self.jobSpec.payload.operate(jobSpecFinder) self.jobSpecNode = jobSpecFinder.result workflowFinder = NodeFinder(taskState.taskName()) self.workflowSpec.payload.operate(workflowFinder) self.workflowNode = workflowFinder.result self.run = None self.lumis = [] self.streamerFiles = [] self.activeDatasets = []
def __init__(self): self.jobSpec = JobSpec() self.jobSpec.load(os.environ['PRODAGENT_JOBSPEC']) self.taskState = TaskState(os.getcwd()) self.taskState.loadRunResDB() self.workflowSpec = WorkflowSpec() self.workflowSpec.load(os.environ["PRODAGENT_WORKFLOW_SPEC"]) self.config = self.taskState.configurationDict() finder = NodeFinder(self.taskState.taskName()) self.jobSpec.payload.operate(finder) self.jobSpecNode = finder.result wffinder = NodeFinder(self.taskState.taskName()) self.workflowSpec.payload.operate(wffinder) self.workflowNode = wffinder.result self.inputFiles = self.jobSpecNode.cfgInterface.inputFiles self.globalTag = self.jobSpecNode.cfgInterface.conditionsTag self.inputDataset = self.jobSpecNode._InputDatasets[0] self.runNumber = self.jobSpec.parameters['RunNumber'] self.scenario = self.jobSpec.parameters.get('Scenario', 'relvalmc') self.refHistKey = self.jobSpec.parameters.get('RefHistKey', None)
def stageOut(): """ _stageOut_ Main function for this module. Loads data from the task and manages the stage out process for a single attempt """ state = TaskState(os.getcwd()) state.loadRunResDB() workflow = WorkflowSpec() workflow.load(os.environ['PRODAGENT_WORKFLOW_SPEC']) jobSpecFile = os.environ.get('PRODAGENT_JOBSPEC') jobSpecId = None if jobSpecFile is not None: jobSpec = JobSpec() jobSpec.load(jobSpecFile) jobSpecId = jobSpec.parameters.get('JobName') print workflow print state.taskName() print jobSpecId stageOutFor, override, controls = StageOutUtils.getStageOutConfig( workflow, state.taskName()) toplevelReport = os.path.join(os.environ['PRODAGENT_JOB_DIR'], "FrameworkJobReport.xml") exitCode = 0 # // # // find inputs by locating the task for which we are staging out #// and loading its TaskState for inputTask in stageOutFor: print "Attempting to stage out files for node %s" % inputTask try: inputState = getTaskState(inputTask) msg = "Loaded Input Task: %s " % inputTask except Exception, ex: msg = "Error load for TaskState for task %s" % inputTask msg += "%s\n" % str(ex) inputState = None print msg if inputState == None: # exit with init error # generate failure report in this dir, since cant find # input state dir inputReport = FwkJobReport() inputReport.name = inputTask inputReport.jobSpecId = jobSpecId exitCode = 60311 errRep = inputReport.addError( 60311, "TaskStateError") errRep['Description'] = msg inputReport.status = "Failed" inputReport.exitCode = 60311 updateReport(toplevelReport, inputReport) print "TaskState is None, exiting..." return exitCode try: inputReport = inputState.getJobReport() msg = "Loaded JobReport for Task : %s\n" % inputTask msg += "File: %s\n" % inputState.jobReport except Exception, ex: msg = "Error loading input report : %s" % str(ex) inputReport = None
newReport = FwkJobReport() newReport.jobSpecId = jobSpecPayload.jobName newReport.jobType = jobSpecPayload.type newReport.workflowSpecId = jobSpecPayload.workflow newReport.name = jobSpecPayload.name #get information from the super class newReport.siteDetails['SiteName'] = workerNodeInfo['SiteName'] #HostName is the same as worker_node name newReport.siteDetails['HostName'] = workerNodeInfo['HostName'] newReport.siteDetails['se-name'] = workerNodeInfo['se-name'] newReport.siteDetails['ce-name'] = workerNodeInfo['ce-name'] newReport.addLogFile("/path/to/log/archive", "some.random.se.cern.ch") return jobSpecPayload, newReport except Exception, ex: #msg = "Unable to Publish Report for %s\n" % jobSpecPayload.jobName #msg += "Since It is not known to the JobState System:\n" msg = str(ex) logging.error(msg) raise RuntimeError, msg registerPlugin(EmulatorReportPlugin, EmulatorReportPlugin.__name__) if __name__ == "__main__": from ProdCommon.MCPayloads.JobSpec import JobSpec jobSpec = JobSpec() jobSpec.load("/home/sryu")
def factoriseJobSpec(jobSpecInstance, jobSpecDir, njobs=[], eventCount=0, **args): """ _factoriseJobSpec_ njobs is an array of globally unique run numbers TODO: <<<<NEEDS PILEUP DETAILS>>>> """ generators = GeneratorMaker() jobSpecInstance.payload.operate(generators) #AFgenerators(jobSpecInstance.payload) runNumber = int( args.get("RunNumber", int(jobSpecInstance.parameters['RunNumber']))) firstEvent = int(args.get("FirstEvent", 0)) maxRunNumber = args.get("MaxRunNumber", None) eventsPerJob = int(math.ceil(float(eventCount) / float(len(njobs)))) result = [] workflowName = jobSpecInstance.payload.workflow template = jobSpecInstance.makeIMProv() currentRun = runNumber currentEvent = firstEvent for run_number in njobs: #jobName = "%s-%s" % (workflowName, run_number) jobName = jobSpecInstance.parameters[ 'JobName'] + '_jobcut-' + workflowName + '-' + str(run_number) newSpec = JobSpec() newSpec.loadFromNode(template) newSpec.setJobName(jobName) newSpec.parameters['RunNumber'] = run_number newSpec.payload.operate(DefaultLFNMaker(newSpec)) maker = CfgMaker(generators, JobName=jobName, RunNumber=run_number, MaxEvents=eventsPerJob, SkipEvents=currentEvent) newSpec.payload.operate(maker) newSpec.payload.operate(maker.generateCmsGenConfig) newSpec.parameters['FirstEvent'] = currentEvent newSpec.parameters['RunNumber'] = run_number newSpec.parameters['EventCount'] = eventsPerJob jobSpecLocation = jobSpecDir + '/' + newSpec.parameters[ 'JobName'] + '.xml' newSpec.save(jobSpecLocation) result.append({ 'id': newSpec.parameters['JobName'], 'spec': jobSpecLocation, 'events': eventsPerJob }) currentRun += 1 currentEvent += eventsPerJob if ((eventsPerJob + currentEvent) > (firstEvent + int(eventCount))): eventsPerJob = firstEvent + int(eventCount) - currentEvent if maxRunNumber != None: if currentRun > maxRunNumber: break return result
gen.creator = creatorInst gen.workflowCache = wfCache gen.workflowFile = workflowSpecFile print "Generator on Workflow Spec:" gen.actOnWorkflowSpec(workflowSpec, wfCache) del gen del creatorInst # // # // Now process the job spec #// if jobSpecFile == None: sys.exit(0) try: jobSpec = JobSpec() jobSpec.load(jobSpecFile) except StandardError, ex: msg = "Error loading job spec file:\n" msg += jobSpecFile msg += "\n" msg += str(ex) print msg sys.exit(1) jobname = jobSpec.parameters['JobName'] jobCache = os.path.join(wfCache, jobname) if not os.path.exists(jobCache): os.makedirs(jobCache) statsFile = "TestHarness_%s_%s.prof" % (generator, creator)
class OfflineDQMSetup: """ _OfflineDQMSetup_ Generate the PSet for the job on the fly """ def __init__(self): self.jobSpec = JobSpec() self.jobSpec.load(os.environ['PRODAGENT_JOBSPEC']) self.taskState = TaskState(os.getcwd()) self.taskState.loadRunResDB() self.workflowSpec = WorkflowSpec() self.workflowSpec.load(os.environ["PRODAGENT_WORKFLOW_SPEC"]) self.config = self.taskState.configurationDict() finder = NodeFinder(self.taskState.taskName()) self.jobSpec.payload.operate(finder) self.jobSpecNode = finder.result wffinder = NodeFinder(self.taskState.taskName()) self.workflowSpec.payload.operate(wffinder) self.workflowNode = wffinder.result self.inputFiles = self.jobSpecNode.cfgInterface.inputFiles self.globalTag = self.jobSpecNode.cfgInterface.conditionsTag self.inputDataset = self.jobSpecNode._InputDatasets[0] self.runNumber = self.jobSpec.parameters['RunNumber'] self.scenario = self.jobSpec.parameters.get('Scenario', 'relvalmc') self.refHistKey = self.jobSpec.parameters.get('RefHistKey', None) def __call__(self): """ _operator()_ Invoke the setup tool """ msg = "Creating Harvesting Configuration for:\n" msg += " => Dataset: %s\n" % self.inputDataset.name() msg += " => Run Number: %s\n" % self.runNumber msg += " => Global Tag: %s\n" % self.globalTag msg += " => Input Files:\n" for inputfile in self.inputFiles: msg += " => %s\n" % inputfile print msg process = self.importConfigurationLibrary() pycfgDump = open("PyCfgFileDump.log", 'w') try: pycfgDump.write(process.dumpPython()) except Exception, ex: msg = "Error writing python format cfg dump:\n" msg += "%s\n" % str(ex) msg += "This needs to be reported to the framework team" pycfgDump.write(msg) pycfgDump.close() # // # // Save the edited config as PSet.py #// handle = open("PSet.py", 'w') handle.write("import pickle\n") handle.write("pickledCfg=\"\"\"%s\"\"\"\n" % pickle.dumps(process)) handle.write("process = pickle.loads(pickledCfg)\n") handle.close() print "Wrote PSet.py for harvesting" return
prodToMergeDatasets[dsName] = mergeProdSpecs[dsName] emulator2 = EmulatorReportPlugin() wnInfo = { "SiteName" : "TN_SITE_CH", "HostID" : "host" , "HostName" : "workernode.element.edu", "se-name" : "storage.element.edu", "ce-name" : "compute.element.edu", } for prodJob in prodJobs: jobSpec = JobSpec() jobSpec.load(prodJob['JobSpecFile']) jobReport = "%s/%s-JobReport.xml" % ( productionDir, jobSpec.payload.jobName) repInstance = emulator2.createSuccessReport(jobSpec, wnInfo, jobReport) for fileinfo in repInstance.files: lfn = fileinfo['LFN'] for dataset in [ x.name() for x in fileinfo.dataset ] : prodFiles[dataset].add(lfn) class MergeMaker: def __init__(self, mergeSpec, mergeWorkDir):
import JobState.JobStateAPI.JobStateChangeAPI as JobStates import sys from MessageService.MessageService import MessageService ms = MessageService() ms.registerAs("Test") """ Usage: cleanAndRetry.py /path/to/JobSpec.xml """ specFile = sys.argv[1] spec = JobSpec() spec.load(specFile) specIds = [] if spec.isBulkSpec(): specIds.extend(spec.bulkSpecs.keys()) else: specIds.append(spec.parameters['JobName']) #print specIds; for specId in specIds: JobStates.cleanout(specId)
class Prestager: """ _Prestager_ Castor Prestager """ def __init__(self): self.state = TaskState(os.getcwd()) self.siteConf = self.state.getSiteConfig() self.spec = JobSpec() self.numberOfRetries = 3 self.retryInterval = 600 #seconds jobSpec = os.environ.get("PRODAGENT_JOBSPEC", None) if jobSpec == None: msg = "Unable to find JobSpec from PRODAGENT_JOBSPEC variable\n" msg += "Unable to proceed\n" raise RuntimeError, msg if not os.path.exists(jobSpec): msg += "Cannot find JobSpec file:\n %s\n" % jobSpec msg += "Unable to proceed\n" raise RuntimeError, msg self.specFile = jobSpec self.spec.load(jobSpec) self.spec.payload.loadConfiguration() self.files = self.spec.payload.cfgInterface.inputFiles print self.files self.lfnToPfn = {} self.localFiles = {} def executeCommand(self, command): """ _executeCommand_ Util it execute the command provided in a popen object """ child = popen2.Popen3(command, 1) # capture stdout and stderr from command child.tochild.close() # don't need to talk to child outfile = child.fromchild outfd = outfile.fileno() errfile = child.childerr errfd = errfile.fileno() makeNonBlocking(outfd) # don't deadlock! makeNonBlocking(errfd) outdata = errdata = '' outeof = erreof = 0 stdoutBuffer = "" while 1: ready = select.select([outfd,errfd],[],[]) # wait for input if outfd in ready[0]: outchunk = outfile.read() if outchunk == '': outeof = 1 stdoutBuffer += outchunk sys.stdout.write(outchunk) if errfd in ready[0]: errchunk = errfile.read() if errchunk == '': erreof = 1 sys.stderr.write(errchunk) if outeof and erreof: break select.select([],[],[],.1) # give a little time for buffers to fill try: exitCode = child.poll() except Exception, ex: msg = "Error retrieving child exit code: %s\n" % ex msg = "while executing command:\n" msg += command print msg return 1 if exitCode: msg = "Error executing command:\n" msg += command msg += "Exited with code: %s\n" % exitCode print msg return exitCode
logging.info("TestHarness:Instantiating Submitter %s" % submitter) submitterInstance = retrieveSubmitter(submitter) logging.info("TestHarness:Submitter Instantiated OK") jobToSubmit = os.path.join(workingDir, jobname) if jobSpecFile == None: jobSpecFile = os.path.join(workingDir, "%s-JobSpec.xml" % jobname) cacheMap = { jobname : workingDir } logging.debug("TestHarness:Jobname=%s" % jobname) logging.debug("TestHarness:WorkingDir=%s" % workingDir) logging.debug("TestHarness:JobToSubmit=%s" % jobToSubmit) logging.debug("TestHarness:JobSpecFile=%s" % jobSpecFile) try: jobSpecInstance = JobSpec() jobSpecInstance.load("file://%s" % jobSpecFile) except StandardError, ex: msg = "TestHarness:Failed to read JobSpec File for Job %s\n" % jobname msg += "From: %s\n" % jobSpecFile msg += str(ex) logging.error(msg) sys.exit(1) logging.info("TestHarness: Invoking Submitter %s" % submitter) submitterInstance( workingDir,
class JobSpecExpander: def __init__(self, jobSpecFile): self.jobSpec = JobSpec() self.jobSpec.load(jobSpecFile) self.taskState = TaskState(os.getcwd()) self.taskState.loadRunResDB() self.workflowSpec = WorkflowSpec() self.workflowSpec.load(os.environ["PRODAGENT_WORKFLOW_SPEC"]) self.config = self.taskState.configurationDict() finder = NodeFinder(self.taskState.taskName()) self.jobSpec.payload.operate(finder) self.jobSpecNode = finder.result wffinder = NodeFinder(self.taskState.taskName()) self.workflowSpec.payload.operate(wffinder) self.workflowNode = wffinder.result tier0Merge = self.workflowSpec.parameters.get("Tier0Merge", "False") if self.jobSpecNode.jobType != "Merge" or tier0Merge == "True": if self.config.has_key('Configuration'): #try: self.createPSet() #except Exception, ex: # msg = "Unable to generate cmsRun Config from JobSpec:\n" # msg += str(ex) # print msg # badfile = open("exit.status", 'w') # badfile.write("10040") # badfile.close() else: # // # // Merge job #// self.createMergePSet() # do after pset created to get correct input files self.setJobDetails() if self.config.has_key('UserSandbox'): self.userSandbox() def handleInputLink(self, config, inpLink): """ _handleInputLink_ Generate the information for the input link between this task and the task specified """ msg = "Input Link Detected:\n" for k, v in inpLink.items(): msg += " %s = %s\n" % (k, v) print msg inputTask = getTaskState(inpLink['InputNode']) if inputTask == None: msg = "Unable to create InputLink for task: %s\n" % ( inpLink['InputNode'],) msg += "Input TaskState could not be retrieved..." raise RuntimeError, msg inputTask.loadJobReport() inputReport = inputTask.getJobReport() if inputReport == None: msg = "Unable to create InputLink for task: %s\n" % ( inpLink['InputNode'],) msg += "Unable to load input job report file" raise RuntimeError, msg # add files to override catalog inputFileList = [] tfc = None for file in inputReport.files: if not file['ModuleLabel'] == inpLink['OutputModule']: continue # link to file via lfn (in tfc) if link isn't standalone and we # have a valid lfn. Else refer to file via pfn if not inpLink['AppearStandalone'] and \ file.get('LFN', None) not in (None, '', 'None'): if not tfc: tfc = TrivialFileCatalog.TrivialFileCatalog() inputFileList.append(file['LFN']) tfc.addLfnToPfnRule('override', file['LFN'], file['PFN']) else: inputFileList.append("file:%s" % file['PFN']) if tfc: print "Creating override tfc, contents below" print str(tfc) tfc.write(os.path.join(os.getcwd(), 'override_catalog.xml')) if inpLink['InputSource'] == "source": # // # // feed into main source #// config.inputFiles = inputFileList if tfc: config.inputOverrideCatalog = os.path.join(os.getcwd(), 'override_catalog.xml') msg = "Input Link created to input source for files:\n" for f in inputFileList: msg += " %s\n" % f print msg return # // # // Need to add to secondary source with name provided #// raise NotImplementedError, "Havent implemented secondary source input links at present..." def localCustomization(self, config, merge = False): """ Apply site specific customizations to the config """ site_config = self.taskState.getSiteConfig() self.ioCustomization(config, site_config.io_config, merge) def ioCustomization(self, config, custom_config, merge = False): """ Apply site specific io customizations """ # Don't do anything if no customization or job has no input files if not custom_config or (merge is False and not config.inputFiles): return import re version = lambda x: tuple(int(x) for x in re.compile('(\d+)').findall(x)) cmssw_version = version(os.environ['CMSSW_VERSION']) # Only implemented in CMSSW_2_1_8 and above if cmssw_version < (2, 1, 8): return print "Site specific IO parameters will be used:" # cacheSize is a property of InputSource cache_size = custom_config.get('cacheSize', None) if cache_size: # Merge pset creates process on fly so can't use CMSSWConfig object if merge: from ProdCommon.CMSConfigTools.ConfigAPI.InputSource import InputSource inputSource = InputSource(config.source) inputSource.setCacheSize(cache_size) else: config.sourceParams['cacheSize'] = cache_size if merge: from FWCore.ParameterSet.Modules import Service config.add_(Service('AdaptorConfig')) for param in custom_config: print " %s %s" % (param, custom_config[param]) if param == 'cacheSize': continue if merge: import FWCore.ParameterSet.Types as CfgTypes adaptor = config.services['AdaptorConfig'] setattr(adaptor, param, CfgTypes.untracked(CfgTypes.string(str(custom_config[param])))) else: config.tFileAdaptorConfig[param] = custom_config[param] return def createPSet(self): """ _createPSet_ Create the PSet cfg File """ cfgFile = self.config['Configuration'].get("CfgFile", "PSet.py")[0] cfgFile = str(cfgFile) self.jobSpecNode.loadConfiguration() self.jobSpecNode.cfgInterface.rawCfg = self.workflowNode.cfgInterface.rawCfg # taken from cmssw environment # pylint: disable-msg=F0401 import FWCore.ParameterSet.Types as CfgTypes # pylint: enable-msg=F0401 workingDir = os.path.join(os.getcwd(), 'prestage') if os.path.exists(workingDir + '/prestageTFC.xml'): rawCfg = pickle.loads(self.jobSpecNode.cfgInterface.rawCfg) rawCfg.source.overrideCatalog = CfgTypes.untracked(CfgTypes.string('trivialcatalog_file:%s/prestageTFC.xml?protocol=local-stage-in' % workingDir)) self.jobSpecNode.cfgInterface.rawCfg = pickle.dumps(rawCfg) # Apply site specific customizations self.localCustomization(self.jobSpecNode.cfgInterface) for inpLink in self.jobSpecNode._InputLinks: # // # // We have in-job input links to be resolved #// self.handleInputLink(self.jobSpecNode.cfgInterface, inpLink) cmsProcess = self.jobSpecNode.cfgInterface.makeConfiguration() pycfgDump = open("PyCfgFileDump.log", 'w') try: pycfgDump.write(cmsProcess.dumpPython()) except Exception, ex: msg = "Error writing python format cfg dump:\n" msg += "%s\n" % str(ex) msg += "This needs to be reported to the framework team" pycfgDump.write(msg) pycfgDump.close() handle = open(cfgFile, 'w') handle.write("import pickle\n") handle.write("pickledCfg=\"\"\"%s\"\"\"\n" % pickle.dumps(cmsProcess)) handle.write("process = pickle.loads(pickledCfg)\n") handle.close() return
class RepackerSetup: """ _RepackerSetup_ Object to manipulate the Configuration files for a repacker job - Extract the details of the repacker job entity stored in the config - Pull in the lumi server information and add it to the config """ def __init__(self, workflowSpec, jobSpec): self.workflowSpec = WorkflowSpec() self.workflowSpec.load(workflowSpec) self.jobSpec = JobSpec() self.jobSpec.load(jobSpec) taskState = TaskState(os.getcwd()) taskState.loadRunResDB() jobSpecFinder = NodeFinder(taskState.taskName()) self.jobSpec.payload.operate(jobSpecFinder) self.jobSpecNode = jobSpecFinder.result workflowFinder = NodeFinder(taskState.taskName()) self.workflowSpec.payload.operate(workflowFinder) self.workflowNode = workflowFinder.result self.run = None self.lumis = [] self.streamerFiles = [] self.activeDatasets = [] def unpackJobEntity(self): """ _unpackJobEntity_ Get the StreamerJobEntity from the JobSpec node """ repackJobEntity = self.jobSpecNode.cfgInterface.extensions.get('Streamer', None) if repackJobEntity == None: msg = "No StreamerJobEntity in JobSpec configuration\n" msg += "This is required for repacker jobs\n" raise RuntimeError, msg # Get run and lumi numbers for this job self.run = repackJobEntity.data['runNumber'] self.lumis = repackJobEntity.data['lumiSections'] print "Repacker Job Handling Run:%s\n LumiSections: %s\n" % (self.run,self.lumis) # Sort streamer input by lumi ID for time ordering self.streamerFiles = sortByValue(repackJobEntity.data['streamerFiles']) msg = "Streamer Files for this job are:\n" for strmr in self.streamerFiles: msg += " %s\n" % strmr print msg # Get list of active datasets for this job ## self.activeDatasets = repackJobEntity.data['activeOutputModules'] ## msg = "This Job Will repack datasets:\n" ## for dataset in self.activeDatasets: ## msg += " %s\n" % dataset ## print msg return def backupPSet(self,filename,process): """ _backupPSet_ Write a backup copy of the current PSet to disk. """ print "Wrote current configurations as %s" % filename handle = open(filename, 'w') handle.write("import pickle\n") handle.write("pickledCfg=\"\"\"%s\"\"\"\n" % pickle.dumps(process)) handle.write("process = pickle.loads(pickledCfg)\n") handle.close() return def importAndBackupProcess(self): """ _importAndBackupProcess_ Try to import the process object for the job, which is contained in PSet.py and save a backup copy of it. """ try: from PSet import process except ImportError, ex: msg = "Failed to import PSet module containing cmsRun Config\n" msg += str(ex) raise RuntimeError, msg print "PSet.py imported" self.backupPSet("PSetPreRepack.log",process) return process
def factoriseJobSpec(jobSpecInstance, jobSpecDir,njobs=[], eventCount=0, **args): """ _factoriseJobSpec_ njobs is an array of globally unique run numbers TODO: <<<<NEEDS PILEUP DETAILS>>>> """ generators = GeneratorMaker() jobSpecInstance.payload.operate(generators) #AFgenerators(jobSpecInstance.payload) runNumber = int(args.get("RunNumber", int(jobSpecInstance.parameters['RunNumber']))) firstEvent = int(args.get("FirstEvent",0)) maxRunNumber = args.get("MaxRunNumber", None) eventsPerJob = int(math.ceil(float(eventCount)/float(len(njobs)))) result = [] workflowName = jobSpecInstance.payload.workflow template = jobSpecInstance.makeIMProv() currentRun = runNumber currentEvent = firstEvent for run_number in njobs: #jobName = "%s-%s" % (workflowName, run_number) jobName = jobSpecInstance.parameters['JobName']+'_jobcut-'+workflowName+'-'+str(run_number) newSpec = JobSpec() newSpec.loadFromNode(template) newSpec.setJobName(jobName) newSpec.parameters['RunNumber'] = run_number newSpec.payload.operate(DefaultLFNMaker(newSpec)) maker = CfgMaker(generators, JobName = jobName, RunNumber = run_number, MaxEvents = eventsPerJob, SkipEvents = currentEvent) newSpec.payload.operate(maker) newSpec.payload.operate(maker.generateCmsGenConfig) newSpec.parameters['FirstEvent']=currentEvent newSpec.parameters['RunNumber']=run_number newSpec.parameters['EventCount']=eventsPerJob jobSpecLocation=jobSpecDir+'/'+newSpec.parameters['JobName']+'.xml' newSpec.save(jobSpecLocation) result.append({'id':newSpec.parameters['JobName'],'spec':jobSpecLocation,'events':eventsPerJob}) currentRun += 1 currentEvent += eventsPerJob if((eventsPerJob+currentEvent)>(firstEvent+int(eventCount))): eventsPerJob=firstEvent+int(eventCount)-currentEvent if maxRunNumber != None: if currentRun > maxRunNumber: break return result
def __init__(self, jobSpecFile): self.jobSpec = JobSpec() self.jobSpec.load(jobSpecFile) self.taskState = TaskState(os.getcwd()) self.taskState.loadRunResDB() self.config = self.taskState.configurationDict() finder = NodeFinder(self.taskState.taskName()) self.jobSpec.payload.operate(finder) self.jobSpecNode = finder.result self.jobSpecNode.loadConfiguration() # // # // Update output dataset information in RunResDB to #// match input job spec for merges taskName = self.taskState.taskAttrs['Name'] del self.config['Output']['Datasets'] self.taskState._RunResDB = RunResComponent() runresData = {taskName : self.config} self.taskState._RunResDB.populate(runresData) for dataset in self.jobSpecNode._OutputDatasets: if dataset['DataTier'] == "": continue dsPath = "/%s/Output/Datasets%s" % ( taskName, dataset.name()) self.taskState._RunResDB.addPath(dsPath) for key, val in dataset.items(): self.taskState._RunResDB.addData("/%s/%s" % (dsPath, key), str(val)) handle = open(self.taskState.runresdb, 'w') dom = self.taskState._RunResDB.makeDOMElement() handle.write(dom.toprettyxml()) handle.close() self.taskState.loadRunResDB() cfgInt = self.jobSpecNode.cfgInterface inputFiles = cfgInt.inputFiles fileList = "" for inputfile in inputFiles: inputfile = inputfile.replace("\'", "") inputfile = inputfile.replace("\"", "") fileList += "%s " % inputfile outMod = cfgInt.outputModules['Merged'] lfn = outMod['logicalFileName'] catalog = outMod['catalog'] pfn = outMod['fileName'] pfn = pfn.replace("\'", "") pfn = pfn.replace("\"", "") lfn = lfn.replace("\'", "") lfn = lfn.replace("\"", "") catalog = catalog.replace("\'", "") catalog = catalog.replace("\"", "") handle = open("EdmFastMerge-setup.sh", "w") handle.write("export EDM_MERGE_INPUTFILES=\"%s\"\n" % fileList) handle.write("export EDM_MERGE_OUTPUT_PFN=\"%s\"\n" % pfn) handle.write("export EDM_MERGE_OUTPUT_LFN=\"%s\"\n" % lfn) handle.write("export EDM_MERGE_CATALOG=\"%s\"\n" % catalog) handle.close() os.system("chmod +x EdmFastMerge-setup.sh" )
def testA(self): try: ###shell start#### self.ms.publish("ProdMgrInterface:StartDebug",'') self.ms.commit() # this means we are using the size the allocation gives us self.ms.publish("ProdMgrInterface:JobSize",'-1') self.ms.commit() # this means that if we get a job from the prodmgr we cut it in jobs with this number # of events. self.ms.publish("ProdMgrInterface:JobCutSize",'12') self.ms.commit() self.ms.publish("ProdMgrInterface:AddRequest",'https://localhost:8443/clarens/?Request_id=requestID0?Priority=3') self.ms.commit() self.ms.publish("ProdMgrInterface:ResourcesAvailable",'4') self.ms.commit() print('Waiting for 4*9=36 creatjobs') ###shell end #### for i in xrange(0,36): type, payload = self.ms.get() print("Message type: "+str(type)+", payload: "+str(payload)) # retrieve the job spec id (jobname) jobspec=JobSpec() jobspec.load(payload) ProdMgrUnitTests.__jobSpecId.append(jobspec.parameters['JobName']) self.ms.commit() for jobspecid in ProdMgrUnitTests.__jobSpecId: print("handling jobspecid: "+str(jobspecid)) reportFile='FrameworkJobReport.xml' report=readJobReport(reportFile) for fileinfo in report[-1].files: if fileinfo['TotalEvents'] != None: fileinfo['TotalEvents'] = 9 report[-1].jobSpecId=jobspecid report[-1].status="Success" reportLocation=self.jobReportDir+'/'+jobspecid.replace('/','_')+".xml" report[-1].write(reportLocation) self.ms.publish("JobSuccess", reportLocation) self.ms.commit() ###shell start#### self.ms.publish("ProdMgrInterface:ResourcesAvailable",'10') self.ms.commit() print('Waiting for 10*9=90 creatjobs') ###shell end #### ProdMgrUnitTests.__jobSpecId=[] for i in xrange(0,90): type, payload = self.ms.get() print("Message type: "+str(type)+", payload: "+str(payload)) # retrieve the job spec id (jobname) jobspec=JobSpec() jobspec.load(payload) ProdMgrUnitTests.__jobSpecId.append(jobspec.parameters['JobName']) self.ms.commit() for jobspecid in ProdMgrUnitTests.__jobSpecId: print("handling jobspecid: "+str(jobspecid)) reportFile='FrameworkJobReport.xml' report=readJobReport(reportFile) for fileinfo in report[-1].files: if fileinfo['TotalEvents'] != None: fileinfo['TotalEvents'] = 9 report[-1].jobSpecId=jobspecid report[-1].status="Success" reportLocation=self.jobReportDir+'/'+jobspecid.replace('/','_')+".xml" report[-1].write(reportLocation) self.ms.publish("JobSuccess", reportLocation) self.ms.commit() sys.exit(0) ###shell start#### self.ms.publish("ProdMgrInterface:AddRequest",'https://localhost:8443/clarens/?Request_id=requestID1?Priority=4') self.ms.commit() self.ms.publish("ProdMgrInterface:ResourcesAvailable",'10') self.ms.commit() print('Waiting for 10 creatjobs') ###shell end #### ProdMgrUnitTests.__jobSpecId=[] for i in xrange(0,10): type, payload = self.ms.get() print("Message type: "+str(type)+", payload: "+str(payload)) # retrieve the job spec id (jobname) jobspec=JobSpec() jobspec.load(payload) ProdMgrUnitTests.__jobSpecId.append(jobspec.parameters['JobName']) self.ms.commit() for jobspecid in ProdMgrUnitTests.__jobSpecId: print("handling jobspecid: "+str(jobspecid)) reportFile='FrameworkJobReport.xml' report=readJobReport(reportFile) for fileinfo in report[-1].files: if fileinfo['TotalEvents'] != None: fileinfo['TotalEvents'] = 3 report[-1].jobSpecId=jobspecid report[-1].status="Success" reportLocation=self.jobReportDir+'/'+jobspecid.replace('/','_')+".xml" report[-1].write(reportLocation) self.ms.publish("JobSuccess", reportLocation) self.ms.commit() print('ProdMgr is left with 8 allocations as 2 allocations successfully finished') ###shell start#### self.ms.publish("ProdMgrInterface:RemoveIdlingAllocs",'00:00:01') print('All idling allocations should have been removed since the used a small time interval') self.ms.commit() self.ms.publish("ProdMgrInterface:ResourcesAvailable",'20') self.ms.commit() print('ProdAgent should now have 20 active allocations (inlcuding the ones that where removed)') ###shell end #### ProdMgrUnitTests.__jobSpecId=[] for i in xrange(0,20): type, payload = self.ms.get() print("Message type: "+str(type)+", payload: "+str(payload)) # retrieve the job spec id (jobname) jobspec=JobSpec() jobspec.load(payload) ProdMgrUnitTests.__jobSpecId.append(jobspec.parameters['JobName']) self.ms.commit() #raw_input("Shut down the server to test queueing capability (check the log to see when no more messages enter)\n") for jobspecid in ProdMgrUnitTests.__jobSpecId[0:4]: print("handling jobspecid: "+str(jobspecid)) reportFile='FrameworkJobReport.xml' report=readJobReport(reportFile) for fileinfo in report[-1].files: if fileinfo['TotalEvents'] != None: fileinfo['TotalEvents'] = 3 report[-1].jobSpecId=jobspecid report[-1].status="Success" reportLocation=self.jobReportDir+'/'+jobspecid.replace('/','_')+".xml" report[-1].write(reportLocation) self.ms.publish("JobSuccess", reportLocation) self.ms.commit() #raw_input("Start server again\n") for jobspecid in ProdMgrUnitTests.__jobSpecId[4:]: print("handling jobspecid: "+str(jobspecid)) reportFile='FrameworkJobReport.xml' report=readJobReport(reportFile) for fileinfo in report[-1].files: if fileinfo['TotalEvents'] != None: fileinfo['TotalEvents'] = 3 report[-1].jobSpecId=jobspecid report[-1].status="Success" reportLocation=self.jobReportDir+'/'+jobspecid.replace('/','_')+".xml" report[-1].write(reportLocation) self.ms.publish("JobSuccess", reportLocation) self.ms.commit() ###shell start#### print('Adding a non existing request') self.ms.publish("ProdMgrInterface:AddRequest",'https://localhost:8443/clarens/?Request_id=NOTEXISTINGREQUEST?Priority=1') self.ms.commit() print('Adding a request that alrady finished ') self.ms.publish("ProdMgrInterface:AddRequest",'https://localhost:8443/clarens/?Request_id=requestID0?Priority=2') self.ms.commit() self.ms.publish("ProdMgrInterface:AddRequest",'https://localhost:8443/clarens/?Request_id=requestID2?Priority=5') print('There should now be 3 additional requests in the request queue') self.ms.commit() self.ms.publish("ProdMgrInterface:JobSize",'5') self.ms.commit() self.ms.publish("ProdMgrInterface:ResourcesAvailable",'10') self.ms.commit() ###shell end #### ProdMgrUnitTests.__jobSpecId=[] for i in xrange(0,10): type, payload = self.ms.get() print("Message type: "+str(type)+", payload: "+str(payload)) # retrieve the job spec id (jobname) jobspec=JobSpec() jobspec.load(payload) ProdMgrUnitTests.__jobSpecId.append(jobspec.parameters['JobName']) self.ms.commit() for jobspecid in ProdMgrUnitTests.__jobSpecId[0:4]: print("handling jobspecid: "+str(jobspecid)) reportFile='FrameworkJobReport.xml' report=readJobReport(reportFile) for fileinfo in report[-1].files: if fileinfo['TotalEvents'] != None: fileinfo['TotalEvents'] = 2 report[-1].jobSpecId=jobspecid report[-1].status="Success" reportLocation=self.jobReportDir+'/'+jobspecid.replace('/','_')+".xml" report[-1].write(reportLocation) self.ms.publish("JobSuccess", reportLocation) self.ms.commit() for jobspecid in ProdMgrUnitTests.__jobSpecId[4:]: print("handling jobspecid: "+str(jobspecid)) reportFile='FrameworkJobReport.xml' report=readJobReport(reportFile) for fileinfo in report[-1].files: if fileinfo['TotalEvents'] != None: fileinfo['TotalEvents'] = 5 report[-1].jobSpecId=jobspecid report[-1].status="Success" reportLocation=self.jobReportDir+'/'+jobspecid.replace('/','_')+".xml" report[-1].write(reportLocation) self.ms.publish("JobSuccess", reportLocation) self.ms.commit() ###shell start#### print('Emitting resources available which should get allocations of multiple requests') self.ms.publish("ProdMgrInterface:ResourcesAvailable",'15') self.ms.commit() print('There should be now 15 active allocations and the finished request and nonexisting request are removed') ###shell end #### ProdMgrUnitTests.__jobSpecId=[] for i in xrange(0,15): type, payload = self.ms.get() print("Message type: "+str(type)+", payload: "+str(payload)) # retrieve the job spec id (jobname) jobspec=JobSpec() jobspec.load(payload) ProdMgrUnitTests.__jobSpecId.append(jobspec.parameters['JobName']) self.ms.commit() for jobspecid in ProdMgrUnitTests.__jobSpecId: print("handling jobspecid: "+str(jobspecid)) reportFile='FrameworkJobReport.xml' report=readJobReport(reportFile) for fileinfo in report[-1].files: if fileinfo['TotalEvents'] != None: fileinfo['TotalEvents'] = 3 report[-1].jobSpecId=jobspecid report[-1].status="Success" reportLocation=self.jobReportDir+'/'+jobspecid.replace('/','_')+".xml" report[-1].write(reportLocation) self.ms.publish("GeneralJobFailure", reportLocation) self.ms.commit() ###shell start#### self.ms.publish("ProdMgrInterface:AddRequest",'https://localhost:8443/clarens/?Request_id=requestID5?Priority=3') self.ms.commit() self.ms.publish("ProdMgrInterface:AddRequest",'https://localhost:8443/clarens/?Request_id=requestID6?Priority=3') self.ms.commit() self.ms.publish("ProdMgrInterface:AddRequest",'https://localhost:8443/clarens/?Request_id=requestID7?Priority=3') self.ms.commit() self.ms.publish("ProdMgrInterface:ResourcesAvailable",'15') self.ms.commit() ###shell end #### except StandardError, ex: msg = "Failed testA\n" msg += str(ex) self.fail(msg)