def httpPost(self, filename): """ _httpPost_ perform an HTTP POST operation to a webserver """ args = {} # Preparing a checksum blockSize = 0x10000 def upd(m, data): m.update(data) return m fd = open(filename, 'rb') try: contents = iter(lambda: fd.read(blockSize), '') m = reduce(upd, contents, md5()) finally: fd.close() args['checksum'] = 'md5:%s' % m.hexdigest() #args['checksum'] = 'md5:%s' % md5.new(filename).read()).hexdigest() args['size'] = os.path.getsize(filename) msg = "HTTP Upload is about to start:\n" msg += " => URL: %s\n" % self.step.upload.URL msg += " => Filename: %s\n" % filename logging.info(msg) try: for uploadURL in self.step.upload.URL.split(';'): (headers, data) = self.upload(uploadURL, args, filename) msg = 'HTTP upload finished succesfully with response:\n' msg += 'Status code: %s\n' % headers.get( "Dqm-Status-Code", None) msg += 'Message: %s\n' % headers.get("Dqm-Status-Message", None) msg += 'Detail: %s\n' % headers.get("Dqm-Status-Detail", None) msg += 'Data: %s\n' % str(data) logging.info(msg) except urllib2.HTTPError as ex: msg = 'HTTP upload failed with response:\n' msg += 'Status code: %s\n' % ex.hdrs.get("Dqm-Status-Code", None) msg += 'Message: %s\n' % ex.hdrs.get("Dqm-Status-Message", None) msg += 'Detail: %s\n' % ex.hdrs.get("Dqm-Status-Detail", None) msg += 'Error: %s\n' % str(ex) logging.error(msg) raise WMExecutionFailure(60318, "DQMUploadFailure", msg) except Exception as ex: msg = 'HTTP upload failed with response:\n' msg += 'Problem unknown.\n' msg += 'Error: %s\n' % str(ex) msg += 'Traceback: %s\n' % traceback.format_exc() logging.error(msg) raise WMExecutionFailure(60318, "DQMUploadFailure", msg) return
def execute(self, emulator=None): """ _execute_ """ stepModule = "WMTaskSpace.%s" % self.stepName if (emulator != None): return emulator.emulate(self.step, self.job) # write the wrapper script to a temporary location # I don't pass it directly through os.system because I don't # trust that there won't be shell-escape shenanigans with # arbitrary input files scramSetup = self.step.application.setup.softwareEnvironment scramCommand = self.step.application.setup.scramCommand scramProject = self.step.application.setup.scramProject scramArch = self.step.application.setup.scramArch cmsswVersion = self.step.application.setup.cmsswVersion jobReportXML = self.step.output.jobReport cmsswCommand = self.step.application.command.executable cmsswConfig = self.step.application.command.configuration cmsswArguments = self.step.application.command.arguments logging.info("Executing MulticoreCMSSW step") multicoreSettings = self.step.application.multicore numberOfCores = multicoreSettings.numberOfCores logging.info("Multicore configured for %s cores" % numberOfCores) #Todo: Create input file list from job writing filelist one per line to filelist = open(multicoreSettings.inputfilelist, 'w') for inputFile in self.job['input_files']: filelist.write("%s\n" % inputFile['lfn']) filelist.close() # # scram bootstrap # scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Running SCRAM") try: projectOutcome = scram.project() except Exception, ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(60513, "ScramSetupFailure", msg)
def execute(self, emulator = None): """ _execute_ """ #Are we using emulators again? if (emulator != None): return emulator.emulate( self.step, self.job ) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) seName = overrides.get('seName', "srm-cms.cern.ch") lfnPrefix = overrides.get('lfnPrefix', "srm://srm-cms.cern.ch:8443/srm/managerv2?SFN=/castor/cern.ch/cms") lfnBase = overrides.get('lfnBase', "/store/user/jsmith") userLogs = overrides.get('userLogs', False) dontStage = overrides.get('dontStage', False) stageOutParams = {"command": "srmv2-lcg", "se-name": seName, "lfn-prefix": lfnPrefix} #Okay, we need a stageOut Manager useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ (overrides.has_key('newStageOut') and overrides.get('newStageOut')): useNewStageOutCode = True try: if not useNewStageOutCode: # old style deleteMgr = DeleteMgr.DeleteMgr() stageInMgr = StageInMgr.StageInMgr() stageOutMgr = StageOutMgr.StageOutMgr(**stageOutParams) else: # new style logging.info("LOGCOLLECT IS USING NEW STAGEOUT CODE") stageOutMgr = WMCore.Storage.FileManager.StageOutMgr( retryPauseTime = self.step.retryDelay, numberOfRetries = self.step.retryCount, **overrides) stageInMgr = WMCore.Storage.FileManager.StageInMgr( retryPauseTime = self.step.retryDelay, numberOfRetries = self.step.retryCount ) deleteMgr = WMCore.Storage.FileManager.DeleteMgr( retryPauseTime = self.step.retryDelay, numberOfRetries = self.step.retryCount ) deleteMgr = DeleteMgr.DeleteMgr() stageInMgr = StageInMgr.StageInMgr() stageOutMgr = StageOutMgr.StageOutMgr(**stageOutParams) except StandardError, ex: msg = "Unable to load StageIn/Out/Delete Impl: %s" % str(ex) logging.error(msg) raise WMExecutionFailure(60312, "MgrImplementationError", msg)
def __call__(self, task, wmbsJob): """ _operator(task)_ Load and run executors for all steps in Task, if an emulator is available for that step, use it instead. """ myThread = threading.currentThread try: myThread.watchdogMonitor.setupMonitors(task, wmbsJob) myThread.watchdogMonitor.notifyJobStart(task) except WMException: self.toTaskDirectory() raise except Exception, ex: msg = "Encountered unhandled exception while starting monitors:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() raise WMExecutionFailure(msg)
class CMSSW(Executor): """ _CMSWW_ Execute a CMSSW Step """ def pre(self, emulator=None): """ _pre_ Pre execution checks """ if (emulator != None): return emulator.emulatePre(self.step) logging.info("Pre-executing CMSSW step") if hasattr(self.step.application.configuration, 'configCacheUrl'): # means we have a configuration & tweak in the sandbox psetFile = self.step.application.command.configuration psetTweak = self.step.application.command.psetTweak self.stepSpace.getFromSandbox(psetFile) if psetTweak: self.stepSpace.getFromSandbox(psetTweak) if hasattr(self.step, "pileup"): self.stepSpace.getFromSandbox("pileupconf.json") # if step is multicore Enabled, add the multicore setup # that generates the input file manifest and per process splitting stepHelper = WMStepHelper(self.step) typeHelper = stepHelper.getTypeHelper() if typeHelper.multicoreEnabled(): self.step.runtime.scramPreScripts.append("SetupCMSSWMulticore") # add in ths scram env PSet manip script whatever happens self.step.runtime.scramPreScripts.append("SetupCMSSWPset") return None def execute(self, emulator=None): """ _execute_ """ stepModule = "WMTaskSpace.%s" % self.stepName if (emulator != None): return emulator.emulate(self.step, self.job) # write the wrapper script to a temporary location # I don't pass it directly through os.system because I don't # trust that there won't be shell-escape shenanigans with # arbitrary input files scramSetup = self.step.application.setup.softwareEnvironment scramCommand = self.step.application.setup.scramCommand scramProject = self.step.application.setup.scramProject scramArch = self.step.application.setup.scramArch cmsswVersion = self.step.application.setup.cmsswVersion jobReportXML = self.step.output.jobReport cmsswCommand = self.step.application.command.executable cmsswConfig = self.step.application.command.configuration cmsswArguments = self.step.application.command.arguments userTarball = ','.join(self.step.user.inputSandboxes) userFiles = ','.join(self.step.user.userFiles) logging.info('User files are %s' % userFiles) logging.info('User sandboxes are %s' % userTarball) multicoreSettings = self.step.application.multicore multicoreEnabled = multicoreSettings.enabled if multicoreEnabled: numberOfCores = multicoreSettings.numberOfCores logging.info("***Multicore CMSSW Enabled***") logging.info("Multicore configured for %s cores" % numberOfCores) #create input file list used to generate manifest filelist = open(multicoreSettings.inputfilelist, 'w') for inputFile in self.job['input_files']: filelist.write("%s\n" % inputFile['lfn']) filelist.close() logging.info("Executing CMSSW step") # # set any global environment variables # try: os.environ['FRONTIER_ID'] = 'wmagent_%s' % ( self.report.data.workload) except Exception, ex: logging.error('Have critical error in setting FRONTIER_ID: %s' % str(ex)) logging.error( 'Continuing, as this is not a critical function yet.') pass # # scram bootstrap # scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Runing SCRAM") try: projectOutcome = scram.project() except Exception, ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
def multicoreMerge(self, scramRef, applicationStart): """ If this step is a multicore configured step, the output from the subprocesses needs to be merged together and the job report updated. """ logging.info("***Multicore Post processing started***") # // # // Now go through the reports and merge the outputs #// outputReports = readMultiJobReports(self.step.output.jobReport, self.stepName, self.step.builder.workingDir) self.report = None mergeBuckets = {} inputFiles = {} aggregator = ReportUtils.Aggregator() # // # // loop over the sub reports and combine the inputs for recording and outputs for merges #// for o in outputReports: # use one of the reports as the template for the master report if self.report == None: self.report = o aggregator.add(o.report.performance) # store list of all unique input files into the job, adding them together if they # are fragments of the same file for inp in o.getAllInputFiles(): lfn = inp['lfn'] if not inputFiles.has_key(lfn): inputFiles[lfn] = inp else: addFiles(inputFiles[lfn], inp) # // # // process the output modules from each subreport #// for f in o.getAllFiles(): lfn = f['outputModule'] if not mergeBuckets.has_key(lfn): mergeBuckets[lfn] = MergeBucket( f['lfn'], f['outputModule'], self.stepName, self.step.builder.workingDir) mergeBuckets[lfn].append(f) # clean up the master report from the template reportData = getattr(self.report.data, self.stepName) for src in reportData.input.listSections_(): delattr(reportData.input, src) for omod in reportData.outputModules: delattr(reportData.output, omod) reportData.outputModules.remove(omod) # // # // Add the reduced list of input files to the master report #// for f in inputFiles.values(): self.report.addInputFile(f['module_label'], **f) # // # // Now roll through the merges, run the merge and edit the master job report with the outcome #// mergesStart = time.time() mergeTiming = [] for b in mergeBuckets.values(): # write the merge config file thisMergeStarts = time.time() b.writeConfig() # run the merge as a scram enabled command logging.info(" Invoking command: %s" % b.merge_command) logfile = "%s.log" % b.merge_pset_file retCode = scramRef(b.merge_command, False, logfile, self.step.builder.workingDir) if retCode > 0: msg = "Error running merge job:\n%s\n" % b.merge_command msg += "Merge Config:\n%s\n" % b.mergeConfig() msg += "%s\n " % retCode msg += scram.diagnostic() logging.critical(msg) raise WMExecutionFailure(60666, "MulticoreMergeFailure", msg) # // # // add report from merge to master #// ToDo: try/except here in case report is missing b.editReport(self.report) thisMergeTime = time.time() - thisMergeStarts mergeTiming.append(thisMergeTime) mergesComplete = time.time() totalJobTime = mergesComplete - applicationStart self.report.report.performance = aggregator.aggregate() ReportUtils.updateMulticoreReport(self.report, len(mergeBuckets), mergesStart, mergesComplete, totalJobTime, *mergeTiming) logging.info("***Multicore Post processing complete***") return
logging.info("Runing SCRAM") try: projectOutcome = scram.project() except Exception, ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() #self.report.addError(60513, "ScramSetupFailure", msg) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() #self.report.addError(60513, "ScramSetupFailure", msg) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # # pre scripts # logging.info("Running PRE scripts") for script in self.step.runtime.preScripts: # TODO: Exception handling and error handling & logging scriptProcess = subprocess.Popen(
def execute(self, emulator=None): """ _execute_ """ # Are we using emulators again? if emulator is not None: return emulator.emulate(self.step, self.job) logging.info("Steps.Executors.%s.execute called", self.__class__.__name__) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() logging.info("Using the following overrides: %s ", overrides) # Find alternate stageout location self.altLFN = overrides.get('altLFN', None) self.failedPreviousStep = overrides.get('previousCmsRunFailure', False) logging.info("Step configuration is: %s", self.step) # Wait timeout for stageOut waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) matchFiles = [ ".log$", # matches the scram, wmagent and cmsRun logs "FrameworkJobReport.xml", "Report.pkl", "^PSet.py$", "^PSet.pkl$", "_condor_std*", # condor wrapper logs at the pilot top level ] ignoredDirs = ['Utils', 'WMCore', 'WMSandbox'] # Okay, we need a stageOut Manager useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True if not useNewStageOutCode: # old style manager = StageOutMgr.StageOutMgr(**overrides) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.info("LOGARCHIVE IS USING NEW STAGEOUT CODE") manager = WMCore.Storage.FileManager.StageOutMgr(retryPauseTime=self.step.retryDelay, numberOfRetries=self.step.retryCount, **overrides) # Now we need to find all the reports # The log search follows this structure: ~pilotArea/jobArea/WMTaskSpaceArea/StepsArea # Start looking at the pilot scratch area first, such that we find the condor logs # Then look at the job area in order to find the wmagentJob log # Finally, at the taskspace area to find the cmsRun/FWJR/PSet files pilotScratchDir = os.path.join(self.stepSpace.taskSpace.location, '../../') logFilesToArchive = self.findFilesInDirectory(pilotScratchDir, matchFiles, ignoredDirs) # What if it's empty? if len(logFilesToArchive) == 0: msg = "Couldn't find any log files in the job" logging.error(msg) return logFilesToArchive # Now that we've gone through all the steps, we have to tar it out tarName = 'logArchive.tar.gz' tarBallLocation = os.path.join(self.stepSpace.location, tarName) with tarfile.open(tarBallLocation, 'w:gz') as tarBall: for fName in logFilesToArchive: altName = fName.replace(pilotScratchDir, '', 1) tarBall.add(name=fName, arcname=altName) fileInfo = {'LFN': self.getLFN(tarName), 'PFN': tarBallLocation, 'PNN': None, 'GUID': None } signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileInfo) self.report.addOutputModule(moduleName="logArchive") (adler32, cksum) = calculateChecksums(tarBallLocation) reportFile = {"lfn": fileInfo["LFN"], "pfn": fileInfo["PFN"], "location": fileInfo["PNN"], "module_label": "logArchive", "events": 0, "size": 0, "merged": False, "checksums": {'adler32': adler32, 'cksum': cksum}} self.report.addOutputFile(outputModule="logArchive", aFile=reportFile) except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) self.report.addError(self.stepName, 60404, "LogArchiveTimeout", msg) self.saveReport() raise WMExecutionFailure(60404, "LogArchiveTimeout", msg) except WMException as ex: self.report.addError(self.stepName, 60307, "LogArchiveFailure", str(ex)) self.saveReport() raise ex except Exception as ex: self.report.addError(self.stepName, 60405, "LogArchiveFailure", str(ex)) self.saveReport() msg = "Failure in transferring logArchive tarball\n" logging.exception(msg) raise WMException("LogArchiveFailure", message=str(ex)) signal.alarm(0) signal.alarm(waitTime) self.sendLogToEOS(overrides, tarBallLocation, useNewStageOutCode) signal.alarm(0) return
def execute(self, emulator=None): """ _execute_ """ # Are we using emulators again? if emulator != None: return emulator.emulate(self.step, self.job) logging.info("Steps.Executors.%s.execute called", self.__class__.__name__) # Search through steps for analysis files for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: # Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s", step) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s", step, stepLocation) continue # First, get everything from a file and 'unpersist' it stepReport = Report() stepReport.unpersist(reportLocation, step) # Don't upload nor stage out files from bad steps. if not stepReport.stepSuccessful(step): continue # Pulling out the analysis files from each step analysisFiles = stepReport.getAnalysisFilesFromStep(step) # make sure all conditions from this job get the same uuid uuid = makeUUID() condFiles2copy = [] lumiFiles2copy = [] # Working on analysis files for analysisFile in analysisFiles: # deal with sqlite files if analysisFile.FileClass == "ALCA": sqlitefile = analysisFile.fileName.replace( 'sqlite_file:', '', 1) filenamePrefix = "Run%d@%s@%s" % ( self.step.condition.runNumber, analysisFile.inputtag, uuid) filenameDB = filenamePrefix + ".db" filenameTXT = filenamePrefix + ".txt" shutil.copy2(os.path.join(stepLocation, sqlitefile), filenameDB) textoutput = "prepMetaData %s\n" % analysisFile.prepMetaData textoutput += "prodMetaData %s\n" % analysisFile.prodMetaData with open(filenameTXT, "w") as fout: fout.write(textoutput) os.chmod( filenameDB, stat.S_IREAD | stat.S_IWRITE | stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH) os.chmod( filenameTXT, stat.S_IREAD | stat.S_IWRITE | stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH) condFiles2copy.append(filenameDB) condFiles2copy.append(filenameTXT) # deal with text files containing lumi info elif analysisFile.FileClass == "ALCATXT": shutil.copy2( os.path.join(stepLocation, analysisFile.fileName), analysisFile.fileName) lumiFiles2copy.append(analysisFile.fileName) # copy conditions files out and fake the job report addedOutputFJR = False if self.step.condition.lfnbase: logging.info("Copy out conditions files to %s", self.step.condition.lfnbase) for file2copy in condFiles2copy: logging.info("==> copy %s", file2copy) targetLFN = os.path.join(self.step.condition.lfnbase, file2copy) targetPFN = "root://eoscms//eos/cms%s" % targetLFN command = "env XRD_WRITERECOVERY=0 xrdcp -s -f %s %s" % ( file2copy, targetPFN) p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) output = p.communicate()[0] if p.returncode > 0: msg = 'Failure during condition copy to EOS:\n' msg += ' %s\n' % output logging.error(msg) raise WMExecutionFailure(60319, "AlcaHarvestFailure", msg) # add fake output file to job report addedOutputFJR = True stepReport.addOutputFile(self.step.condition.outLabel, aFile={ 'lfn': targetLFN, 'pfn': targetPFN, 'module_label': self.step.condition.outLabel }) # copy luminosity files out if self.step.luminosity.url: logging.info("Copy out luminosity files to %s", self.step.luminosity.url) for file2copy in lumiFiles2copy: logging.info("==> copy %s", file2copy) targetPFN = rootUrlJoin(self.step.luminosity.url, file2copy) if not targetPFN: msg = 'No valid URL for lumi copy:\n' msg += ' %s\n' % self.step.luminosity.url logging.error(msg) raise WMExecutionFailure(60319, "AlcaHarvestFailure", msg) command = "env XRD_WRITERECOVERY=0 xrdcp -s -f %s %s" % ( file2copy, targetPFN) p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) output = p.communicate()[0] if p.returncode > 0: msg = 'Failure during copy to EOS:\n' msg += ' %s\n' % output logging.error(msg) raise WMExecutionFailure(60319, "AlcaHarvestFailure", msg) if not addedOutputFJR: # no conditions from AlcaHarvest is a valid result, can # happen if calibration algorithms produced no output # due to not enough statistics or other reasons # # add fake placeholder output file to job report logging.info( "==> no sqlite files from AlcaHarvest job, creating placeholder file record" ) stepReport.addOutputFile(self.step.condition.outLabel, aFile={ 'lfn': "/no/output", 'pfn': "/no/output", 'module_label': self.step.condition.outLabel }) # Am DONE with report # Persist it stepReport.persist(reportLocation) return
logging.info(msg) except urllib2.HTTPError, ex: msg = 'HTTP upload failed with response:\n' msg += 'Status code: %s\n' % ex.hdrs.get("Dqm-Status-Code", None) msg += 'Message: %s\n' % ex.hdrs.get("Dqm-Status-Message", None) msg += 'Detail: %s\n' % ex.hdrs.get("Dqm-Status-Detail", None) msg += 'Error: %s\n' % str(ex) logging.error(msg) raise WMExecutionFailure(60318, "DQMUploadFailure", msg) except Exception, ex: msg = 'HTTP upload failed with response:\n' msg += 'Problem unknown.\n' msg += 'Error: %s\n' % str(ex) msg += 'Traceback: %s\n' % traceback.format_exc() logging.error(msg) raise WMExecutionFailure(60318, "DQMUploadFailure", msg) return def filetype(self, filename): return guess_type(filename)[0] or 'application/octet-stream' def encode(self, args, files): """ Encode form (name, value) and (name, filename, type) elements into multi-part/form-data. We don't actually need to know what we are uploading here, so just claim it's all text/plain. """ boundary = '----------=_DQM_FILE_BOUNDARY_=-----------' (body, crlf) = ('', '\r\n') for (key, value) in args.items():
def execute(self, emulator=None): """ _execute_ """ #Are we using emulators again? if (emulator != None): return emulator.emulate(self.step, self.job) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get( 'waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) seName = overrides.get('seName', "srm-cms.cern.ch") lfnPrefix = overrides.get( 'lfnPrefix', "srm://srm-cms.cern.ch:8443/srm/managerv2?SFN=/castor/cern.ch/cms") lfnBase = overrides.get('lfnBase', "/store/user/jsmith") userLogs = overrides.get('userLogs', False) dontStage = overrides.get('dontStage', False) stageOutParams = { "command": "srmv2-lcg", "se-name": seName, "lfn-prefix": lfnPrefix } #Okay, we need a stageOut Manager useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True try: if not useNewStageOutCode: # old style deleteMgr = DeleteMgr.DeleteMgr() stageInMgr = StageInMgr.StageInMgr() stageOutMgr = StageOutMgr.StageOutMgr(**stageOutParams) else: # new style logging.info("LOGCOLLECT IS USING NEW STAGEOUT CODE") stageOutMgr = WMCore.Storage.FileManager.StageOutMgr( retryPauseTime=self.step.retryDelay, numberOfRetries=self.step.retryCount, **overrides) stageInMgr = WMCore.Storage.FileManager.StageInMgr( retryPauseTime=self.step.retryDelay, numberOfRetries=self.step.retryCount) deleteMgr = WMCore.Storage.FileManager.DeleteMgr( retryPauseTime=self.step.retryDelay, numberOfRetries=self.step.retryCount) deleteMgr = DeleteMgr.DeleteMgr() stageInMgr = StageInMgr.StageInMgr() stageOutMgr = StageOutMgr.StageOutMgr(**stageOutParams) except Exception as ex: msg = "Unable to load StageIn/Out/Delete Impl: %s" % str(ex) logging.error(msg) raise WMExecutionFailure(60312, "MgrImplementationError", msg) # Now we need the logs if not dontStage: # Don't stage or delete files logs = [] for file in self.job["input_files"]: logs.append({"LFN": file["lfn"]}) readyFiles = [] for file in logs: signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: output = stageInMgr(**file) readyFiles.append(output) self.report.addInputFile(sourceName="logArchives", lfn=file['LFN']) except Alarm: msg = "Indefinite hang during stageIn of LogCollect" logging.error(msg) self.report.addError(self.stepName, 60407, "LogCollectTimeout", msg) self.report.persist("Report.pkl") except StageOutFailure as ex: msg = "Unable to StageIn %s" % file['LFN'] logging.error(msg) # Don't do anything other then record it self.report.addSkippedFile(file.get('PFN', None), file['LFN']) except Exception as ex: raise signal.alarm(0) if len(readyFiles) == 0: # Then we have no output; all the files failed # Panic! msg = "No logs staged in during LogCollect step" logging.error(msg) raise WMExecutionFailure(60312, "LogCollectError", msg) now = datetime.datetime.now() tarPFN = self.createArchive(readyFiles) if userLogs: lfn = "%s/%s/logs/%s" % (lfnBase, self.report.data.workload, os.path.basename(tarPFN)) else: lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % ( now.year, now.month, "WMAgent", self.report.data.workload, os.path.basename(tarPFN)) tarInfo = {'LFN': lfn, 'PFN': tarPFN, 'SEName': None, 'GUID': None} signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: stageOutMgr(tarInfo) self.report.addOutputFile(outputModule="LogCollect", file=tarInfo) except Alarm: msg = "Indefinite hang during stageOut of LogCollect" logging.error(msg) raise WMExecutionFailure(60409, "LogCollectTimeout", msg) except Exception as ex: msg = "Unable to stage out log archive:\n" msg += str(ex) print "MSG: %s" % msg raise WMExecutionFailure(60408, "LogCollectStageOutError", msg) signal.alarm(0) # If we're still here we didn't die on stageOut for file in self.job["input_files"]: signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: fileToDelete = { "LFN": file["lfn"], "PFN": None, "SEName": None, "StageOutCommand": None } deleteMgr(fileToDelete=fileToDelete) except Alarm: msg = "Indefinite hang during delete of LogCollect" logging.error(msg) raise WMExecutionFailure(60411, "DeleteTimeout", msg) except Exception as ex: msg = "Unable to delete files:\n" msg += str(ex) logging.error(msg) raise WMExecutionFailure(60410, "DeleteError", msg) signal.alarm(0) # Add to report outputRef = getattr(self.report.data, self.stepName) if dontStage: outputRef.output.pfn = 'NotStaged' outputRef.output.location = 'NotStaged' outputRef.output.lfn = 'NotStaged' else: outputRef.output.pfn = tarInfo['PFN'] outputRef.output.location = tarInfo['SEName'] outputRef.output.lfn = tarInfo['LFN'] return
def __call__(self, task, wmbsJob): """ _operator(task)_ Load and run executors for all steps in Task, if an emulator is available for that step, use it instead. """ myThread = threading.currentThread try: myThread.watchdogMonitor.setupMonitors(task, wmbsJob) myThread.watchdogMonitor.notifyJobStart(task) except WMException: self.toTaskDirectory() raise except Exception as ex: msg = "Encountered unhandled exception while starting monitors:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() raise WMExecutionFailure(msg) failureUpstream = False for step in task.steps().nodeIterator(): try: helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() if failureUpstream: # for chained steps, don't execute further steps if a # failure has already happened helper.addOverride("previousCmsRunFailure", True) executor = StepFactory.getStepExecutor(stepType) result = self.doExecution(executor, step, wmbsJob) logging.info("StepName: %s, StepType: %s, with result: %r", stepName, stepType, result) if result: # can be either None, or the step exit code failureUpstream = True except WMException as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" logging.error(msg) self.toTaskDirectory() break except Exception as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" msg += str(traceback.format_exc()) + "\n" self.toTaskDirectory() logging.error(msg) break try: myThread.watchdogMonitor.notifyJobEnd(task) except WMException: self.toTaskDirectory() except Exception as ex: msg = "Encountered unhandled exception while ending the job:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() return
def __call__(self, task, wmbsJob): """ _operator(task)_ Load and run executors for all steps in Task, if an emulator is available for that step, use it instead. """ myThread = threading.currentThread try: myThread.watchdogMonitor.setupMonitors(task, wmbsJob) myThread.watchdogMonitor.notifyJobStart(task) except WMException: self.toTaskDirectory() raise except Exception as ex: msg = "Encountered unhandled exception while starting monitors:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() raise WMExecutionFailure(msg) skipToStep = None for step in task.steps().nodeIterator(): try: helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() if skipToStep and skipToStep != stepName: # Then we continue until we get to the required step continue skipToStep = None # Reset this when we get to the right step executor = StepFactory.getStepExecutor(stepType) result = self.doExecution(executor, step, wmbsJob) if not result == None: skipToStep = result except WMException as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" logging.error(msg) self.toTaskDirectory() break except Exception as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" msg += str(traceback.format_exc()) + "\n" self.toTaskDirectory() logging.error(msg) break try: myThread.watchdogMonitor.notifyJobEnd(task) except WMException: self.toTaskDirectory() except Exception as ex: msg = "Encountered unhandled exception while ending the job:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() return
def execute(self, emulator=None): """ _execute_ """ #Are we using emulators again? if emulator != None: return emulator.emulate(self.step, self.job) # Search through steps for analysis files for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: #Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s" % step) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s" \ % (step, stepLocation)) continue # First, get everything from a file and 'unpersist' it stepReport = Report() stepReport.unpersist(reportLocation, step) # Don't upload nor stage out files from bad steps. if not stepReport.stepSuccessful(step): continue # Pulling out the analysis files from each step analysisFiles = stepReport.getAnalysisFilesFromStep(step) # make sure all conditions from this job get the same uuid uuid = makeUUID() files2copy = [] # Working on analysis files for analysisFile in analysisFiles: # only deal with sqlite files if analysisFile.FileClass == "ALCA": sqlitefile = analysisFile.fileName.replace( 'sqlite_file:', '', 1) filenamePrefix = "Run%d@%s@%s" % ( self.step.condition.runNumber, analysisFile.inputtag, uuid) filenameDB = filenamePrefix + ".db" filenameTXT = filenamePrefix + ".txt" shutil.copy2(os.path.join(stepLocation, sqlitefile), filenameDB) textoutput = "prepMetaData %s\n" % analysisFile.prepMetaData textoutput += "prodMetaData %s\n" % analysisFile.prodMetaData fout = open(filenameTXT, "w") fout.write(textoutput) fout.close() os.chmod( filenameDB, stat.S_IREAD | stat.S_IWRITE | stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH) os.chmod( filenameTXT, stat.S_IREAD | stat.S_IWRITE | stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH) files2copy.append(filenameDB) files2copy.append(filenameTXT) # check and create target directory if not os.path.isdir(self.step.condition.dir): msg = 'Conditions copy failed with response:\n' msg += 'The target dir %s does not exist or is not a directory\n' logging.error(msg) raise WMExecutionFailure(60319, "AlcaHarvestFailure", msg) # copy files out and fake the job report logging.info("Copy out conditions files to %s" % self.step.condition.dir) for file2copy in files2copy: logging.info("==> copy %s" % file2copy) targetFile = os.path.join(self.step.condition.dir, file2copy) try: shutil.copy2(file2copy, targetFile) except Exception, ex: msg = 'Conditions copy failed with response:\n' msg += 'Error: %s\n' % str(ex) logging.error(msg) raise WMExecutionFailure(60319, "AlcaHarvestFailure", msg) # add fake output file to job report stepReport.addOutputFile(self.step.condition.outLabel, file={ 'lfn': targetFile, 'pfn': targetFile, 'module_label': self.step.condition.outLabel }) if len(files2copy) == 0: # no output from AlcaHarvest is a valid result, can # happen if calibration algorithms produced no output # due to not enough statistics or other reasons # # add fake placeholder output file to job report logging.info( "==> no sqlite files from AlcaHarvest job, creating placeholder file record" ) stepReport.addOutputFile(self.step.condition.outLabel, file={ 'lfn': "/no/output", 'pfn': "/no/output", 'module_label': self.step.condition.outLabel }) # Am DONE with report # Persist it stepReport.persist(reportLocation)
def httpPost(self, filename): """ _httpPost_ perform an HTTP POST operation to a webserver """ args = {} # Preparing a checksum blockSize = 0x10000 def upd(m, data): m.update(data) return m with open(filename, 'rb') as fd: contents = iter(lambda: fd.read(blockSize), b'') m = reduce(upd, contents, md5()) args['checksum'] = 'md5:%s' % m.hexdigest() # args['checksum'] = 'md5:%s' % md5.new(filename).read()).hexdigest() args['size'] = os.path.getsize(filename) msg = "HTTP Upload is about to start:\n" msg += " => URL: %s\n" % self.step.upload.URL msg += " => Filename: %s\n" % filename logging.info(msg) try: logException = True for uploadURL in self.step.upload.URL.split(';'): (headers, data) = self.upload(uploadURL, args, filename) msg = ' Status code: %s\n' % headers.get( "Dqm-Status-Code", None) msg += ' Message: %s\n' % headers.get("Dqm-Status-Message", None) msg += ' Detail: %s\n' % headers.get("Dqm-Status-Detail", None) msg += ' Data: %s\n' % str(data) if int(headers.get("Dqm-Status-Code", 400)) >= 400: logException = False raise Exception(msg) else: msg = 'HTTP upload finished succesfully with response:\n' + msg logging.info(msg) except HTTPError as ex: msg = 'HTTP upload failed with response:\n' msg += ' Status code: %s\n' % ex.hdrs.get("Dqm-Status-Code", None) msg += ' Message: %s\n' % ex.hdrs.get("Dqm-Status-Message", None) msg += ' Detail: %s\n' % ex.hdrs.get("Dqm-Status-Detail", None) msg += ' Error: %s\n' % str(ex) logging.exception(msg) raise WMExecutionFailure(70318, "DQMUploadFailure", msg) except Exception as ex: msg = 'HTTP upload failed! Error:\n%s' % str(ex) if logException: logging.exception(msg) else: logging.error(msg) raise WMExecutionFailure(70318, "DQMUploadFailure", msg) return
def execute(self, emulator=None): """ _execute_ """ stepModule = "WMTaskSpace.%s" % self.stepName if emulator is not None: return emulator.emulate(self.step, self.job) # write the wrapper script to a temporary location # I don't pass it directly through os.system because I don't # trust that there won't be shell-escape shenanigans with # arbitrary input files scramSetup = self.step.application.setup.softwareEnvironment scramCommand = self.step.application.setup.scramCommand scramProject = self.step.application.setup.scramProject scramArch = self.step.application.setup.scramArch cmsswVersion = self.step.application.setup.cmsswVersion jobReportXML = self.step.output.jobReport cmsswCommand = self.step.application.command.executable cmsswConfig = self.step.application.command.configuration cmsswArguments = self.step.application.command.arguments userTarball = ','.join(self.step.user.inputSandboxes) userFiles = ','.join(self.step.user.userFiles) logging.info('User files are %s', userFiles) logging.info('User sandboxes are %s', userTarball) scramArch = getSingleScramArch(scramArch) multicoreSettings = self.step.application.multicore try: logging.info("CMSSW configured for %s cores and %s event streams", multicoreSettings.numberOfCores, multicoreSettings.eventStreams) except AttributeError: logging.info( "No value set for multicore numberOfCores or eventStreams") logging.info("Executing CMSSW step") # # set any global environment variables # try: os.environ['FRONTIER_ID'] = 'wmagent_%s' % ( self.report.data.workload) except Exception as ex: logging.error('Have critical error in setting FRONTIER_ID: %s', str(ex)) logging.error( 'Continuing, as this is not a critical function yet.') # # scram bootstrap # scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Runing SCRAM") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # # pre scripts # logging.info("Running PRE scripts") for script in self.step.runtime.preScripts: # TODO: Exception handling and error handling & logging scriptProcess = subprocess.Popen( ["/bin/bash"], shell=True, cwd=self.step.builder.workingDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, ) # BADPYTHON scriptProcess.stdin.write( "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH\n") invokeCommand = "%s -m WMCore.WMRuntime.ScriptInvoke %s %s \n" % ( sys.executable, stepModule, script) logging.info(" Invoking command: %s", invokeCommand) scriptProcess.stdin.write(invokeCommand) stdout, stderr = scriptProcess.communicate() retCode = scriptProcess.returncode if retCode > 0: msg = "Error running command\n%s\n" % invokeCommand msg += "%s\n %s\n %s\n" % (retCode, stdout, stderr) logging.critical("Error running command") logging.critical(msg) raise WMExecutionFailure(50513, "PreScriptFailure", msg) # # pre scripts with scram # logging.info("RUNNING SCRAM SCRIPTS") for script in self.step.runtime.scramPreScripts: # invoke scripts with scram() runtimeDir = getattr(self.step.runtime, 'scramPreDir', None) invokeCommand = self.step.runtime.invokeCommand if hasattr(self.step.runtime, 'invokeCommand') else \ "%s -m WMCore.WMRuntime.ScriptInvoke %s" % (sys.executable, stepModule) invokeCommand += " %s \n" % script retCode = scram(invokeCommand, runtimeDir=runtimeDir) if retCode > 0: msg = "Error running command\n%s\n" % invokeCommand msg += "%s\n " % retCode msg += scram.diagnostic() logging.critical(msg) raise WMExecutionFailure(50513, "PreScriptScramFailure", msg) configPath = "%s/%s-main.sh" % (self.step.builder.workingDir, self.stepName) handle = open(configPath, 'w') handle.write(CONFIG_BLOB) handle.close() # spawn this new process # the script looks for: # <SCRAM_COMMAND> <SCRAM_PROJECT> <CMSSW_VERSION> <JOB_REPORT> <EXECUTABLE> <CONFIG> # open the output files stdoutHandle = open(self.step.output.stdout, 'w') stderrHandle = open(self.step.output.stderr, 'w') args = [ '/bin/bash', configPath, scramSetup, scramArch, scramCommand, scramProject, cmsswVersion, jobReportXML, cmsswCommand, cmsswConfig, userTarball, userFiles, cmsswArguments ] logging.info("Executing CMSSW. args: %s", args) # possibly needed environment overrides for CMSSW call go here envOverride = {} # work around problem with GSI authentication plugin and EOS at CERN if socket.getfqdn().endswith("cern.ch"): envOverride['XRD_LOADBALANCERTTL'] = "86400" # some libraries linked with CMSSW need HOME in the environment if 'HOME' not in os.environ: envOverride['HOME'] = os.environ.get('PWD', "/") os.environ.update(envOverride) returncode = subprocess.call(args, stdout=stdoutHandle, stderr=stderrHandle) self.setCondorChirpAttrDelayed('Chirp_WMCore_cmsRun_ExitCode', returncode) self.setCondorChirpAttrDelayed( 'Chirp_WMCore_%s_ExitCode' % self.stepName, returncode) stdoutHandle.close() stderrHandle.close() self.step.execution.exitStatus = returncode argsDump = {'arguments': args} if returncode != 0: msg = "Error running cmsRun\n%s\n" % argsDump msg += "Return code: %s\n" % returncode logging.critical(msg) raise WMExecutionFailure(returncode, "CmsRunFailure", msg) try: self.report.parse(jobReportXML, stepName=self.stepName) except Exception as ex: # Catch it if something goes wrong raise WMExecutionFailure(50115, "BadJobReportXML", str(ex)) stepHelper = WMStepHelper(self.step) typeHelper = stepHelper.getTypeHelper() acquisitionEra = self.task.getAcquisitionEra() processingVer = self.task.getProcessingVersion() processingStr = self.task.getProcessingString() validStatus = self.workload.getValidStatus() inputPath = self.task.getInputDatasetPath() globalTag = typeHelper.getGlobalTag() prepID = self.task.getPrepID() campaign = self.workload.getCampaign() cacheUrl, cacheDB, configID = stepHelper.getConfigInfo() self.report.setValidStatus(validStatus=validStatus) self.report.setGlobalTag(globalTag=globalTag) self.report.setCampaign(campaign) self.report.setPrepID(prepID) self.report.setInputDataset(inputPath=inputPath) self.report.setAcquisitionProcessing(acquisitionEra=acquisitionEra, processingVer=processingVer, processingStr=processingStr) self.report.setConfigURL(configURL="%s;;%s;;%s" % (cacheUrl, cacheDB, configID)) # Attach info to files self.report.addInfoToOutputFilesForStep(stepName=self.stepName, step=self.step) self.report.checkForOutputFiles(stepName=self.stepName) self.report.checkForAdlerChecksum(stepName=self.stepName) self.report.checkForRunLumiInformation(stepName=self.stepName) if self.step.output.keep != True: self.report.killOutput() else: # Check that we only keep the desired output for module in stepHelper.getIgnoredOutputModules(): self.report.deleteOutputModuleForStep(stepName=self.stepName, moduleName=module) # Add stageout LFN to existing TFileService files reportAnalysisFiles = self.report.getAnalysisFilesFromStep( self.stepName) for reportAnalysisFile in reportAnalysisFiles: newLFN = analysisFileLFN(reportAnalysisFile.fileName, self.step.user.lfnBase, self.job) addAttributesToFile(reportAnalysisFile, pfn=reportAnalysisFile.fileName, lfn=newLFN, validate=False) # Add analysis file entries for additional files listed in workflow for fileName in stepHelper.listAnalysisFiles(): analysisFile = stepHelper.getAnalysisFile(fileName) if os.path.isfile(analysisFile.fileName): newLFN = analysisFileLFN(analysisFile.fileName, analysisFile.lfnBase, self.job) self.report.addAnalysisFile(analysisFile.fileName, lfn=newLFN, Source='UserDefined', pfn=os.path.join( os.getcwd(), analysisFile.fileName), validate=False) return
def execute(self, emulator = None, **overrides): """ _execute_ """ # Are we using emulators again? if (emulator != None): return emulator.emulate( self.step, self.job ) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Find alternate stageout location self.altLFN = overrides.get('altLFN', None) logging.info("Beginning Steps.Executors.LogArchive.Execute") logging.info("Using the following overrides: %s " % overrides) logging.info("Step is: %s" % self.step) # Wait timout for stageOut waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) matchFiles = [ ".log$", "FrameworkJobReport", "Report.pkl", "Report.pcl", "^PSet.py$", "^PSet.pkl$" ] #Okay, we need a stageOut Manager useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ (overrides.has_key('newStageOut') and overrides.get('newStageOut')): useNewStageOutCode = True if not useNewStageOutCode: # old style manager = StageOutMgr.StageOutMgr(**overrides) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.info("LOGARCHIVE IS USING NEW STAGEOUT CODE") manager = WMCore.Storage.FileManager.StageOutMgr( retryPauseTime = self.step.retryDelay, numberOfRetries = self.step.retryCount, **overrides) #Now we need to find all the reports logFilesForTransfer = [] #Look in the taskSpace first logFilesForTransfer.extend(self.findFilesInDirectory(self.stepSpace.taskSpace.location, matchFiles)) #What if it's empty? if len(logFilesForTransfer) == 0: msg = "Could find no log files in job" logging.error(msg) return logFilesForTransfer #Now that we've gone through all the steps, we have to tar it out tarName = 'logArchive.tar.gz' tarBallLocation = os.path.join(self.stepSpace.location, tarName) tarBall = tarfile.open(tarBallLocation, 'w:gz') for f in logFilesForTransfer: tarBall.add(name = f, arcname = f.replace(self.stepSpace.taskSpace.location, '', 1).lstrip('/')) tarBall.close() fileInfo = {'LFN': self.getLFN(tarName), 'PFN' : tarBallLocation, 'SEName' : None, 'GUID' : None } signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileInfo) self.report.addOutputModule(moduleName = "logArchive") reportFile = {"lfn": fileInfo["LFN"], "pfn": fileInfo["PFN"], "location": fileInfo["SEName"], "module_label": "logArchive", "events": 0, "size": 0, "merged": False, "checksums": {'md5': BasicAlgos.getMD5(tarBallLocation), 'adler32': readAdler32(tarBallLocation), 'cksum': readCksum(tarBallLocation)}} self.report.addOutputFile(outputModule = "logArchive", file = reportFile) except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) self.report.addError(self.stepName, 60404, "LogArchiveTimeout", msg) self.report.persist("Report.pkl") raise WMExecutionFailure(60404, "LogArchiveTimeout", msg) except WMException, ex: self.report.addError(self.stepName, 60307, "LogArchiveFailure", str(ex)) self.report.setStepStatus(self.stepName, 0) self.report.persist("Report.pkl") raise ex
def execute(self, emulator=None): """ _execute_ """ # Are we using emulators again? if emulator is not None: return emulator.emulate(self.step, self.job) logging.info("Steps.Executors.%s.execute called", self.__class__.__name__) scramCommand = self.step.application.setup.scramCommand cmsswVersion = self.step.application.setup.cmsswVersion scramArch = getSingleScramArch(self.step.application.setup.scramArch) overrideCatalog = getattr(self.step.application, 'overrideCatalog', None) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) # hardcode CERN EOS T2_CH_CERN stageout parameters eosStageOutParams = {} eosStageOutParams['command'] = overrides.get('command', "xrdcp") eosStageOutParams['option'] = overrides.get('option', "--wma-disablewriterecovery") eosStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN") eosStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "root://eoscms.cern.ch//eos/cms") try: eosStageOutMgr = StageOutMgr(**eosStageOutParams) stageInMgr = StageInMgr() deleteMgr = DeleteMgr() except Exception as ex: msg = "Unable to load StageOut/Delete Impl: %s" % str(ex) logging.error(msg) raise WMExecutionFailure(60312, "MgrImplementationError", msg) # prepare output tar file taskName = self.report.getTaskName().split('/')[-1] host = socket.gethostname().split('.')[0] tarName = '%s-%s-%s-%i-logs.tar' % (self.report.data.workload, taskName, host, self.job["counter"]) tarLocation = os.path.join(self.stepSpace.location, tarName) # Supported by any release beyond CMSSW_8_X, however DaviX is broken until CMSSW_10_4_X # see: https://github.com/cms-sw/cmssw/issues/25292 useEdmCopyUtil = True if isCMSSWSupported(cmsswVersion, "CMSSW_10_4_0"): pass elif scramArch.startswith('slc7_amd64_'): msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead" logging.warning(msg) cmsswVersion = "CMSSW_10_4_0" scramArch = "slc7_amd64_gcc820" elif scramArch.startswith('slc6_amd64_'): msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead" logging.warning(msg) cmsswVersion = "CMSSW_10_4_0" scramArch = "slc6_amd64_gcc700" else: useEdmCopyUtil = False logging.info("Using edmCopyUtil: %s", useEdmCopyUtil) # setup Scram needed to run edmCopyUtil if useEdmCopyUtil: scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Running scram") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # iterate through input files localLogs = [] deleteLogArchives = [] if useEdmCopyUtil: numberOfFilesPerCopy = 10 else: numberOfFilesPerCopy = 1 for logs in grouper(self.job["input_files"], numberOfFilesPerCopy): copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get('X509_USER_PROXY', None) # specify TFC if necessary if overrideCatalog: copyCommand += " -c %s" % overrideCatalog for log in logs: copyCommand += " %s" % log['lfn'] copyCommand += " %s" % self.step.builder.workingDir # give up after timeout of 1 minute per input file signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60 * numberOfFilesPerCopy) filesCopied = False try: if useEdmCopyUtil: logging.info("Running edmCopyUtil") retval = scram(copyCommand) if retval == 0: filesCopied = True else: logging.info("Running stageIn") for log in logs: fileInfo = {"LFN": log['lfn']} logArchive = stageInMgr(**fileInfo) if logArchive: filesCopied = True except Alarm: logging.error("Indefinite hang during edmCopyUtil/stageIn of logArchives") except StageOutFailure: logging.error("Unable to stageIn logArchives") except Exception: raise signal.alarm(0) if filesCopied: for log in logs: localLogs.append(os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn']))) deleteLogArchives.append(log) self.report.addInputFile(sourceName="logArchives", lfn=log['lfn']) else: logging.error("Unable to copy logArchives to local disk") for log in logs: self.report.addSkippedFile(log['lfn'], None) # create tarfile if any logArchive copied in if localLogs: with tarfile.open(tarLocation, 'w:') as tarFile: for log in localLogs: path = log.split('/') tarFile.add(name=log, arcname=os.path.join(path[-3], path[-2], path[-1])) os.remove(log) else: msg = "Unable to copy any logArchives to local disk" logging.error(msg) raise WMExecutionFailure(60312, "LogCollectError", msg) # now staging out the LogCollect tarfile logging.info("Staging out LogCollect tarfile to EOS (skipping CASTOR)") now = datetime.datetime.now() lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent", self.report.data.workload, os.path.basename(tarLocation)) tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # perform mandatory stage out to CERN EOS signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: eosStageOutMgr(tarInfo) except Alarm: msg = "Indefinite hang during stageOut of LogCollect to EOS" logging.error(msg) raise WMExecutionFailure(60409, "LogCollectTimeout", msg) except Exception as ex: msg = "Unable to stageOut LogCollect to Castor:\n" msg += str(ex) logging.error(msg) raise WMExecutionFailure(60408, "LogCollectStageOutError", msg) signal.alarm(0) # add to job report self.report.addOutputFile(outputModule="LogCollect", aFile=tarInfo) outputRef = getattr(self.report.data, self.stepName) outputRef.output.pfn = tarInfo['PFN'] outputRef.output.location = tarInfo['PNN'] outputRef.output.lfn = tarInfo['LFN'] # we got this far, delete ALL input files assigned to this job for log in self.job["input_files"]: # give up after timeout of 1 minutes signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60) try: fileToDelete = {'LFN': log['lfn'], 'PFN': None, 'PNN': None, 'StageOutCommand': None} deleteMgr(fileToDelete=fileToDelete) except Alarm: logging.error("Indefinite hang during delete of logArchive") except Exception as ex: logging.error("Unable to delete logArchive: %s", ex) signal.alarm(0) return
class MulticoreCMSSW(Executor): """ _MulticoreCMSWW_ Execute a CMSSW Step in a terrifying series of parallel universes!!!!1!1! """ def pre(self, emulator=None): """ _pre_ Pre execution checks """ if (emulator != None): return emulator.emulatePre(self.step) logging.info("Pre-executing CMSSW step") if hasattr(self.step.application.configuration, 'configCacheUrl'): # means we have a configuration & tweak in the sandbox psetFile = self.step.application.command.configuration psetTweak = self.step.application.command.psetTweak self.stepSpace.getFromSandbox(psetFile) if psetTweak: self.stepSpace.getFromSandbox(psetTweak) #Todo: Discover number of cores available to the job and set the number of # cores. self.step.runtime.scramPreScripts.append("SetupCMSSWMulticore") self.step.runtime.scramPreScripts.append("SetupCMSSWPset") return None def execute(self, emulator=None): """ _execute_ """ stepModule = "WMTaskSpace.%s" % self.stepName if (emulator != None): return emulator.emulate(self.step, self.job) # write the wrapper script to a temporary location # I don't pass it directly through os.system because I don't # trust that there won't be shell-escape shenanigans with # arbitrary input files scramSetup = self.step.application.setup.softwareEnvironment scramCommand = self.step.application.setup.scramCommand scramProject = self.step.application.setup.scramProject scramArch = self.step.application.setup.scramArch cmsswVersion = self.step.application.setup.cmsswVersion jobReportXML = self.step.output.jobReport cmsswCommand = self.step.application.command.executable cmsswConfig = self.step.application.command.configuration cmsswArguments = self.step.application.command.arguments logging.info("Executing MulticoreCMSSW step") multicoreSettings = self.step.application.multicore numberOfCores = multicoreSettings.numberOfCores logging.info("Multicore configured for %s cores" % numberOfCores) #Todo: Create input file list from job writing filelist one per line to filelist = open(multicoreSettings.inputfilelist, 'w') for inputFile in self.job['input_files']: filelist.write("%s\n" % inputFile['lfn']) filelist.close() # # scram bootstrap # scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Running SCRAM") try: projectOutcome = scram.project() except Exception, ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(60513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() #self.report.addError(60513, "ScramSetupFailure", msg) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(60513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() #self.report.addError(60513, "ScramSetupFailure", msg) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(60513, "ScramSetupFailure", msg) # # pre scripts # logging.info("Running PRE scripts") for script in self.step.runtime.preScripts: # TODO: Exception handling and error handling & logging scriptProcess = subprocess.Popen( ["/bin/bash"], shell=True, cwd=self.step.builder.workingDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, ) # BADPYTHON scriptProcess.stdin.write( "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$VO_CMS_SW_DIR/COMP/slc5_amd64_gcc434/external/openssl/0.9.7m/lib:$VO_CMS_SW_DIR/COMP/slc5_amd64_gcc434/external/bz2lib/1.0.5/lib\n" ) invokeCommand = "%s -m WMCore.WMRuntime.ScriptInvoke %s %s \n" % ( sys.executable, stepModule, script) logging.info(" Invoking command: %s" % invokeCommand) scriptProcess.stdin.write(invokeCommand) stdout, stderr = scriptProcess.communicate() retCode = scriptProcess.returncode if retCode > 0: msg = "Error running command\n%s\n" % invokeCommand msg += "%s\n %s\n %s\n" % (retCode, stdout, stderr) logging.critical("Error running command") logging.critical(msg) raise WMExecutionFailure(60514, "PreScriptFailure", msg) # # pre scripts with scram # logging.info("RUNNING SCRAM SCRIPTS") for script in self.step.runtime.scramPreScripts: "invoke scripts with scram()" invokeCommand = "%s -m WMCore.WMRuntime.ScriptInvoke %s %s \n" % ( sys.executable, stepModule, script) logging.info(" Invoking command: %s" % invokeCommand) retCode = scram(invokeCommand) if retCode > 0: msg = "Error running command\n%s\n" % invokeCommand msg += "%s\n " % retCode msg += scram.diagnostic() logging.critical(msg) raise WMExecutionFailure(60515, "PreScriptScramFailure", msg) configPath = "%s/%s-main.sh" % (self.step.builder.workingDir, self.stepName) handle = open(configPath, 'w') handle.write(configBlob) handle.close() # spawn this new process # the script looks for: # <SCRAM_COMMAND> <SCRAM_PROJECT> <CMSSW_VERSION> <JOB_REPORT> <EXECUTABLE> # <CONFIG> # open the output files stdoutHandle = open(self.step.output.stdout, 'w') stderrHandle = open(self.step.output.stderr, 'w') applicationStart = time.time() args = [ '/bin/bash', configPath, scramSetup, scramArch, scramCommand, scramProject, cmsswVersion, jobReportXML, cmsswCommand, cmsswConfig, cmsswArguments ] logging.info("Executing CMSSW. args: %s" % args) spawnedChild = subprocess.Popen(args, 0, None, None, stdoutHandle, stderrHandle) #(stdoutData, stderrData) = spawnedChild.communicate() # the above line replaces the bottom block. I'm unsure of why # nobody used communicate(), but I'm leaving this just in case # AMM Jul 4th, /2010 # loop and collect the data while True: (rdready, wrready, errready) = select.select( [stdoutHandle.fileno(), stderrHandle.fileno()], [], []) # see if the process is still running spawnedChild.poll() if (spawnedChild.returncode != None): break # give the process some time to fill a buffer select.select([], [], [], .1) spawnedChild.wait() stdoutHandle.close() stderrHandle.close() self.step.execution.exitStatus = spawnedChild.returncode argsDump = {'arguments': args} if spawnedChild.returncode != 0: msg = "Error running cmsRun\n%s\n" % argsDump msg += "Return code: %s\n" % spawnedChild.returncode logging.critical(msg) raise WMExecutionFailure(spawnedChild.returncode, "CmsRunFailure", msg) # // # // Now go through the reports and merge the outputs #// outputReports = readMultiJobReports(self.step.output.jobReport, self.stepName, self.step.builder.workingDir) self.report = None mergeBuckets = {} inputFiles = {} aggregator = ReportUtils.Aggregator() # // # // loop over the sub reports and combine the inputs for recording and outputs for merges #// for o in outputReports: # use one of the reports as the template for the master report if self.report == None: self.report = o aggregator.add(o.report.performance) # store list of all unique input files into the job, adding them together if they # are fragments of the same file for inp in o.getAllInputFiles(): lfn = inp['lfn'] if not inputFiles.has_key(lfn): inputFiles[lfn] = inp else: addFiles(inputFiles[lfn], inp) # // # // process the output modules from each subreport #// for f in o.getAllFiles(): lfn = f['outputModule'] if not mergeBuckets.has_key(lfn): mergeBuckets[lfn] = MergeBucket( f['lfn'], f['outputModule'], self.stepName, self.step.builder.workingDir) mergeBuckets[lfn].append(f) # clean up the master report from the template reportData = getattr(self.report.data, self.stepName) for src in reportData.input.listSections_(): delattr(reportData.input, src) for omod in reportData.outputModules: delattr(reportData.output, omod) reportData.outputModules.remove(omod) # // # // Add the reduced list of input files to the master report #// for f in inputFiles.values(): self.report.addInputFile(f['module_label'], **f) # // # // Now roll through the merges, run the merge and edit the master job report with the outcome #// mergesStart = time.time() mergeTiming = [] for b in mergeBuckets.values(): # write the merge config file thisMergeStarts = time.time() b.writeConfig() # run the merge as a scram enabled command logging.info(" Invoking command: %s" % b.merge_command) logfile = "%s.log" % b.merge_pset_file retCode = scram(b.merge_command, False, logfile, self.step.builder.workingDir) if retCode > 0: msg = "Error running merge job:\n%s\n" % b.merge_command msg += "Merge Config:\n%s\n" % b.mergeConfig() msg += "%s\n " % retCode msg += scram.diagnostic() logging.critical(msg) raise WMExecutionFailure(60666, "MulticoreMergeFailure", msg) # // # // add report from merge to master #// ToDo: try/except here in case report is missing b.editReport(self.report) thisMergeTime = time.time() - thisMergeStarts mergeTiming.append(thisMergeTime) mergesComplete = time.time() totalJobTime = mergesComplete - applicationStart self.report.report.performance = aggregator.aggregate() ReportUtils.updateMulticoreReport(self.report, len(mergeBuckets), mergesStart, mergesComplete, totalJobTime, *mergeTiming) return
def execute(self, emulator=None): """ _execute_ """ scramCommand = self.step.application.setup.scramCommand scramArch = self.step.application.setup.scramArch cmsswVersion = self.step.application.setup.cmsswVersion # Are we using emulators again? if (emulator != None): return emulator.emulate(self.step, self.job) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get( 'waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters castorStageOutParams = {} castorStageOutParams['command'] = overrides.get('command', "xrdcp") castorStageOutParams['option'] = overrides.get('option', "--cerncastor") castorStageOutParams['phedex-node'] = overrides.get( 'phedex-node', "T2_CH_CERN") castorStageOutParams['lfn-prefix'] = overrides.get( 'lfn-prefix', "root://castorcms.cern.ch//castor/cern.ch/cms") # hardcode CERN EOS T2_CH_CERN stageout parameters eosStageOutParams = {} eosStageOutParams['command'] = overrides.get('command', "xrdcp") eosStageOutParams['option'] = overrides.get('option', "") eosStageOutParams['phedex-node'] = overrides.get( 'phedex-node', "T2_CH_CERN") eosStageOutParams['lfn-prefix'] = overrides.get( 'lfn-prefix', "root://eoscms.cern.ch//eos/cms") # are we using the new stageout method ? useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True try: castorStageOutMgr = StageOutMgr(**castorStageOutParams) eosStageOutMgr = StageOutMgr(**eosStageOutParams) stageInMgr = StageInMgr() deleteMgr = DeleteMgr() except Exception as ex: msg = "Unable to load StageOut/Delete Impl: %s" % str(ex) logging.error(msg) raise WMExecutionFailure(60312, "MgrImplementationError", msg) # prepare output tar file taskName = self.report.getTaskName().split('/')[-1] host = socket.gethostname().split('.')[0] tarName = '%s-%s-%s-%i-logs.tar' % ( self.report.data.workload, taskName, host, self.job["counter"]) tarLocation = os.path.join(self.stepSpace.location, tarName) # check if the cmsswVersion supports edmCopyUtil (min CMSSW_8_X) result = re.match("CMSSW_([0-9]+)_([0-9]+)_([0-9]+).*", cmsswVersion) useEdmCopyUtil = False if result: try: if int(result.group(1)) >= 8: useEdmCopyUtil = True except ValueError: pass # setup Scram needed to run edmCopyUtil if useEdmCopyUtil: scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Running scram") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # iterate through input files localLogs = [] deleteLogArchives = [] if useEdmCopyUtil: numberOfFilesPerCopy = 10 else: numberOfFilesPerCopy = 1 for logs in grouper(self.job["input_files"], numberOfFilesPerCopy): copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get( 'X509_USER_PROXY', None) for log in logs: copyCommand += " %s" % log['lfn'] copyCommand += " %s" % self.step.builder.workingDir # give up after timeout of 1 minute per input file signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60 * numberOfFilesPerCopy) filesCopied = False try: if useEdmCopyUtil: logging.info("Running edmCopyUtil") retval = scram(copyCommand) if retval == 0: filesCopied = True else: logging.info("Running stageIn") for log in logs: fileInfo = {"LFN": log['lfn']} logArchive = stageInMgr(**fileInfo) if logArchive: filesCopied = True except Alarm: logging.error( "Indefinite hang during edmCopyUtil/stageIn of logArchives" ) except StageOutFailure: logging.error("Unable to stageIn logArchives") except Exception: raise signal.alarm(0) if filesCopied: for log in logs: localLogs.append( os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn']))) deleteLogArchives.append(log) self.report.addInputFile(sourceName="logArchives", lfn=log['lfn']) else: logging.error("Unable to copy logArchives to local disk") if useEdmCopyUtil: with open('scramOutput.log', 'r') as f: logging.error("Scram output: %s", f.read()) for log in logs: self.report.addSkippedFile(log['lfn'], None) # create tarfile if any logArchive copied in if localLogs: tarFile = tarfile.open(tarLocation, 'w:') for log in localLogs: path = log.split('/') tarFile.add(name=log, arcname=os.path.join(path[-3], path[-2], path[-1])) os.remove(log) tarFile.close() else: msg = "Unable to copy any logArchives to local disk" logging.error(msg) raise WMExecutionFailure(60312, "LogCollectError", msg) # now staging out the LogCollect tarfile logging.info("Staging out LogCollect tarfile to Castor and EOS") now = datetime.datetime.now() lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % ( now.year, now.month, "WMAgent", self.report.data.workload, os.path.basename(tarLocation)) tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # perform mandatory stage out to CERN Castor signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: castorStageOutMgr(tarInfo) except Alarm: msg = "Indefinite hang during stageOut of LogCollect to Castor" logging.error(msg) raise WMExecutionFailure(60409, "LogCollectTimeout", msg) except Exception as ex: msg = "Unable to stageOut LogCollect to Castor:\n" msg += str(ex) logging.error(msg) raise WMExecutionFailure(60408, "LogCollectStageOutError", msg) signal.alarm(0) # add to job report self.report.addOutputFile(outputModule="LogCollect", file=tarInfo) outputRef = getattr(self.report.data, self.stepName) outputRef.output.pfn = tarInfo['PFN'] outputRef.output.location = tarInfo['PNN'] outputRef.output.lfn = tarInfo['LFN'] tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # then, perform best effort stage out to CERN EOS signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: eosStageOutMgr(tarInfo) except Alarm: logging.error( "Indefinite hang during stageOut of LogCollect to EOS") except Exception as ex: logging.error("Unable to stageOut LogCollect to EOS:\n", ex) signal.alarm(0) # we got this far, delete input for log in deleteLogArchives: # give up after timeout of 1 minutes signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60) try: fileToDelete = { 'LFN': log['lfn'], 'PFN': None, 'PNN': None, 'StageOutCommand': None } deleteMgr(fileToDelete=fileToDelete) except Alarm: logging.error("Indefinite hang during delete of logArchive") except Exception as ex: logging.error("Unable to delete logArchive: %s", ex) signal.alarm(0) return
except StageOutFailure, ex: msg = "Unable to StageIn %s" % file['LFN'] logging.error(msg) # Don't do anything other then record it self.report.addSkippedFile(file.get('PFN', None), file['LFN']) except Exception, ex: raise signal.alarm(0) if len(readyFiles) == 0: # Then we have no output; all the files failed # Panic! msg = "No logs staged in during LogCollect step" logging.error(msg) raise WMExecutionFailure(60312, "LogCollectError", msg) now = datetime.datetime.now() tarPFN = self.createArchive(readyFiles) if userLogs: lfn = "%s/%s/logs/%s" % (lfnBase, self.report.data.workload, os.path.basename(tarPFN)) else: lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent", self.report.data.workload, os.path.basename(tarPFN)) tarInfo = {'LFN' : lfn, 'PFN' : tarPFN, 'SEName' : None, 'GUID' : None}