def execute(self, emulator=None): """ _execute_ """ scramCommand = self.step.application.setup.scramCommand scramArch = self.step.application.setup.scramArch cmsswVersion = self.step.application.setup.cmsswVersion # Are we using emulators again? if (emulator != None): return emulator.emulate(self.step, self.job) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get( 'waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters castorStageOutParams = {} castorStageOutParams['command'] = overrides.get('command', "xrdcp") castorStageOutParams['option'] = overrides.get('option', "--cerncastor") castorStageOutParams['phedex-node'] = overrides.get( 'phedex-node', "T2_CH_CERN") castorStageOutParams['lfn-prefix'] = overrides.get( 'lfn-prefix', "root://castorcms.cern.ch//castor/cern.ch/cms") # hardcode CERN EOS T2_CH_CERN stageout parameters eosStageOutParams = {} eosStageOutParams['command'] = overrides.get('command', "xrdcp") eosStageOutParams['option'] = overrides.get('option', "") eosStageOutParams['phedex-node'] = overrides.get( 'phedex-node', "T2_CH_CERN") eosStageOutParams['lfn-prefix'] = overrides.get( 'lfn-prefix', "root://eoscms.cern.ch//eos/cms") # are we using the new stageout method ? useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True try: castorStageOutMgr = StageOutMgr(**castorStageOutParams) eosStageOutMgr = StageOutMgr(**eosStageOutParams) stageInMgr = StageInMgr() deleteMgr = DeleteMgr() except Exception as ex: msg = "Unable to load StageOut/Delete Impl: %s" % str(ex) logging.error(msg) raise WMExecutionFailure(60312, "MgrImplementationError", msg) # prepare output tar file taskName = self.report.getTaskName().split('/')[-1] host = socket.gethostname().split('.')[0] tarName = '%s-%s-%s-%i-logs.tar' % ( self.report.data.workload, taskName, host, self.job["counter"]) tarLocation = os.path.join(self.stepSpace.location, tarName) # check if the cmsswVersion supports edmCopyUtil (min CMSSW_8_X) result = re.match("CMSSW_([0-9]+)_([0-9]+)_([0-9]+).*", cmsswVersion) useEdmCopyUtil = False if result: try: if int(result.group(1)) >= 8: useEdmCopyUtil = True except ValueError: pass # setup Scram needed to run edmCopyUtil if useEdmCopyUtil: scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Running scram") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # iterate through input files localLogs = [] deleteLogArchives = [] if useEdmCopyUtil: numberOfFilesPerCopy = 10 else: numberOfFilesPerCopy = 1 for logs in grouper(self.job["input_files"], numberOfFilesPerCopy): copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get( 'X509_USER_PROXY', None) for log in logs: copyCommand += " %s" % log['lfn'] copyCommand += " %s" % self.step.builder.workingDir # give up after timeout of 1 minute per input file signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60 * numberOfFilesPerCopy) filesCopied = False try: if useEdmCopyUtil: logging.info("Running edmCopyUtil") retval = scram(copyCommand) if retval == 0: filesCopied = True else: logging.info("Running stageIn") for log in logs: fileInfo = {"LFN": log['lfn']} logArchive = stageInMgr(**fileInfo) if logArchive: filesCopied = True except Alarm: logging.error( "Indefinite hang during edmCopyUtil/stageIn of logArchives" ) except StageOutFailure: logging.error("Unable to stageIn logArchives") except Exception: raise signal.alarm(0) if filesCopied: for log in logs: localLogs.append( os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn']))) deleteLogArchives.append(log) self.report.addInputFile(sourceName="logArchives", lfn=log['lfn']) else: logging.error("Unable to copy logArchives to local disk") if useEdmCopyUtil: with open('scramOutput.log', 'r') as f: logging.error("Scram output: %s", f.read()) for log in logs: self.report.addSkippedFile(log['lfn'], None) # create tarfile if any logArchive copied in if localLogs: tarFile = tarfile.open(tarLocation, 'w:') for log in localLogs: path = log.split('/') tarFile.add(name=log, arcname=os.path.join(path[-3], path[-2], path[-1])) os.remove(log) tarFile.close() else: msg = "Unable to copy any logArchives to local disk" logging.error(msg) raise WMExecutionFailure(60312, "LogCollectError", msg) # now staging out the LogCollect tarfile logging.info("Staging out LogCollect tarfile to Castor and EOS") now = datetime.datetime.now() lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % ( now.year, now.month, "WMAgent", self.report.data.workload, os.path.basename(tarLocation)) tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # perform mandatory stage out to CERN Castor signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: castorStageOutMgr(tarInfo) except Alarm: msg = "Indefinite hang during stageOut of LogCollect to Castor" logging.error(msg) raise WMExecutionFailure(60409, "LogCollectTimeout", msg) except Exception as ex: msg = "Unable to stageOut LogCollect to Castor:\n" msg += str(ex) logging.error(msg) raise WMExecutionFailure(60408, "LogCollectStageOutError", msg) signal.alarm(0) # add to job report self.report.addOutputFile(outputModule="LogCollect", file=tarInfo) outputRef = getattr(self.report.data, self.stepName) outputRef.output.pfn = tarInfo['PFN'] outputRef.output.location = tarInfo['PNN'] outputRef.output.lfn = tarInfo['LFN'] tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # then, perform best effort stage out to CERN EOS signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: eosStageOutMgr(tarInfo) except Alarm: logging.error( "Indefinite hang during stageOut of LogCollect to EOS") except Exception as ex: logging.error("Unable to stageOut LogCollect to EOS:\n", ex) signal.alarm(0) # we got this far, delete input for log in deleteLogArchives: # give up after timeout of 1 minutes signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60) try: fileToDelete = { 'LFN': log['lfn'], 'PFN': None, 'PNN': None, 'StageOutCommand': None } deleteMgr(fileToDelete=fileToDelete) except Alarm: logging.error("Indefinite hang during delete of logArchive") except Exception as ex: logging.error("Unable to delete logArchive: %s", ex) signal.alarm(0) return
def execute(self, emulator=None): """ _execute_ """ # Are we using emulators again? if emulator is not None: return emulator.emulate(self.step, self.job) logging.info("Steps.Executors.%s.execute called", self.__class__.__name__) scramCommand = self.step.application.setup.scramCommand cmsswVersion = self.step.application.setup.cmsswVersion scramArch = getSingleScramArch(self.step.application.setup.scramArch) overrideCatalog = getattr(self.step.application, 'overrideCatalog', None) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) # hardcode CERN EOS T2_CH_CERN stageout parameters eosStageOutParams = {} eosStageOutParams['command'] = overrides.get('command', "xrdcp") eosStageOutParams['option'] = overrides.get('option', "--wma-disablewriterecovery") eosStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN") eosStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "root://eoscms.cern.ch//eos/cms") try: eosStageOutMgr = StageOutMgr(**eosStageOutParams) stageInMgr = StageInMgr() deleteMgr = DeleteMgr() except Exception as ex: msg = "Unable to load StageOut/Delete Impl: %s" % str(ex) logging.error(msg) raise WMExecutionFailure(60312, "MgrImplementationError", msg) # prepare output tar file taskName = self.report.getTaskName().split('/')[-1] host = socket.gethostname().split('.')[0] tarName = '%s-%s-%s-%i-logs.tar' % (self.report.data.workload, taskName, host, self.job["counter"]) tarLocation = os.path.join(self.stepSpace.location, tarName) # Supported by any release beyond CMSSW_8_X, however DaviX is broken until CMSSW_10_4_X # see: https://github.com/cms-sw/cmssw/issues/25292 useEdmCopyUtil = True if isCMSSWSupported(cmsswVersion, "CMSSW_10_4_0"): pass elif scramArch.startswith('slc7_amd64_'): msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead" logging.warning(msg) cmsswVersion = "CMSSW_10_4_0" scramArch = "slc7_amd64_gcc820" elif scramArch.startswith('slc6_amd64_'): msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead" logging.warning(msg) cmsswVersion = "CMSSW_10_4_0" scramArch = "slc6_amd64_gcc700" else: useEdmCopyUtil = False logging.info("Using edmCopyUtil: %s", useEdmCopyUtil) # setup Scram needed to run edmCopyUtil if useEdmCopyUtil: scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Running scram") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # iterate through input files localLogs = [] deleteLogArchives = [] if useEdmCopyUtil: numberOfFilesPerCopy = 10 else: numberOfFilesPerCopy = 1 for logs in grouper(self.job["input_files"], numberOfFilesPerCopy): copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get('X509_USER_PROXY', None) # specify TFC if necessary if overrideCatalog: copyCommand += " -c %s" % overrideCatalog for log in logs: copyCommand += " %s" % log['lfn'] copyCommand += " %s" % self.step.builder.workingDir # give up after timeout of 1 minute per input file signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60 * numberOfFilesPerCopy) filesCopied = False try: if useEdmCopyUtil: logging.info("Running edmCopyUtil") retval = scram(copyCommand) if retval == 0: filesCopied = True else: logging.info("Running stageIn") for log in logs: fileInfo = {"LFN": log['lfn']} logArchive = stageInMgr(**fileInfo) if logArchive: filesCopied = True except Alarm: logging.error("Indefinite hang during edmCopyUtil/stageIn of logArchives") except StageOutFailure: logging.error("Unable to stageIn logArchives") except Exception: raise signal.alarm(0) if filesCopied: for log in logs: localLogs.append(os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn']))) deleteLogArchives.append(log) self.report.addInputFile(sourceName="logArchives", lfn=log['lfn']) else: logging.error("Unable to copy logArchives to local disk") for log in logs: self.report.addSkippedFile(log['lfn'], None) # create tarfile if any logArchive copied in if localLogs: with tarfile.open(tarLocation, 'w:') as tarFile: for log in localLogs: path = log.split('/') tarFile.add(name=log, arcname=os.path.join(path[-3], path[-2], path[-1])) os.remove(log) else: msg = "Unable to copy any logArchives to local disk" logging.error(msg) raise WMExecutionFailure(60312, "LogCollectError", msg) # now staging out the LogCollect tarfile logging.info("Staging out LogCollect tarfile to EOS (skipping CASTOR)") now = datetime.datetime.now() lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent", self.report.data.workload, os.path.basename(tarLocation)) tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None} # perform mandatory stage out to CERN EOS signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: eosStageOutMgr(tarInfo) except Alarm: msg = "Indefinite hang during stageOut of LogCollect to EOS" logging.error(msg) raise WMExecutionFailure(60409, "LogCollectTimeout", msg) except Exception as ex: msg = "Unable to stageOut LogCollect to Castor:\n" msg += str(ex) logging.error(msg) raise WMExecutionFailure(60408, "LogCollectStageOutError", msg) signal.alarm(0) # add to job report self.report.addOutputFile(outputModule="LogCollect", aFile=tarInfo) outputRef = getattr(self.report.data, self.stepName) outputRef.output.pfn = tarInfo['PFN'] outputRef.output.location = tarInfo['PNN'] outputRef.output.lfn = tarInfo['LFN'] # we got this far, delete ALL input files assigned to this job for log in self.job["input_files"]: # give up after timeout of 1 minutes signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(60) try: fileToDelete = {'LFN': log['lfn'], 'PFN': None, 'PNN': None, 'StageOutCommand': None} deleteMgr(fileToDelete=fileToDelete) except Alarm: logging.error("Indefinite hang during delete of logArchive") except Exception as ex: logging.error("Unable to delete logArchive: %s", ex) signal.alarm(0) return
def execute(self, emulator = None): """ _execute_ """ #Are we using emulators again? if (emulator != None): return emulator.emulate( self.step, self.job ) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to two hours per retry # this alarm leaves a subprocess behing that may cause trouble, see #6273 waitTime = overrides.get('waitTime', 7200 * self.step.retryCount) logging.info("StageOut override is: %s " % self.step) # Pull out StageOutMgr Overrides # switch between old stageOut behavior and new, fancy stage out behavior useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True stageOutCall = {} if "command" in overrides and "option" in overrides \ and "phedex-node" in overrides \ and"lfn-prefix" in overrides: logging.critical('using override in StageOut') stageOutCall['command'] = overrides.get('command') stageOutCall['option'] = overrides.get('option') stageOutCall['phedex-node']= overrides.get('phedex-node') stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix') # naw man, this is real # iterate over all the incoming files if not useNewStageOutCode: # old style manager = StageOutMgr(**stageOutCall) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE") print("STAGEOUT IS USING NEW STAGEOUT CODE") manager = FMStageOutMgr(retryPauseTime = self.step.retryDelay, numberOfRetries = self.step.retryCount, **stageOutCall) # We need to find a list of steps in our task # And eventually a list of jobReports for out steps # Search through steps for report files filesTransferred = [] for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: #Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s" % (step)) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s" \ % (step, stepLocation)) continue # First, get everything from a file and 'unpersist' it stepReport = Report() stepReport.unpersist(reportLocation, step) # Don't stage out files from bad steps. if not stepReport.stepSuccessful(step): continue # Okay, time to start using stuff # Now I'm a bit confused about this; each report should ONLY # Have the results of that particular step in it, # So getting all the files should get ONLY the files # for that step; or so I hope files = stepReport.getAllFileRefsFromStep(step = step) for fileName in files: if not hasattr(fileName, 'lfn') and hasattr(fileName, 'pfn'): # Then we're truly hosed on this file; ignore it msg = "Not a file: %s" % fileName logging.error(msg) continue # Support direct-to-merge # This requires pulling a bunch of stuff from everywhere # First check if it's needed if hasattr(self.step.output, 'minMergeSize') and hasattr(fileName, 'size') \ and not getattr(fileName, 'merged', False): # We need both of those to continue, and we don't # direct-to-merge if getattr(self.step.output, 'doNotDirectMerge', False): # Then we've been told explicitly not to do direct-to-merge continue if fileName.size >= self.step.output.minMergeSize: # Then this goes direct to merge try: fileName = self.handleLFNForMerge(mergefile = fileName, step = step) except Exception as ex: logging.error("Encountered error while handling LFN for merge due to size.\n") logging.error(str(ex)) logging.debug(fileName) logging.debug("minMergeSize: %s" % self.step.output.minMergeSize) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60401, "DirectToMergeFailure", str(ex)) elif getattr(self.step.output, 'maxMergeEvents', None) != None \ and getattr(fileName, 'events', None) != None and not getattr(fileName, 'merged', False): # Then direct-to-merge due to events if # the file is large enough: if fileName.events >= self.step.output.maxMergeEvents: # straight to merge try: fileName = self.handleLFNForMerge(mergefile = fileName, step = step) except Exception as ex: logging.error("Encountered error while handling LFN for merge due to events.\n") logging.error(str(ex)) logging.debug(fileName) logging.debug("maxMergeEvents: %s" % self.step.output.maxMergeEvents) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60402, "DirectToMergeFailure", str(ex)) # Save the input PFN in case we need it # Undecided whether to move fileName.pfn to the output PFN fileName.InputPFN = fileName.pfn lfn = getattr(fileName, 'lfn') fileSource = getattr(fileName, 'Source', None) if fileSource in ['TFileService', 'UserDefined']: userLfnRegEx(lfn) else: lfnRegEx(lfn) fileForTransfer = {'LFN': lfn, 'PFN': getattr(fileName, 'pfn'), 'PNN' : None, 'StageOutCommand': None, 'Checksums' : getattr(fileName, 'checksums', None)} signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileForTransfer) #Afterwards, the file should have updated info. filesTransferred.append(fileForTransfer) fileName.StageOutCommand = fileForTransfer['StageOutCommand'] fileName.location = fileForTransfer['PNN'] fileName.OutputPFN = fileForTransfer['PFN'] except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60403, "StageOutTimeout", msg) stepReport.setStepStatus(self.stepName, 1) # well, if it fails for one file, it fails for the whole job... break except Exception as ex: manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60307, "StageOutFailure", str(ex)) stepReport.setStepStatus(self.stepName, 1) stepReport.persist(reportLocation) raise signal.alarm(0) # Am DONE with report. Persist it stepReport.persist(reportLocation) #Done with all steps, and should have a list of #stagedOut files in fileForTransfer logging.info("Transferred %i files" %(len(filesTransferred))) return
def __init__(self, jobReport): self.report = jobReport self.mgr = StageOutMgr() self.failures = {}
def execute(self, emulator=None): """ _execute_ """ #Are we using emulators again? if (emulator != None): return emulator.emulate(self.step, self.job) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to two hours per retry # this alarm leaves a subprocess behing that may cause trouble, see #6273 waitTime = overrides.get('waitTime', 7200 * self.step.retryCount) logging.info("StageOut override is: %s ", self.step) # Pull out StageOutMgr Overrides # switch between old stageOut behavior and new, fancy stage out behavior useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True stageOutCall = {} if "command" in overrides and "option" in overrides \ and "phedex-node" in overrides \ and"lfn-prefix" in overrides: logging.critical('using override in StageOut') stageOutCall['command'] = overrides.get('command') stageOutCall['option'] = overrides.get('option') stageOutCall['phedex-node'] = overrides.get('phedex-node') stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix') # naw man, this is real # iterate over all the incoming files if not useNewStageOutCode: # old style manager = StageOutMgr(**stageOutCall) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE") print("STAGEOUT IS USING NEW STAGEOUT CODE") manager = FMStageOutMgr(retryPauseTime=self.step.retryDelay, numberOfRetries=self.step.retryCount, **stageOutCall) # We need to find a list of steps in our task # And eventually a list of jobReports for out steps # Search through steps for report files filesTransferred = [] for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: #Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s", step) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s", step, stepLocation) continue # First, get everything from a file and 'unpersist' it stepReport = Report() stepReport.unpersist(reportLocation, step) # Don't stage out files from bad steps. if not stepReport.stepSuccessful(step): continue # Okay, time to start using stuff # Now I'm a bit confused about this; each report should ONLY # Have the results of that particular step in it, # So getting all the files should get ONLY the files # for that step; or so I hope files = stepReport.getAllFileRefsFromStep(step=step) for fileName in files: # make sure the file information is consistent if hasattr(fileName, 'pfn') and (not hasattr(fileName, 'lfn') or not hasattr(fileName, 'module_label')): msg = "Not a valid file: %s" % fileName logging.error(msg) continue # Figuring out if we should do straight to merge # - should we do straight to merge at all ? # - is straight to merge disabled for this output ? # - are we over the size threshold # - are we over the event threshold ? straightToMerge = False if not getattr(fileName, 'merged', False) and hasattr( self.step.output, 'minMergeSize'): if fileName.module_label not in getattr( self.step.output, 'forceUnmergedOutputs', []): if getattr(fileName, 'size', 0) >= self.step.output.minMergeSize: straightToMerge = True if getattr(fileName, 'events', 0) >= getattr( self.step.output, 'maxMergeEvents', sys.maxsize): straightToMerge = True if straightToMerge: try: fileName = self.handleLFNForMerge(mergefile=fileName, step=step) except Exception as ex: logging.info( "minMergeSize: %s", getattr(self.step.output, 'minMergeSize', None)) logging.info( "maxMergeEvents: %s", getattr(self.step.output, 'maxMergeEvents', None)) logging.error( "Encountered error while handling LFN for merge %s", fileName) logging.error(str(ex)) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60401, "DirectToMergeFailure", str(ex)) # Save the input PFN in case we need it # Undecided whether to move fileName.pfn to the output PFN fileName.InputPFN = fileName.pfn lfn = getattr(fileName, 'lfn') fileSource = getattr(fileName, 'Source', None) if fileSource in ['TFileService', 'UserDefined']: userLfnRegEx(lfn) else: lfnRegEx(lfn) fileForTransfer = { 'LFN': lfn, 'PFN': getattr(fileName, 'pfn'), 'PNN': None, 'StageOutCommand': None, 'Checksums': getattr(fileName, 'checksums', None) } signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileForTransfer) #Afterwards, the file should have updated info. filesTransferred.append(fileForTransfer) fileName.StageOutCommand = fileForTransfer[ 'StageOutCommand'] fileName.location = fileForTransfer['PNN'] fileName.OutputPFN = fileForTransfer['PFN'] except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60403, "StageOutTimeout", msg) stepReport.setStepStatus(self.stepName, 1) # well, if it fails for one file, it fails for the whole job... break except Exception as ex: manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60307, "StageOutFailure", str(ex)) stepReport.setStepStatus(self.stepName, 1) stepReport.persist(reportLocation) raise signal.alarm(0) # Am DONE with report. Persist it stepReport.persist(reportLocation) #Done with all steps, and should have a list of #stagedOut files in fileForTransfer logging.info("Transferred %i files", len(filesTransferred)) return
def execute(self, emulator=None): """ _execute_ """ # Are we using emulators again? if emulator is not None: return emulator.emulate(self.step, self.job) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to two hours per retry # this alarm leaves a subprocess behing that may cause trouble, see #6273 waitTime = overrides.get('waitTime', 7200 * self.step.retryCount) logging.info("StageOut override is: %s ", self.step) # Pull out StageOutMgr Overrides # switch between old stageOut behavior and new, fancy stage out behavior useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True stageOutCall = {} if "command" in overrides and "option" in overrides \ and "phedex-node" in overrides \ and "lfn-prefix" in overrides: logging.critical('using override in StageOut') stageOutCall['command'] = overrides.get('command') stageOutCall['option'] = overrides.get('option') stageOutCall['phedex-node'] = overrides.get('phedex-node') stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix') # naw man, this is real # iterate over all the incoming files if not useNewStageOutCode: # old style manager = StageOutMgr(**stageOutCall) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE") manager = FMStageOutMgr(retryPauseTime=self.step.retryDelay, numberOfRetries=self.step.retryCount, **stageOutCall) # We need to find a list of steps in our task # And eventually a list of jobReports for out steps # Search through steps for report files filesTransferred = [] for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: # Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s", step) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s", step, stepLocation) continue # First, get everything from a file and 'unpersist' it stepReport = Report() stepReport.unpersist(reportLocation, step) # Don't stage out files from bad steps. if not stepReport.stepSuccessful(step): continue # Okay, time to start using stuff # Now I'm a bit confused about this; each report should ONLY # Have the results of that particular step in it, # So getting all the files should get ONLY the files # for that step; or so I hope files = stepReport.getAllFileRefsFromStep(step=step) for fileName in files: # make sure the file information is consistent if hasattr(fileName, 'pfn') and (not hasattr(fileName, 'lfn') or not hasattr(fileName, 'module_label')): msg = "Not a valid file: %s" % fileName logging.error(msg) continue # Figuring out if we should do straight to merge # - should we do straight to merge at all ? # - is straight to merge disabled for this output ? # - are we over the size threshold # - are we over the event threshold ? straightToMerge = False if not getattr(fileName, 'merged', False): if hasattr(fileName, 'dataset') and fileName.dataset.get('dataTier', "") in ["NANOAOD", "NANOAODSIM"]: logging.info("NANOAOD and NANOAODSIM files never go straight to merge!") elif fileName.module_label not in getattr(self.step.output, 'forceUnmergedOutputs', []): if hasattr(self.step.output, 'minMergeSize') and getattr(fileName, 'size', 0) >= self.step.output.minMergeSize: logging.info("Sending %s straight to merge due to minMergeSize", fileName.lfn) straightToMerge = True elif getattr(fileName, 'events', 0) >= getattr(self.step.output, 'maxMergeEvents', sys.maxsize): logging.info("Sending %s straight to merge due to maxMergeEvents", fileName.lfn) straightToMerge = True if straightToMerge: try: fileName = self.handleLFNForMerge(mergefile=fileName, step=step) except Exception as ex: logging.info("minMergeSize: %s", getattr(self.step.output, 'minMergeSize', None)) logging.info("maxMergeEvents: %s", getattr(self.step.output, 'maxMergeEvents', None)) logging.error("Encountered error while handling LFN for merge %s", fileName) logging.error(str(ex)) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60401, "DirectToMergeFailure", str(ex)) # Save the input PFN in case we need it # Undecided whether to move fileName.pfn to the output PFN fileName.InputPFN = fileName.pfn lfn = fileName.lfn fileSource = getattr(fileName, 'Source', None) if fileSource in ['TFileService', 'UserDefined']: userLfnRegEx(lfn) else: lfnRegEx(lfn) fileForTransfer = {'LFN': lfn, 'PFN': getattr(fileName, 'pfn'), 'PNN': None, 'StageOutCommand': None, 'Checksums': getattr(fileName, 'checksums', None)} signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileForTransfer) # Afterwards, the file should have updated info. filesTransferred.append(fileForTransfer) fileName.StageOutCommand = fileForTransfer['StageOutCommand'] fileName.location = fileForTransfer['PNN'] fileName.OutputPFN = fileForTransfer['PFN'] except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60403, "StageOutTimeout", msg) # well, if it fails for one file, it fails for the whole job... break except Exception as ex: manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60307, "StageOutFailure", str(ex)) stepReport.persist(reportLocation) raise signal.alarm(0) # Am DONE with report. Persist it stepReport.persist(reportLocation) # Done with all steps, and should have a list of # stagedOut files in fileForTransfer logging.info("Transferred %i files", len(filesTransferred)) return