def makeThreadsStreamsTweak(self): """ _makeThreadsStreamsTweak_ Tweak threads and streams paraameters """ origCores = int( getattr(self.step.data.application.multicore, 'numberOfCores', 1)) eventStreams = int( getattr(self.step.data.application.multicore, 'eventStreams', 0)) resources = {'cores': origCores} resizeResources(resources) numCores = resources['cores'] if numCores != origCores: self.logger.info( "Resizing a job with nStreams != nCores. Setting nStreams = nCores. This may end badly." ) eventStreams = 0 tweak = PSetTweak() tweak.addParameter("process.options", "customTypeCms.untracked.PSet()") self.applyPsetTweak(tweak, skipIfSet=True) self.tweak.addParameter( "process.options.numberOfThreads", "customTypeCms.untracked.uint32(%s)" % numCores) self.tweak.addParameter( "process.options.numberOfStreams", "customTypeCms.untracked.uint32(%s)" % eventStreams) return
def setupMonitors(self, task, wmbsJob): logging.info("In Watchdog.setupMonitors") if not hasattr(task.data, 'watchdog'): msg = "Could not find watchdog in spec" logging.error(msg) # I don't think this is necessarily fatal return if not hasattr(task.data.watchdog, 'monitors'): msg = "Watchdog has no monitors" logging.error(msg) # Probably not fatal either return if hasattr(task.data.watchdog, 'interval'): # Set the interval off the config self.setInterval(task.data.watchdog.interval) for monitor in task.data.watchdog.monitors: msg = "Initializing monitor %s" % monitor logging.info(msg) mon = self.loadMonitor(monitor) args = {} if hasattr(task.data.watchdog, monitor): # This should be a config section monitorArgs = getattr(task.data.watchdog, monitor) args = monitorArgs.dictionary_() if monitor == 'PerformanceMonitor' and args: # Apply tweaks to PerformanceMonitor only. # Scale resources according to the HTCondor runtime environment. origCores = 1 for stepName in task.listAllStepNames(): sh = task.getStepHelper(stepName) origCores = max(origCores, sh.getNumberOfCores()) resources = {'cores': origCores} origMaxPSS = args.get('maxPSS', args.get('maxRSS')) if origMaxPSS: resources['memory'] = origMaxPSS # Actually parses the HTCondor runtime resizeResources(resources) # We decided to only touch Watchdog settings if the number of cores changed. # (even if this means the watchdog memory is wrong for a slot this size). changedCores = origCores != resources['cores'] # If we did base maxPSS off the memory in the HTCondor slot, subtract a bit # off the top so watchdog triggers before HTCondor does. # Add the new number of cores to the args such that PerformanceMonitor can see it args['cores'] = resources['cores'] if changedCores: if origMaxPSS: args['maxPSS'] = resources['memory'] - 50 logging.info("Watchdog modified: %s. Final settings:", changedCores) for k, v in viewitems(args): logging.info(" %s: %r", k, v) # Actually initialize the monitor variables mon.initMonitor(task=task, job=wmbsJob, logPath=self.logPath, args=args) self._Monitors.append(mon) return
def setupMonitors(self, task, wmbsJob): logging.info("In Watchdog.setupMonitors") if not hasattr(task.data, 'watchdog'): msg = "Could not find watchdog in spec" logging.error(msg) # I don't think this is necessarily fatal return if not hasattr(task.data.watchdog, 'monitors'): msg = "Watchdog has no monitors" logging.error(msg) # Probably not fatal either return if hasattr(task.data.watchdog, 'interval'): # Set the interval off the config self.setInterval(task.data.watchdog.interval) for monitor in task.data.watchdog.monitors: msg = "Initializing monitor %s" % monitor logging.info(msg) mon = self.loadMonitor(monitor) args = {} if hasattr(task.data.watchdog, monitor): # This should be a config section monitorArgs = getattr(task.data.watchdog, monitor) args = monitorArgs.dictionary_() if monitor == 'PerformanceMonitor' and args: # Apply tweaks to PerformanceMonitor only. # Scale resources according to the HTCondor runtime environment. origCores = 1 for stepName in task.listAllStepNames(): sh = task.getStepHelper(stepName) origCores = max(origCores, sh.getNumberOfCores()) resources = {'cores': origCores} origMaxPSS = args.get('maxPSS', args.get('maxRSS')) if origMaxPSS: resources['memory'] = origMaxPSS # Actually parses the HTCondor runtime resizeResources(resources) # We decided to only touch Watchdog settings if the number of cores changed. # (even if this means the watchdog memory is wrong for a slot this size). changedCores = origCores != resources['cores'] # If we did base maxPSS off the memory in the HTCondor slot, subtract a bit # off the top so watchdog triggers before HTCondor does. # Add the new number of cores to the args such that DashboardInterface can see it args['cores'] = resources['cores'] if changedCores: if origMaxPSS: args['maxPSS'] = resources['memory'] - 50 logging.info("Watchdog modified: %s. Final settings:", changedCores) for k, v in args.iteritems(): logging.info(" %s: %r", k, v) # Actually initialize the monitor variables mon.initMonitor(task=task, job=wmbsJob, logPath=self.logPath, args=args) self._Monitors.append(mon) return
def __call__(self): """ _call_ Examine the step configuration and construct a PSet from that. """ self.logger.info("Executing SetupCMSSWPSet...") self.jobBag = self.job.getBaggage() scenario = getattr(self.step.data.application.configuration, "scenario", None) if scenario is not None and scenario != "": self.logger.info("Setting up job scenario/process") funcName = getattr(self.step.data.application.configuration, "function", None) if getattr(self.step.data.application.configuration, "pickledarguments", None) is not None: funcArgs = pickle.loads( self.step.data.application.configuration.pickledarguments) else: funcArgs = {} try: self.createProcess(scenario, funcName, funcArgs) except Exception as ex: self.logger.exception( "Error creating process for Config/DataProcessing:") raise ex if funcName == "repack": self.handleRepackSettings() if funcName in ["merge", "alcaHarvesting"]: self.handleSingleCoreOverride() if socket.getfqdn().endswith("cern.ch"): self.handleSpecialCERNMergeSettings(funcName) else: try: self.loadPSet() except Exception as ex: self.logger.exception("Error loading PSet:") raise ex # Check process.source exists if getattr(self.process, "source", None) is None: msg = "Error in CMSSW PSet: process is missing attribute 'source'" msg += " or process.source is defined with None value." self.logger.error(msg) raise RuntimeError(msg) self.handleCondorStatusService() self.fixupProcess() # In case of CRAB3, the number of threads in the PSet should not be overridden if not self.crabPSet: try: origCores = int( getattr(self.step.data.application.multicore, 'numberOfCores', 1)) eventStreams = int( getattr(self.step.data.application.multicore, 'eventStreams', 0)) resources = {'cores': origCores} resizeResources(resources) numCores = resources['cores'] if numCores != origCores: self.logger.info( "Resizing a job with nStreams != nCores. Setting nStreams = nCores. This may end badly." ) eventStreams = 0 options = getattr(self.process, "options", None) if options is None: self.process.options = cms.untracked.PSet() options = getattr(self.process, "options") options.numberOfThreads = cms.untracked.uint32(numCores) options.numberOfStreams = cms.untracked.uint32(eventStreams) except AttributeError as ex: self.logger.error("Failed to override numberOfThreads: %s", str(ex)) psetTweak = getattr(self.step.data.application.command, "psetTweak", None) if psetTweak is not None: self.applyPSetTweak(psetTweak, self.fixupDict) # Apply task level tweaks taskTweak = makeTaskTweak(self.step.data) applyTweak(self.process, taskTweak, self.fixupDict) # Check if chained processing is enabled # If not - apply the per job tweaks # If so - create an override TFC (like done in PA) and then modify thePSet accordingly if hasattr(self.step.data.input, "chainedProcessing" ) and self.step.data.input.chainedProcessing: self.handleChainedProcessing() else: # Apply per job PSet Tweaks jobTweak = makeJobTweak(self.job) applyTweak(self.process, jobTweak, self.fixupDict) # check for pileup settings presence, pileup support implementation # and if enabled, process pileup configuration / settings if hasattr(self.step.data, "pileup"): self.handlePileup() # Apply per output module PSet Tweaks cmsswStep = self.step.getTypeHelper() for om in cmsswStep.listOutputModules(): mod = cmsswStep.getOutputModule(om) outTweak = makeOutputTweak(mod, self.job) applyTweak(self.process, outTweak, self.fixupDict) # revlimiter for testing if getattr(self.step.data.application.command, "oneEventMode", False): self.process.maxEvents.input = 1 # check for random seeds and the method of seeding which is in the job baggage self.handleSeeding() # make sure default parametersets for perf reports are installed self.handlePerformanceSettings() # check for event numbers in the producers self.handleProducersNumberOfEvents() # fixup the dqmFileSaver self.handleDQMFileSaver() # tweak for jobs reading LHE articles from CERN self.handleLHEInput() # tweak jobs for enforceGUIDInFileName self.handleEnforceGUIDInFileName() # Check if we accept skipping bad files if hasattr(self.step.data.application.configuration, "skipBadFiles"): self.process.source.skipBadFiles = \ cms.untracked.bool(self.step.data.application.configuration.skipBadFiles) # Apply events per lumi section if available if hasattr(self.step.data.application.configuration, "eventsPerLumi"): self.process.source.numberEventsInLuminosityBlock = \ cms.untracked.uint32(self.step.data.application.configuration.eventsPerLumi) # limit run time if desired if hasattr(self.step.data.application.configuration, "maxSecondsUntilRampdown"): self.process.maxSecondsUntilRampdown = cms.untracked.PSet( input=cms.untracked.int32( self.step.data.application.configuration. maxSecondsUntilRampdown)) # accept an overridden TFC from the step if hasattr(self.step.data.application, 'overrideCatalog'): self.logger.info("Found a TFC override: %s", self.step.data.application.overrideCatalog) self.process.source.overrideCatalog = \ cms.untracked.string(self.step.data.application.overrideCatalog) configFile = self.step.data.application.command.configuration configPickle = getattr(self.step.data.application.command, "configurationPickle", "PSet.pkl") workingDir = self.stepSpace.location try: with open("%s/%s" % (workingDir, configPickle), 'wb') as pHandle: pickle.dump(self.process, pHandle) with open("%s/%s" % (workingDir, configFile), 'w') as handle: handle.write("import FWCore.ParameterSet.Config as cms\n") handle.write("import pickle\n") handle.write("with open('%s', 'rb') as handle:\n" % configPickle) handle.write(" process = pickle.load(handle)\n") except Exception as ex: self.logger.exception("Error writing out PSet:") raise ex self.logger.info("CMSSW PSet setup completed!") return 0
def __call__(self): """ _call_ Examine the step configuration and construct a PSet from that. """ self.process = None scenario = getattr(self.step.data.application.configuration, "scenario", None) if scenario is not None and scenario != "": funcName = getattr(self.step.data.application.configuration, "function", None) if getattr(self.step.data.application.configuration, "pickledarguments", None) is not None: funcArgs = pickle.loads(self.step.data.application.configuration.pickledarguments) else: funcArgs = {} try: self.createProcess(scenario, funcName, funcArgs) except Exception as ex: logging.exception("Error creating process for Config/DataProcessing:") raise ex if funcName == "repack": self.handleRepackSettings() if funcName in ["merge", "alcaHarvesting"]: self.handleSingleCoreOverride() if socket.getfqdn().endswith("cern.ch"): self.handleSpecialCERNMergeSettings(funcName) else: try: self.loadPSet() except Exception as ex: logging.exception("Error loading PSet:") raise ex # Check process.source exists if getattr(self.process, "source", None) is None: msg = "Error in CMSSW PSet: process is missing attribute 'source'" msg += " or process.source is defined with None value." logging.error(msg) raise RuntimeError(msg) self.handleCondorStatusService() self.fixupProcess() # In case of CRAB3, the number of threads in the PSet should not be overridden if not self.crabPSet: try: origCores = int(getattr(self.step.data.application.multicore, 'numberOfCores', 1)) eventStreams = int(getattr(self.step.data.application.multicore, 'eventStreams', 0)) resources = {'cores': origCores} resizeResources(resources) numCores = resources['cores'] if numCores != origCores: logging.info( "Resizing a job with nStreams != nCores. Setting nStreams = nCores. This may end badly.") eventStreams = 0 options = getattr(self.process, "options", None) if options is None: self.process.options = cms.untracked.PSet() options = getattr(self.process, "options") options.numberOfThreads = cms.untracked.uint32(numCores) options.numberOfStreams = cms.untracked.uint32(eventStreams) except AttributeError as ex: logging.error("Failed to override numberOfThreads: %s", str(ex)) psetTweak = getattr(self.step.data.application.command, "psetTweak", None) if psetTweak is not None: self.applyPSetTweak(psetTweak, self.fixupDict) # Apply task level tweaks taskTweak = makeTaskTweak(self.step.data) applyTweak(self.process, taskTweak, self.fixupDict) # Check if chained processing is enabled # If not - apply the per job tweaks # If so - create an override TFC (like done in PA) and then modify thePSet accordingly if hasattr(self.step.data.input, "chainedProcessing") and self.step.data.input.chainedProcessing: self.handleChainedProcessing() else: # Apply per job PSet Tweaks jobTweak = makeJobTweak(self.job) applyTweak(self.process, jobTweak, self.fixupDict) # check for pileup settings presence, pileup support implementation # and if enabled, process pileup configuration / settings if hasattr(self.step.data, "pileup"): self.handlePileup() # Apply per output module PSet Tweaks cmsswStep = self.step.getTypeHelper() for om in cmsswStep.listOutputModules(): mod = cmsswStep.getOutputModule(om) outTweak = makeOutputTweak(mod, self.job) applyTweak(self.process, outTweak, self.fixupDict) # revlimiter for testing if getattr(self.step.data.application.command, "oneEventMode", False): self.process.maxEvents.input = 1 # check for random seeds and the method of seeding which is in the job baggage self.handleSeeding() # make sure default parametersets for perf reports are installed self.handlePerformanceSettings() # check for event numbers in the producers self.handleProducersNumberOfEvents() # fixup the dqmFileSaver self.handleDQMFileSaver() # Check if we accept skipping bad files if hasattr(self.step.data.application.configuration, "skipBadFiles"): self.process.source.skipBadFiles = \ cms.untracked.bool(self.step.data.application.configuration.skipBadFiles) # Apply events per lumi section if available if hasattr(self.step.data.application.configuration, "eventsPerLumi"): self.process.source.numberEventsInLuminosityBlock = \ cms.untracked.uint32(self.step.data.application.configuration.eventsPerLumi) # limit run time if desired if hasattr(self.step.data.application.configuration, "maxSecondsUntilRampdown"): self.process.maxSecondsUntilRampdown = cms.untracked.PSet( input=cms.untracked.int32(self.step.data.application.configuration.maxSecondsUntilRampdown)) # accept an overridden TFC from the step if hasattr(self.step.data.application, 'overrideCatalog'): logging.info("Found a TFC override: %s", self.step.data.application.overrideCatalog) self.process.source.overrideCatalog = \ cms.untracked.string(self.step.data.application.overrideCatalog) configFile = self.step.data.application.command.configuration configPickle = getattr(self.step.data.application.command, "configurationPickle", "PSet.pkl") workingDir = self.stepSpace.location try: with open("%s/%s" % (workingDir, configPickle), 'wb') as pHandle: pickle.dump(self.process, pHandle) with open("%s/%s" % (workingDir, configFile), 'w') as handle: handle.write("import FWCore.ParameterSet.Config as cms\n") handle.write("import pickle\n") handle.write("with open('%s', 'rb') as handle:\n" % configPickle) handle.write(" process = pickle.load(handle)\n") except Exception as ex: logging.exception("Error writing out PSet:") raise ex return 0
def setupMonitors(self, task, wmbsJob): logging.info("In Watchdog.setupMonitors") if not hasattr(task.data, 'watchdog'): msg = "Could not find watchdog in spec" logging.error(msg) # I don't think this is necessarily fatal return if not hasattr(task.data.watchdog, 'monitors'): msg = "Watchdog has no monitors" logging.error(msg) # Probably not fatal either return if hasattr(task.data.watchdog, 'interval'): # Set the interval off the config self.setInterval(task.data.watchdog.interval) for monitor in task.data.watchdog.monitors: msg = "Initializing monitor %s" % monitor logging.info(msg) mon = self.loadMonitor(monitor) args = {} if hasattr(task.data.watchdog, monitor): # This should be a config section monitorArgs = getattr(task.data.watchdog, monitor) args = monitorArgs.dictionary_() if monitor == 'PerformanceMonitor' and args: # Apply tweaks to PerformanceMonitor only. # Scale resources according to the HTCondor runtime environment. origCores = 1 for stepName in task.listAllStepNames(): sh = task.getStepHelper(stepName) origCores = max(origCores, sh.getNumberOfCores()) resources = {'cores': origCores} origMaxRSS = args.get('maxRSS') if origMaxRSS: origMaxRSS = int(origMaxRSS / 1024.) # HTCondor expects MB; we get KB. resources['memory'] = origMaxRSS # Actually parses the HTCondor runtime resizeResources(resources) # We decided to only touch Watchdog settings if the number of cores changed. # (even if this means the watchdog memory is wrong for a slot this size). changedCores = origCores != resources['cores'] # HTCondor doesn't explicitly scale VSize; it's also not clear what # resources this manages (as we already watch the memory use) or how # it should relate to other resources (such as memory or cores used). # Hence, we simply remove it if we change anything about the memory. # If we did base maxRSS off the memory in the HTCondor slot, subtract a bit # off the top so watchdog triggers before HTCondor does. # Add the new number of cores to the args such that DashboardInterface can see it args['cores'] = resources['cores'] if changedCores: if origMaxRSS: args.pop('maxVSize', None) args['maxRSS'] = 1024 * (resources['memory'] - 50 ) # Convert back to KB logging.info("Watchdog modified: %s. Final settings:", changedCores) for k, v in args.iteritems(): logging.info(" %s: %r", k, v) # Actually initialize the monitor variables mon.initMonitor(task=task, job=wmbsJob, logPath=self.logPath, args=args) self._Monitors.append(mon) return
def setupMonitors(self, task, wmbsJob): logging.info("In Watchdog.setupMonitors") if not hasattr(task.data, 'watchdog'): msg = "Could not find watchdog in spec" logging.error(msg) # I don't think this is necessarily fatal return if not hasattr(task.data.watchdog, 'monitors'): msg = "Watchdog has no monitors" logging.error(msg) # Probably not fatal either return if hasattr(task.data.watchdog, 'interval'): # Set the interval off the config self.setInterval(task.data.watchdog.interval) for monitor in task.data.watchdog.monitors: msg = "Initializing monitor %s" % monitor logging.info(msg) mon = self.loadMonitor(monitor) args = {} if hasattr(task.data.watchdog, monitor): # This should be a config section monitorArgs = getattr(task.data.watchdog, monitor) args = monitorArgs.dictionary_() if monitor == 'PerformanceMonitor' and args: # Apply tweaks to PerformanceMonitor only. # Scale resources according to the HTCondor runtime environment. origCores = 1 for stepName in task.listAllStepNames(): sh = task.getStepHelper(stepName) origCores = max(origCores, sh.getNumberOfCores()) resources = {'cores': origCores} origMaxRSS = args.get('maxRSS') ### TODO: keep only the else clause after ~HG1805 if origMaxRSS and origMaxRSS > 100 * 1000: # in case MaxRSS is in KB origMaxRSS = int(origMaxRSS / 1024.) # HTCondor expects MB; we get KB. resources['memory'] = origMaxRSS elif origMaxRSS: resources['memory'] = origMaxRSS # then it's already in MB # Actually parses the HTCondor runtime resizeResources(resources) # We decided to only touch Watchdog settings if the number of cores changed. # (even if this means the watchdog memory is wrong for a slot this size). changedCores = origCores != resources['cores'] # HTCondor doesn't explicitly scale VSize; it's also not clear what # resources this manages (as we already watch the memory use) or how # it should relate to other resources (such as memory or cores used). # Hence, we simply remove it if we change anything about the memory. # If we did base maxRSS off the memory in the HTCondor slot, subtract a bit # off the top so watchdog triggers before HTCondor does. # Add the new number of cores to the args such that DashboardInterface can see it args['cores'] = resources['cores'] if changedCores: if origMaxRSS: args.pop('maxVSize', None) args['maxRSS'] = resources['memory'] - 50 logging.info("Watchdog modified: %s. Final settings:", changedCores) for k, v in args.iteritems(): logging.info(" %s: %r", k, v) # Actually initialize the monitor variables mon.initMonitor(task=task, job=wmbsJob, logPath=self.logPath, args=args) self._Monitors.append(mon) return