def executeInternal(self, *args): """The executeInternal method return 4 if the "completion" threshold is not reached, 0 otherwise""" self.stage = args[0] self.completion = int(args[1]) self.prefix = args[2] self.setupLog() self.statusCacheInfo = { } #Will be filled with the status from the status cache self.readJobStatus() completed = set(self.completedJobs(stage=self.stage)) if len(completed) < self.completion: return 4 self.readProcessedJobs() unprocessed = completed - self.processedJobs estimates = copy.copy(unprocessed) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) if self.stage == 'tail' and len(estimates - set(self.failedJobs)) == 0: estimates = set( self.completedJobs(stage='processing', processFailed=False)) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) # The TaskWorker saves some files that now we are gonna read with open('datadiscovery.pkl', 'rb') as fd: dataset = pickle.load(fd) #Output from the discovery process with open('taskinformation.pkl', 'rb') as fd: task = pickle.load( fd ) #A dictionary containing information about the task as in the Oracle DB with open('taskworkerconfig.pkl', 'rb') as fd: config = pickle.load(fd) #Task worker configuration # need to use user proxy as credential for talking with cmsweb config.TaskWorker.cmscert = os.environ.get('X509_USER_PROXY') config.TaskWorker.cmskey = os.environ.get('X509_USER_PROXY') config.TaskWorker.envForCMSWEB = newX509env( X509_USER_CERT=config.TaskWorker.cmscert, X509_USER_KEY=config.TaskWorker.cmskey) # need to get username from classAd to setup for Rucio access task_ad = classad.parseOne(open(os.environ['_CONDOR_JOB_AD'])) username = task_ad['CRAB_UserHN'] config.Services.Rucio_account = username # need the global black list config.TaskWorker.scratchDir = './scratchdir' if not os.path.exists(config.TaskWorker.scratchDir): os.makedirs(config.TaskWorker.scratchDir) from TaskWorker.Actions.Recurring.BanDestinationSites import CRAB3BanDestinationSites banSites = CRAB3BanDestinationSites(config, self.logger) with config.TaskWorker.envForCMSWEB: banSites.execute() # Read the automatic_splitting/throughputs/0-N files where the PJ # saved the EventThroughput # (report['steps']['cmsRun']['performance']['cpu']['EventThroughput']) # and the average size of the output per event sumEventsThr = 0 sumEventsSize = 0 count = 0 for jid in estimates: if jid in self.failedJobs: continue fn = "automatic_splitting/throughputs/{0}".format(jid) with open(fn) as fd: throughput, eventsize = json.load(fd) sumEventsThr += throughput sumEventsSize += eventsize count += 1 eventsThr = sumEventsThr / count eventsSize = sumEventsSize / count self.logger.info("average throughput for %s jobs: %s evt/s", count, eventsThr) self.logger.info("average eventsize for %s jobs: %s bytes", count, eventsSize) maxSize = getattr(config.TaskWorker, 'automaticOutputSizeMaximum', 5 * 1000**3) maxEvents = (maxSize / eventsSize) if eventsSize > 0 else 0 runtime = task['tm_split_args'].get('minutes_per_job', -1) if self.stage == "processing": # Build in a 33% error margin in the runtime to not create too # many tails. This essentially moves the peak to lower # runtimes and cuts off less of the job distribution tail. target = int(0.75 * runtime) elif self.stage == 'tail': target = int( max( getattr(config.TaskWorker, 'automaticTailRuntimeMinimumMins', 45), getattr(config.TaskWorker, 'automaticTailRuntimeFraction', 0.2) * runtime)) # `target` is in minutes, `eventsThr` is in events/second! events = int(target * eventsThr * 60) if events > maxEvents and maxEvents > 0: self.logger.info( "reduced the target event count from %s to %s to obey output size", events, maxEvents) events = int(maxEvents) splitTask = dict(task) splitTask['tm_split_algo'] = 'EventAwareLumiBased' splitTask['tm_split_args']['events_per_job'] = events if self.stage == 'tail' and not self.adjustLumisForCompletion( splitTask, unprocessed): self.logger.info("nothing to process for completion") self.saveProcessedJobs(unprocessed) return 0 # Disable retries for processing: every lumi is attempted to be # processed once in processing, thrice in the tails -> four times. # That should be enough "retries" # # See note in DagmanCreator about getting this from the Task DB if self.stage == "processing": config.TaskWorker.numAutomJobRetries = 0 try: splitter = Splitter(config, crabserver=None) split_result = splitter.execute(dataset, task=splitTask) self.logger.info("Splitting results:") for g in split_result.result[0]: msg = "Created jobgroup with length {0}".format( len(g.getJobs())) self.logger.info(msg) except TaskWorkerException as e: retmsg = "Splitting failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 try: parent = self.prefix if self.stage == 'tail' else None rucioClient = getNativeRucioClient(config=config, logger=self.logger) creator = DagmanCreator(config, crabserver=None, rucioClient=rucioClient) with config.TaskWorker.envForCMSWEB: creator.createSubdag(split_result.result, task=task, parent=parent, stage=self.stage) self.submitSubdag( 'RunJobs{0}.subdag'.format(self.prefix), getattr(config.TaskWorker, 'maxIdle', MAX_IDLE_JOBS), getattr(config.TaskWorker, 'maxPost', MAX_POST_JOBS), self.stage) except TaskWorkerException as e: retmsg = "DAG creation failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 self.saveProcessedJobs(unprocessed) return 0
def executeInternal(self, *args): """The executeInternal method return 4 if the "completion" threshold is not reached, 0 otherwise""" self.stage = args[0] self.completion = int(args[1]) self.prefix = args[2] self.setupLog() self.statusCacheInfo = {} #Will be filled with the status from the status cache self.readJobStatus() completed = set(self.completedJobs(stage=self.stage)) if len(completed) < self.completion: return 4 self.readProcessedJobs() unprocessed = completed - self.processedJobs estimates = copy.copy(unprocessed) self.logger.info("jobs remaining to process: {0}".format(", ".join(sorted(unprocessed)))) if self.stage == 'tail' and len(estimates-set(self.failedJobs)) == 0: estimates = set(self.completedJobs(stage='processing')) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) # The TaskWorker saves some files that now we are gonna read with open('datadiscovery.pkl', 'rb') as fd: dataset = pickle.load(fd) #Output from the discovery process with open('taskinformation.pkl', 'rb') as fd: task = pickle.load(fd) #A dictionary containing information about the task as in the Oracle DB with open('taskworkerconfig.pkl', 'rb') as fd: config = pickle.load(fd) #Task worker configuration # Read the automatic_splitting/throughputs/0-N files where the PJ # saved the EventThroughput (report['steps']['cmsRun']['performance']['cpu']['EventThroughput']) sumEventsThr = 0 count = 0 for jid in estimates: if jid in self.failedJobs: continue fn = "automatic_splitting/throughputs/{0}".format(jid) with open(fn) as fd: sumEventsThr += float(fd.read()) count += 1 eventsThr = sumEventsThr / count self.logger.info("average throughput for %s jobs: %s", count, eventsThr) runtime = task['tm_split_args'].get('seconds_per_job', -1) if self.stage == "processing": # Build in a 33% error margin in the runtime to not create too # many tails. This essentially moves the peak to lower # runtimes and cuts off less of the job distribution tail. target = int(0.75 * runtime) elif self.stage == 'tail': target = int(max( getattr(config.TaskWorker, 'automaticTailRuntimeMinimum', 45 * 60), getattr(config.TaskWorker, 'automaticTailRuntimeFraction', 0.2) * runtime )) events = int(target * eventsThr) splitTask = dict(task) splitTask['tm_split_algo'] = 'EventAwareLumiBased' splitTask['tm_split_args']['events_per_job'] = events if self.stage == 'tail' and not self.adjustLumisForCompletion(splitTask, unprocessed): self.logger.info("nothing to process for completion") self.saveProcessedJobs(unprocessed) return 0 # Disable retries for processing: every lumi is attempted to be # processed once in processing, thrice in the tails -> four times. # That should be enough "retries" # # See note in DagmanCreator about getting this from the Task DB if self.stage == "processing": config.TaskWorker.numAutomJobRetries = 0 try: config.TaskWorker.scratchDir = './scratchdir' # XXX splitter = Splitter(config, server=None, resturi='') split_result = splitter.execute(dataset, task=splitTask) self.logger.info("Splitting results:") for g in split_result.result[0]: msg = "Created jobgroup with length {0}".format(len(g.getJobs())) self.logger.info(msg) except TaskWorkerException as e: retmsg = "Splitting failed with:\n{0}".format(e) self.logger.error(msg) # self.set_dashboard_state('FAILED') return 1 try: parent = self.prefix if self.stage == 'tail' else None creator = DagmanCreator(config, server=None, resturi='') creator.createSubdag(split_result.result, task=task, parent=parent, stage=self.stage) self.submitSubdag('RunJobs{0}.subdag'.format(self.prefix), getattr(config.TaskWorker, 'maxPost', 20), self.stage) except TaskWorkerException as e: retmsg = "DAG creation failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 self.saveProcessedJobs(unprocessed) return 0
def executeInternal(self, *args): """The executeInternal method return 4 if the "completion" threshold is not reached, 0 otherwise""" self.stage = args[0] self.completion = int(args[1]) self.prefix = args[2] self.setupLog() self.statusCacheInfo = {} #Will be filled with the status from the status cache self.readJobStatus() completed = set(self.completedJobs(stage=self.stage)) if len(completed) < self.completion: return 4 self.readProcessedJobs() unprocessed = completed - self.processedJobs estimates = copy.copy(unprocessed) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) if self.stage == 'tail' and len(estimates-set(self.failedJobs)) == 0: estimates = set(self.completedJobs(stage='processing', processFailed=False)) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) # The TaskWorker saves some files that now we are gonna read with open('datadiscovery.pkl', 'rb') as fd: dataset = pickle.load(fd) #Output from the discovery process with open('taskinformation.pkl', 'rb') as fd: task = pickle.load(fd) #A dictionary containing information about the task as in the Oracle DB with open('taskworkerconfig.pkl', 'rb') as fd: config = pickle.load(fd) #Task worker configuration # need to use user proxy as credential for talking with cmsweb config.TaskWorker.cmscert = os.environ.get('X509_USER_PROXY') config.TaskWorker.cmskey = os.environ.get('X509_USER_PROXY') config.TaskWorker.envForCMSWEB = newX509env(X509_USER_CERT=config.TaskWorker.cmscert, X509_USER_KEY=config.TaskWorker.cmskey) # need the global black list config.TaskWorker.scratchDir = './scratchdir' if not os.path.exists(config.TaskWorker.scratchDir): os.makedirs(config.TaskWorker.scratchDir) from TaskWorker.Actions.Recurring.BanDestinationSites import CRAB3BanDestinationSites banSites = CRAB3BanDestinationSites(config, 'dummy', 'dummy', self.logger) with config.TaskWorker.envForCMSWEB: banSites.execute() # Read the automatic_splitting/throughputs/0-N files where the PJ # saved the EventThroughput # (report['steps']['cmsRun']['performance']['cpu']['EventThroughput']) # and the average size of the output per event sumEventsThr = 0 sumEventsSize = 0 count = 0 for jid in estimates: if jid in self.failedJobs: continue fn = "automatic_splitting/throughputs/{0}".format(jid) with open(fn) as fd: throughput, eventsize = json.load(fd) sumEventsThr += throughput sumEventsSize += eventsize count += 1 eventsThr = sumEventsThr / count eventsSize = sumEventsSize / count self.logger.info("average throughput for %s jobs: %s evt/s", count, eventsThr) self.logger.info("average eventsize for %s jobs: %s bytes", count, eventsSize) maxSize = getattr(config.TaskWorker, 'automaticOutputSizeMaximum', 5 * 1000**3) maxEvents = (maxSize / eventsSize) if eventsSize > 0 else 0 runtime = task['tm_split_args'].get('minutes_per_job', -1) if self.stage == "processing": # Build in a 33% error margin in the runtime to not create too # many tails. This essentially moves the peak to lower # runtimes and cuts off less of the job distribution tail. target = int(0.75 * runtime) elif self.stage == 'tail': target = int(max( getattr(config.TaskWorker, 'automaticTailRuntimeMinimumMins', 45), getattr(config.TaskWorker, 'automaticTailRuntimeFraction', 0.2) * runtime )) # `target` is in minutes, `eventsThr` is in events/second! events = int(target * eventsThr * 60) if events > maxEvents and maxEvents > 0: self.logger.info("reduced the target event count from %s to %s to obey output size", events, maxEvents) events = int(maxEvents) splitTask = dict(task) splitTask['tm_split_algo'] = 'EventAwareLumiBased' splitTask['tm_split_args']['events_per_job'] = events if self.stage == 'tail' and not self.adjustLumisForCompletion(splitTask, unprocessed): self.logger.info("nothing to process for completion") self.saveProcessedJobs(unprocessed) return 0 # Disable retries for processing: every lumi is attempted to be # processed once in processing, thrice in the tails -> four times. # That should be enough "retries" # # See note in DagmanCreator about getting this from the Task DB if self.stage == "processing": config.TaskWorker.numAutomJobRetries = 0 try: splitter = Splitter(config, server=None, resturi='') split_result = splitter.execute(dataset, task=splitTask) self.logger.info("Splitting results:") for g in split_result.result[0]: msg = "Created jobgroup with length {0}".format(len(g.getJobs())) self.logger.info(msg) except TaskWorkerException as e: retmsg = "Splitting failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 try: parent = self.prefix if self.stage == 'tail' else None creator = DagmanCreator(config, server=None, resturi='') with config.TaskWorker.envForCMSWEB: creator.createSubdag(split_result.result, task=task, parent=parent, stage=self.stage) self.submitSubdag('RunJobs{0}.subdag'.format(self.prefix), getattr(config.TaskWorker, 'maxPost', 20), self.stage) except TaskWorkerException as e: retmsg = "DAG creation failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 self.saveProcessedJobs(unprocessed) return 0
def executeInternal(self, *args): """The executeInternal method return 4 if the "completion" threshold is not reached, 0 otherwise""" self.stage = args[0] self.completion = int(args[1]) self.prefix = args[2] self.setupLog() self.statusCacheInfo = {} #Will be filled with the status from the status cache self.readJobStatus() completed = set(self.completedJobs(stage=self.stage)) if len(completed) < self.completion: return 4 self.readProcessedJobs() unprocessed = completed - self.processedJobs estimates = copy.copy(unprocessed) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) if self.stage == 'tail' and len(estimates-set(self.failedJobs)) == 0: estimates = set(self.completedJobs(stage='processing', processFailed=False)) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) # The TaskWorker saves some files that now we are gonna read with open('datadiscovery.pkl', 'rb') as fd: dataset = pickle.load(fd) #Output from the discovery process with open('taskinformation.pkl', 'rb') as fd: task = pickle.load(fd) #A dictionary containing information about the task as in the Oracle DB with open('taskworkerconfig.pkl', 'rb') as fd: config = pickle.load(fd) #Task worker configuration # Read the automatic_splitting/throughputs/0-N files where the PJ # saved the EventThroughput (report['steps']['cmsRun']['performance']['cpu']['EventThroughput']) sumEventsThr = 0 count = 0 for jid in estimates: if jid in self.failedJobs: continue fn = "automatic_splitting/throughputs/{0}".format(jid) with open(fn) as fd: sumEventsThr += float(fd.read()) count += 1 eventsThr = sumEventsThr / count self.logger.info("average throughput for %s jobs: %s", count, eventsThr) runtime = task['tm_split_args'].get('minutes_per_job', -1) if self.stage == "processing": # Build in a 33% error margin in the runtime to not create too # many tails. This essentially moves the peak to lower # runtimes and cuts off less of the job distribution tail. target = int(0.75 * runtime) elif self.stage == 'tail': target = int(max( getattr(config.TaskWorker, 'automaticTailRuntimeMinimumMins', 45), getattr(config.TaskWorker, 'automaticTailRuntimeFraction', 0.2) * runtime )) # `target` is in minutes, `eventsThr` is in events/second! events = int(target * eventsThr * 60) splitTask = dict(task) splitTask['tm_split_algo'] = 'EventAwareLumiBased' splitTask['tm_split_args']['events_per_job'] = events if self.stage == 'tail' and not self.adjustLumisForCompletion(splitTask, unprocessed): self.logger.info("nothing to process for completion") self.saveProcessedJobs(unprocessed) return 0 # Disable retries for processing: every lumi is attempted to be # processed once in processing, thrice in the tails -> four times. # That should be enough "retries" # # See note in DagmanCreator about getting this from the Task DB if self.stage == "processing": config.TaskWorker.numAutomJobRetries = 0 try: config.TaskWorker.scratchDir = './scratchdir' # XXX splitter = Splitter(config, server=None, resturi='') split_result = splitter.execute(dataset, task=splitTask) self.logger.info("Splitting results:") for g in split_result.result[0]: msg = "Created jobgroup with length {0}".format(len(g.getJobs())) self.logger.info(msg) except TaskWorkerException as e: retmsg = "Splitting failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 try: parent = self.prefix if self.stage == 'tail' else None creator = DagmanCreator(config, server=None, resturi='') creator.createSubdag(split_result.result, task=task, parent=parent, stage=self.stage) self.submitSubdag('RunJobs{0}.subdag'.format(self.prefix), getattr(config.TaskWorker, 'maxPost', 20), self.stage) except TaskWorkerException as e: retmsg = "DAG creation failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 self.saveProcessedJobs(unprocessed) return 0