def _getVerificationSubmitThrottle(self, submitCount): jobsActive = self.jobDB.getJobsN(ClassSelector(JobClass.PROCESSING)) jobsSuccess = self.jobDB.getJobsN(ClassSelector(JobClass.SUCCESS)) jobsDone = self.jobDB.getJobsN(ClassSelector(JobClass.PROCESSED)) jobsTotal = jobsDone + jobsActive verifyIndex = bisect.bisect_left(self._verifyChunks, jobsTotal) try: successRatio = jobsSuccess * 1.0 / self._verifyChunks[verifyIndex] goal = self._verifyChunks[verifyIndex] * self._verifyThresh[ verifyIndex] if self._verifyChunks[verifyIndex] - jobsDone + jobsSuccess < goal: if not self._unreachableGoal: self._log_user_time.warning( 'All remaining jobs are vetoed by an unachieveable verification goal!' ) self._log_user_time.info( 'Current goal: %d successful jobs out of %d', goal, self._verifyChunks[verifyIndex]) self._unreachableGoal = True return 0 if successRatio < self._verifyThresh[verifyIndex]: return min(submitCount, self._verifyChunks[verifyIndex] - jobsTotal) else: return min(submitCount, self._verifyChunks[verifyIndex + 1] - jobsTotal) except IndexError: self._log_user_time.debug('All verification chunks passed') self._log_user_time.debug( 'Verification submission throttle disabled') self._verify = False return submitCount
def check(self, wms): jobList = self._sample( self.jobDB.getJobs(ClassSelector(JobClass.PROCESSING)), utils.QM(self._chunks_enabled, self._chunks_check, -1)) # Check jobs in the joblist and return changes, timeouts and successfully reported jobs (change, timeoutList, reported) = self._checkJobList(wms, jobList) unreported = len(jobList) - len(reported) if unreported > 0: self._log_user_time.critical( '%d job(s) did not report their status!', unreported) if change is None: # neither True or False => abort return False # Cancel jobs which took too long if len(timeoutList): change = True self._log_user.warning('Timeout for the following jobs:') self.cancel(wms, timeoutList, interactive=False, showJobs=True) # Process task interventions self._processIntervention(wms, self._task.getIntervention()) # Quit when all jobs are finished if self.jobDB.getJobsN(ClassSelector(JobClass.ENDSTATE)) == len( self.jobDB): self._logDisabledJobs() self._eventhandler.onTaskFinish(len(self.jobDB)) if self._task.canFinish(): self._log_user_time.info( 'Task successfully completed. Quitting grid-control!') utils.abort(True) return change
def display(self): self._bar.update( len(self._jobDB.getJobs(ClassSelector(JobClass.SUCCESS))), len(self._jobDB.getJobs(ClassSelector(JobClass.ATWMS))), len(self._jobDB.getJobs(ClassSelector(JobClass.RUNNING_DONE))), len(self._jobDB.getJobs(ClassSelector(JobClass.FAILING)))) sys.stdout.write(str(self._bar) + '\n')
def _process_intervention(self, task, wms): # Process changes of job states requested by task module resetable_state_list = [ Job.INIT, Job.DISABLED, Job.ABORTED, Job.CANCELLED, Job.DONE, Job.FAILED, Job.SUCCESS ] def _reset_state(jobnum_list, state_new): jobnum_listet = set(jobnum_list) for jobnum in jobnum_list: job_obj = self.job_db.get_job_persistent(jobnum) if job_obj.state in resetable_state_list: self._update(task, job_obj, jobnum, state_new) jobnum_listet.remove(jobnum) job_obj.attempt = 0 if len(jobnum_listet) > 0: raise JobError( 'For the following jobs it was not possible to reset the state to %s:\n%s' % (Job.enum2str(state_new), str.join(', ', imap(str, jobnum_listet)))) (redo, disable, size_change) = task.get_intervention() if (not redo) and (not disable) and (not size_change): return self._log.log_time( logging.INFO, 'The task module has requested changes to the job database') max_job_len_new = self._get_max_jobs(task) applied_change = False if max_job_len_new != len(self.job_db): self._log.log_time(logging.INFO, 'Number of jobs changed from %d to %d', len(self.job_db), max_job_len_new) self.job_db.set_job_limit(max_job_len_new) applied_change = True if redo: self._cancel(task, wms, self.job_db.get_job_list( ClassSelector(JobClass.PROCESSING), redo), interactive=False, show_jobs=True) _reset_state(redo, Job.INIT) applied_change = True if disable: self._cancel(task, wms, self.job_db.get_job_list( ClassSelector(JobClass.PROCESSING), disable), interactive=False, show_jobs=True) _reset_state(disable, Job.DISABLED) applied_change = True if applied_change: self._log.log_time(logging.INFO, 'All requested changes are applied')
def _processIntervention(self, wms, jobChanges): def resetState(jobs, newState): jobSet = set(jobs) for jobNum in jobs: jobObj = self.jobDB.get(jobNum) if jobObj and jobObj.state in [ Job.INIT, Job.DISABLED, Job.ABORTED, Job.CANCELLED, Job.DONE, Job.FAILED, Job.SUCCESS ]: self._update(jobObj, jobNum, newState) jobSet.remove(jobNum) jobObj.attempt = 0 if len(jobSet) > 0: output = (Job.enum2str(newState), str.join(', ', imap(str, jobSet))) raise JobError( 'For the following jobs it was not possible to reset the state to %s:\n%s' % output) if jobChanges: (redo, disable, sizeChange) = jobChanges if (redo == []) and (disable == []) and (sizeChange is False): return self._log_user_time.info( 'The task module has requested changes to the job database') newMaxJobs = self.getMaxJobs(self._task) applied_change = False if newMaxJobs != self.jobDB.jobLimit: self._log_user_time.info( 'Number of jobs changed from %d to %d', len(self.jobDB), newMaxJobs) self.jobDB.jobLimit = newMaxJobs applied_change = True if redo: self.cancel(wms, self.jobDB.getJobs( ClassSelector(JobClass.PROCESSING), redo), interactive=False, showJobs=True) resetState(redo, Job.INIT) applied_change = True if disable: self.cancel(wms, self.jobDB.getJobs( ClassSelector(JobClass.PROCESSING), disable), interactive=False, showJobs=True) resetState(disable, Job.DISABLED) applied_change = True if applied_change: self._log_user_time.info('All requested changes are applied')
def delete(self, wms, select): selector = AndJobSelector(ClassSelector(JobClass.PROCESSING), JobSelector.create(select, task=self._task)) jobs = self.jobDB.getJobs(selector) if jobs: self._log_user.warning('Cancelling the following jobs:') self.cancel(wms, jobs, interactive=True, showJobs=True)
def retrieve(self, wms): change = False jobList = self._sample( self.jobDB.getJobs(ClassSelector(JobClass.DONE)), utils.QM(self._chunks_enabled, self._chunks_retrieve, -1)) for (jobNum, retCode, data, outputdir) in wms.retrieveJobs(self._wmsArgs(jobList)): jobObj = self.jobDB.get(jobNum) if jobObj is None: continue if retCode == 0: state = Job.SUCCESS elif retCode == 107: # set ABORTED instead of FAILED for errorcode 107 state = Job.ABORTED else: state = Job.FAILED if state == Job.SUCCESS: if not self._outputProcessor.process(outputdir): retCode = 108 state = Job.FAILED if state != jobObj.state: change = True jobObj.set('retcode', retCode) jobObj.set('runtime', data.get('TIME', -1)) self._update(jobObj, jobNum, state) self._eventhandler.onJobOutput(wms, jobObj, jobNum, retCode) if utils.abort(): return False return change
def delete(self, task, wms, select): selector = AndJobSelector(ClassSelector(JobClass.PROCESSING), JobSelector.create(select, task=task)) jobs = self.job_db.get_job_list(selector) if jobs: self._log.warning('Cancelling the following jobs:') self.cancel(wms, jobs, interactive=self._interactive_delete, show_jobs=True)
def _submit_get_jobs_throttled(self, job_len_submit): # Verification heuristic - check whether enough jobs have succeeded before submitting more job_len_active = self.job_db.get_job_len( ClassSelector(JobClass.PROCESSING)) job_len_success = self.job_db.get_job_len( ClassSelector(JobClass.SUCCESS)) job_len_done = self.job_db.get_job_len( ClassSelector(JobClass.PROCESSED)) job_len_total = job_len_done + job_len_active verify_idx = bisect.bisect_left(self._verify_chunk_list, job_len_total) try: success_ratio = job_len_success * 1.0 / self._verify_chunk_list[ verify_idx] goal = self._verify_chunk_list[ verify_idx] * self._verify_threshold_list[verify_idx] if self._verify_chunk_list[ verify_idx] - job_len_done + job_len_success < goal: if not self._unreachable_goal_flag: self._log.log_time( logging.WARNING, 'All remaining jobs are vetoed by an unachieveable verification goal!' ) self._log.log_time( logging.INFO, 'Current goal: %d successful jobs out of %d', goal, self._verify_chunk_list[verify_idx]) self._unreachable_goal_flag = True return 0 if success_ratio < self._verify_threshold_list[verify_idx]: return min(job_len_submit, self._verify_chunk_list[verify_idx] - job_len_total) else: return min( job_len_submit, self._verify_chunk_list[verify_idx + 1] - job_len_total) except IndexError: clear_current_exception() self._log.log_time(logging.DEBUG, 'All verification chunks passed') self._log.log_time(logging.DEBUG, 'Verification submission throttle disabled') self._verify = False return job_len_submit
def _getSubmissionJobs(self, maxsample): # Get list of submittable jobs readyList = self.jobDB.getJobs(ClassSelector(JobClass.READY)) retryOK = readyList defaultJob = Job() if self._job_retries >= 0: retryOK = lfilter( lambda x: self.jobDB.get(x, defaultJob).attempt - 1 < self. _job_retries, readyList) modOK = lfilter(self._task.canSubmit, readyList) jobList = set.intersection(set(retryOK), set(modOK)) if self._showBlocker and readyList and not jobList: # No submission but ready jobs err = [] err += utils.QM((len(retryOK) > 0) and (len(modOK) == 0), [], ['have hit their maximum number of retries']) err += utils.QM((len(retryOK) == 0) and (len(modOK) > 0), [], ['are vetoed by the task module']) self._log_user_time.warning( 'All remaining jobs %s!', str.join(utils.QM(retryOK or modOK, ' or ', ' and '), err)) self._showBlocker = not (len(readyList) > 0 and len(jobList) == 0) # Determine number of jobs to submit submit = len(jobList) if self._njobs_inqueue > 0: submit = min( submit, self._njobs_inqueue - self.jobDB.getJobsN(ClassSelector(JobClass.ATWMS))) if self._njobs_inflight > 0: submit = min( submit, self._njobs_inflight - self.jobDB.getJobsN(ClassSelector(JobClass.PROCESSING))) if self._chunks_enabled and (maxsample > 0): submit = min(submit, maxsample) submit = max(submit, 0) if self._do_shuffle: return self._sample(jobList, submit) return sorted(jobList)[:submit]
def check(self, task, wms): jobnum_list = self._sample( self.job_db.get_job_list(ClassSelector(JobClass.PROCESSING)), self._get_chunk_size(self._chunks_check)) # Check jobs in the jobnum_list and return changes, timeouts and successfully reported jobs (change, jobnum_list_timeout, reported) = self._check_get_jobnum_list(task, wms, jobnum_list) unreported = len(jobnum_list) - len(reported) if unreported > 0: self._log.log_time(logging.CRITICAL, '%d job(s) did not report their status!', unreported) if change is None: # neither True or False => abort return False # Cancel jobs which took too long if len(jobnum_list_timeout): change = True self._log.warning('Timeout for the following jobs:') self._cancel(task, wms, jobnum_list_timeout, interactive=False, show_jobs=True) # Process task interventions self._process_intervention(task, wms) # Quit when all jobs are finished if self.job_db.get_job_len(ClassSelector(JobClass.ENDSTATE)) == len( self.job_db): self._log_disabled_jobs() if task.can_finish(): self._local_event_handler.on_task_finish( task, len(self.job_db)) abort(True) return change
def _submit_get_jobs(self, task): # Get list of submittable jobs jobnum_list_ready = self.job_db.get_job_list( ClassSelector(JobClass.SUBMIT_CANDIDATES)) (n_mod_ok, n_retry_ok, jobnum_list) = self._get_enabled_jobs(task, jobnum_list_ready) if self._show_blocker and jobnum_list_ready and not jobnum_list: # No submission but ready jobs err_str_list = [] if (n_retry_ok <= 0) or (n_mod_ok != 0): err_str_list.append('have hit their maximum number of retries') if (n_retry_ok != 0) and (n_mod_ok <= 0): err_str_list.append('are vetoed by the task module') err_delim = ' and ' if n_retry_ok or n_mod_ok: err_delim = ' or ' self._log.log_time(logging.WARNING, 'All remaining jobs %s!', str.join(err_delim, err_str_list)) self._show_blocker = not (len(jobnum_list_ready) > 0 and len(jobnum_list) == 0) # Determine number of jobs to submit submit = len(jobnum_list) if self._njobs_inqueue > 0: submit = min( submit, self._njobs_inqueue - self.job_db.get_job_len(ClassSelector(JobClass.ATWMS))) if self._njobs_inflight > 0: submit = min( submit, self._njobs_inflight - self.job_db.get_job_len(ClassSelector(JobClass.PROCESSING))) if self._chunks_enabled and (self._chunks_submit > 0): submit = min(submit, self._chunks_submit) submit = max(submit, 0) if self._do_shuffle: return self._sample(jobnum_list, submit) return sorted(jobnum_list)[:submit]
def reset(self, wms, select): jobs = self.jobDB.getJobs(JobSelector.create(select, task=self._task)) if jobs: self._log_user.warning('Resetting the following jobs:') self._reportClass(self.jobDB, self._task, jobs).display() if utils.getUserBool( 'Are you sure you want to reset the state of these jobs?', False): self.cancel( wms, self.jobDB.getJobs(ClassSelector(JobClass.PROCESSING), jobs), False, False) for jobNum in jobs: self.jobDB.commit(jobNum, Job())
def _logDisabledJobs(self): disabled = self.jobDB.getJobs(ClassSelector(JobClass.DISABLED)) try: fp = SafeFile(self._disabled_jobs_logfile, 'w') fp.write(str.join('\n', imap(str, disabled))) fp.close() except Exception: raise JobError('Could not write disabled jobs to file %s!' % self._disabled_jobs_logfile) if disabled: self._log_user_time.warning( 'There are %d disabled jobs in this task!', len(disabled)) self._log_user_time.debug( 'Please refer to %s for a complete list of disabled jobs.', self._disabled_jobs_logfile)
def _log_disabled_jobs(self): disabled = self.job_db.get_job_list(ClassSelector(JobClass.DISABLED)) try: with_file(SafeFile(self._disabled_jobs_logfile, 'w'), lambda fp: fp.write(str.join('\n', imap(str, disabled)))) except Exception: raise JobError('Could not write disabled jobs to file %s!' % self._disabled_jobs_logfile) if disabled: self._log.log_time(logging.WARNING, 'There are %d disabled jobs in this task!', len(disabled)) self._log.log_time( logging.DEBUG, 'Please refer to %s for a complete list of disabled jobs.', self._disabled_jobs_logfile)
def get_script_object(config_file, job_selector_str, only_success=False, require_task=False): config = gc_create_config(config_file=config_file, load_only_old_config=True) (task, job_selector) = _get_job_selector_and_task(config, job_selector_str, require_task) if only_success: job_selector = JobSelector.create_instance('AndJobSelector', ClassSelector(JobClass.SUCCESS), job_selector) new_config = gc_create_config(config_file=config_file) jobs_config = new_config.change_view(set_sections=['jobs']) job_db = jobs_config.get_plugin('job database', 'TextFileJobDB', cls='JobDB', pkwargs={'job_selector': job_selector}, on_change=None) class ScriptObject(object): def __init__(self, config, new_config, task, job_db): (self.config, self.new_config) = (config, new_config) (self.task, self.job_db) = (task, job_db) return ScriptObject(config, new_config, task, job_db)
def reset(self, task, wms, select): jobnum_list = self.job_db.get_job_list( JobSelector.create(select, task=task)) if jobnum_list: self._log.warning('Resetting the following jobs:') self._abort_report.show_report(self.job_db, jobnum_list) ask_user_msg = 'Are you sure you want to reset the state of these jobs?' if self._interactive_reset or self._uii.prompt_bool( ask_user_msg, False): self.cancel(wms, self.job_db.get_job_list( ClassSelector(JobClass.PROCESSING), jobnum_list), interactive=False, show_jobs=False) for jobnum in jobnum_list: self.job_db.commit(jobnum, Job())
def __init__(self, config, datasource_name): InfoScanner.__init__(self, config, datasource_name) ext_config_fn = config.get_fn('source config') ext_config_raw = create_config(ext_config_fn, load_only_old_config=True) ext_config = ext_config_raw.change_view(set_sections=['global']) self._ext_work_dn = ext_config.get_work_path() logging.getLogger().disabled = True ext_workflow = ext_config.get_plugin('workflow', 'Workflow:global', cls='Workflow', pkwargs={'backend': 'NullWMS'}) logging.getLogger().disabled = False self._ext_task = ext_workflow.task job_selector = JobSelector.create(config.get('source job selector', ''), task=self._ext_task) self._selected = sorted( ext_workflow.job_manager.job_db.get_job_list( AndJobSelector(ClassSelector(JobClass.SUCCESS), job_selector)))
def retrieve(self, task, wms): change = False jobnum_list = self._sample( self.job_db.get_job_list(ClassSelector(JobClass.DONE)), self._get_chunk_size(self._chunks_retrieve)) job_output_iter = wms.retrieve_jobs(self._get_wms_args(jobnum_list)) for (jobnum, exit_code, data, outputdir) in job_output_iter: job_obj = self.job_db.get_job(jobnum) if job_obj is None: continue if exit_code == 0: state = Job.SUCCESS elif exit_code == 107: # set ABORTED instead of FAILED for errorcode 107 state = Job.ABORTED else: state = Job.FAILED if state == Job.SUCCESS: if not self._output_processor.process(outputdir, task): exit_code = 108 state = Job.FAILED if state != job_obj.state: change = True job_obj.set('retcode', exit_code) job_obj.set('runtime', data.get('TIME', -1)) self._update(task, job_obj, jobnum, state) self._local_event_handler.on_job_output( task, wms, job_obj, jobnum, exit_code) if abort(): return False return change
def display(self): self._bar.update( len(self._jobDB.getJobs(ClassSelector(JobClass.SUCCESS)))) sys.stdout.write(str(self._bar) + '\n')