def requeue_dirac_finished_jobs(requeue_jobs, finalised_statuses): """ Method used to requeue jobs whih are in the finalized state of some form, finished/failed/etc Args: requeue_jobs (list): This is a list of the jobs which are to be requeued to be finalised finalised_statuses (dict): Dict of the Dirac statuses vs the Ganga statuses after running """ # requeue existing completed job for j in requeue_jobs: if j.been_queued: continue if monitoring_component: if monitoring_component.should_stop(): break if not configDirac['serializeBackend']: getQueues()._monitoring_threadpool.add_function( DiracBase.job_finalisation, args=(j, finalised_statuses[j.backend.status]), priority=5, name="Job %s Finalizing" % j.fqid) j.been_queued = True else: DiracBase.job_finalisation( j, finalised_statuses[j.backend.status])
def requeue_dirac_finished_jobs(requeue_jobs, finalised_statuses): """ Method used to requeue jobs whih are in the finalized state of some form, finished/failed/etc Args: requeue_jobs (list): This is a list of the jobs which are to be requeued to be finalised finalised_statuses (dict): Dict of the Dirac statuses vs the Ganga statuses after running """ from Ganga.Core import monitoring_component # requeue existing completed job for j in requeue_jobs: if j.been_queued: continue if monitoring_component: if monitoring_component.should_stop(): break if not configDirac['serializeBackend']: getQueues()._monitoring_threadpool.add_function(DiracBase.job_finalisation, args=(j, finalised_statuses[j.backend.status]), priority=5, name="Job %s Finalizing" % j.fqid) j.been_queued = True else: DiracBase.job_finalisation(j, finalised_statuses[j.backend.status])
def updateMonitoringInformation(jobs): """Check the status of jobs and retrieve output sandboxes""" from Ganga.Core import monitoring_component dirac_job_ids = [] for j in jobs: dirac_job_ids.append(j.backend.id) global dirac_monitoring_server global dirac_monitoring_is_active if not dirac_monitoring_server.proxy.isValid(): if dirac_monitoring_is_active: logger.warning('DIRAC monitoring inactive (no valid proxy '\ 'found).') dirac_monitoring_is_active = False return else: dirac_monitoring_is_active = True cmd = 'result = DiracCommands.status(%s)' % str(dirac_job_ids) result = dirac_monitoring_server.execute(cmd) if type(result) != type([]): logger.warning('DIRAC monitoring failed: %s' % str(result)) return for i in range(0, len(jobs)): if monitoring_component: if monitoring_component.should_stop(): break j = jobs[i] j.backend.statusInfo = result[i][0] j.backend.status = result[i][1] j.backend.actualCE = result[i][2] cmd = 'result = DiracCommands.normCPUTime(%d)' % j.backend.id j.backend.normCPUTime = dirac_monitoring_server.execute(cmd) if result[i][3] != 'completed' and result[i][3] != j.status: j.updateStatus(result[i][3]) if result[i][3] == 'completed': j.updateStatus('completing') ok = j.backend._getOutputSandbox(dirac_monitoring_server) if ok and j.outputdata: j.backend._getOutputDataLFNs(dirac_monitoring_server, True) if not ok: j.updateStatus('failed') else: j.updateStatus('completed') if result[i][3] == 'failed': if configBoss['failed_sandbox_download']: j.backend._getOutputSandbox(dirac_monitoring_server) pass
def updateMonitoringInformation(jobs): """Check the status of jobs and retrieve output sandboxes""" from Ganga.Core import monitoring_component dirac_job_ids = [] for j in jobs: dirac_job_ids.append(j.backend.id) global dirac_monitoring_server global dirac_monitoring_is_active if not dirac_monitoring_server.proxy.isValid(): if dirac_monitoring_is_active: logger.warning('DIRAC monitoring inactive (no valid proxy '\ 'found).') dirac_monitoring_is_active = False return else: dirac_monitoring_is_active = True cmd = 'result = DiracCommands.status(%s)' % str(dirac_job_ids) result = dirac_monitoring_server.execute(cmd) if type(result) != type([]): logger.warning('DIRAC monitoring failed: %s' % str(result)) return for i in range(0,len(jobs)): if monitoring_component: if monitoring_component.should_stop(): break j = jobs[i] j.backend.statusInfo = result[i][0] j.backend.status = result[i][1] j.backend.actualCE = result[i][2] cmd = 'result = DiracCommands.normCPUTime(%d)' % j.backend.id j.backend.normCPUTime = dirac_monitoring_server.execute(cmd) if result[i][3] != 'completed' and result[i][3] != j.status: j.updateStatus(result[i][3]) if result[i][3] == 'completed': j.updateStatus('completing') ok = j.backend._getOutputSandbox(dirac_monitoring_server) if ok and j.outputdata: j.backend._getOutputDataLFNs(dirac_monitoring_server,True) if not ok: j.updateStatus('failed') else: j.updateStatus('completed') if result[i][3] == 'failed': if configBoss['failed_sandbox_download']: j.backend._getOutputSandbox(dirac_monitoring_server) pass
def updateMonitoringInformation(_jobs): """Check the status of jobs and retrieve output sandboxes""" # Only those jobs in 'submitted','running' are passed in here for checking # if however they have already completed in Dirac they may have been put on queue # for processing from last time. These should be put back on queue without # querying dirac again. Their signature is status = running and job.backend.status # already set to Done or Failed etc. jobs = [stripProxy(j) for j in _jobs] logger = getLogger() # make sure proxy is valid if not _proxyValid(): if DiracBase.dirac_monitoring_is_active: logger.warning('DIRAC monitoring inactive (no valid proxy found).') DiracBase.dirac_monitoring_is_active = False return else: DiracBase.dirac_monitoring_is_active = True # remove from consideration any jobs already in the queue. Checking this non persisted attribute # is better than querying the queue as cant tell if a job has just been taken off queue and is being processed # also by not being persistent, this attribute automatically allows queued jobs from last session to be considered # for requeing interesting_jobs = [j for j in jobs if not j.been_queued] # status that correspond to a ganga 'completed' or 'failed' (see DiracCommands.status(id)) # if backend status is these then the job should be on the queue requeue_dirac_status = {'Completed': 'completed', 'Done': 'completed', 'Failed': 'failed', 'Deleted': 'failed', 'Unknown: No status for Job': 'failed'} monitor_jobs = [j for j in interesting_jobs if j.backend.status not in requeue_dirac_status] requeue_jobs = [j for j in interesting_jobs if j.backend.status in requeue_dirac_status] logger.debug('Interesting jobs: ' + repr([j.fqid for j in interesting_jobs])) logger.debug('Monitor jobs : ' + repr([j.fqid for j in monitor_jobs])) logger.debug('Requeue jobs : ' + repr([j.fqid for j in requeue_jobs])) from Ganga.GPI import queues # requeue existing completed job for j in requeue_jobs: # if j.backend.status in requeue_dirac_status: queues._monitoring_threadpool.add_function(DiracBase.job_finalisation, args=(j, requeue_dirac_status[j.backend.status]), priority=5, name="Job %s Finalizing" % j.fqid) j.been_queued = True # now that can submit in non_blocking mode, can see jobs in submitting # that have yet to be assigned an id so ignore them # NOT SURE THIS IS VALID NOW BULK SUBMISSION IS GONE # EVEN THOUGH COULD ADD queues.add(j.submit) WILL KEEP AN EYE ON IT # dirac_job_ids = [ j.backend.id for j in monitor_jobs if j.backend.id is not None ] # Correction this did become a problem for a crashed session during # submit, see #104454 dead_jobs = (j for j in monitor_jobs if j.backend.id is None) for d in dead_jobs: d.updateStatus('failed') if d.master is not None: d.master.updateMasterJobStatus() ganga_job_status = [j.status for j in monitor_jobs if j.backend.id is not None] dirac_job_ids = [j.backend.id for j in monitor_jobs if j.backend.id is not None] #logger.debug("GangaStatus: %s" % str(ganga_job_status)) #logger.debug("diracJobIDs: %s" % str(dirac_job_ids)) result = execute('status(%s)' % str(dirac_job_ids)) if len(result) != len(ganga_job_status): logger.warning('Dirac monitoring failed fro %s, result = %s' % ( str(dirac_job_ids), str(result))) return #logger.debug("%s, %s, %s" % (str(len(ganga_job_status)), str(len(dirac_job_ids)), str(len(result)))) from Ganga.Core import monitoring_component thread_handled_states = ['completed', 'failed'] for job, state, old_state in zip(monitor_jobs, result, ganga_job_status): if monitoring_component: if monitoring_component.should_stop(): break job.backend.statusInfo = state[0] job.backend.status = state[1] job.backend.actualCE = state[2] updated_dirac_status = state[3] try: job.backend.extraInfo = state[4] except Exception as err: logger.debug("gxception: %s" % str(err)) pass logger.debug('Job status vector : ' + job.fqid + ' : ' + repr(state)) # Is this really catching a real problem? if job.status != old_state: logger.warning('User changed Ganga job status from %s -> %s' % (str(old_state), job.status)) continue #################### if updated_dirac_status == job.status: continue if updated_dirac_status in thread_handled_states: if job.status != 'running': DiracBase._getStateTime(job, 'running') if job.status in ['removed', 'killed']: continue if (job.master and job.master.status in ['removed', 'killed']): continue # user changed it under us job.updateStatus('running') if job.master: job.master.updateMasterJobStatus() queues._monitoring_threadpool.add_function(DiracBase.job_finalisation, args=(job, updated_dirac_status), priority=5, name="Job %s Finalizing" % job.fqid) job.been_queued = True else: DiracBase._getStateTime(job, updated_dirac_status) if job.status in ['removed', 'killed']: continue if (job.master and job.master.status in ['removed', 'killed']): continue # user changed it under us job.updateStatus(updated_dirac_status) if job.master: job.master.updateMasterJobStatus()
def monitor_dirac_running_jobs(monitor_jobs, finalised_statuses): """ Method to update the configuration of jobs which are in a submitted/running state in Ganga&Dirac Args: monitor_jobs (list): Jobs which are to be monitored for their status change finalised_statuses (dict): Dict of the Dirac statuses vs the Ganga statuses after running """ # now that can submit in non_blocking mode, can see jobs in submitting # that have yet to be assigned an id so ignore them # NOT SURE THIS IS VALID NOW BULK SUBMISSION IS GONE # EVEN THOUGH COULD ADD queues.add(j.submit) WILL KEEP AN EYE ON IT # dirac_job_ids = [ j.backend.id for j in monitor_jobs if j.backend.id is not None ] # Correction this did become a problem for a crashed session during # submit, see #104454 dead_jobs = (j for j in monitor_jobs if j.backend.id is None) for d in dead_jobs: d.updateStatus('failed') if d.master is not None: d.master.updateMasterJobStatus() ganga_job_status = [ j.status for j in monitor_jobs if j.backend.id is not None ] dirac_job_ids = [ j.backend.id for j in monitor_jobs if j.backend.id is not None ] logger.debug("GangaStatus: %s" % str(ganga_job_status)) logger.debug("diracJobIDs: %s" % str(dirac_job_ids)) if not dirac_job_ids: ## Nothing to do here stop bugging DIRAC about it! ## Everything else beyond here in the function depends on some ids present here, no ids means we can stop. return statusmapping = configDirac['statusmapping'] result, bulk_state_result = execute( 'monitorJobs(%s, %s)' % (repr(dirac_job_ids), repr(statusmapping)), cred_req=monitor_jobs[0].backend.credential_requirements) #result = results[0] #bulk_state_result = results[1] if len(result) != len(ganga_job_status): logger.warning('Dirac monitoring failed for %s, result = %s' % (str(dirac_job_ids), str(result))) logger.warning("Results: %s" % str(result)) return requeue_job_list = [] jobStateDict = {} jobs_to_update = {} master_jobs_to_update = [] thread_handled_states = ['completed', 'failed'] for job, state, old_state in zip(monitor_jobs, result, ganga_job_status): if monitoring_component: if monitoring_component.should_stop(): break if job.been_queued: continue job.backend.statusInfo = state[0] job.backend.status = state[1] job.backend.actualCE = state[2] updated_dirac_status = state[3] try: job.backend.extraInfo = state[4] except Exception as err: logger.debug("gexception: %s" % str(err)) pass logger.debug('Job status vector : ' + job.fqid + ' : ' + repr(state)) if updated_dirac_status not in jobStateDict: jobStateDict[updated_dirac_status] = [] jobStateDict[updated_dirac_status].append(job) if job.backend.status in finalised_statuses: if job.status != 'running': if job.status in ['removed', 'killed']: requeue_job_list.append(job) elif (job.master and job.master.status in ['removed', 'killed']): continue # user changed it under us else: if 'running' not in jobs_to_update: jobs_to_update['running'] = [] jobs_to_update['running'].append(job) if job.master: if job.master not in master_jobs_to_update: master_jobs_to_update.append(job.master) requeue_job_list.append(job) else: if job.status in ['removed', 'killed']: continue if (job.master and job.master.status in ['removed', 'killed']): continue # user changed it under us if job.status != updated_dirac_status: if updated_dirac_status not in jobs_to_update: jobs_to_update[updated_dirac_status] = [] jobs_to_update[updated_dirac_status].append(job) if job.master: if job.master not in master_jobs_to_update: master_jobs_to_update.append(job.master) DiracBase._bulk_updateStateTime(jobStateDict, bulk_state_result) for status in jobs_to_update: for job in jobs_to_update[status]: job.updateStatus(status, update_master=False) for j in master_jobs_to_update: j.updateMasterJobStatus() DiracBase.requeue_dirac_finished_jobs(requeue_job_list, finalised_statuses)
def monitor_dirac_running_jobs(monitor_jobs, finalised_statuses): """ Method to update the configuration of jobs which are in a submitted/running state in Ganga&Dirac Args: monitor_jobs (list): Jobs which are to be monitored for their status change finalised_statuses (dict): Dict of the Dirac statuses vs the Ganga statuses after running """ # now that can submit in non_blocking mode, can see jobs in submitting # that have yet to be assigned an id so ignore them # NOT SURE THIS IS VALID NOW BULK SUBMISSION IS GONE # EVEN THOUGH COULD ADD queues.add(j.submit) WILL KEEP AN EYE ON IT # dirac_job_ids = [ j.backend.id for j in monitor_jobs if j.backend.id is not None ] # Correction this did become a problem for a crashed session during # submit, see #104454 dead_jobs = (j for j in monitor_jobs if j.backend.id is None) for d in dead_jobs: d.updateStatus('failed') if d.master is not None: d.master.updateMasterJobStatus() ganga_job_status = [j.status for j in monitor_jobs if j.backend.id is not None] dirac_job_ids = [j.backend.id for j in monitor_jobs if j.backend.id is not None] logger.debug("GangaStatus: %s" % str(ganga_job_status)) logger.debug("diracJobIDs: %s" % str(dirac_job_ids)) if not dirac_job_ids: ## Nothing to do here stop bugging DIRAC about it! ## Everything else beyond here in the function depends on some ids present here, no ids means we can stop. return statusmapping = configDirac['statusmapping'] result, bulk_state_result = execute('monitorJobs(%s, %s)' %( repr(dirac_job_ids), repr(statusmapping))) if not DiracBase.checkDiracProxy(): return #result = results[0] #bulk_state_result = results[1] if len(result) != len(ganga_job_status): logger.warning('Dirac monitoring failed for %s, result = %s' % (str(dirac_job_ids), str(result))) logger.warning("Results: %s" % str(results)) return from Ganga.Core import monitoring_component requeue_job_list = [] jobStateDict = {} jobs_to_update = {} master_jobs_to_update = [] thread_handled_states = ['completed', 'failed'] for job, state, old_state in zip(monitor_jobs, result, ganga_job_status): if monitoring_component: if monitoring_component.should_stop(): break if job.been_queued: continue job.backend.statusInfo = state[0] job.backend.status = state[1] job.backend.actualCE = state[2] updated_dirac_status = state[3] try: job.backend.extraInfo = state[4] except Exception as err: logger.debug("gexception: %s" % str(err)) pass logger.debug('Job status vector : ' + job.fqid + ' : ' + repr(state)) if updated_dirac_status not in jobStateDict: jobStateDict[updated_dirac_status] = [] jobStateDict[updated_dirac_status].append(job) if job.backend.status in finalised_statuses: if job.status != 'running': if job.status in ['removed', 'killed']: requeue_job_list.append(job) elif (job.master and job.master.status in ['removed', 'killed']): continue # user changed it under us else: if 'running' not in jobs_to_update: jobs_to_update['running'] = [] jobs_to_update['running'].append(job) if job.master: if job.master not in master_jobs_to_update: master_jobs_to_update.append(job.master) requeue_job_list.append(job) else: if job.status in ['removed', 'killed']: continue if (job.master and job.master.status in ['removed', 'killed']): continue # user changed it under us if job.status != updated_dirac_status: if updated_dirac_status not in jobs_to_update: jobs_to_update[updated_dirac_status] = [] jobs_to_update[updated_dirac_status].append(job) if job.master: if job.master not in master_jobs_to_update: master_jobs_to_update.append(job.master) DiracBase._bulk_updateStateTime(jobStateDict, bulk_state_result) for status in jobs_to_update: for job in jobs_to_update[status]: job.updateStatus(status, update_master=False) for j in master_jobs_to_update: j.updateMasterJobStatus() DiracBase.requeue_dirac_finished_jobs(requeue_job_list, finalised_statuses)