def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' import datetime backenddict = {} jobdict = {} for j in jobs: if j.backend.id and ( (datetime.datetime.utcnow() - j.time.timestamps["submitted"] ).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]): jobdict[j.backend.id] = j backenddict[j.backend.actualCE] = j if len(jobdict.keys()) == 0: return jobInfoDict = Grid.arc_status(jobdict.keys(), backenddict.keys()) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.actualCE != urlparse(id)[1].split(":")[0]: job.backend.actualCE = urlparse(id)[1].split(":")[0] if job.backend.status != info['State']: doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['State'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['State'] in [ 'Finished', '(FINISHED)', 'Finished (FINISHED)' ]: # grab output sandbox if Grid.arc_get_output( job.backend.id, job.getOutputWorkspace(create=True).getPath()): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace(create=True).getPath()) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error('fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['State'] if 'Exit Code' in info: try: job.backend.exitcode_arc = int( info['Exit Code']) except: job.backend.exitcode_arc = 1 if 'Job Error' in info: try: job.backend.reason = info['Job Error'] except: pass job.backend.updateGangaJobStatus() else: logger.warning('fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: if not Grid.arc_purgeMultiple(jidListForPurge): logger.warning("Failed to purge all ARC jobs.")
def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' import datetime ce_list = [] # type: List[str] jobdict = {} # type: Mapping[str, Job] for j in jobs: if j.backend.id and ( (datetime.datetime.utcnow() - j.time.timestamps["submitted"] ).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]): jobdict[j.backend.id] = j ce_list.append(j.backend.actualCE) if len(jobdict.keys()) == 0: return # Group jobs by the backend's credential requirements cred_to_backend_id_list = defaultdict( list) # type: Mapping[ICredentialRequirement, List[str]] for jid, job in jobdict.items(): cred_to_backend_id_list[ job.backend.credential_requirements].append(jid) # Batch the status requests by credential requirement jobInfoDict = {} for cred_req, job_ids in cred_to_backend_id_list.items(): # If the credential is not valid or doesn't exist then skip it cred = credential_store.get(cred_req) if not cred or not cred.is_valid(): needed_credentials.add(cred_req) continue # Create a ``Grid`` for each credential requirement and request the relevant jobs through it info = Grid.arc_status(job_ids, ce_list, cred_req) jobInfoDict.update(info) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.actualCE != urlparse(id)[1].split(":")[0]: job.backend.actualCE = urlparse(id)[1].split(":")[0] if job.backend.status != info['State']: doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['State'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['State'] in [ 'Finished', '(FINISHED)', 'Finished (FINISHED)' ]: # grab output sandbox if Grid.arc_get_output( job.backend.id, job.getOutputWorkspace(create=True).getPath(), job.backend.credential_requirements): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace(create=True).getPath()) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error('fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['State'] if 'Exit Code' in info: try: job.backend.exitcode_arc = int( info['Exit Code']) except: job.backend.exitcode_arc = 1 if 'Job Error' in info: try: job.backend.reason = info['Job Error'] except: pass job.backend.updateGangaJobStatus() else: logger.warning('fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: for cred_req, job_ids in cred_to_backend_id_list.items(): if not Grid.arc_purge_multiple( set(job_ids) & set(jidListForPurge), cred_req): logger.warning("Failed to purge all ARC jobs.")
def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' jobdict = dict([(job.backend.id, job) for job in jobs if job.backend.id]) # Group jobs by the backend's credential requirements cred_to_backend_id_list = defaultdict(list) for job in jobs: cred_to_backend_id_list[ job.backend.credential_requirements].append(job.backend.id) # Batch the status requests by credential requirement jobInfoDict = {} for cred_req, job_ids in cred_to_backend_id_list.items(): # If the credential is not valid or doesn't exist then skip it cred = credential_store.get(cred_req) if not cred or not cred.is_valid(): needed_credentials.add(cred_req) continue # Create a ``Grid`` for each credential requirement and request the relevant jobs through it info = Grid.cream_status(job_ids, cred_req) jobInfoDict.update(info) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.status != info['Current Status'] and ( 'ExitCode' not in info or ('ExitCode' in info and info['ExitCode'].isdigit())): if 'Worker Node' in info: job.backend.workernode = info['Worker Node'] if 'CREAM ISB URI' in info: job.backend.isbURI = info['CREAM ISB URI'] if 'CREAM OSB URI' in info: job.backend.osbURI = info['CREAM OSB URI'] doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['Current Status'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['Current Status'] in ['DONE-OK', 'DONE-FAILED']: # resolve output sandbox URIs based on the JDL # information osbURIList = __cream_resolveOSBList__(job, info['JDL']) logger.debug('OSB list:') for f in osbURIList: logger.debug(f) if osbURIList: if Grid.cream_get_output( osbURIList, job.getOutputWorkspace( create=True).getPath(), job.backend.credential_requirements): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace( create=True).getPath()) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error( 'fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['Current Status'] if 'ExitCode' in info and info['ExitCode'] != "W": try: job.backend.exitcode_cream = int( info['ExitCode']) except: job.backend.exitcode_cream = 1 if 'FailureReason' in info: try: job.backend.reason = info['FailureReason'] except: pass job.backend.updateGangaJobStatus() else: logger.warning('fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: for cred_req, job_ids in cred_to_backend_id_list.items(): Grid.cream_purge_multiple( set(job_ids) & set(jidListForPurge), cred_req)
def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' import datetime ce_list = [] # type: List[str] jobdict = {} # type: Mapping[str, Job] for j in jobs: if j.backend.id and ((datetime.datetime.utcnow() - j.time.timestamps["submitted"]).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]): jobdict[j.backend.id] = j ce_list.append(j.backend.actualCE) if len(jobdict.keys()) == 0: return # Group jobs by the backend's credential requirements cred_to_backend_id_list = defaultdict(list) # type: Mapping[ICredentialRequirement, List[str]] for jid, job in jobdict.items(): cred_to_backend_id_list[job.backend.credential_requirements].append(jid) # Batch the status requests by credential requirement jobInfoDict = {} for cred_req, job_ids in cred_to_backend_id_list.items(): # If the credential is not valid or doesn't exist then skip it cred = credential_store.get(cred_req) if not cred or not cred.is_valid(): needed_credentials.add(cred_req) continue # Create a ``Grid`` for each credential requirement and request the relevant jobs through it info = Grid.arc_status(job_ids, ce_list, cred_req) jobInfoDict.update(info) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.actualCE != urlparse(id)[1].split(":")[0]: job.backend.actualCE = urlparse(id)[1].split(":")[0] if job.backend.status != info['State']: doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['State'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['State'] in ['Finished', '(FINISHED)', 'Finished (FINISHED)']: # grab output sandbox if Grid.arc_get_output(job.backend.id, job.getOutputWorkspace(create=True).getPath(), job.backend.credential_requirements): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace(create=True).getPath()) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error( 'fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['State'] if 'Exit Code' in info: try: job.backend.exitcode_arc = int( info['Exit Code']) except: job.backend.exitcode_arc = 1 if 'Job Error' in info: try: job.backend.reason = info['Job Error'] except: pass job.backend.updateGangaJobStatus() else: logger.warning( 'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: for cred_req, job_ids in cred_to_backend_id_list.items(): if not Grid.arc_purge_multiple(set(job_ids) & set(jidListForPurge), cred_req): logger.warning("Failed to purge all ARC jobs.")
def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' jobdict = dict([[job.backend.id, job] for job in jobs if job.backend.id]) jobInfoDict = Grid.cream_status(jobdict.keys()) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.status != info['Current Status'] and ('ExitCode' not in info or ('ExitCode' in info and info['ExitCode'].isdigit())): if 'Worker Node' in info: job.backend.workernode = info['Worker Node'] if 'CREAM ISB URI' in info: job.backend.isbURI = info['CREAM ISB URI'] if 'CREAM OSB URI' in info: job.backend.osbURI = info['CREAM OSB URI'] doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['Current Status'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['Current Status'] in ['DONE-OK', 'DONE-FAILED']: # resolve output sandbox URIs based on the JDL # information osbURIList = __cream_resolveOSBList__(job, info['JDL']) logger.debug('OSB list:') for f in osbURIList: logger.debug(f) if osbURIList: if Grid.cream_get_output(osbURIList, job.getOutputWorkspace(create=True).getPath() ): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace(create=True).getPath() ) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error( 'fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['Current Status'] if 'ExitCode' in info and info['ExitCode'] != "W": try: job.backend.exitcode_cream = int( info['ExitCode']) except: job.backend.exitcode_cream = 1 if 'FailureReason' in info: try: job.backend.reason = info['FailureReason'] except: pass job.backend.updateGangaJobStatus() else: logger.warning( 'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: Grid.cream_purgeMultiple(jidListForPurge)
def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' import datetime backenddict = {} jobdict = {} for j in jobs: if j.backend.id and ((datetime.datetime.utcnow() - j.time.timestamps["submitted"]).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]): jobdict[j.backend.id] = j backenddict[j.backend.actualCE] = j if len(jobdict.keys()) == 0: return jobInfoDict = Grid.arc_status( jobdict.keys(), backenddict.keys()) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.actualCE != urlparse(id)[1].split(":")[0]: job.backend.actualCE = urlparse(id)[1].split(":")[0] if job.backend.status != info['State']: doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['State'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['State'] in ['Finished', '(FINISHED)', 'Finished (FINISHED)']: # grab output sandbox if Grid.arc_get_output(job.backend.id, job.getOutputWorkspace(create=True).getPath()): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace(create=True).getPath()) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error( 'fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['State'] if 'Exit Code' in info: try: job.backend.exitcode_arc = int( info['Exit Code']) except: job.backend.exitcode_arc = 1 if 'Job Error' in info: try: job.backend.reason = info['Job Error'] except: pass job.backend.updateGangaJobStatus() else: logger.warning( 'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: if not Grid.arc_purgeMultiple(jidListForPurge): logger.warning("Failed to purge all ARC jobs.")
def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' jobdict = dict([(job.backend.id, job) for job in jobs if job.backend.id]) # Group jobs by the backend's credential requirements cred_to_backend_id_list = defaultdict(list) for job in jobs: cred_to_backend_id_list[job.backend.credential_requirements].append(job.backend.id) # Batch the status requests by credential requirement jobInfoDict = {} for cred_req, job_ids in cred_to_backend_id_list.items(): # If the credential is not valid or doesn't exist then skip it cred = credential_store.get(cred_req) if not cred or not cred.is_valid(): needed_credentials.add(cred_req) continue # Create a ``Grid`` for each credential requirement and request the relevant jobs through it info = Grid.cream_status(job_ids, cred_req) jobInfoDict.update(info) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.status != info['Current Status'] and ('ExitCode' not in info or ('ExitCode' in info and info['ExitCode'].isdigit())): if 'Worker Node' in info: job.backend.workernode = info['Worker Node'] if 'CREAM ISB URI' in info: job.backend.isbURI = info['CREAM ISB URI'] if 'CREAM OSB URI' in info: job.backend.osbURI = info['CREAM OSB URI'] doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['Current Status'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['Current Status'] in ['DONE-OK', 'DONE-FAILED']: # resolve output sandbox URIs based on the JDL # information osbURIList = __cream_resolveOSBList__(job, info['JDL']) logger.debug('OSB list:') for f in osbURIList: logger.debug(f) if osbURIList: if Grid.cream_get_output(osbURIList, job.getOutputWorkspace(create=True).getPath(), job.backend.credential_requirements): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace(create=True).getPath()) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error( 'fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['Current Status'] if 'ExitCode' in info and info['ExitCode'] != "W": try: job.backend.exitcode_cream = int( info['ExitCode']) except: job.backend.exitcode_cream = 1 if 'FailureReason' in info: try: job.backend.reason = info['FailureReason'] except: pass job.backend.updateGangaJobStatus() else: logger.warning( 'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: for cred_req, job_ids in cred_to_backend_id_list.items(): Grid.cream_purge_multiple(set(job_ids) & set(jidListForPurge), cred_req)