def kill_workers(self, workspec_list): # Make logger tmpLog = self.make_logger(baseLogger, method_name='kill_workers') tmpLog.debug('start') # Initialization all_job_ret_map = {} retList = [] # Kill for submissionHost, batchIDs_list in six.iteritems( get_host_batchid_map(workspec_list)): condor_job_manage = CondorJobManage(id=submissionHost) try: ret_map = condor_job_manage.remove(batchIDs_list) except Exception as e: ret_map = {} ret_err_str = 'Exception {0}: {1}'.format( e.__class__.__name__, e) tmpLog.error(ret_err_str) all_job_ret_map.update(ret_map) # Fill return list for workspec in workspec_list: if workspec.batchID is None: ret = (True, 'worker withoug batchID; skipped') else: ret = all_job_ret_map.get( condor_job_id_from_workspec(workspec), (False, 'batch job unfound in return map')) retList.append(ret) tmpLog.debug('done') # Return return retList
def check_workers(self, workspec_list): # Make logger for batch job query tmpLog = self.make_logger(baseLogger, '{0}'.format('batch job query'), method_name='check_workers') tmpLog.debug('start') # Loop over submissionHost job_ads_all_dict = {} for submissionHost, batchIDs_list in six.iteritems(get_host_batchid_map(workspec_list)): # Record batch job query result to this dict, with key = batchID job_query = CondorJobQuery( cacheEnable=self.cacheEnable, cacheRefreshInterval=self.cacheRefreshInterval, useCondorHistory=self.useCondorHistory, id=submissionHost) try: host_job_ads_dict = job_query.get_all(batchIDs_list=batchIDs_list) except Exception as e: host_job_ads_dict = {} ret_err_str = 'Exception {0}: {1}'.format(e.__class__.__name__, e) tmpLog.error(ret_err_str) job_ads_all_dict.update(host_job_ads_dict) # Check for all workers with Pool(self.nProcesses) as _pool: retIterator = _pool.map(lambda _x: _check_one_worker( _x, job_ads_all_dict, cancel_unknown=self.cancelUnknown, held_timeout=self.heldTimeout), workspec_list) retList = list(retIterator) tmpLog.debug('done') return True, retList
def kill_workers(self, workspec_list): # Make logger tmpLog = self.make_logger(baseLogger, method_name='kill_workers') tmpLog.debug('start') # Initialization all_job_ret_map = {} retList = [] # Kill for submissionHost, batchIDs_list in six.iteritems(get_host_batchid_map(workspec_list)): condor_job_manage = CondorJobManage(id=submissionHost) try: ret_map = condor_job_manage.remove(batchIDs_list) except Exception as e: ret_map = {} ret_err_str = 'Exception {0}: {1}'.format(e.__class__.__name__, e) tmpLog.error(ret_err_str) all_job_ret_map.update(ret_map) # Fill return list for workspec in workspec_list: if workspec.batchID is None: ret = (True, 'worker withoug batchID; skipped') else: ret = all_job_ret_map.get(condor_job_id_from_workspec(workspec), (False, 'batch job unfound in return map')) retList.append(ret) tmpLog.debug('done') # Return return retList
def check_workers(self, workspec_list): # Make logger for batch job query tmpLog = self.make_logger(baseLogger, '{0}'.format('batch job query'), method_name='check_workers') tmpLog.debug('start') # Loop over submissionHost job_ads_all_dict = {} for submissionHost, batchIDs_list in six.iteritems( get_host_batchid_map(workspec_list)): # Record batch job query result to this dict, with key = batchID job_query = CondorJobQuery( cacheEnable=self.cacheEnable, cacheRefreshInterval=self.cacheRefreshInterval, useCondorHistory=self.useCondorHistory, id=submissionHost) try: host_job_ads_dict = job_query.get_all( batchIDs_list=batchIDs_list) except Exception as e: host_job_ads_dict = {} ret_err_str = 'Exception {0}: {1}'.format( e.__class__.__name__, e) tmpLog.error(ret_err_str) job_ads_all_dict.update(host_job_ads_dict) # Check for all workers with Pool(self.nProcesses) as _pool: retIterator = _pool.map( lambda _x: _check_one_worker(_x, job_ads_all_dict, cancel_unknown=self.cancelUnknown, held_timeout=self.heldTimeout), workspec_list) retList = list(retIterator) tmpLog.debug('done') return True, retList
def _check_one_worker(workspec, job_ads_all_dict, cancel_unknown=False, held_timeout=3600): # Make logger for one single worker tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_check_one_worker') # Initialize newStatus newStatus = workspec.status errStr = '' try: job_ads_dict = job_ads_all_dict[condor_job_id_from_workspec(workspec)] except KeyError: got_job_ads = False except Exception as e: got_job_ads = False tmpLog.error('With error {0}'.format(e)) else: got_job_ads = True # Parse job ads if got_job_ads: # Check JobStatus try: batchStatus = str(job_ads_dict['JobStatus']) except KeyError: # Propagate native condor job status as unknown workspec.nativeStatus = 'unknown' if cancel_unknown: newStatus = WorkSpec.ST_cancelled errStr = 'cannot get JobStatus of job submissionHost={0} batchID={1}. Regard the worker as canceled'.format( workspec.submissionHost, workspec.batchID) tmpLog.error(errStr) else: newStatus = None errStr = 'cannot get JobStatus of job submissionHost={0} batchID={1}. Skipped'.format( workspec.submissionHost, workspec.batchID) tmpLog.warning(errStr) else: # Try to get LastJobStatus lastBatchStatus = str(job_ads_dict.get('LastJobStatus', '')) # Set batchStatus if lastBatchStatus is terminated status if (lastBatchStatus in ['3', '4'] and batchStatus not in ['3', '4']) \ or (lastBatchStatus in ['4'] and batchStatus in ['3']): batchStatus = lastBatchStatus tmpLog.warning( 'refer to LastJobStatus={0} as new status of job submissionHost={1} batchID={2} to avoid reversal in status (Jobstatus={3})' .format(lastBatchStatus, workspec.submissionHost, workspec.batchID, str(job_ads_dict['JobStatus']))) # Propagate native condor job status workspec.nativeStatus = CONDOR_JOB_STATUS_MAP.get( batchStatus, 'unexpected') if batchStatus in ['2', '6']: # 2 running, 6 transferring output newStatus = WorkSpec.ST_running elif batchStatus in ['1', '7']: # 1 idle, 7 suspended if job_ads_dict.get('JobStartDate'): newStatus = WorkSpec.ST_idle else: newStatus = WorkSpec.ST_submitted elif batchStatus in ['3']: # 3 removed if not errStr: errStr = 'Condor HoldReason: {0} ; Condor RemoveReason: {1} '.format( job_ads_dict.get('LastHoldReason'), job_ads_dict.get('RemoveReason')) newStatus = WorkSpec.ST_cancelled elif batchStatus in ['5']: # 5 held errStr = 'Condor HoldReason: {0} '.format( job_ads_dict.get('HoldReason')) if (job_ads_dict.get('HoldReason') == 'Job not found' or int(time.time()) - int(job_ads_dict.get('EnteredCurrentStatus', 0)) > held_timeout): # Kill the job if held too long or other reasons tmpLog.debug( 'trying to kill job submissionHost={0} batchID={1} due to held too long or not found' .format(workspec.submissionHost, workspec.batchID)) for submissionHost, batchIDs_list in six.iteritems( get_host_batchid_map([workspec])): condor_job_manage = CondorJobManage( id=workspec.submissionHost) try: ret_map = condor_job_manage.remove(batchIDs_list) except Exception as e: ret_map = {} ret_err_str = 'failed to kill job. Exception {0}: {1}'.format( e.__class__.__name__, e) tmpLog.error(ret_err_str) else: ret = ret_map.get( condor_job_id_from_workspec(workspec)) if ret and ret[0]: tmpLog.info( 'killed held job submissionHost={0} batchID={1}' .format(workspec.submissionHost, workspec.batchID)) else: tmpLog.error( 'cannot kill held job submissionHost={0} batchID={1}' .format(workspec.submissionHost, workspec.batchID)) newStatus = WorkSpec.ST_cancelled errStr += ' ; Worker canceled by harvester due to held too long or not found' # Mark the PanDA job as closed instead of failed workspec.set_pilot_closed() tmpLog.debug('Called workspec set_pilot_closed') else: if job_ads_dict.get('JobStartDate'): newStatus = WorkSpec.ST_idle else: newStatus = WorkSpec.ST_submitted elif batchStatus in ['4']: # 4 completed try: payloadExitCode = str(job_ads_dict['ExitCode']) except KeyError: errStr = 'cannot get ExitCode of job submissionHost={0} batchID={1}. Regard the worker as failed'.format( workspec.submissionHost, workspec.batchID) tmpLog.warning(errStr) newStatus = WorkSpec.ST_failed else: # Propagate condor return code workspec.nativeExitCode = payloadExitCode if payloadExitCode in ['0']: # Payload should return 0 after successful run newStatus = WorkSpec.ST_finished else: # Other return codes are considered failed newStatus = WorkSpec.ST_failed errStr = 'Payload execution error: returned non-zero' tmpLog.debug(errStr) tmpLog.info( 'Payload return code = {0}'.format(payloadExitCode)) else: errStr = 'cannot get reasonable JobStatus of job submissionHost={0} batchID={1}. Regard the worker as failed by default'.format( workspec.submissionHost, workspec.batchID) tmpLog.error(errStr) newStatus = WorkSpec.ST_failed tmpLog.info( 'submissionHost={0} batchID={1} : batchStatus {2} -> workerStatus {3}' .format(workspec.submissionHost, workspec.batchID, batchStatus, newStatus)) else: # Propagate native condor job status as unknown workspec.nativeStatus = 'unknown' if cancel_unknown: errStr = 'condor job submissionHost={0} batchID={1} not found. Regard the worker as canceled by default'.format( workspec.submissionHost, workspec.batchID) tmpLog.error(errStr) newStatus = WorkSpec.ST_cancelled tmpLog.info( 'submissionHost={0} batchID={1} : batchStatus {2} -> workerStatus {3}' .format(workspec.submissionHost, workspec.batchID, '3', newStatus)) else: errStr = 'condor job submissionHost={0} batchID={1} not found. Skipped'.format( workspec.submissionHost, workspec.batchID) tmpLog.warning(errStr) newStatus = None # Return return (newStatus, errStr)
def _check_one_worker(workspec, job_ads_all_dict, cancel_unknown=False, held_timeout=3600): # Make logger for one single worker tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_check_one_worker') # Initialize newStatus newStatus = workspec.status errStr = '' try: job_ads_dict = job_ads_all_dict[condor_job_id_from_workspec(workspec)] except KeyError: got_job_ads = False except Exception as e: got_job_ads = False tmpLog.error('With error {0}'.format(e)) else: got_job_ads = True # Parse job ads if got_job_ads: # Check JobStatus try: batchStatus = str(job_ads_dict['JobStatus']) except KeyError: # Propagate native condor job status as unknown workspec.nativeStatus = 'unknown' if cancel_unknown: newStatus = WorkSpec.ST_cancelled errStr = 'cannot get JobStatus of job submissionHost={0} batchID={1}. Regard the worker as canceled'.format(workspec.submissionHost, workspec.batchID) tmpLog.error(errStr) else: newStatus = None errStr = 'cannot get JobStatus of job submissionHost={0} batchID={1}. Skipped'.format(workspec.submissionHost, workspec.batchID) tmpLog.warning(errStr) else: # Try to get LastJobStatus lastBatchStatus = str(job_ads_dict.get('LastJobStatus', '')) # Set batchStatus if lastBatchStatus is terminated status if (lastBatchStatus in ['3', '4'] and batchStatus not in ['3', '4']) \ or (lastBatchStatus in ['4'] and batchStatus in ['3']): batchStatus = lastBatchStatus tmpLog.warning('refer to LastJobStatus={0} as new status of job submissionHost={1} batchID={2} to avoid reversal in status (Jobstatus={3})'.format( lastBatchStatus, workspec.submissionHost, workspec.batchID, str(job_ads_dict['JobStatus']))) # Propagate native condor job status workspec.nativeStatus = CONDOR_JOB_STATUS_MAP.get(batchStatus, 'unexpected') if batchStatus in ['2', '6']: # 2 running, 6 transferring output newStatus = WorkSpec.ST_running elif batchStatus in ['1', '7']: # 1 idle, 7 suspended if job_ads_dict.get('JobStartDate'): newStatus = WorkSpec.ST_idle else: newStatus = WorkSpec.ST_submitted elif batchStatus in ['3']: # 3 removed if not errStr: errStr = 'Condor HoldReason: {0} ; Condor RemoveReason: {1} '.format( job_ads_dict.get('LastHoldReason'), job_ads_dict.get('RemoveReason')) newStatus = WorkSpec.ST_cancelled elif batchStatus in ['5']: # 5 held errStr = 'Condor HoldReason: {0} '.format(job_ads_dict.get('HoldReason')) if ( job_ads_dict.get('HoldReason') == 'Job not found' or int(time.time()) - int(job_ads_dict.get('EnteredCurrentStatus', 0)) > held_timeout ): # Kill the job if held too long or other reasons tmpLog.debug('trying to kill job submissionHost={0} batchID={1} due to held too long or not found'.format(workspec.submissionHost, workspec.batchID)) for submissionHost, batchIDs_list in six.iteritems(get_host_batchid_map([workspec])): condor_job_manage = CondorJobManage(id=workspec.submissionHost) try: ret_map = condor_job_manage.remove(batchIDs_list) except Exception as e: ret_map = {} ret_err_str = 'failed to kill job. Exception {0}: {1}'.format(e.__class__.__name__, e) tmpLog.error(ret_err_str) else: ret = ret_map.get(condor_job_id_from_workspec(workspec)) if ret and ret[0]: tmpLog.info('killed held job submissionHost={0} batchID={1}'.format(workspec.submissionHost, workspec.batchID)) else: tmpLog.error('cannot kill held job submissionHost={0} batchID={1}'.format(workspec.submissionHost, workspec.batchID)) newStatus = WorkSpec.ST_cancelled errStr += ' ; Worker canceled by harvester due to held too long or not found' # Mark the PanDA job as closed instead of failed workspec.set_pilot_closed() tmpLog.debug('Called workspec set_pilot_closed') else: if job_ads_dict.get('JobStartDate'): newStatus = WorkSpec.ST_idle else: newStatus = WorkSpec.ST_submitted elif batchStatus in ['4']: # 4 completed try: payloadExitCode = str(job_ads_dict['ExitCode']) except KeyError: errStr = 'cannot get ExitCode of job submissionHost={0} batchID={1}. Regard the worker as failed'.format(workspec.submissionHost, workspec.batchID) tmpLog.warning(errStr) newStatus = WorkSpec.ST_failed else: # Propagate condor return code workspec.nativeExitCode = payloadExitCode if payloadExitCode in ['0']: # Payload should return 0 after successful run newStatus = WorkSpec.ST_finished else: # Other return codes are considered failed newStatus = WorkSpec.ST_failed errStr = 'Payload execution error: returned non-zero' tmpLog.debug(errStr) tmpLog.info('Payload return code = {0}'.format(payloadExitCode)) else: errStr = 'cannot get reasonable JobStatus of job submissionHost={0} batchID={1}. Regard the worker as failed by default'.format( workspec.submissionHost, workspec.batchID) tmpLog.error(errStr) newStatus = WorkSpec.ST_failed tmpLog.info('submissionHost={0} batchID={1} : batchStatus {2} -> workerStatus {3}'.format( workspec.submissionHost, workspec.batchID, batchStatus, newStatus)) else: # Propagate native condor job status as unknown workspec.nativeStatus = 'unknown' if cancel_unknown: errStr = 'condor job submissionHost={0} batchID={1} not found. Regard the worker as canceled by default'.format( workspec.submissionHost, workspec.batchID) tmpLog.error(errStr) newStatus = WorkSpec.ST_cancelled tmpLog.info('submissionHost={0} batchID={1} : batchStatus {2} -> workerStatus {3}'.format( workspec.submissionHost, workspec.batchID, '3', newStatus)) else: errStr = 'condor job submissionHost={0} batchID={1} not found. Skipped'.format( workspec.submissionHost, workspec.batchID) tmpLog.warning(errStr) newStatus = None # Set supplemental error message error_code = WorkerErrors.error_codes.get('GENERAL_ERROR') if errStr else WorkerErrors.error_codes.get('SUCCEEDED') workspec.set_supplemental_error(error_code=error_code, error_diag=errStr) # Return return (newStatus, errStr)