Exemplo n.º 1
0
 def __init__(self, queue_config_mapper, single_mode=False):
     AgentBase.__init__(self, single_mode)
     self.queueConfigMapper = queue_config_mapper
     self.dbProxy = DBProxy()
     self.pluginFactory = PluginFactory()
     self.startTimestamp = time.time()
     self.monitor_fifo = MonitorFIFO()
     self.apfmon = Apfmon(self.queueConfigMapper)
Exemplo n.º 2
0
 def __init__(self, queue_config_mapper, single_mode=False):
     AgentBase.__init__(self, single_mode)
     self.queueConfigMapper = queue_config_mapper
     self.dbProxy = DBProxy()
     self.workerMaker = WorkerMaker()
     self.workerAdjuster = WorkerAdjuster(queue_config_mapper)
     self.pluginFactory = PluginFactory()
     self.monitor_fifo = MonitorFIFO()
     self.apfmon = Apfmon(self.queueConfigMapper)
Exemplo n.º 3
0
 def __init__(self, queue_config_mapper, single_mode=False):
     AgentBase.__init__(self, single_mode)
     self.queueConfigMapper = queue_config_mapper
     self.dbProxy = DBProxy()
     self.pluginFactory = PluginFactory()
     self.startTimestamp = time.time()
     self.monitor_fifo = MonitorFIFO()
     if self.monitor_fifo.enabled:
         self.monitor_event_fifo = MonitorEventFIFO()
     else:
         self.monitor_event_fifo = None
     self.apfmon = Apfmon(self.queueConfigMapper)
     self.eventBasedMonCoreList = []
     if getattr(harvester_config.monitor, 'eventBasedEnable', False):
         for pluginConf in harvester_config.monitor.eventBasedPlugins:
             pluginFactory = PluginFactory()
             self.eventBasedMonCoreList.append(pluginFactory.get_plugin(pluginConf))
Exemplo n.º 4
0
import time
import random

from future.utils import iteritems

from pandaharvester.harvestercore import core_utils
from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
from pandaharvester.harvestercore.job_spec import JobSpec
from pandaharvester.harvestercore.work_spec import WorkSpec
from pandaharvester.harvestercore.plugin_factory import PluginFactory

from pandaharvester.harvestercore.fifos import MonitorFIFO

# start test

mq = MonitorFIFO()

print('sleepTime', mq.config.sleepTime)


def single_thread_test(nObjects=3):
    time_point = time.time()
    print('clear')
    mq.fifo.clear()
    print('size', mq.size())
    time_consumed = time.time() - time_point
    print('Time consumed: ', time_consumed)

    time_point = time.time()
    for i in range(nObjects):
        workspec = WorkSpec()
Exemplo n.º 5
0
import random

from future.utils import iteritems

from pandaharvester.harvestercore import core_utils
from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
from pandaharvester.harvestercore.job_spec import JobSpec
from pandaharvester.harvestercore.work_spec import WorkSpec
from pandaharvester.harvestercore.plugin_factory import PluginFactory

from pandaharvester.harvestercore.fifos import MonitorFIFO


# start test

mq = MonitorFIFO()

print('sleepTime', mq.config.sleepTime)

def single_thread_test(nObjects=3, protective=False):
    time_point = time.time()
    print('clear')
    mq.fifo.clear()
    print('size', mq.size())
    time_consumed = time.time() - time_point
    print('Time consumed: ', time_consumed)

    time_point = time.time()
    for i in range(nObjects):
        workspec = WorkSpec()
        workspec.workerID = i
Exemplo n.º 6
0
class Monitor(AgentBase):
    # fifos
    monitor_fifo = MonitorFIFO()

    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.pluginFactory = PluginFactory()
        self.startTimestamp = time.time()

    # main loop
    def run(self):
        lockedBy = 'monitor-{0}'.format(self.get_pid())
        # init messengers
        for queueConfig in self.queueConfigMapper.get_all_queues().values():
            # just import for module initialization
            self.pluginFactory.get_plugin(queueConfig.messenger)
        # main
        try:
            fifoSleepTimeMilli = harvester_config.monitor.fifoSleepTimeMilli
        except AttributeError:
            fifoSleepTimeMilli = 5000
        try:
            fifoCheckDuration = harvester_config.monitor.fifoCheckDuration
        except AttributeError:
            fifoCheckDuration = 30
        try:
            fifoMaxWorkersPerChunk = harvester_config.monitor.fifoMaxWorkersPerChunk
        except AttributeError:
            fifoMaxWorkersPerChunk = 500
        try:
            fifoProtectiveDequeue = harvester_config.monitor.fifoProtectiveDequeue
        except AttributeError:
            fifoProtectiveDequeue = True
        last_DB_cycle_timestamp = 0
        monitor_fifo = self.monitor_fifo
        sleepTime = (fifoSleepTimeMilli / 1000.0) \
                        if monitor_fifo.enabled else harvester_config.monitor.sleepTime
        adjusted_sleepTime = sleepTime
        if monitor_fifo.enabled:
            monitor_fifo.restore()
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(lockedBy),
                                       method_name='run')
            mainLog.debug('start a cycle')
            if time.time() >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime and \
                    not (monitor_fifo.enabled and self.singleMode):
                # run with workers from DB
                mainLog.debug('starting run with DB')
                mainLog.debug('getting workers to monitor')
                workSpecsPerQueue = self.dbProxy.get_workers_to_update(
                    harvester_config.monitor.maxWorkers,
                    harvester_config.monitor.checkInterval,
                    harvester_config.monitor.lockInterval, lockedBy)
                mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue)))
                # loop over all workers
                for queueName, configIdWorkSpecs in iteritems(
                        workSpecsPerQueue):
                    for configID, workSpecsList in iteritems(
                            configIdWorkSpecs):
                        retVal = self.monitor_agent_core(lockedBy,
                                                         queueName,
                                                         workSpecsList,
                                                         config_id=configID)
                        if monitor_fifo.enabled and retVal is not None:
                            workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                            if workSpecsToEnqueue:
                                mainLog.debug('putting workers to FIFO')
                                try:
                                    score = fifoCheckInterval + timeNow_timestamp
                                    monitor_fifo.put(
                                        (queueName, workSpecsToEnqueue), score)
                                    mainLog.info(
                                        'put workers of {0} to FIFO with score {1}'
                                        .format(queueName, score))
                                except Exception as errStr:
                                    mainLog.error(
                                        'failed to put object from FIFO: {0}'.
                                        format(errStr))
                            if workSpecsToEnqueueToHead:
                                mainLog.debug('putting workers to FIFO head')
                                try:
                                    score = fifoCheckInterval - timeNow_timestamp
                                    monitor_fifo.put(
                                        (queueName, workSpecsToEnqueueToHead),
                                        score)
                                    mainLog.info(
                                        'put workers of {0} to FIFO with score {1}'
                                        .format(queueName, score))
                                except Exception as errStr:
                                    mainLog.error(
                                        'failed to put object from FIFO head: {0}'
                                        .format(errStr))
                last_DB_cycle_timestamp = time.time()
                mainLog.debug('ended run with DB')
            elif monitor_fifo.enabled:
                # run with workers from FIFO
                sw = core_utils.get_stopwatch()
                n_loops = 0
                last_fifo_cycle_timestamp = time.time()
                to_break = False
                obj_dequeued_id_list = []
                obj_to_enqueue_dict = collections.defaultdict(
                    lambda: [[], 0, 0])
                obj_to_enqueue_to_head_dict = collections.defaultdict(
                    lambda: [[], 0, 0])
                remaining_obj_to_enqueue_dict = {}
                remaining_obj_to_enqueue_to_head_dict = {}
                n_chunk_peeked_stat, sum_overhead_time_stat = 0, 0.0
                while n_loops == 0 or time.time(
                ) < last_fifo_cycle_timestamp + fifoCheckDuration:
                    sw.reset()
                    retVal, overhead_time = monitor_fifo.to_check_workers()
                    if overhead_time is not None:
                        n_chunk_peeked_stat += 1
                        sum_overhead_time_stat += overhead_time
                    if retVal:
                        # check fifo size
                        fifo_size = monitor_fifo.size()
                        mainLog.debug('FIFO size is {0}'.format(fifo_size))
                        mainLog.debug('starting run with FIFO')
                        try:
                            obj_gotten = monitor_fifo.get(
                                timeout=1, protective=fifoProtectiveDequeue)
                        except Exception as errStr:
                            mainLog.error(
                                'failed to get object from FIFO: {0}'.format(
                                    errStr))
                        else:
                            if obj_gotten is not None:
                                if fifoProtectiveDequeue:
                                    obj_dequeued_id_list.append(obj_gotten.id)
                                queueName, workSpecsList = obj_gotten.item
                                mainLog.debug(
                                    'got a chunk of {0} workers of {1} from FIFO'
                                    .format(len(workSpecsList), queueName) +
                                    sw.get_elapsed_time())
                                sw.reset()
                                configID = None
                                for workSpecs in workSpecsList:
                                    if configID is None and len(workSpecs) > 0:
                                        configID = workSpecs[0].configID
                                    for workSpec in workSpecs:
                                        if workSpec.pandaid_list is None:
                                            _jobspec_list = workSpec.get_jobspec_list(
                                            )
                                            if _jobspec_list is not None:
                                                workSpec.pandaid_list = [
                                                    j.PandaID
                                                    for j in workSpec.
                                                    get_jobspec_list()
                                                ]
                                            else:
                                                workSpec.pandaid_list = []
                                            workSpec.force_update(
                                                'pandaid_list')
                                retVal = self.monitor_agent_core(
                                    lockedBy,
                                    queueName,
                                    workSpecsList,
                                    from_fifo=True,
                                    config_id=configID)
                                if retVal is not None:
                                    workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                                    try:
                                        if len(obj_to_enqueue_dict[queueName]
                                               [0]) + len(
                                                   workSpecsToEnqueue
                                               ) <= fifoMaxWorkersPerChunk:
                                            obj_to_enqueue_dict[queueName][
                                                0].extend(workSpecsToEnqueue)
                                            obj_to_enqueue_dict[queueName][
                                                1] = max(
                                                    obj_to_enqueue_dict[
                                                        queueName][1],
                                                    timeNow_timestamp)
                                            obj_to_enqueue_dict[queueName][
                                                2] = max(
                                                    obj_to_enqueue_dict[
                                                        queueName][2],
                                                    fifoCheckInterval)
                                        else:
                                            to_break = True
                                            remaining_obj_to_enqueue_dict[
                                                queueName] = [
                                                    workSpecsToEnqueue,
                                                    timeNow_timestamp,
                                                    fifoCheckInterval
                                                ]
                                    except Exception as errStr:
                                        mainLog.error(
                                            'failed to gather workers for FIFO: {0}'
                                            .format(errStr))
                                        to_break = True
                                    try:
                                        if len(obj_to_enqueue_to_head_dict[
                                                queueName][0]) + len(
                                                    workSpecsToEnqueueToHead
                                                ) <= fifoMaxWorkersPerChunk:
                                            obj_to_enqueue_to_head_dict[
                                                queueName][0].extend(
                                                    workSpecsToEnqueueToHead)
                                            obj_to_enqueue_to_head_dict[
                                                queueName][1] = max(
                                                    obj_to_enqueue_to_head_dict[
                                                        queueName][1],
                                                    timeNow_timestamp)
                                            obj_to_enqueue_to_head_dict[
                                                queueName][2] = max(
                                                    obj_to_enqueue_to_head_dict[
                                                        queueName][2],
                                                    fifoCheckInterval)
                                        else:
                                            to_break = True
                                            remaining_obj_to_enqueue_to_head_dict[
                                                queueName] = [
                                                    workSpecsToEnqueueToHead,
                                                    timeNow_timestamp,
                                                    fifoCheckInterval
                                                ]
                                    except Exception as errStr:
                                        mainLog.error(
                                            'failed to gather workers for FIFO head: {0}'
                                            .format(errStr))
                                        to_break = True
                                    mainLog.debug(
                                        'checked {0} workers from FIFO'.format(
                                            len(workSpecsList)) +
                                        sw.get_elapsed_time())
                                    n_loops += 1
                                else:
                                    mainLog.debug(
                                        'monitor_agent_core returned None. Skipped putting to FIFO'
                                    )
                                if to_break:
                                    break
                            else:
                                mainLog.debug('got nothing in FIFO')
                    else:
                        mainLog.debug(
                            'workers in FIFO too young to check. Skipped')
                        if self.singleMode:
                            break
                        if overhead_time is not None:
                            time.sleep(
                                max(-overhead_time * random.uniform(0.1, 1),
                                    adjusted_sleepTime))
                        else:
                            time.sleep(
                                max(fifoCheckDuration * random.uniform(0.1, 1),
                                    adjusted_sleepTime))

                # enqueue to fifo
                sw.reset()
                n_chunk_put = 0
                mainLog.debug('putting worker chunks to FIFO')
                for _dct in (obj_to_enqueue_dict,
                             remaining_obj_to_enqueue_dict):
                    for queueName, obj_to_enqueue in iteritems(_dct):
                        try:
                            workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue
                            if workSpecsToEnqueue:
                                score = fifoCheckInterval + timeNow_timestamp
                                monitor_fifo.put(
                                    (queueName, workSpecsToEnqueue), score)
                                n_chunk_put += 1
                                mainLog.info(
                                    'put a chunk of {0} workers of {1} to FIFO with score {2}'
                                    .format(len(workSpecsToEnqueue), queueName,
                                            score))
                        except Exception as errStr:
                            mainLog.error(
                                'failed to put object from FIFO: {0}'.format(
                                    errStr))
                mainLog.debug('putting worker chunks to FIFO head')
                for _dct in (obj_to_enqueue_to_head_dict,
                             remaining_obj_to_enqueue_to_head_dict):
                    for queueName, obj_to_enqueue_to_head in iteritems(_dct):
                        try:
                            workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue_to_head
                            if workSpecsToEnqueueToHead:
                                score = fifoCheckInterval - timeNow_timestamp
                                monitor_fifo.put(
                                    (queueName, workSpecsToEnqueueToHead),
                                    score)
                                n_chunk_put += 1
                                mainLog.info(
                                    'put a chunk of {0} workers of {1} to FIFO with score {2}'
                                    .format(len(workSpecsToEnqueueToHead),
                                            queueName, score))
                        except Exception as errStr:
                            mainLog.error(
                                'failed to put object from FIFO head: {0}'.
                                format(errStr))
                # release protective dequeued objects
                if fifoProtectiveDequeue and len(obj_dequeued_id_list) > 0:
                    monitor_fifo.release(ids=obj_dequeued_id_list)
                mainLog.debug(
                    'put {0} worker chunks into FIFO'.format(n_chunk_put) +
                    sw.get_elapsed_time())
                # adjust adjusted_sleepTime
                if n_chunk_peeked_stat > 0 and sum_overhead_time_stat > sleepTime:
                    speedup_factor = (sum_overhead_time_stat - sleepTime) / (
                        n_chunk_peeked_stat *
                        harvester_config.monitor.checkInterval)
                    speedup_factor = max(speedup_factor, 0)
                    adjusted_sleepTime = adjusted_sleepTime / (1. +
                                                               speedup_factor)
                elif n_chunk_peeked_stat == 0 or sum_overhead_time_stat < 0:
                    adjusted_sleepTime = (sleepTime + adjusted_sleepTime) / 2
                mainLog.debug('adjusted_sleepTime becomes {0:.3f} sec'.format(
                    adjusted_sleepTime))
                # end run with fifo
                mainLog.debug('ended run with FIFO')

            if sw_main.get_elapsed_time_in_sec(
            ) > harvester_config.monitor.lockInterval:
                mainLog.warning(
                    'a single cycle was longer than lockInterval ' +
                    sw_main.get_elapsed_time())
            else:
                mainLog.debug('done a cycle' + sw_main.get_elapsed_time())

            # check if being terminated
            if self.terminated(adjusted_sleepTime):
                mainLog.debug('terminated')
                return

    # core of monitor agent to check workers in workSpecsList of queueName
    def monitor_agent_core(self,
                           lockedBy,
                           queueName,
                           workSpecsList,
                           from_fifo=False,
                           config_id=None):
        tmpQueLog = self.make_logger(_logger,
                                     'id={0} queue={1}'.format(
                                         lockedBy, queueName),
                                     method_name='run')
        # check queue
        if not self.queueConfigMapper.has_queue(queueName, config_id):
            tmpQueLog.error('config not found')
            return None
        # get queue
        queueConfig = self.queueConfigMapper.get_queue(queueName, config_id)
        # get plugins
        monCore = self.pluginFactory.get_plugin(queueConfig.monitor)
        messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
        # workspec chunk of active workers
        workSpecsToEnqueue = []
        workSpecsToEnqueueToHead = []
        timeNow_timestamp = time.time()
        # get fifoCheckInterval for PQ and other fifo attributes
        try:
            fifoCheckInterval = monCore.fifoCheckInterval
        except Exception:
            if hasattr(harvester_config.monitor, 'fifoCheckInterval'):
                fifoCheckInterval = harvester_config.monitor.fifoCheckInterval
            else:
                fifoCheckInterval = harvester_config.monitor.checkInterval
        try:
            forceEnqueueInterval = harvester_config.monitor.fifoForceEnqueueInterval
        except AttributeError:
            forceEnqueueInterval = 3600
        try:
            fifoMaxPreemptInterval = harvester_config.monitor.fifoMaxPreemptInterval
        except AttributeError:
            fifoMaxPreemptInterval = 60
        # check workers
        allWorkers = [item for sublist in workSpecsList for item in sublist]
        tmpQueLog.debug('checking {0} workers'.format(len(allWorkers)))
        tmpStat, tmpRetMap = self.check_workers(monCore, messenger, allWorkers,
                                                queueConfig, tmpQueLog,
                                                from_fifo)
        if tmpStat:
            # loop over all worker chunks
            tmpQueLog.debug('update jobs and workers')
            iWorker = 0
            for workSpecs in workSpecsList:
                jobSpecs = None
                filesToStageOut = dict()
                pandaIDsList = []
                eventsToUpdateList = []
                filesToStageOutList = []
                isCheckedList = []
                mapType = workSpecs[0].mapType
                # loop over workSpecs
                for workSpec in workSpecs:
                    tmpLog = self.make_logger(_logger,
                                              'id={0} workerID={1}'.format(
                                                  lockedBy, workSpec.workerID),
                                              method_name='run')
                    tmpOut = tmpRetMap[workSpec.workerID]
                    newStatus = tmpOut['newStatus']
                    monStatus = tmpOut['monStatus']
                    diagMessage = tmpOut['diagMessage']
                    workAttributes = tmpOut['workAttributes']
                    eventsToUpdate = tmpOut['eventsToUpdate']
                    filesToStageOut = tmpOut['filesToStageOut']
                    eventsRequestParams = tmpOut['eventsRequestParams']
                    nJobsToReFill = tmpOut['nJobsToReFill']
                    pandaIDs = tmpOut['pandaIDs']
                    isChecked = tmpOut['isChecked']
                    tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} '
                    tmpStr += 'postProcessed={3} files={4}'
                    tmpLog.debug(
                        tmpStr.format(newStatus, monStatus, diagMessage,
                                      workSpec.is_post_processed(),
                                      str(filesToStageOut)))
                    iWorker += 1
                    # check status
                    if newStatus not in WorkSpec.ST_LIST:
                        tmpLog.error('unknown status={0}'.format(newStatus))
                        return
                    # update worker
                    workSpec.set_status(newStatus)
                    workSpec.set_work_attributes(workAttributes)
                    workSpec.set_dialog_message(diagMessage)
                    if isChecked:
                        workSpec.checkTime = datetime.datetime.utcnow()
                    isCheckedList.append(isChecked)
                    if monStatus == WorkSpec.ST_failed:
                        if not workSpec.has_pilot_error():
                            workSpec.set_pilot_error(
                                PilotErrors.ERR_GENERALERROR, diagMessage)
                    elif monStatus == WorkSpec.ST_cancelled:
                        if not workSpec.has_pilot_error():
                            workSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL,
                                                     diagMessage)
                    # request events
                    if eventsRequestParams != {}:
                        workSpec.eventsRequest = WorkSpec.EV_requestEvents
                        workSpec.eventsRequestParams = eventsRequestParams
                    # jobs to refill
                    if nJobsToReFill is not None:
                        workSpec.nJobsToReFill = nJobsToReFill
                    # get associated jobs for the worker chunk
                    if workSpec.hasJob == 1 and jobSpecs is None:
                        jobSpecs = self.dbProxy.get_jobs_with_worker_id(
                            workSpec.workerID, None, only_running=True)
                    # pandaIDs for push
                    pandaIDsList.append(pandaIDs)
                    if len(eventsToUpdate) > 0:
                        eventsToUpdateList.append(eventsToUpdate)
                    if len(filesToStageOut) > 0:
                        filesToStageOutList.append(filesToStageOut)
                # lock workers for fifo
                if from_fifo:
                    # collect some attributes to be updated when wokers are locked
                    worker_id_list = dict()
                    for workSpec, isChecked in zip(workSpecs, isCheckedList):
                        attrs = dict()
                        if isChecked:
                            attrs['checkTime'] = workSpec.checkTime
                            workSpec.force_not_update('checkTime')
                        if workSpec.has_updated_attributes():
                            attrs['lockedBy'] = lockedBy
                            workSpec.lockedBy = lockedBy
                            workSpec.force_not_update('lockedBy')
                        else:
                            attrs['lockedBy'] = None
                        worker_id_list[workSpec.workerID] = attrs
                    temRetLockWorker = self.dbProxy.lock_workers(
                        worker_id_list, harvester_config.monitor.lockInterval)
                    # skip if not locked
                    if not temRetLockWorker:
                        continue
                # update jobs and workers
                if jobSpecs is not None and len(jobSpecs) > 0:
                    tmpQueLog.debug(
                        'updating {0} jobs with {1} workers'.format(
                            len(jobSpecs), len(workSpecs)))
                    core_utils.update_job_attributes_with_workers(
                        mapType, jobSpecs, workSpecs, filesToStageOutList,
                        eventsToUpdateList)
                # update local database
                tmpRet = self.dbProxy.update_jobs_workers(
                    jobSpecs, workSpecs, lockedBy, pandaIDsList)
                if not tmpRet:
                    for workSpec in workSpecs:
                        tmpLog = self.make_logger(_logger,
                                                  'id={0} workerID={1}'.format(
                                                      lockedBy,
                                                      workSpec.workerID),
                                                  method_name='run')
                        if from_fifo:
                            tmpLog.info(
                                'failed to update the DB. Maybe locked by other thread running with DB'
                            )
                        else:
                            if workSpec.status in [
                                    WorkSpec.ST_finished, WorkSpec.ST_failed,
                                    WorkSpec.ST_cancelled, WorkSpec.ST_missed
                            ]:
                                tmpLog.info(
                                    'worker already in final status. Skipped')
                            else:
                                tmpLog.error(
                                    'failed to update the DB. lockInterval may be too short'
                                )
                else:
                    if jobSpecs is not None:
                        for jobSpec in jobSpecs:
                            tmpLog = self.make_logger(
                                _logger,
                                'id={0} PandaID={1}'.format(
                                    lockedBy, jobSpec.PandaID),
                                method_name='run')
                            tmpLog.debug(
                                'new status={0} subStatus={1} status_in_metadata={2}'
                                .format(
                                    jobSpec.status, jobSpec.subStatus,
                                    jobSpec.get_job_status_from_attributes()))
                # send ACK to workers for events and files
                if len(eventsToUpdateList) > 0 or len(filesToStageOutList) > 0:
                    for workSpec in workSpecs:
                        try:
                            messenger.acknowledge_events_files(workSpec)
                        except Exception:
                            core_utils.dump_error_message(tmpQueLog)
                            tmpQueLog.error(
                                'failed to send ACK to workerID={0}'.format(
                                    workSpec.workerID))
                # active workers for fifo
                if self.monitor_fifo.enabled and workSpecs:
                    workSpec = workSpecs[0]
                    tmpOut = tmpRetMap[workSpec.workerID]
                    newStatus = tmpOut['newStatus']
                    monStatus = tmpOut['monStatus']
                    if newStatus in [WorkSpec.ST_submitted, WorkSpec.ST_running, WorkSpec.ST_idle] \
                        and workSpec.mapType != WorkSpec.MT_MultiWorkers \
                        and workSpec.workAttributes is not None:
                        timeNow = datetime.datetime.utcnow()
                        timeNow_timestamp = time.time()
                        _bool, lastCheckAt = workSpec.get_work_params(
                            'lastCheckAt')
                        try:
                            last_check_period = timeNow_timestamp - lastCheckAt
                        except TypeError:
                            last_check_period = forceEnqueueInterval + 1.0
                        if _bool and lastCheckAt is not None and last_check_period > harvester_config.monitor.checkInterval \
                            and timeNow_timestamp - harvester_config.monitor.checkInterval > self.startTimestamp:
                            tmpQueLog.warning(
                                'last check period of workerID={0} is {1} sec, longer than monitor checkInterval'
                                .format(workSpec.workerID, last_check_period))
                        if (from_fifo) \
                            or (not from_fifo
                                and timeNow_timestamp - harvester_config.monitor.sleepTime > self.startTimestamp
                                and last_check_period > forceEnqueueInterval):
                            if not from_fifo:
                                tmpQueLog.warning(
                                    'last check period of workerID={0} is {1} sec, longer than monitor forceEnqueueInterval. Enqueue the worker by force'
                                    .format(workSpec.workerID,
                                            last_check_period))
                            workSpec.set_work_params(
                                {'lastCheckAt': timeNow_timestamp})
                            workSpec.lockedBy = None
                            workSpec.force_update('lockedBy')
                            if monStatus in [
                                    WorkSpec.ST_finished, WorkSpec.ST_failed,
                                    WorkSpec.ST_cancelled
                            ]:
                                _bool, startFifoPreemptAt = workSpec.get_work_params(
                                    'startFifoPreemptAt')
                                if not _bool or startFifoPreemptAt is None:
                                    startFifoPreemptAt = timeNow_timestamp
                                    workSpec.set_work_params({
                                        'startFifoPreemptAt':
                                        startFifoPreemptAt
                                    })
                                tmpQueLog.debug(
                                    'workerID={0} , startFifoPreemptAt: {1}'.
                                    format(workSpec.workerID,
                                           startFifoPreemptAt))
                                if timeNow_timestamp - startFifoPreemptAt < fifoMaxPreemptInterval:
                                    workSpecsToEnqueueToHead.append(workSpecs)
                                else:
                                    workSpec.set_work_params({
                                        'startFifoPreemptAt':
                                        timeNow_timestamp
                                    })
                                    workSpec.modificationTime = timeNow
                                    workSpec.force_update('modificationTime')
                                    workSpecsToEnqueue.append(workSpecs)
                            else:
                                workSpec.modificationTime = timeNow
                                workSpec.force_update('modificationTime')
                                workSpecsToEnqueue.append(workSpecs)
        else:
            tmpQueLog.error('failed to check workers')
        retVal = workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval
        tmpQueLog.debug('done')
        return retVal

    # wrapper for checkWorkers
    def check_workers(self, mon_core, messenger, all_workers, queue_config,
                      tmp_log, from_fifo):
        # check timeout value
        try:
            checkTimeout = mon_core.checkTimeout
        except Exception:
            try:
                checkTimeout = harvester_config.monitor.checkTimeout
            except Exception:
                checkTimeout = None
        workersToCheck = []
        retMap = dict()
        for workSpec in all_workers:
            eventsRequestParams = {}
            eventsToUpdate = []
            pandaIDs = []
            workStatus = None
            workAttributes = None
            filesToStageOut = []
            nJobsToReFill = None
            # job-level late binding
            if workSpec.hasJob == 0 and workSpec.mapType != WorkSpec.MT_NoJob:
                # check if job is requested
                jobRequested = messenger.job_requested(workSpec)
                if jobRequested:
                    # set ready when job is requested
                    workStatus = WorkSpec.ST_ready
                else:
                    workStatus = workSpec.status
            elif workSpec.nJobsToReFill in [0, None]:
                # check if job is requested to refill free slots
                jobRequested = messenger.job_requested(workSpec)
                if jobRequested:
                    nJobsToReFill = jobRequested
                workersToCheck.append(workSpec)
            else:
                workersToCheck.append(workSpec)
            # add
            retMap[workSpec.workerID] = {
                'newStatus': workStatus,
                'monStatus': workStatus,
                'workAttributes': workAttributes,
                'filesToStageOut': filesToStageOut,
                'eventsRequestParams': eventsRequestParams,
                'eventsToUpdate': eventsToUpdate,
                'diagMessage': '',
                'pandaIDs': pandaIDs,
                'nJobsToReFill': nJobsToReFill,
                'isChecked': True
            }
        # check workers
        tmp_log.debug('checking workers with plugin')
        try:
            tmpStat, tmpOut = mon_core.check_workers(workersToCheck)
            if not tmpStat:
                tmp_log.error(
                    'failed to check workers with: {0}'.format(tmpOut))
            else:
                tmp_log.debug('checked')
                timeNow = datetime.datetime.utcnow()
                for workSpec, (newStatus,
                               diagMessage) in zip(workersToCheck, tmpOut):
                    workerID = workSpec.workerID
                    tmp_log.debug(
                        'Going to check workerID={0}'.format(workerID))
                    pandaIDs = []
                    if workerID in retMap:
                        # failed to check status
                        if newStatus is None:
                            tmp_log.error(
                                'Failed to check workerID={0} with {1}'.format(
                                    workerID, diagMessage))
                            retMap[workerID]['isChecked'] = False
                            # set status
                            if workSpec.checkTime is not None and checkTimeout is not None and \
                                    timeNow - workSpec.checkTime > datetime.timedelta(seconds=checkTimeout):
                                # kill due to timeout
                                tmp_log.debug(
                                    'kill workerID={0} due to continuous check failure'
                                    .format(workerID))
                                self.dbProxy.kill_worker(workSpec.workerID)
                                newStatus = WorkSpec.ST_cancelled
                                diagMessage = 'killed due to continuous check failure. ' + diagMessage
                                workSpec.set_pilot_error(
                                    PilotErrors.ERR_FAILEDBYSERVER,
                                    diagMessage)
                            else:
                                # use original status
                                newStatus = workSpec.status
                        # request kill
                        if messenger.kill_requested(workSpec):
                            tmp_log.debug(
                                'kill workerID={0} as requested'.format(
                                    workerID))
                            self.dbProxy.kill_worker(workSpec.workerID)

                        # expired heartbeat - only when requested in the configuration
                        try:
                            # check if the queue configuration requires checking for worker heartbeat
                            worker_heartbeat_limit = int(
                                queue_config.messenger['worker_heartbeat'])
                        except (AttributeError, KeyError):
                            worker_heartbeat_limit = None
                        tmp_log.debug(
                            'workerID={0} heartbeat limit is configured to {1}'
                            .format(workerID, worker_heartbeat_limit))
                        if worker_heartbeat_limit:
                            if messenger.is_alive(workSpec,
                                                  worker_heartbeat_limit):
                                tmp_log.debug(
                                    'heartbeat for workerID={0} is valid'.
                                    format(workerID))
                            else:
                                tmp_log.debug(
                                    'heartbeat for workerID={0} expired: sending kill request'
                                    .format(workerID))
                                self.dbProxy.kill_worker(workSpec.workerID)

                        # get work attributes
                        workAttributes = messenger.get_work_attributes(
                            workSpec)
                        retMap[workerID]['workAttributes'] = workAttributes
                        # get output files
                        filesToStageOut = messenger.get_files_to_stage_out(
                            workSpec)
                        retMap[workerID]['filesToStageOut'] = filesToStageOut
                        # get events to update
                        if workSpec.eventsRequest in [
                                WorkSpec.EV_useEvents,
                                WorkSpec.EV_requestEvents
                        ]:
                            eventsToUpdate = messenger.events_to_update(
                                workSpec)
                            retMap[workerID]['eventsToUpdate'] = eventsToUpdate
                        # request events
                        if workSpec.eventsRequest == WorkSpec.EV_useEvents:
                            eventsRequestParams = messenger.events_requested(
                                workSpec)
                            retMap[workerID][
                                'eventsRequestParams'] = eventsRequestParams
                        # get PandaIDs for pull model
                        if workSpec.mapType == WorkSpec.MT_NoJob:
                            pandaIDs = messenger.get_panda_ids(workSpec)
                        retMap[workerID]['pandaIDs'] = pandaIDs
                        # keep original new status
                        retMap[workerID]['monStatus'] = newStatus
                        # set running or idle while there are events to update or files to stage out
                        if newStatus in [
                                WorkSpec.ST_finished, WorkSpec.ST_failed,
                                WorkSpec.ST_cancelled
                        ]:
                            if len(retMap[workerID]['filesToStageOut']) > 0 or \
                                            len(retMap[workerID]['eventsToUpdate']) > 0:
                                if workSpec.status == WorkSpec.ST_running:
                                    newStatus = WorkSpec.ST_running
                                else:
                                    newStatus = WorkSpec.ST_idle
                            elif not workSpec.is_post_processed():
                                if not queue_config.is_no_heartbeat_status(
                                        newStatus):
                                    # post processing unless heartbeat is suppressed
                                    jobSpecs = self.dbProxy.get_jobs_with_worker_id(
                                        workSpec.workerID,
                                        None,
                                        True,
                                        only_running=True)
                                    # post processing
                                    messenger.post_processing(
                                        workSpec, jobSpecs, workSpec.mapType)
                                workSpec.post_processed()
                                if workSpec.status == WorkSpec.ST_running:
                                    newStatus = WorkSpec.ST_running
                                else:
                                    newStatus = WorkSpec.ST_idle
                            # reset modification time to immediately trigger subsequent lookup
                            if not self.monitor_fifo.enabled:
                                workSpec.trigger_next_lookup()
                        retMap[workerID]['newStatus'] = newStatus
                        retMap[workerID]['diagMessage'] = diagMessage
                    else:
                        tmp_log.debug(
                            'workerID={0} not in retMap'.format(workerID))
            return True, retMap
        except Exception:
            core_utils.dump_error_message(tmp_log)
            return False, None
Exemplo n.º 7
0
class Submitter(AgentBase):
    # fifos
    monitor_fifo = MonitorFIFO()

    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.workerMaker = WorkerMaker()
        self.workerAdjuster = WorkerAdjuster(queue_config_mapper)
        self.pluginFactory = PluginFactory()

    # main loop
    def run(self):
        lockedBy = 'submitter-{0}'.format(self.get_pid())
        monitor_fifo = self.monitor_fifo
        while True:
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(lockedBy),
                                       method_name='run')
            mainLog.debug('getting queues to submit workers')

            # get queues associated to a site to submit workers
            curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(
                harvester_config.submitter.nQueues,
                harvester_config.submitter.lookupTime,
                harvester_config.submitter.lockInterval)
            submitted = False
            if siteName is not None:
                mainLog.debug('got {0} queues for site {1}'.format(
                    len(curWorkers), siteName))

                # get commands
                comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers,
                                          siteName)
                commandSpecs = self.dbProxy.get_commands_for_receiver(
                    'submitter', comStr)
                mainLog.debug('got {0} {1} commands'.format(
                    commandSpecs, comStr))
                for commandSpec in commandSpecs:
                    newLimits = self.dbProxy.set_queue_limit(
                        siteName, commandSpec.params)
                    for tmpResource, tmpNewVal in iteritems(newLimits):
                        # if available, overwrite new worker value with the command from panda server
                        if tmpResource in resMap:
                            tmpQueueName = resMap[tmpResource]
                            if tmpQueueName in curWorkers:
                                curWorkers[tmpQueueName][tmpResource][
                                    'nNewWorkers'] = tmpNewVal

                # define number of new workers
                if len(curWorkers) == 0:
                    n_workers_per_queue_and_rt = dict()
                else:
                    n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(
                        curWorkers, siteName)

                if n_workers_per_queue_and_rt is None:
                    mainLog.error(
                        'WorkerAdjuster failed to define the number of workers'
                    )
                elif len(n_workers_per_queue_and_rt) == 0:
                    pass
                else:
                    # loop over all queues and resource types
                    for queueName in n_workers_per_queue_and_rt:
                        for resource_type, tmpVal in iteritems(
                                n_workers_per_queue_and_rt[queueName]):

                            tmpLog = self.make_logger(
                                _logger,
                                'id={0} queue={1} rtype={2}'.format(
                                    lockedBy, queueName, resource_type),
                                method_name='run')
                            try:
                                tmpLog.debug('start')
                                tmpLog.debug('workers status: %s' % tmpVal)
                                nWorkers = tmpVal['nNewWorkers'] + tmpVal[
                                    'nReady']
                                nReady = tmpVal['nReady']

                                # check queue
                                if not self.queueConfigMapper.has_queue(
                                        queueName):
                                    tmpLog.error('config not found')
                                    continue

                                # no new workers
                                if nWorkers == 0:
                                    tmpLog.debug(
                                        'skipped since no new worker is needed based on current stats'
                                    )
                                    continue
                                # get queue
                                queueConfig = self.queueConfigMapper.get_queue(
                                    queueName)
                                workerMakerCore = self.workerMaker.get_plugin(
                                    queueConfig)
                                # check if resource is ready
                                if hasattr(
                                        workerMakerCore, 'dynamicSizing'
                                ) and workerMakerCore.dynamicSizing is True:
                                    numReadyResources = self.workerMaker.num_ready_resources(
                                        queueConfig, resource_type,
                                        workerMakerCore)
                                    tmpLog.debug('numReadyResources: %s' %
                                                 numReadyResources)
                                    if not numReadyResources:
                                        if hasattr(workerMakerCore,
                                                   'staticWorkers'):
                                            nQRWorkers = tmpVal[
                                                'nQueue'] + tmpVal['nRunning']
                                            tmpLog.debug(
                                                'staticWorkers: %s, nQRWorkers(Queue+Running): %s'
                                                %
                                                (workerMakerCore.staticWorkers,
                                                 nQRWorkers))
                                            if nQRWorkers >= workerMakerCore.staticWorkers:
                                                tmpLog.debug(
                                                    'No left static workers, skip'
                                                )
                                                continue
                                            else:
                                                nWorkers = min(
                                                    workerMakerCore.
                                                    staticWorkers - nQRWorkers,
                                                    nWorkers)
                                                tmpLog.debug(
                                                    'staticWorkers: %s, nWorkers: %s'
                                                    %
                                                    (workerMakerCore.
                                                     staticWorkers, nWorkers))
                                        else:
                                            tmpLog.debug(
                                                'skip since no resources are ready'
                                            )
                                            continue
                                    else:
                                        nWorkers = min(nWorkers,
                                                       numReadyResources)
                                # post action of worker maker
                                if hasattr(
                                        workerMakerCore, 'skipOnFail'
                                ) and workerMakerCore.skipOnFail is True:
                                    skipOnFail = True
                                else:
                                    skipOnFail = False
                                # actions based on mapping type
                                if queueConfig.mapType == WorkSpec.MT_NoJob:
                                    # workers without jobs
                                    jobChunks = []
                                    for i in range(nWorkers):
                                        jobChunks.append([])
                                elif queueConfig.mapType == WorkSpec.MT_OneToOne:
                                    # one worker per one job
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName, nWorkers, nReady, 1, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.
                                        checkInterval, harvester_config.
                                        submitter.lockInterval, lockedBy)
                                elif queueConfig.mapType == WorkSpec.MT_MultiJobs:
                                    # one worker for multiple jobs
                                    nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(
                                        queueConfig,
                                        nWorkers,
                                        resource_type,
                                        maker=workerMakerCore)
                                    tmpLog.debug('nJobsPerWorker={0}'.format(
                                        nJobsPerWorker))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName, nWorkers, nReady,
                                        nJobsPerWorker, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.
                                        checkInterval, harvester_config.
                                        submitter.lockInterval, lockedBy,
                                        queueConfig.allowJobMixture)
                                elif queueConfig.mapType == WorkSpec.MT_MultiWorkers:
                                    # multiple workers for one job
                                    nWorkersPerJob = self.workerMaker.get_num_workers_per_job(
                                        queueConfig,
                                        nWorkers,
                                        resource_type,
                                        maker=workerMakerCore)
                                    tmpLog.debug('nWorkersPerJob={0}'.format(
                                        nWorkersPerJob))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName, nWorkers, nReady, None,
                                        nWorkersPerJob,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.
                                        checkInterval, harvester_config.
                                        submitter.lockInterval, lockedBy)
                                else:
                                    tmpLog.error('unknown mapType={0}'.format(
                                        queueConfig.mapType))
                                    continue

                                tmpLog.debug('got {0} job chunks'.format(
                                    len(jobChunks)))
                                if len(jobChunks) == 0:
                                    continue
                                # make workers
                                okChunks, ngChunks = self.workerMaker.make_workers(
                                    jobChunks,
                                    queueConfig,
                                    nReady,
                                    resource_type,
                                    maker=workerMakerCore)
                                if len(ngChunks) == 0:
                                    tmpLog.debug(
                                        'successfully made {0} workers'.format(
                                            len(okChunks)))
                                else:
                                    tmpLog.debug(
                                        'made {0} workers, while {1} workers failed'
                                        .format(len(okChunks), len(ngChunks)))
                                timeNow = datetime.datetime.utcnow()
                                timeNow_timestamp = time.time()
                                pandaIDs = set()
                                # NG (=not good)
                                for ngJobs in ngChunks:
                                    for jobSpec in ngJobs:
                                        if skipOnFail:
                                            # release jobs when workers are not made
                                            pandaIDs.add(jobSpec.PandaID)
                                        else:
                                            jobSpec.status = 'failed'
                                            jobSpec.subStatus = 'failed_to_make'
                                            jobSpec.stateChangeTime = timeNow
                                            jobSpec.lockedBy = None
                                            errStr = 'failed to make a worker'
                                            jobSpec.set_pilot_error(
                                                PilotErrors.ERR_SETUPFAILURE,
                                                errStr)
                                            jobSpec.trigger_propagation()
                                            self.dbProxy.update_job(
                                                jobSpec, {
                                                    'lockedBy': lockedBy,
                                                    'subStatus': 'prepared'
                                                })
                                # OK
                                workSpecList = []
                                if len(okChunks) > 0:
                                    for workSpec, okJobs in okChunks:
                                        # has job
                                        if (queueConfig.useJobLateBinding and workSpec.workerID is None) \
                                                or queueConfig.mapType == WorkSpec.MT_NoJob:
                                            workSpec.hasJob = 0
                                        else:
                                            workSpec.hasJob = 1
                                            if workSpec.nJobsToReFill in [
                                                    None, 0
                                            ]:
                                                workSpec.set_jobspec_list(
                                                    okJobs)
                                            else:
                                                # refill free slots during the worker is running
                                                workSpec.set_jobspec_list(
                                                    okJobs[:workSpec.
                                                           nJobsToReFill])
                                                workSpec.nJobsToReFill = None
                                                for jobSpec in okJobs[
                                                        workSpec.
                                                        nJobsToReFill:]:
                                                    pandaIDs.add(
                                                        jobSpec.PandaID)
                                            workSpec.set_num_jobs_with_list()
                                        # map type
                                        workSpec.mapType = queueConfig.mapType
                                        # queue name
                                        workSpec.computingSite = queueConfig.queueName
                                        # set access point
                                        workSpec.accessPoint = queueConfig.messenger[
                                            'accessPoint']
                                        # sync level
                                        workSpec.syncLevel = queueConfig.get_synchronization_level(
                                        )
                                        # events
                                        if len(okJobs) > 0 and \
                                                ('eventService' in okJobs[0].jobParams or
                                                 'cloneJob' in okJobs[0].jobParams):
                                            workSpec.eventsRequest = WorkSpec.EV_useEvents
                                        workSpecList.append(workSpec)
                                if len(workSpecList) > 0:
                                    # get plugin for submitter
                                    submitterCore = self.pluginFactory.get_plugin(
                                        queueConfig.submitter)
                                    if submitterCore is None:
                                        # not found
                                        tmpLog.error(
                                            'submitter plugin for {0} not found'
                                            .format(jobSpec.computingSite))
                                        continue
                                    # get plugin for messenger
                                    messenger = self.pluginFactory.get_plugin(
                                        queueConfig.messenger)
                                    if messenger is None:
                                        # not found
                                        tmpLog.error(
                                            'messenger plugin for {0} not found'
                                            .format(jobSpec.computingSite))
                                        continue
                                    # setup access points
                                    messenger.setup_access_points(workSpecList)
                                    # feed jobs
                                    for workSpec in workSpecList:
                                        if workSpec.hasJob == 1:
                                            tmpStat = messenger.feed_jobs(
                                                workSpec,
                                                workSpec.get_jobspec_list())
                                            if tmpStat is False:
                                                tmpLog.error(
                                                    'failed to send jobs to workerID={0}'
                                                    .format(workSpec.workerID))
                                            else:
                                                tmpLog.debug(
                                                    'sent jobs to workerID={0} with {1}'
                                                    .format(
                                                        workSpec.workerID,
                                                        tmpStat))
                                    # insert workers
                                    self.dbProxy.insert_workers(
                                        workSpecList, lockedBy)
                                    # submit
                                    tmpLog.info(
                                        'submitting {0} workers'.format(
                                            len(workSpecList)))
                                    workSpecList, tmpRetList, tmpStrList = self.submit_workers(
                                        submitterCore, workSpecList)
                                    for iWorker, (tmpRet, tmpStr) in enumerate(
                                            zip(tmpRetList, tmpStrList)):
                                        workSpec, jobList = okChunks[iWorker]
                                        # use associated job list since it can be truncated for re-filling
                                        jobList = workSpec.get_jobspec_list()
                                        # set status
                                        if not tmpRet:
                                            # failed submission
                                            errStr = 'failed to submit a workerID={0} with {1}'.format(
                                                workSpec.workerID, tmpStr)
                                            tmpLog.error(errStr)
                                            workSpec.set_status(
                                                WorkSpec.ST_missed)
                                            workSpec.set_dialog_message(tmpStr)
                                            workSpec.set_pilot_error(
                                                PilotErrors.ERR_SETUPFAILURE,
                                                errStr)
                                            if jobList is not None:
                                                # increment attempt number
                                                newJobList = []
                                                for jobSpec in jobList:
                                                    if jobSpec.submissionAttempts is None:
                                                        jobSpec.submissionAttempts = 0
                                                    jobSpec.submissionAttempts += 1
                                                    # max attempt or permanent error
                                                    if tmpRet is False or \
                                                            jobSpec.submissionAttempts >= \
                                                            queueConfig.maxSubmissionAttempts:
                                                        newJobList.append(
                                                            jobSpec)
                                                    else:
                                                        self.dbProxy.increment_submission_attempt(
                                                            jobSpec.PandaID,
                                                            jobSpec.
                                                            submissionAttempts)
                                                jobList = newJobList
                                        elif queueConfig.useJobLateBinding and workSpec.hasJob == 1:
                                            # directly go to running after feeding jobs for late biding
                                            workSpec.set_status(
                                                WorkSpec.ST_running)
                                        else:
                                            # normal successful submission
                                            workSpec.set_status(
                                                WorkSpec.ST_submitted)
                                        workSpec.submitTime = timeNow
                                        workSpec.modificationTime = timeNow
                                        workSpec.checkTime = timeNow
                                        if self.monitor_fifo.enabled:
                                            workSpec.set_work_params({
                                                'lastCheckAt':
                                                timeNow_timestamp
                                            })
                                        # prefetch events
                                        if tmpRet and workSpec.hasJob == 1 and \
                                                workSpec.eventsRequest == WorkSpec.EV_useEvents and \
                                                queueConfig.prefetchEvents:
                                            workSpec.eventsRequest = WorkSpec.EV_requestEvents
                                            eventsRequestParams = dict()
                                            for jobSpec in jobList:
                                                eventsRequestParams[jobSpec.PandaID] = \
                                                    {'pandaID': jobSpec.PandaID,
                                                     'taskID': jobSpec.taskID,
                                                     'jobsetID': jobSpec.jobParams['jobsetID'],
                                                     'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))),
                                                                    jobSpec.jobParams['coreCount']),
                                                     }
                                            workSpec.eventsRequestParams = eventsRequestParams
                                        # register worker
                                        tmpStat = self.dbProxy.register_worker(
                                            workSpec, jobList, lockedBy)
                                        if jobList is not None:
                                            for jobSpec in jobList:
                                                pandaIDs.add(jobSpec.PandaID)
                                                if tmpStat:
                                                    if tmpRet:
                                                        tmpStr = \
                                                            'submitted a workerID={0} for PandaID={1} with batchID={2}'
                                                        tmpLog.info(
                                                            tmpStr.format(
                                                                workSpec.
                                                                workerID,
                                                                jobSpec.
                                                                PandaID,
                                                                workSpec.
                                                                batchID))
                                                    else:
                                                        tmpStr = 'failed to submit a workerID={0} for PandaID={1}'
                                                        tmpLog.error(
                                                            tmpStr.format(
                                                                workSpec.
                                                                workerID,
                                                                jobSpec.PandaID
                                                            ))
                                                else:
                                                    tmpStr = \
                                                        'failed to register a worker for PandaID={0} with batchID={1}'
                                                    tmpLog.error(
                                                        tmpStr.format(
                                                            jobSpec.PandaID,
                                                            workSpec.batchID))
                                    # enqueue to monitor fifo
                                    if self.monitor_fifo.enabled \
                                            and queueConfig.mapType != WorkSpec.MT_MultiWorkers:
                                        workSpecsToEnqueue = \
                                            [[w] for w in workSpecList if w.status
                                             in (WorkSpec.ST_submitted, WorkSpec.ST_running)]
                                        monitor_fifo.put(
                                            (queueName, workSpecsToEnqueue))
                                        mainLog.debug(
                                            'put workers to monitor FIFO')
                                    submitted = True
                                # release jobs
                                self.dbProxy.release_jobs(pandaIDs, lockedBy)
                                tmpLog.info('done')
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
            mainLog.debug('done')
            # define sleep interval
            if siteName is None:
                sleepTime = harvester_config.submitter.sleepTime
            else:
                sleepTime = 0
                if submitted and hasattr(harvester_config.submitter,
                                         'minSubmissionInterval'):
                    interval = harvester_config.submitter.minSubmissionInterval
                    if interval > 0:
                        newTime = datetime.datetime.utcnow(
                        ) + datetime.timedelta(seconds=interval)
                        self.dbProxy.update_panda_queue_attribute(
                            'submitTime', newTime, site_name=siteName)
            # check if being terminated
            if self.terminated(sleepTime):
                mainLog.debug('terminated')
                return

    # wrapper for submitWorkers to skip ready workers
    def submit_workers(self, submitter_core, workspec_list):
        retList = []
        strList = []
        newSpecList = []
        workersToSubmit = []
        for workSpec in workspec_list:
            if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]:
                newSpecList.append(workSpec)
                retList.append(True)
                strList.append('')
            else:
                workersToSubmit.append(workSpec)
        tmpRetList = submitter_core.submit_workers(workersToSubmit)
        for tmpRet, tmpStr in tmpRetList:
            retList.append(tmpRet)
            strList.append(tmpStr)
        newSpecList += workersToSubmit
        return newSpecList, retList, strList
Exemplo n.º 8
0
class Monitor(AgentBase):
    # fifos
    monitor_fifo = MonitorFIFO()

    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.pluginFactory = PluginFactory()
        self.startTimestamp = time.time()

    # main loop
    def run(self):
        lockedBy = 'monitor-{0}'.format(self.ident)
        # init messengers
        for queueConfig in self.queueConfigMapper.get_all_queues().values():
            # just import for module initialization
            self.pluginFactory.get_plugin(queueConfig.messenger)
        # main
        last_DB_cycle_timestamp = 0
        monitor_fifo = self.monitor_fifo
        while True:
            sw = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(lockedBy),
                                       method_name='run')

            if time.time(
            ) >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime:
                # run with workers from DB
                mainLog.debug('starting run with DB')
                mainLog.debug('getting workers to monitor')
                workSpecsPerQueue = self.dbProxy.get_workers_to_update(
                    harvester_config.monitor.maxWorkers,
                    harvester_config.monitor.checkInterval,
                    harvester_config.monitor.lockInterval, lockedBy)
                mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue)))
                # loop over all workers
                for queueName, configIdWorkSpecs in iteritems(
                        workSpecsPerQueue):
                    for configID, workSpecsList in iteritems(
                            configIdWorkSpecs):
                        retVal = self.monitor_agent_core(lockedBy,
                                                         queueName,
                                                         workSpecsList,
                                                         config_id=configID)
                        if self.monitor_fifo.enabled and retVal is not None:
                            workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                            if workSpecsToEnqueue:
                                mainLog.debug('putting workers to FIFO')
                                try:
                                    score = fifoCheckInterval + timeNow_timestamp
                                    monitor_fifo.put(
                                        (queueName, workSpecsToEnqueue), score)
                                    mainLog.info(
                                        'put workers of {0} to FIFO with score {1}'
                                        .format(queueName, score))
                                except Exception as errStr:
                                    mainLog.error(
                                        'failed to put object from FIFO: {0}'.
                                        format(errStr))
                            if workSpecsToEnqueueToHead:
                                mainLog.debug('putting workers to FIFO head')
                                try:
                                    score = fifoCheckInterval - timeNow_timestamp
                                    monitor_fifo.put(
                                        (queueName, workSpecsToEnqueueToHead),
                                        score)
                                    mainLog.info(
                                        'put workers of {0} to FIFO with score {1}'
                                        .format(queueName, score))
                                except Exception as errStr:
                                    mainLog.error(
                                        'failed to put object from FIFO head: {0}'
                                        .format(errStr))
                last_DB_cycle_timestamp = time.time()
                mainLog.debug('ended run with DB')
            elif self.monitor_fifo.enabled:
                # run with workers from FIFO
                if monitor_fifo.to_check_workers():
                    # check fifo size
                    fifo_size = monitor_fifo.size()
                    mainLog.debug('FIFO size is {0}'.format(fifo_size))
                    mainLog.debug('starting run with FIFO')
                    try:
                        obj_gotten = monitor_fifo.get(timeout=1)
                    except Exception as errStr:
                        mainLog.error(
                            'failed to get object from FIFO: {0}'.format(
                                errStr))
                    else:
                        if obj_gotten is not None:
                            queueName, workSpecsList = obj_gotten
                            mainLog.debug('got {0} workers of {1}'.format(
                                len(workSpecsList), queueName))
                            configID = workSpecsList[0][0].configID
                            for workSpecs in workSpecsList:
                                for workSpec in workSpecs:
                                    if workSpec.pandaid_list is None:
                                        _jobspec_list = workSpec.get_jobspec_list(
                                        )
                                        if _jobspec_list is not None:
                                            workSpec.pandaid_list = [
                                                j.PandaID for j in
                                                workSpec.get_jobspec_list()
                                            ]
                                        else:
                                            workSpec.pandaid_list = []
                                        workSpec.force_update('pandaid_list')
                            retVal = self.monitor_agent_core(
                                lockedBy,
                                queueName,
                                workSpecsList,
                                from_fifo=True,
                                config_id=configID)
                            if retVal is not None:
                                workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                                if workSpecsToEnqueue:
                                    mainLog.debug('putting workers to FIFO')
                                    try:
                                        score = fifoCheckInterval + timeNow_timestamp
                                        monitor_fifo.put(
                                            (queueName, workSpecsToEnqueue),
                                            score)
                                        mainLog.info(
                                            'put workers of {0} to FIFO with score {1}'
                                            .format(queueName, score))
                                    except Exception as errStr:
                                        mainLog.error(
                                            'failed to put object from FIFO: {0}'
                                            .format(errStr))
                                if workSpecsToEnqueueToHead:
                                    mainLog.debug(
                                        'putting workers to FIFO head')
                                    try:
                                        score = fifoCheckInterval - timeNow_timestamp
                                        monitor_fifo.put(
                                            (queueName,
                                             workSpecsToEnqueueToHead), score)
                                        mainLog.info(
                                            'put workers of {0} to FIFO with score {1}'
                                            .format(queueName, score))
                                    except Exception as errStr:
                                        mainLog.error(
                                            'failed to put object from FIFO head: {0}'
                                            .format(errStr))
                            else:
                                mainLog.debug(
                                    'monitor_agent_core returned None. Skipped putting to FIFO'
                                )
                        else:
                            mainLog.debug('got nothing in FIFO')
                    mainLog.debug('ended run with FIFO')
                else:
                    mainLog.debug(
                        'workers in FIFO too young to check. Skipped')

            if sw.get_elapsed_time_in_sec(
            ) > harvester_config.monitor.lockInterval:
                mainLog.warning(
                    'a single cycle was longer than lockInterval ' +
                    sw.get_elapsed_time())
            else:
                mainLog.debug('done' + sw.get_elapsed_time())

            # check if being terminated
            sleepTime = (harvester_config.monitor.fifoSleepTimeMilli / 1000.0) \
                            if self.monitor_fifo.enabled else harvester_config.monitor.sleepTime
            if self.terminated(sleepTime):
                mainLog.debug('terminated')
                return

    # core of monitor agent to check workers in workSpecsList of queueName
    def monitor_agent_core(self,
                           lockedBy,
                           queueName,
                           workSpecsList,
                           from_fifo=False,
                           config_id=None):
        tmpQueLog = self.make_logger(_logger,
                                     'id={0} queue={1}'.format(
                                         lockedBy, queueName),
                                     method_name='run')
        # check queue
        if not self.queueConfigMapper.has_queue(queueName, config_id):
            tmpQueLog.error('config not found')
            return
        # get queue
        queueConfig = self.queueConfigMapper.get_queue(queueName, config_id)
        # get plugins
        monCore = self.pluginFactory.get_plugin(queueConfig.monitor)
        messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
        # workspec chunk of active workers
        workSpecsToEnqueue = []
        workSpecsToEnqueueToHead = []
        timeNow_timestamp = time.time()
        # get fifoCheckInterval for PQ
        try:
            fifoCheckInterval = monCore.fifoCheckInterval
        except:
            if hasattr(harvester_config.monitor, 'fifoCheckInterval'):
                fifoCheckInterval = harvester_config.monitor.fifoCheckInterval
            else:
                fifoCheckInterval = harvester_config.monitor.checkInterval
        # check workers
        allWorkers = [item for sublist in workSpecsList for item in sublist]
        tmpQueLog.debug('checking {0} workers'.format(len(allWorkers)))
        tmpStat, tmpRetMap = self.check_workers(monCore, messenger, allWorkers,
                                                queueConfig, tmpQueLog)
        if tmpStat:
            # loop over all worker chunks
            tmpQueLog.debug('update jobs and workers')
            iWorker = 0
            for workSpecs in workSpecsList:
                jobSpecs = None
                filesToStageOut = dict()
                pandaIDsList = []
                eventsToUpdateList = []
                filesToStageOutList = []
                mapType = workSpecs[0].mapType
                # lock workers for fifo
                temRetLockWorker = None
                if from_fifo:
                    # lock workers
                    worker_id_list = [w.workerID for w in workSpecs]
                    temRetLockWorker = self.dbProxy.lock_workers(
                        worker_id_list, harvester_config.monitor.lockInterval,
                        lockedBy)
                    # skip if not locked
                    if not temRetLockWorker:
                        continue
                # loop over workSpecs
                for workSpec in workSpecs:
                    tmpLog = self.make_logger(_logger,
                                              'id={0} workerID={1}'.format(
                                                  lockedBy, workSpec.workerID),
                                              method_name='run')
                    tmpOut = tmpRetMap[workSpec.workerID]
                    newStatus = tmpOut['newStatus']
                    monStatus = tmpOut['monStatus']
                    diagMessage = tmpOut['diagMessage']
                    workAttributes = tmpOut['workAttributes']
                    eventsToUpdate = tmpOut['eventsToUpdate']
                    filesToStageOut = tmpOut['filesToStageOut']
                    eventsRequestParams = tmpOut['eventsRequestParams']
                    nJobsToReFill = tmpOut['nJobsToReFill']
                    pandaIDs = tmpOut['pandaIDs']
                    tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} '
                    tmpStr += 'postProcessed={3} files={4}'
                    tmpLog.debug(
                        tmpStr.format(newStatus, monStatus, diagMessage,
                                      workSpec.is_post_processed(),
                                      str(filesToStageOut)))
                    iWorker += 1
                    if from_fifo:
                        workSpec.lockedBy = lockedBy
                        workSpec.force_update('lockedBy')
                    # check status
                    if newStatus not in WorkSpec.ST_LIST:
                        tmpLog.error('unknown status={0}'.format(newStatus))
                        return
                    # update worker
                    workSpec.set_status(newStatus)
                    workSpec.set_work_attributes(workAttributes)
                    workSpec.set_dialog_message(diagMessage)
                    if monStatus == WorkSpec.ST_failed:
                        if not workSpec.has_pilot_error():
                            workSpec.set_pilot_error(
                                PilotErrors.ERR_GENERALERROR, diagMessage)
                    elif monStatus == WorkSpec.ST_cancelled:
                        if not workSpec.has_pilot_error():
                            workSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL,
                                                     diagMessage)
                    # request events
                    if eventsRequestParams != {}:
                        workSpec.eventsRequest = WorkSpec.EV_requestEvents
                        workSpec.eventsRequestParams = eventsRequestParams
                    # jobs to refill
                    if nJobsToReFill is not None:
                        workSpec.nJobsToReFill = nJobsToReFill
                    # get associated jobs for the worker chunk
                    if workSpec.hasJob == 1 and jobSpecs is None:
                        jobSpecs = self.dbProxy.get_jobs_with_worker_id(
                            workSpec.workerID, None, only_running=True)
                    # pandaIDs for push
                    pandaIDsList.append(pandaIDs)
                    if len(eventsToUpdate) > 0:
                        eventsToUpdateList.append(eventsToUpdate)
                    if len(filesToStageOut) > 0:
                        filesToStageOutList.append(filesToStageOut)
                # update jobs and workers
                if jobSpecs is not None:
                    tmpQueLog.debug(
                        'updating {0} jobs with {1} workers'.format(
                            len(jobSpecs), len(workSpecs)))
                    core_utils.update_job_attributes_with_workers(
                        mapType, jobSpecs, workSpecs, filesToStageOutList,
                        eventsToUpdateList)
                    for jobSpec in jobSpecs:
                        tmpLog = self.make_logger(_logger,
                                                  'id={0} PandaID={1}'.format(
                                                      lockedBy,
                                                      jobSpec.PandaID),
                                                  method_name='run')
                        tmpLog.debug(
                            'new status={0} subStatus={1} status_in_metadata={2}'
                            .format(jobSpec.status, jobSpec.subStatus,
                                    jobSpec.get_job_status_from_attributes()))
                # update local database
                tmpRet = self.dbProxy.update_jobs_workers(
                    jobSpecs, workSpecs, lockedBy, pandaIDsList)
                if not tmpRet:
                    for workSpec in workSpecs:
                        tmpLog = self.make_logger(_logger,
                                                  'id={0} workerID={1}'.format(
                                                      lockedBy,
                                                      workSpec.workerID),
                                                  method_name='run')
                        if from_fifo:
                            tmpLog.info(
                                'failed to update the DB. Maybe locked by other thread running with DB'
                            )
                        else:
                            tmpLog.error(
                                'failed to update the DB. lockInterval may be too short'
                            )
                        sendWarning = True
                # send ACK to workers for events and files
                if len(eventsToUpdateList) > 0 or len(filesToStageOutList) > 0:
                    for workSpec in workSpecs:
                        messenger.acknowledge_events_files(workSpec)
                # active workers for fifo
                if self.monitor_fifo.enabled and workSpecs:
                    workSpec = workSpecs[0]
                    tmpOut = tmpRetMap[workSpec.workerID]
                    newStatus = tmpOut['newStatus']
                    monStatus = tmpOut['monStatus']
                    if newStatus in [WorkSpec.ST_submitted, WorkSpec.ST_running] \
                        and workSpec.mapType != WorkSpec.MT_MultiWorkers \
                        and workSpec.workAttributes is not None:
                        forceEnqueueInterval = harvester_config.monitor.fifoForceEnqueueInterval
                        timeNow = datetime.datetime.utcnow()
                        timeNow_timestamp = time.time()
                        _bool, lastCheckAt = workSpec.get_work_params(
                            'lastCheckAt')
                        try:
                            last_check_period = timeNow_timestamp - lastCheckAt
                        except TypeError:
                            last_check_period = forceEnqueueInterval + 1.0
                        if (from_fifo and tmpRet) \
                            or (not from_fifo and timeNow_timestamp - harvester_config.monitor.sleepTime > self.startTimestamp
                                and last_check_period > forceEnqueueInterval):
                            if not from_fifo and _bool and lastCheckAt is not None \
                                and last_check_period > harvester_config.monitor.checkInterval:
                                tmpQueLog.warning(
                                    'last check period of workerID={0} is {1} sec, longer than monitor checkInterval'
                                    .format(workSpec.workerID,
                                            last_check_period))
                            workSpec.set_work_params(
                                {'lastCheckAt': timeNow_timestamp})
                            workSpec.lockedBy = None
                            workSpec.force_update('lockedBy')
                            if monStatus in [
                                    WorkSpec.ST_finished, WorkSpec.ST_failed,
                                    WorkSpec.ST_cancelled
                            ]:
                                _bool, startFifoPreemptAt = workSpec.get_work_params(
                                    'startFifoPreemptAt')
                                if not _bool or startFifoPreemptAt is None:
                                    startFifoPreemptAt = timeNow_timestamp
                                    workSpec.set_work_params({
                                        'startFifoPreemptAt':
                                        startFifoPreemptAt
                                    })
                                tmpQueLog.debug(
                                    'workerID={0} , startFifoPreemptAt: {1}'.
                                    format(workSpec.workerID,
                                           startFifoPreemptAt))
                                if timeNow_timestamp - startFifoPreemptAt < harvester_config.monitor.fifoMaxPreemptInterval:
                                    workSpecsToEnqueueToHead.append(workSpecs)
                                else:
                                    workSpec.set_work_params({
                                        'startFifoPreemptAt':
                                        timeNow_timestamp
                                    })
                                    workSpec.modificationTime = timeNow
                                    workSpec.force_update('modificationTime')
                                    workSpecsToEnqueue.append(workSpecs)
                            else:
                                workSpec.modificationTime = timeNow
                                workSpec.force_update('modificationTime')
                                workSpecsToEnqueue.append(workSpecs)
        else:
            tmpQueLog.error('failed to check workers')
        retVal = workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval
        tmpQueLog.debug('done')
        return retVal

    # wrapper for checkWorkers
    def check_workers(self, mon_core, messenger, all_workers, queue_config,
                      tmp_log):
        workersToCheck = []
        retMap = dict()
        for workSpec in all_workers:
            eventsRequestParams = {}
            eventsToUpdate = []
            pandaIDs = []
            workStatus = None
            workAttributes = None
            filesToStageOut = []
            nJobsToReFill = None
            # job-level late binding
            if workSpec.hasJob == 0 and workSpec.mapType != WorkSpec.MT_NoJob:
                # check if job is requested
                jobRequested = messenger.job_requested(workSpec)
                if jobRequested:
                    # set ready when job is requested
                    workStatus = WorkSpec.ST_ready
                else:
                    workStatus = workSpec.status
            elif workSpec.nJobsToReFill in [0, None]:
                # check if job is requested to refill free slots
                jobRequested = messenger.job_requested(workSpec)
                if jobRequested:
                    nJobsToReFill = jobRequested
                workersToCheck.append(workSpec)
            else:
                workersToCheck.append(workSpec)
            # add
            retMap[workSpec.workerID] = {
                'newStatus': workStatus,
                'monStatus': workStatus,
                'workAttributes': workAttributes,
                'filesToStageOut': filesToStageOut,
                'eventsRequestParams': eventsRequestParams,
                'eventsToUpdate': eventsToUpdate,
                'diagMessage': '',
                'pandaIDs': pandaIDs,
                'nJobsToReFill': nJobsToReFill
            }
        # check workers
        tmp_log.debug('checking workers with plugin')
        try:
            tmpStat, tmpOut = mon_core.check_workers(workersToCheck)
            if not tmpStat:
                tmp_log.error(
                    'failed to check workers with: {0}'.format(tmpOut))
            else:
                tmp_log.debug('checked')
                for workSpec, (newStatus,
                               diagMessage) in zip(workersToCheck, tmpOut):
                    workerID = workSpec.workerID
                    tmp_log.debug(
                        'Going to check workerID={0}'.format(workerID))
                    pandaIDs = []
                    if workerID in retMap:
                        # request kill
                        if messenger.kill_requested(workSpec):
                            self.dbProxy.kill_worker(workSpec.workerID)

                        # expired heartbeat - only when requested in the configuration
                        try:
                            # check if the queue configuration requires checking for worker heartbeat
                            worker_heartbeat_limit = int(
                                queue_config.messenger['worker_heartbeat'])
                        except (AttributeError, KeyError):
                            worker_heartbeat_limit = None
                        tmp_log.debug(
                            'workerID={0} heartbeat limit is configured to {1}'
                            .format(workerID, worker_heartbeat_limit))
                        if worker_heartbeat_limit:
                            if messenger.is_alive(workSpec,
                                                  worker_heartbeat_limit):
                                tmp_log.debug(
                                    'heartbeat for workerID={0} is valid'.
                                    format(workerID))
                            else:
                                tmp_log.debug(
                                    'heartbeat for workerID={0} expired: sending kill request'
                                    .format(workerID))
                                self.dbProxy.kill_worker(workSpec.workerID)

                        # get work attributes
                        workAttributes = messenger.get_work_attributes(
                            workSpec)
                        retMap[workerID]['workAttributes'] = workAttributes
                        # get output files
                        filesToStageOut = messenger.get_files_to_stage_out(
                            workSpec)
                        retMap[workerID]['filesToStageOut'] = filesToStageOut
                        # get events to update
                        if workSpec.eventsRequest in [
                                WorkSpec.EV_useEvents,
                                WorkSpec.EV_requestEvents
                        ]:
                            eventsToUpdate = messenger.events_to_update(
                                workSpec)
                            retMap[workerID]['eventsToUpdate'] = eventsToUpdate
                        # request events
                        if workSpec.eventsRequest == WorkSpec.EV_useEvents:
                            eventsRequestParams = messenger.events_requested(
                                workSpec)
                            retMap[workerID][
                                'eventsRequestParams'] = eventsRequestParams
                        # get PandaIDs for pull model
                        if workSpec.mapType == WorkSpec.MT_NoJob:
                            pandaIDs = messenger.get_panda_ids(workSpec)
                        retMap[workerID]['pandaIDs'] = pandaIDs
                        # keep original new status
                        retMap[workerID]['monStatus'] = newStatus
                        # set running while there are events to update or files to stage out
                        if newStatus in [
                                WorkSpec.ST_finished, WorkSpec.ST_failed,
                                WorkSpec.ST_cancelled
                        ]:
                            if len(retMap[workerID]['filesToStageOut']) > 0 or \
                                            len(retMap[workerID]['eventsToUpdate']) > 0:
                                newStatus = WorkSpec.ST_running
                            elif not workSpec.is_post_processed():
                                if not queue_config.is_no_heartbeat_status(
                                        newStatus):
                                    # post processing unless heartbeat is suppressed
                                    jobSpecs = self.dbProxy.get_jobs_with_worker_id(
                                        workSpec.workerID,
                                        None,
                                        True,
                                        only_running=True)
                                    # post processing
                                    messenger.post_processing(
                                        workSpec, jobSpecs, workSpec.mapType)
                                workSpec.post_processed()
                                newStatus = WorkSpec.ST_running
                            # reset modification time to immediately trigger subsequent lookup
                            workSpec.trigger_next_lookup()
                        retMap[workerID]['newStatus'] = newStatus
                        retMap[workerID]['diagMessage'] = diagMessage
                    else:
                        tmp_log.debug(
                            'workerID={0} not in retMap'.format(workerID))
            return True, retMap
        except:
            core_utils.dump_error_message(tmp_log)
            return False, None