示例#1
0
def kill_workers(arguments):
    status_in = 'ALL' if (len(arguments.status) == 1 and arguments.status[0]
                          == 'ALL') else arguments.status
    computingSite_in = 'ALL' if (
        len(arguments.sites) == 1
        and arguments.sites[0] == 'ALL') else arguments.sites
    computingElement_in = 'ALL' if (len(
        arguments.ces) == 1 and arguments.ces[0] == 'ALL') else arguments.ces
    submissionHost_in = 'ALL' if (len(arguments.submissionhosts) == 1
                                  and arguments.submissionhosts[0]
                                  == 'ALL') else arguments.submissionhosts
    dbProxy = DBProxy()
    retVal = dbProxy.kill_workers_by_query({
        'status': status_in,
        'computingSite': computingSite_in,
        'computingElement': computingElement_in,
        'submissionHost': submissionHost_in
    })
    if retVal is not None:
        msg_temp = ('Sweeper will soon kill {n_workers} workers, with '
                    'status in {status_in}, '
                    'computingSite in {computingSite_in}, '
                    'computingElement in {computingElement_in}, '
                    'submissionHost in {submissionHost_in}')
        print(
            msg_temp.format(n_workers=retVal,
                            status_in=status_in,
                            computingSite_in=computingSite_in,
                            computingElement_in=computingElement_in,
                            submissionHost_in=submissionHost_in))
    else:
        mainLogger.critical('Failed to kill workers. See panda-db_proxy.log')
def qconf_purge(arguments):
    queueName = arguments.queue
    dbProxy = DBProxy()
    retVal = dbProxy.purge_pq(queueName)
    if retVal:
        print('Purged {0} from harvester DB'.format(queueName))
    else:
        mainLogger.critical('Failed to purge {0} . See panda-db_proxy.log'.format(queueName))
示例#3
0
class EventFeeder(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.queueConfigMapper = queue_config_mapper
        self.communicator = communicator
        self.pluginFactory = PluginFactory()

    # main loop
    def run(self):
        lockedBy = 'eventfeeder-{0}'.format(self.ident)
        while True:
            mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            mainLog.debug('getting workers to feed events')
            workSpecsPerQueue = self.dbProxy.get_workers_to_feed_events(harvester_config.eventfeeder.maxWorkers,
                                                                        harvester_config.eventfeeder.lockInterval)
            mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue)))
            # loop over all workers
            for queueName, workSpecList in iteritems(workSpecsPerQueue):
                tmpQueLog = self.make_logger(_logger, 'queue={0}'.format(queueName), method_name='run')
                # check queue
                if not self.queueConfigMapper.has_queue(queueName):
                    tmpQueLog.error('config not found')
                    continue
                # get queue
                queueConfig = self.queueConfigMapper.get_queue(queueName)
                # get plugin
                messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
                # loop over all workers
                for workSpec in workSpecList:
                    tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID),
                                                    method_name='run')
                    # get events
                    tmpLog.debug('get events')
                    tmpStat, events = self.communicator.get_event_ranges(workSpec.eventsRequestParams)
                    # failed
                    if tmpStat is False:
                        tmpLog.error('failed to get events with {0}'.format(events))
                        continue
                    tmpStat = messenger.feed_events(workSpec, events)
                    # failed
                    if tmpStat is False:
                        tmpLog.error('failed to feed events')
                        continue
                    # update worker
                    workSpec.eventsRequest = WorkSpec.EV_useEvents
                    workSpec.eventsRequestParams = None
                    workSpec.eventFeedTime = None
                    # update local database
                    tmpStat = self.dbProxy.update_worker(workSpec)
                    tmpLog.debug('done with {0}'.format(tmpStat))
                tmpQueLog.debug('done')
            mainLog.debug('done')
            # check if being terminated
            if self.terminated(harvester_config.eventfeeder.sleepTime):
                mainLog.debug('terminated')
                return
示例#4
0
 def __init__(self, single_mode=False, stop_event=None, daemon_mode=True):
     # initialize database and config
     self.singleMode = single_mode
     self.stopEvent = stop_event
     self.daemonMode = daemon_mode
     from pandaharvester.harvestercore.communicator_pool import CommunicatorPool
     self.communicatorPool = CommunicatorPool()
     from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
     self.queueConfigMapper = QueueConfigMapper()
     from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy
     dbProxy = DBProxy()
     dbProxy.make_tables(self.queueConfigMapper)
示例#5
0
 def __init__(self, single_mode=False, stop_event=None, daemon_mode=True):
     # initialize database and config
     self.singleMode = single_mode
     self.stopEvent = stop_event
     self.daemonMode = daemon_mode
     from pandaharvester.harvestercore.communicator_pool import CommunicatorPool
     self.communicatorPool = CommunicatorPool()
     from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
     self.queueConfigMapper = QueueConfigMapper()
     from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy
     dbProxy = DBProxy()
     dbProxy.make_tables(self.queueConfigMapper)
示例#6
0
 def __init__(self, queue_config_mapper, single_mode=False):
     AgentBase.__init__(self, single_mode)
     self.queueConfigMapper = queue_config_mapper
     self.dbProxy = DBProxy()
     self.workerMaker = WorkerMaker()
     self.workerAdjuster = WorkerAdjuster(queue_config_mapper)
     self.pluginFactory = PluginFactory()
示例#7
0
文件: aCTReport.py 项目: manfuin/aCT
    def HarvesterReport(self):
        try:
            from distutils.sysconfig import get_python_lib # pylint: disable=import-error
            sys.path.append(get_python_lib()+'/pandacommon')

            os.environ['PANDA_HOME']=os.environ['VIRTUAL_ENV']

            from collections import defaultdict # pylint: disable=import-error
            from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy # pylint: disable=import-error

            self.dbProxy = DBProxy()

            workers = self.dbProxy.get_worker_stats_bulk(None)
            rep = defaultdict(dict)

            rtot = defaultdict(int)

            for site, prodsourcelabels in workers.items():
                for prodsourcelabel, resources in prodsourcelabels.items():
                    for resource, jobs in resources.items():
                        rep[f'{site}-{resource}'][prodsourcelabel or 'empty'] = jobs
                        for state, count in jobs.items():
                            rtot[state] += count
            self.log(f"All Harvester jobs: {sum(rtot.values())}       prodSourceLabel: submitted/running")
            for k in sorted(rep.keys()):
                log=f"{k:>28.28}:"
                for psl, jobs in rep[k].items():
                    log += f"{psl:>10}: {jobs['submitted']}/{jobs['running']}"
                self.log(log)
            log = f"{'Totals':>28}:  submitted: {rtot['submitted']}  running: {rtot['running']}"
            self.log(log+'\n\n')
        except:
            pass
示例#8
0
 def __init__(self, single_mode=False):
     AgentBase.__init__(self, single_mode)
     self.pluginFactory = PluginFactory()
     self.dbProxy = DBProxy()
     # get module and class names
     moduleNames = self.get_list(harvester_config.credmanager.moduleName)
     classNames = self.get_list(harvester_config.credmanager.className)
     # file names of original certificates
     if hasattr(harvester_config.credmanager, 'inCertFile'):
         inCertFiles = self.get_list(
             harvester_config.credmanager.inCertFile)
     else:
         inCertFiles = self.get_list(harvester_config.credmanager.certFile)
     # file names of certificates to be generated
     if hasattr(harvester_config.credmanager, 'outCertFile'):
         outCertFiles = self.get_list(
             harvester_config.credmanager.outCertFile)
     else:
         # use the file name of the certificate for panda connection as output name
         outCertFiles = self.get_list(harvester_config.pandacon.cert_file)
     # VOMS
     vomses = self.get_list(harvester_config.credmanager.voms)
     # get plugin
     self.exeCores = []
     for moduleName, className, inCertFile, outCertFile, voms in \
             zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses):
         pluginPar = {}
         pluginPar['module'] = moduleName
         pluginPar['name'] = className
         pluginPar['inCertFile'] = inCertFile
         pluginPar['outCertFile'] = outCertFile
         pluginPar['voms'] = voms
         exeCore = self.pluginFactory.get_plugin(pluginPar)
         self.exeCores.append(exeCore)
示例#9
0
 def __init__(self, communicator, queue_config_mapper, single_mode=False):
     AgentBase.__init__(self, single_mode)
     self.dbProxy = DBProxy()
     self.communicator = communicator
     self.queueConfigMapper = queue_config_mapper
     self._last_stats_update = None
     self._last_metrics_update = None
示例#10
0
 def __init__(self, communicator, queue_config_mapper, single_mode=False):
     AgentBase.__init__(self, single_mode)
     self.db_proxy = DBProxy()
     self.communicator = communicator
     self.queueConfigMapper = queue_config_mapper
     self.nodeName = socket.gethostname()
     self.lastHeartbeat = None
示例#11
0
 def __init__(self, communicator, queue_config_mapper, single_mode=False):
     AgentBase.__init__(self, single_mode)
     self.dbProxy = DBProxy()
     self.communicator = communicator
     self.nodeName = socket.gethostname()
     self.queueConfigMapper = queue_config_mapper
     self.pluginFactory = PluginFactory()
示例#12
0
 def __init__(self, communicator, queue_config_mapper, single_mode=False):
     AgentBase.__init__(self, single_mode)
     self.db_proxy = DBProxy()
     self.communicator = communicator
     self.queueConfigMapper = queue_config_mapper
     self.nodeName = socket.gethostname()
     self.lastHeartbeat = None
示例#13
0
 def __init__(self, queue_config_mapper, single_mode=False):
     AgentBase.__init__(self, single_mode)
     self.queueConfigMapper = queue_config_mapper
     self.dbProxy = DBProxy()
     self.pluginFactory = PluginFactory()
     self.startTimestamp = time.time()
     self.monitor_fifo = MonitorFIFO()
     self.apfmon = Apfmon(self.queueConfigMapper)
示例#14
0
 def __init__(self, queue_config_mapper, single_mode=False):
     AgentBase.__init__(self, single_mode)
     self.queueConfigMapper = queue_config_mapper
     self.dbProxy = DBProxy()
     self.workerMaker = WorkerMaker()
     self.workerAdjuster = WorkerAdjuster(queue_config_mapper)
     self.pluginFactory = PluginFactory()
     self.monitor_fifo = MonitorFIFO()
     self.apfmon = Apfmon(self.queueConfigMapper)
示例#15
0
 def __init__(self, queue_config_mapper):
     self.queue_configMapper = queue_config_mapper
     self.pluginFactory = PluginFactory()
     self.dbProxy = DBProxy()
     self.throttlerMap = dict()
     self.apf_mon = Apfmon(self.queue_configMapper)
     try:
         self.maxNewWorkers = harvester_config.submitter.maxNewWorkers
     except AttributeError:
         self.maxNewWorkers = None
示例#16
0
 def __init__(self, queue_config_mapper):
     self.queueConfigMapper = queue_config_mapper
     self.pluginFactory = PluginFactory()
     self.dbProxy = DBProxy()
     self.throttlerMap = dict()
     self.apf_mon = Apfmon(self.queueConfigMapper)
     try:
         self.maxNewWorkers = harvester_config.submitter.maxNewWorkers
     except AttributeError:
         self.maxNewWorkers = None
示例#17
0
    def __init__(self, **kwarg):
        '''Set up DB connection and credentials'''
        PluginBase.__init__(self, **kwarg)

        self.dbproxy = DBProxy()
        self.schedulerid = harvester_config.master.harvester_id

        # Credential dictionary role: proxy file
        self.certs = dict(zip([r.split('=')[1] for r in list(harvester_config.credmanager.voms)],
                              list(harvester_config.credmanager.outCertFile)))
        self.cred_type = arc.initializeCredentialsType(arc.initializeCredentialsType.SkipCredentials)
示例#18
0
 def __init__(self, queue_config_mapper, single_mode=False):
     AgentBase.__init__(self, single_mode)
     self.queue_config_mapper = queue_config_mapper
     self.pluginFactory = PluginFactory()
     self.dbProxy = DBProxy()
     # plugin cores
     self.exeCores = []
     self.queue_exe_cores = []
     # get plugin from harvester config
     self.get_cores_from_harvester_config()
     # update plugin cores from queue config
     self.update_cores_from_queue_config()
示例#19
0
    def __init__(self, **kwarg):
        '''Set up DB connection and credentials'''
        PluginBase.__init__(self, **kwarg)

        self.dbproxy = DBProxy()
        self.schedulerid = harvester_config.master.harvester_id

        # Credential dictionary role: proxy file
        self.certs = dict(
            zip([
                r.split('=')[1]
                for r in list(harvester_config.credmanager.voms)
            ], list(harvester_config.credmanager.outCertFile)))
        self.cred_type = arc.initializeCredentialsType(
            arc.initializeCredentialsType.SkipCredentials)
示例#20
0
    def __init__(self, pid_file, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.db_proxy = DBProxy()

        if pid_file is not None:
            self.pid_file = pid_file
        else:
            try:
                self.pid_file = harvester_config.service_monitor.pidfile
            except Exception:
                self.pid_file = None

        self.pid = self.get_master_pid()
        self.master_process = psutil.Process(self.pid)
        self.children = self.master_process.children(recursive=True)

        self.cpu_count = multiprocessing.cpu_count()
示例#21
0
 def __init__(self, queue_config_mapper, single_mode=False):
     AgentBase.__init__(self, single_mode)
     self.queueConfigMapper = queue_config_mapper
     self.dbProxy = DBProxy()
     self.pluginFactory = PluginFactory()
     self.startTimestamp = time.time()
     self.monitor_fifo = MonitorFIFO()
     if self.monitor_fifo.enabled:
         self.monitor_event_fifo = MonitorEventFIFO()
     else:
         self.monitor_event_fifo = None
     self.apfmon = Apfmon(self.queueConfigMapper)
     self.eventBasedMonCoreList = []
     if getattr(harvester_config.monitor, 'eventBasedEnable', False):
         for pluginConf in harvester_config.monitor.eventBasedPlugins:
             pluginFactory = PluginFactory()
             self.eventBasedMonCoreList.append(pluginFactory.get_plugin(pluginConf))
示例#22
0
    def __init__(self, pid_file, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.db_proxy = DBProxy()

        if pid_file is not None:
            self.pid_file = pid_file
        else:
            try:
                self.pid_file = harvester_config.service_monitor.pidfile
            except Exception:
                self.pid_file = None

        self.pid = self.get_master_pid()
        self.master_process = psutil.Process(self.pid)
        self.children = self.master_process.children(recursive=True)

        self.cpu_count = multiprocessing.cpu_count()
示例#23
0
class Submitter(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.workerMaker = WorkerMaker()
        self.workerAdjuster = WorkerAdjuster(queue_config_mapper)
        self.pluginFactory = PluginFactory()

    # main loop
    def run(self):
        lockedBy = 'submitter-{0}'.format(self.ident)
        while True:
            mainLog = core_utils.make_logger(_logger,
                                             'id={0}'.format(lockedBy),
                                             method_name='run')
            mainLog.debug('getting queues to submit workers')
            # get queues associated to a site to submit workers
            curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(
                harvester_config.submitter.nQueues,
                harvester_config.submitter.lookupTime)
            mainLog.debug('got {0} queues for site {1}'.format(
                len(curWorkers), siteName))
            # get commands
            if siteName is not None:
                comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers,
                                          siteName)
                commandSpecs = self.dbProxy.get_commands_for_receiver(
                    'submitter', comStr)
                mainLog.debug('got {0} commands'.format(len(commandSpecs)))
                for commandSpec in commandSpecs:
                    newLimits = self.dbProxy.set_queue_limit(
                        siteName, commandSpec.params)
                    for tmpResource, tmpNewVal in iteritems(newLimits):
                        if tmpResource in resMap:
                            tmpQueueName = resMap[tmpResource]
                            if tmpQueueName in curWorkers:
                                curWorkers[tmpQueueName][
                                    'nNewWorkers'] = tmpNewVal
            # define number of new workers
            if len(curWorkers) == 0:
                nWorkersPerQueue = dict()
            else:
                nWorkersPerQueue = self.workerAdjuster.define_num_workers(
                    curWorkers, siteName)
            if nWorkersPerQueue is None:
                mainLog.error(
                    'WorkerAdjuster failed to define the number of workers')
            elif len(nWorkersPerQueue) == 0:
                pass
            else:
                # loop over all queues
                for queueName, tmpVal in iteritems(nWorkersPerQueue):
                    tmpLog = core_utils.make_logger(
                        _logger,
                        'queue={0}'.format(queueName),
                        method_name='run')
                    tmpLog.debug('start')
                    nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady']
                    nReady = tmpVal['nReady']
                    # check queue
                    if not self.queueConfigMapper.has_queue(queueName):
                        tmpLog.error('config not found')
                        continue
                    # no new workers
                    if nWorkers == 0:
                        tmpLog.debug(
                            'skipped since no new worker is needed based on current stats'
                        )
                        continue
                    # get queue
                    queueConfig = self.queueConfigMapper.get_queue(queueName)
                    # actions based on mapping type
                    if queueConfig.mapType == WorkSpec.MT_NoJob:
                        # workers without jobs
                        jobChunks = []
                        for i in range(nWorkers):
                            jobChunks.append([])
                    elif queueConfig.mapType == WorkSpec.MT_OneToOne:
                        # one worker per one job
                        jobChunks = self.dbProxy.get_job_chunks_for_workers(
                            queueName, nWorkers, nReady, 1, None,
                            queueConfig.useJobLateBinding,
                            harvester_config.submitter.checkInterval,
                            harvester_config.submitter.lockInterval, lockedBy)
                    elif queueConfig.mapType == WorkSpec.MT_MultiJobs:
                        # one worker for multiple jobs
                        nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(
                            queueConfig, nWorkers)
                        tmpLog.debug(
                            'nJobsPerWorker={0}'.format(nJobsPerWorker))
                        jobChunks = self.dbProxy.get_job_chunks_for_workers(
                            queueName, nWorkers, nReady, nJobsPerWorker, None,
                            queueConfig.useJobLateBinding,
                            harvester_config.submitter.checkInterval,
                            harvester_config.submitter.lockInterval, lockedBy,
                            queueConfig.allowJobMixture)
                    elif queueConfig.mapType == WorkSpec.MT_MultiWorkers:
                        # multiple workers for one job
                        nWorkersPerJob = self.workerMaker.get_num_workers_per_job(
                            queueConfig, nWorkers)
                        jobChunks = self.dbProxy.get_job_chunks_for_workers(
                            queueName, nWorkers, nReady, None, nWorkersPerJob,
                            queueConfig.useJobLateBinding,
                            harvester_config.submitter.checkInterval,
                            harvester_config.submitter.lockInterval, lockedBy)
                    else:
                        tmpLog.error('unknown mapType={0}'.format(
                            queueConfig.mapType))
                        continue
                    tmpLog.debug('got {0} job chunks'.format(len(jobChunks)))
                    if len(jobChunks) == 0:
                        continue
                    # make workers
                    okChunks, ngChunks = self.workerMaker.make_workers(
                        jobChunks, queueConfig, nReady)
                    if len(ngChunks) == 0:
                        tmpLog.debug('successfully made {0} workers'.format(
                            len(okChunks)))
                    else:
                        tmpLog.debug(
                            'made {0} workers, while {1} workers failed'.
                            format(len(okChunks), len(ngChunks)))
                    timeNow = datetime.datetime.utcnow()
                    # NG
                    for ngJobs in ngChunks:
                        for jobSpec in ngJobs:
                            jobSpec.status = 'failed'
                            jobSpec.subStatus = 'failedtomake'
                            jobSpec.stateChangeTime = timeNow
                            jobSpec.lockedBy = None
                            jobSpec.trigger_propagation()
                            self.dbProxy.update_job(jobSpec, {
                                'lockedBy': lockedBy,
                                'subStatus': 'prepared'
                            })
                    # OK
                    pandaIDs = set()
                    workSpecList = []
                    if len(okChunks) > 0:
                        for workSpec, okJobs in okChunks:
                            # has job
                            if (queueConfig.useJobLateBinding and workSpec.workerID is None) \
                                    or queueConfig.mapType == WorkSpec.MT_NoJob:
                                workSpec.hasJob = 0
                            else:
                                workSpec.hasJob = 1
                                if workSpec.nJobsToReFill in [None, 0]:
                                    workSpec.set_jobspec_list(okJobs)
                                else:
                                    # refill free slots during the worker is running
                                    workSpec.set_jobspec_list(
                                        okJobs[:workSpec.nJobsToReFill])
                                    workSpec.nJobsToReFill = None
                                    for jobSpec in okJobs[workSpec.
                                                          nJobsToReFill:]:
                                        pandaIDs.add(jobSpec.PandaID)
                            # map type
                            workSpec.mapType = queueConfig.mapType
                            # queue name
                            workSpec.computingSite = queueConfig.queueName
                            # set access point
                            workSpec.accessPoint = queueConfig.messenger[
                                'accessPoint']
                            # events
                            if len(okJobs) > 0 and (
                                    'eventService' in okJobs[0].jobParams
                                    or 'cloneJob' in okJobs[0].jobParams):
                                workSpec.eventsRequest = WorkSpec.EV_useEvents
                            workSpecList.append(workSpec)
                    if len(workSpecList) > 0:
                        # get plugin for submitter
                        submitterCore = self.pluginFactory.get_plugin(
                            queueConfig.submitter)
                        if submitterCore is None:
                            # not found
                            tmpLog.error(
                                'submitter plugin for {0} not found'.format(
                                    jobSpec.computingSite))
                            continue
                        # get plugin for messenger
                        messenger = self.pluginFactory.get_plugin(
                            queueConfig.messenger)
                        if messenger is None:
                            # not found
                            tmpLog.error(
                                'messenger plugin for {0} not found'.format(
                                    jobSpec.computingSite))
                            continue
                        # setup access points
                        messenger.setup_access_points(workSpecList)
                        # feed jobs
                        for workSpec in workSpecList:
                            if workSpec.hasJob == 1:
                                tmpStat = messenger.feed_jobs(
                                    workSpec, workSpec.get_jobspec_list())
                                if tmpStat is False:
                                    tmpLog.error(
                                        'failed to send jobs to workerID={0}'.
                                        format(workSpec.workerID))
                                else:
                                    tmpLog.debug(
                                        'sent jobs to workerID={0} with {1}'.
                                        format(workSpec.workerID, tmpStat))
                        # submit
                        tmpLog.debug('submitting {0} workers'.format(
                            len(workSpecList)))
                        workSpecList, tmpRetList, tmpStrList = self.submit_workers(
                            submitterCore, workSpecList)
                        for iWorker, (tmpRet, tmpStr) in enumerate(
                                zip(tmpRetList, tmpStrList)):
                            workSpec, jobList = okChunks[iWorker]
                            # use associated job list since it can be truncated for re-filling
                            jobList = workSpec.get_jobspec_list()
                            # set status
                            if not tmpRet:
                                # failed submission
                                tmpLog.error(
                                    'failed to submit a workerID={0} with {1}'.
                                    format(workSpec.workerID, tmpStr))
                                workSpec.set_status(WorkSpec.ST_missed)
                                jobList = []
                            elif queueConfig.useJobLateBinding and workSpec.hasJob == 1:
                                # directly go to running after feeding jobs for late biding
                                workSpec.set_status(WorkSpec.ST_running)
                            else:
                                # normal successful submission
                                workSpec.set_status(WorkSpec.ST_submitted)
                            workSpec.submitTime = timeNow
                            workSpec.modificationTime = timeNow
                            # prefetch events
                            if tmpRet and workSpec.hasJob == 1 and workSpec.eventsRequest == WorkSpec.EV_useEvents:
                                workSpec.eventsRequest = WorkSpec.EV_requestEvents
                                eventsRequestParams = dict()
                                for jobSpec in jobList:
                                    eventsRequestParams[jobSpec.PandaID] = {
                                        'pandaID': jobSpec.PandaID,
                                        'taskID': jobSpec.taskID,
                                        'jobsetID':
                                        jobSpec.jobParams['jobsetID'],
                                        'nRanges':
                                        jobSpec.jobParams['coreCount'],
                                    }
                                workSpec.eventsRequestParams = eventsRequestParams
                            # register worker
                            tmpStat = self.dbProxy.register_worker(
                                workSpec, jobList, lockedBy)
                            if jobList is not None:
                                for jobSpec in jobList:
                                    pandaIDs.add(jobSpec.PandaID)
                                    if tmpStat:
                                        tmpStr = 'submitted a workerID={0} for PandaID={1} with batchID={2}'
                                        tmpLog.info(
                                            tmpStr.format(
                                                workSpec.workerID,
                                                jobSpec.PandaID,
                                                workSpec.batchID))
                                    else:
                                        tmpStr = 'failed to register a worker for PandaID={0} with batchID={1}'
                                        tmpLog.error(
                                            tmpStr.format(
                                                jobSpec.PandaID,
                                                workSpec.batchID))
                    # release jobs
                    self.dbProxy.release_jobs(pandaIDs, lockedBy)
            mainLog.debug('done')
            # check if being terminated
            if self.terminated(harvester_config.submitter.sleepTime):
                mainLog.debug('terminated')
                return

    # wrapper for submitWorkers to skip ready workers
    def submit_workers(self, submitter_core, workspec_list):
        retList = []
        strList = []
        newSpecList = []
        workersToSubmit = []
        for workSpec in workspec_list:
            if workSpec.status == WorkSpec.ST_ready:
                newSpecList.append(workSpec)
                retList.append(True)
                strList.append('')
            else:
                workersToSubmit.append(workSpec)
        tmpRetList = submitter_core.submit_workers(workersToSubmit)
        for tmpRet, tmpStr in tmpRetList:
            retList.append(tmpRet)
            strList.append(tmpStr)
        newSpecList += workersToSubmit
        return newSpecList, retList, strList
      if loggerName.split('.')[-1] in ['db_proxy']:
         continue
      stdoutHandler = logging.StreamHandler(sys.stdout)
      stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
      loggerObj.addHandler(stdoutHandler)

msgStr = "plugin={0}".format(stagerCore.__class__.__name__)
tmpLog.debug(msgStr)
msgStr = "Initial queueConfig.stager = {}".format(initial_queueConfig_stager)
tmpLog.debug(msgStr)
msgStr = "Modified queueConfig.stager = {}".format(modified_queueConfig_stager)
tmpLog.debug(msgStr)

scope = 'panda'

proxy = DBProxy()
communicator = CommunicatorPool()
cacher = Cacher(communicator, single_mode=True)
cacher.run()



# check if db lock exits
locked = stagerCore.dbInterface.get_object_lock('dummy_id_for_out_0',lock_interval=120)
if not locked:
   tmpLog.debug('DB Already locked by another thread')
# now unlock db
unlocked = stagerCore.dbInterface.release_object_lock('dummy_id_for_out_0')
if unlocked :
   tmpLog.debug('unlocked db')
else:
示例#25
0
class WorkerAdjuster(object):
    # constructor
    def __init__(self, queue_config_mapper):
        self.queue_configMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()
        self.dbProxy = DBProxy()
        self.throttlerMap = dict()
        self.apf_mon = Apfmon(self.queue_configMapper)
        try:
            self.maxNewWorkers = harvester_config.submitter.maxNewWorkers
        except AttributeError:
            self.maxNewWorkers = None

    # define number of workers to submit based on various information
    def define_num_workers(self, static_num_workers, site_name):
        tmp_log = core_utils.make_logger(_logger,
                                         'site={0}'.format(site_name),
                                         method_name='define_num_workers')
        tmp_log.debug('start')
        tmp_log.debug('static_num_workers: {0}'.format(static_num_workers))
        dyn_num_workers = copy.deepcopy(static_num_workers)
        try:
            # get queue status
            queue_stat = self.dbProxy.get_cache("panda_queues.json", None)
            if queue_stat is None:
                queue_stat = dict()
            else:
                queue_stat = queue_stat.data

            # get job statistics
            job_stats = self.dbProxy.get_cache("job_statistics.json", None)
            if job_stats is None:
                job_stats = dict()
            else:
                job_stats = job_stats.data

            # define num of new workers
            for queue_name in static_num_workers:
                # get queue
                queue_config = self.queue_configMapper.get_queue(queue_name)
                worker_limits_dict = self.dbProxy.get_worker_limits(queue_name)
                max_workers = worker_limits_dict.get('maxWorkers', 0)
                n_queue_limit = worker_limits_dict.get('nQueueLimitWorker', 0)
                n_queue_limit_per_rt = worker_limits_dict[
                    'nQueueLimitWorkerPerRT']
                n_queue_total, n_ready_total, n_running_total = 0, 0, 0
                apf_msg = None
                apf_data = None
                for job_type, jt_values in iteritems(
                        static_num_workers[queue_name]):
                    for resource_type, tmp_val in iteritems(jt_values):
                        tmp_log.debug(
                            'Processing queue {0} job_type {1} resource_type {2} with static_num_workers {3}'
                            .format(queue_name, job_type, resource_type,
                                    tmp_val))

                        # set 0 to num of new workers when the queue is disabled
                        if queue_name in queue_stat and queue_stat[queue_name][
                                'status'] in [
                                    'offline', 'standby', 'maintenance'
                                ]:
                            dyn_num_workers[queue_name][job_type][
                                resource_type]['nNewWorkers'] = 0
                            ret_msg = 'set n_new_workers=0 since status={0}'.format(
                                queue_stat[queue_name]['status'])
                            tmp_log.debug(ret_msg)
                            apf_msg = 'Not submitting workers since queue status = {0}'.format(
                                queue_stat[queue_name]['status'])
                            continue

                        # protection against not-up-to-date queue config
                        if queue_config is None:
                            dyn_num_workers[queue_name][job_type][
                                resource_type]['nNewWorkers'] = 0
                            ret_msg = 'set n_new_workers=0 due to missing queue_config'
                            tmp_log.debug(ret_msg)
                            apf_msg = 'Not submitting workers because of missing queue_config'
                            continue

                        # get throttler
                        if queue_name not in self.throttlerMap:
                            if hasattr(queue_config, 'throttler'):
                                throttler = self.pluginFactory.get_plugin(
                                    queue_config.throttler)
                            else:
                                throttler = None
                            self.throttlerMap[queue_name] = throttler

                        # check throttler
                        throttler = self.throttlerMap[queue_name]
                        if throttler is not None:
                            to_throttle, tmp_msg = throttler.to_be_throttled(
                                queue_config)
                            if to_throttle:
                                dyn_num_workers[queue_name][job_type][
                                    resource_type]['nNewWorkers'] = 0
                                ret_msg = 'set n_new_workers=0 by {0}:{1}'.format(
                                    throttler.__class__.__name__, tmp_msg)
                                tmp_log.debug(ret_msg)
                                continue

                        # check stats
                        n_queue = tmp_val['nQueue']
                        n_ready = tmp_val['nReady']
                        n_running = tmp_val['nRunning']
                        if resource_type != 'ANY' and job_type != 'ANY' and job_type is not None:
                            n_queue_total += n_queue
                            n_ready_total += n_ready
                            n_running_total += n_running
                        if queue_config.runMode == 'slave':
                            n_new_workers_def = tmp_val['nNewWorkers']
                            if n_new_workers_def == 0:
                                dyn_num_workers[queue_name][job_type][
                                    resource_type]['nNewWorkers'] = 0
                                ret_msg = 'set n_new_workers=0 by panda in slave mode'
                                tmp_log.debug(ret_msg)
                                continue
                        else:
                            n_new_workers_def = None

                        # define num of new workers based on static site config
                        n_new_workers = 0
                        if n_queue >= n_queue_limit_per_rt > 0:
                            # enough queued workers
                            ret_msg = 'No n_new_workers since n_queue({0})>=n_queue_limit_per_rt({1})'.format(
                                n_queue, n_queue_limit_per_rt)
                            tmp_log.debug(ret_msg)
                            pass
                        elif (n_queue + n_ready +
                              n_running) >= max_workers > 0:
                            # enough workers in the system
                            ret_msg = 'No n_new_workers since n_queue({0}) + n_ready({1}) + n_running({2}) '.format(
                                n_queue, n_ready, n_running)
                            ret_msg += '>= max_workers({0})'.format(
                                max_workers)
                            tmp_log.debug(ret_msg)
                            pass
                        else:

                            max_queued_workers = None

                            if n_queue_limit_per_rt > 0:  # there is a limit set for the queue
                                max_queued_workers = n_queue_limit_per_rt

                            # Reset the maxQueueWorkers according to particular
                            if n_new_workers_def is not None:  # don't surpass limits given centrally
                                maxQueuedWorkers_slave = n_new_workers_def + n_queue
                                if max_queued_workers is not None:
                                    max_queued_workers = min(
                                        maxQueuedWorkers_slave,
                                        max_queued_workers)
                                else:
                                    max_queued_workers = maxQueuedWorkers_slave

                            elif queue_config.mapType == 'NoJob':  # for pull mode, limit to activated jobs
                                # limit the queue to the number of activated jobs to avoid empty pilots
                                try:
                                    n_activated = max(
                                        job_stats[queue_name]['activated'],
                                        1)  # avoid no activity queues
                                    queue_limit = max_queued_workers
                                    max_queued_workers = min(
                                        n_activated, max_queued_workers)
                                    tmp_log.debug(
                                        'limiting max_queued_workers to min(n_activated={0}, queue_limit={1})'
                                        .format(n_activated, queue_limit))
                                except KeyError:
                                    tmp_log.warning(
                                        'n_activated not defined, defaulting to configured queue limits'
                                    )
                                    pass

                            if max_queued_workers is None:  # no value found, use default value
                                max_queued_workers = 1

                            # new workers
                            n_new_workers = max(max_queued_workers - n_queue,
                                                0)
                            tmp_log.debug(
                                'setting n_new_workers to {0} in max_queued_workers calculation'
                                .format(n_new_workers))
                            if max_workers > 0:
                                n_new_workers = min(
                                    n_new_workers,
                                    max(
                                        max_workers - n_queue - n_ready -
                                        n_running, 0))
                                tmp_log.debug(
                                    'setting n_new_workers to {0} to respect max_workers'
                                    .format(n_new_workers))
                        if queue_config.maxNewWorkersPerCycle > 0:
                            n_new_workers = min(
                                n_new_workers,
                                queue_config.maxNewWorkersPerCycle)
                            tmp_log.debug(
                                'setting n_new_workers to {0} in order to respect maxNewWorkersPerCycle'
                                .format(n_new_workers))
                        if self.maxNewWorkers is not None and self.maxNewWorkers > 0:
                            n_new_workers = min(n_new_workers,
                                                self.maxNewWorkers)
                            tmp_log.debug(
                                'setting n_new_workers to {0} in order to respect universal maxNewWorkers'
                                .format(n_new_workers))
                        dyn_num_workers[queue_name][job_type][resource_type][
                            'nNewWorkers'] = n_new_workers

                # adjust n_new_workers for UCORE to let aggregations over RT respect nQueueLimitWorker and max_workers
                if queue_config is None:
                    max_new_workers_per_cycle = 0
                    ret_msg = 'set max_new_workers_per_cycle=0 in UCORE aggregation due to missing queue_config'
                    tmp_log.debug(ret_msg)
                else:
                    max_new_workers_per_cycle = queue_config.maxNewWorkersPerCycle
                if len(dyn_num_workers[queue_name]) > 1:
                    total_new_workers_rts = 0
                    for _jt in dyn_num_workers[queue_name]:
                        for _rt in dyn_num_workers[queue_name][_jt]:
                            if _jt != 'ANY' and _rt != 'ANY':
                                total_new_workers_rts = total_new_workers_rts + dyn_num_workers[
                                    queue_name][_jt][_rt]['nNewWorkers']
                    n_new_workers_max_agg = min(
                        max(n_queue_limit - n_queue_total, 0),
                        max(
                            max_workers - n_queue_total - n_ready_total -
                            n_running_total, 0))
                    if max_new_workers_per_cycle >= 0:
                        n_new_workers_max_agg = min(n_new_workers_max_agg,
                                                    max_new_workers_per_cycle)
                    if self.maxNewWorkers is not None and self.maxNewWorkers > 0:
                        n_new_workers_max_agg = min(n_new_workers_max_agg,
                                                    self.maxNewWorkers)

                    # exceeded max, to adjust
                    if total_new_workers_rts > n_new_workers_max_agg:
                        if n_new_workers_max_agg == 0:
                            for job_type in dyn_num_workers[queue_name]:
                                for resource_type in dyn_num_workers[
                                        queue_name][job_type]:
                                    dyn_num_workers[queue_name][job_type][
                                        resource_type]['nNewWorkers'] = 0
                            tmp_log.debug(
                                'No n_new_workers since n_new_workers_max_agg=0 for UCORE'
                            )
                        else:
                            tmp_log.debug(
                                'n_new_workers_max_agg={0} for UCORE'.format(
                                    n_new_workers_max_agg))
                            _d = dyn_num_workers[queue_name].copy()
                            del _d['ANY']

                            # TODO: needs to be recalculated
                            simple_rt_nw_list = []
                            for job_type in _d:  # jt: job type
                                for resource_type in _d[
                                        job_type]:  # rt: resource type
                                    simple_rt_nw_list.append([
                                        (resource_type, job_type),
                                        _d[job_type][resource_type].get(
                                            'nNewWorkers', 0), 0
                                    ])

                            _countdown = n_new_workers_max_agg
                            for _rt_list in simple_rt_nw_list:
                                (resource_type,
                                 job_type), n_new_workers_orig, _r = _rt_list
                                n_new_workers, remainder = divmod(
                                    n_new_workers_orig * n_new_workers_max_agg,
                                    total_new_workers_rts)
                                dyn_num_workers[queue_name][
                                    job_type].setdefault(
                                        resource_type, {
                                            'nReady': 0,
                                            'nRunning': 0,
                                            'nQueue': 0,
                                            'nNewWorkers': 0
                                        })
                                dyn_num_workers[queue_name][job_type][
                                    resource_type][
                                        'nNewWorkers'] = n_new_workers
                                _rt_list[2] = remainder
                                _countdown -= n_new_workers
                            _s_list = sorted(simple_rt_nw_list,
                                             key=(lambda x: x[1]))
                            sorted_rt_nw_list = sorted(_s_list,
                                                       key=(lambda x: x[2]),
                                                       reverse=True)
                            for (
                                    resource_type, job_type
                            ), n_new_workers_orig, remainder in sorted_rt_nw_list:
                                if _countdown <= 0:
                                    break
                                dyn_num_workers[queue_name][job_type][
                                    resource_type]['nNewWorkers'] += 1
                                _countdown -= 1
                        for job_type in dyn_num_workers[queue_name]:
                            for resource_type in dyn_num_workers[queue_name][
                                    job_type]:
                                if job_type == 'ANY' or resource_type == 'ANY':
                                    continue
                                n_new_workers = dyn_num_workers[queue_name][
                                    job_type][resource_type]['nNewWorkers']
                                tmp_log.debug(
                                    'setting n_new_workers to {0} of job_type {1} resource_type {2} in order to respect RT aggregations for UCORE'
                                    .format(n_new_workers, job_type,
                                            resource_type))

                if not apf_msg:
                    apf_data = copy.deepcopy(dyn_num_workers[queue_name])

                self.apf_mon.update_label(queue_name, apf_msg, apf_data)

            # dump
            tmp_log.debug('defined {0}'.format(str(dyn_num_workers)))
            return dyn_num_workers
        except Exception:
            # dump error
            err_msg = core_utils.dump_error_message(tmp_log)
            return None
示例#26
0
class ServiceMonitor(AgentBase):
    # constructor
    def __init__(self, pid_file, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.db_proxy = DBProxy()

        if pid_file is not None:
            self.pid_file = pid_file
        else:
            try:
                self.pid_file = harvester_config.service_monitor.pidfile
            except Exception:
                self.pid_file = None

        self.pid = self.get_master_pid()
        self.master_process = psutil.Process(self.pid)
        self.children = self.master_process.children(recursive=True)

        self.cpu_count = multiprocessing.cpu_count()

    def get_master_pid(self):
        """
        Gets the master pid from the lock file
        :return:
        """
        try:
            fh = open(self.pid_file, 'r')
            pid = int(fh.readline())
            fh.close()
        except Exception:
            _logger.error('Could not read pidfile "{0}"'.format(self.pid_file))
            pid = None

        return pid

    def refresh_children_list(self, children):

        children_refreshed = []

        for child_current in children:
            pid_current = child_current.pid
            found = False
            for child_stored in self.children:
                pid_stored = child_stored.pid
                if pid_stored == pid_current:
                    found = True
                    break

            if found:
                children_refreshed.append(child_stored)
            else:
                children_refreshed.append(child_current)

        self.children = children_refreshed

        return children_refreshed

    def get_memory_n_cpu(self):
        """
        sum memory of whole process tree starting from master pid
        :return: rss in MiB
        """
        try:
            master_process = self.master_process
            rss = master_process.memory_info()[0]
            memory_pc = master_process.memory_percent()
            cpu_pc = master_process.cpu_percent()

            children = self.refresh_children_list(master_process.children(recursive=True))
            for child in children:
                rss += child.memory_info()[0]
                memory_pc += child.memory_percent()
                cpu_pc += child.cpu_percent()

            # convert bytes to MiB
            rss_mib = rss / float(2 ** 20)
            # normalize cpu percentage by cpu count
            cpu_pc = cpu_pc * 1.0 / self.cpu_count
        except Exception:
            _logger.error('Excepted with: {0}'.format(traceback.format_exc()))
            rss_mib = None
            memory_pc = None
            cpu_pc = None

        return rss_mib, memory_pc, cpu_pc

    def volume_use(self, volume_name):
        command = "df -Pkh /" + volume_name
        used_amount = 0
        tmp_array = command.split()
        output = subprocess.Popen(tmp_array, stdout=subprocess.PIPE).communicate()[0].decode("utf-8")

        for line in output.split('\n'):
            if re.search(volume_name, line):
                used_amount = re.search(r"(\d+)\%", line).group(1)

        try:
            used_amount_float = float(used_amount)
        except ValueError:
            used_amount_float = None
            _logger.error('Could not convert used amount {0} to float for volume {1}'.format(used_amount, volume_name))

        return used_amount_float

    # main loop
    def run(self):
        while True:
            _logger.debug('Running service monitor')

            service_metrics = {}

            # get memory usage
            rss_mib, memory_pc, cpu_pc = self.get_memory_n_cpu()
            service_metrics['rss_mib'] = rss_mib
            service_metrics['memory_pc'] = memory_pc
            service_metrics['cpu_pc'] = cpu_pc
            _logger.debug('Memory usage: {0} MiB/{1}%, CPU usage: {2}'.format(rss_mib, memory_pc, cpu_pc))

            # get volume usage
            try:
                volumes = harvester_config.service_monitor.disk_volumes.split(',')
            except Exception:
                volumes = []
            for volume in volumes:
                volume_use = self.volume_use(volume)
                _logger.debug('Disk usage of {0}: {1} %'.format(volume, volume_use))
                service_metrics['volume_{0}_pc'.format(volume)] = volume_use

            service_metrics_spec = ServiceMetricSpec(service_metrics)
            self.db_proxy.insert_service_metrics(service_metrics_spec)

            # check if being terminated
            try:
                sleep_time = harvester_config.service_monitor.sleepTime
            except Exception:
                sleep_time = 300

            if self.terminated(sleep_time, randomize=False):
                return
示例#27
0
class Submitter(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.workerMaker = WorkerMaker()
        self.workerAdjuster = WorkerAdjuster(queue_config_mapper)
        self.pluginFactory = PluginFactory()
        self.monitor_fifo = MonitorFIFO()
        self.apfmon = Apfmon(self.queueConfigMapper)

    # main loop
    def run(self):
        lockedBy = 'submitter-{0}'.format(self.get_pid())
        monitor_fifo = self.monitor_fifo
        queueLockInterval = getattr(harvester_config.submitter, 'queueLockInterval',
                                    harvester_config.submitter.lockInterval)
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            mainLog.debug('getting queues to submit workers')

            # get queues associated to a site to submit workers
            curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues,
                                                                             harvester_config.submitter.lookupTime,
                                                                             harvester_config.submitter.lockInterval,
                                                                             lockedBy, queueLockInterval)
            submitted = False
            if siteName is not None:
                mainLog.debug('got {0} queues for site {1}'.format(len(curWorkers), siteName))

                # get commands
                comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName)
                commandSpecs = self.dbProxy.get_commands_for_receiver('submitter', comStr)
                mainLog.debug('got {0} {1} commands'.format(commandSpecs, comStr))
                for commandSpec in commandSpecs:
                    newLimits = self.dbProxy.set_queue_limit(siteName, commandSpec.params)
                    for tmpResource, tmpNewVal in iteritems(newLimits):
                        # if available, overwrite new worker value with the command from panda server
                        if tmpResource in resMap:
                            tmpQueueName = resMap[tmpResource]
                            if tmpQueueName in curWorkers:
                                curWorkers[tmpQueueName][tmpResource]['nNewWorkers'] = tmpNewVal

                # define number of new workers
                if len(curWorkers) == 0:
                    n_workers_per_queue_and_rt = dict()
                else:
                    n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(curWorkers, siteName)

                if n_workers_per_queue_and_rt is None:
                    mainLog.error('WorkerAdjuster failed to define the number of workers')
                elif len(n_workers_per_queue_and_rt) == 0:
                    pass
                else:
                    # loop over all queues and resource types
                    for queueName in n_workers_per_queue_and_rt:
                        for resource_type, tmpVal in iteritems(n_workers_per_queue_and_rt[queueName]):

                            tmpLog = self.make_logger(_logger, 'id={0} queue={1} rtype={2}'.format(lockedBy,
                                                                                                   queueName,
                                                                                                   resource_type),
                                                      method_name='run')
                            try:
                                tmpLog.debug('start')
                                tmpLog.debug('workers status: %s' % tmpVal)
                                nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady']
                                nReady = tmpVal['nReady']

                                # check queue
                                if not self.queueConfigMapper.has_queue(queueName):
                                    tmpLog.error('config not found')
                                    continue

                                # no new workers
                                if nWorkers == 0:
                                    tmpLog.debug('skipped since no new worker is needed based on current stats')
                                    continue
                                # get queue
                                queueConfig = self.queueConfigMapper.get_queue(queueName)
                                workerMakerCore = self.workerMaker.get_plugin(queueConfig)
                                # check if resource is ready
                                if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True:
                                    numReadyResources = self.workerMaker.num_ready_resources(queueConfig,
                                                                                             resource_type,
                                                                                             workerMakerCore)
                                    tmpLog.debug('numReadyResources: %s' % numReadyResources)
                                    if not numReadyResources:
                                        if hasattr(workerMakerCore, 'staticWorkers'):
                                            nQRWorkers = tmpVal['nQueue'] + tmpVal['nRunning']
                                            tmpLog.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' %
                                                         (workerMakerCore.staticWorkers, nQRWorkers))
                                            if nQRWorkers >= workerMakerCore.staticWorkers:
                                                tmpLog.debug('No left static workers, skip')
                                                continue
                                            else:
                                                nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers)
                                                tmpLog.debug('staticWorkers: %s, nWorkers: %s' %
                                                             (workerMakerCore.staticWorkers, nWorkers))
                                        else:
                                            tmpLog.debug('skip since no resources are ready')
                                            continue
                                    else:
                                        nWorkers = min(nWorkers, numReadyResources)
                                # post action of worker maker
                                if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True:
                                    skipOnFail = True
                                else:
                                    skipOnFail = False
                                # actions based on mapping type
                                if queueConfig.mapType == WorkSpec.MT_NoJob:
                                    # workers without jobs
                                    jobChunks = []
                                    for i in range(nWorkers):
                                        jobChunks.append([])
                                elif queueConfig.mapType == WorkSpec.MT_OneToOne:
                                    # one worker per one job
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName,
                                        nWorkers, nReady, 1, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.checkInterval,
                                        harvester_config.submitter.lockInterval,
                                        lockedBy)
                                elif queueConfig.mapType == WorkSpec.MT_MultiJobs:
                                    # one worker for multiple jobs
                                    nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queueConfig,
                                                                                              nWorkers,
                                                                                              resource_type,
                                                                                              maker=workerMakerCore)
                                    tmpLog.debug('nJobsPerWorker={0}'.format(nJobsPerWorker))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName,
                                        nWorkers, nReady, nJobsPerWorker, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.checkInterval,
                                        harvester_config.submitter.lockInterval,
                                        lockedBy,
                                        queueConfig.allowJobMixture)
                                elif queueConfig.mapType == WorkSpec.MT_MultiWorkers:
                                    # multiple workers for one job
                                    nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queueConfig,
                                                                                              nWorkers,
                                                                                              resource_type,
                                                                                              maker=workerMakerCore)
                                    maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total(
                                        queueConfig, resource_type, maker=workerMakerCore)
                                    maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle(
                                        queueConfig, resource_type, maker=workerMakerCore)
                                    tmpLog.debug('nWorkersPerJob={0}'.format(nWorkersPerJob))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName,
                                        nWorkers, nReady, None, nWorkersPerJob,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.checkInterval,
                                        harvester_config.submitter.lockInterval,
                                        lockedBy, max_workers_per_job_in_total=maxWorkersPerJob,
                                        max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle)
                                else:
                                    tmpLog.error('unknown mapType={0}'.format(queueConfig.mapType))
                                    continue

                                tmpLog.debug('got {0} job chunks'.format(len(jobChunks)))
                                if len(jobChunks) == 0:
                                    continue
                                # make workers
                                okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queueConfig,
                                                                                   nReady, resource_type,
                                                                                   maker=workerMakerCore)
                                if len(ngChunks) == 0:
                                    tmpLog.debug('successfully made {0} workers'.format(len(okChunks)))
                                else:
                                    tmpLog.debug('made {0} workers, while {1} workers failed'.format(len(okChunks),
                                                                                                     len(ngChunks)))
                                timeNow = datetime.datetime.utcnow()
                                timeNow_timestamp = time.time()
                                pandaIDs = set()
                                # NG (=not good)
                                for ngJobs in ngChunks:
                                    for jobSpec in ngJobs:
                                        if skipOnFail:
                                            # release jobs when workers are not made
                                            pandaIDs.add(jobSpec.PandaID)
                                        else:
                                            jobSpec.status = 'failed'
                                            jobSpec.subStatus = 'failed_to_make'
                                            jobSpec.stateChangeTime = timeNow
                                            jobSpec.lockedBy = None
                                            errStr = 'failed to make a worker'
                                            jobSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr)
                                            jobSpec.trigger_propagation()
                                            self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy,
                                                                              'subStatus': 'prepared'})
                                # OK
                                workSpecList = []
                                if len(okChunks) > 0:
                                    for workSpec, okJobs in okChunks:
                                        # has job
                                        if (queueConfig.useJobLateBinding and workSpec.workerID is None) \
                                                or queueConfig.mapType == WorkSpec.MT_NoJob:
                                            workSpec.hasJob = 0
                                        else:
                                            workSpec.hasJob = 1
                                            if workSpec.nJobsToReFill in [None, 0]:
                                                workSpec.set_jobspec_list(okJobs)
                                            else:
                                                # refill free slots during the worker is running
                                                workSpec.set_jobspec_list(okJobs[:workSpec.nJobsToReFill])
                                                workSpec.nJobsToReFill = None
                                                for jobSpec in okJobs[workSpec.nJobsToReFill:]:
                                                    pandaIDs.add(jobSpec.PandaID)
                                            workSpec.set_num_jobs_with_list()
                                        # map type
                                        workSpec.mapType = queueConfig.mapType
                                        # queue name
                                        workSpec.computingSite = queueConfig.queueName
                                        # set access point
                                        workSpec.accessPoint = queueConfig.messenger['accessPoint']
                                        # sync level
                                        workSpec.syncLevel = queueConfig.get_synchronization_level()
                                        # events
                                        if len(okJobs) > 0 and \
                                                ('eventService' in okJobs[0].jobParams or
                                                 'cloneJob' in okJobs[0].jobParams):
                                            workSpec.eventsRequest = WorkSpec.EV_useEvents
                                        workSpecList.append(workSpec)
                                if len(workSpecList) > 0:
                                    sw = core_utils.get_stopwatch()
                                    # get plugin for submitter
                                    submitterCore = self.pluginFactory.get_plugin(queueConfig.submitter)
                                    if submitterCore is None:
                                        # not found
                                        tmpLog.error(
                                            'submitter plugin for {0} not found'.format(jobSpec.computingSite))
                                        continue
                                    # get plugin for messenger
                                    messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
                                    if messenger is None:
                                        # not found
                                        tmpLog.error(
                                            'messenger plugin for {0} not found'.format(jobSpec.computingSite))
                                        continue
                                    # setup access points
                                    messenger.setup_access_points(workSpecList)
                                    # feed jobs
                                    for workSpec in workSpecList:
                                        if workSpec.hasJob == 1:
                                            tmpStat = messenger.feed_jobs(workSpec, workSpec.get_jobspec_list())
                                            if tmpStat is False:
                                                tmpLog.error(
                                                    'failed to send jobs to workerID={0}'.format(workSpec.workerID))
                                            else:
                                                tmpLog.debug(
                                                    'sent jobs to workerID={0} with {1}'.format(workSpec.workerID,
                                                                                                tmpStat))
                                    # insert workers
                                    self.dbProxy.insert_workers(workSpecList, lockedBy)
                                    # submit
                                    sw.reset()
                                    tmpLog.info('submitting {0} workers'.format(len(workSpecList)))
                                    workSpecList, tmpRetList, tmpStrList = self.submit_workers(submitterCore,
                                                                                               workSpecList)
                                    tmpLog.debug('done submitting {0} workers'.format(len(workSpecList))
                                                    + sw.get_elapsed_time())
                                    # collect successful jobs
                                    okPandaIDs = set()
                                    for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)):
                                        if tmpRet:
                                            workSpec, jobList = okChunks[iWorker]
                                            jobList = workSpec.get_jobspec_list()
                                            if jobList is not None:
                                                for jobSpec in jobList:
                                                    okPandaIDs.add(jobSpec.PandaID)
                                    # loop over all workers
                                    for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)):
                                        workSpec, jobList = okChunks[iWorker]
                                        # set harvesterHost
                                        workSpec.harvesterHost = socket.gethostname()
                                        # use associated job list since it can be truncated for re-filling
                                        jobList = workSpec.get_jobspec_list()
                                        # set status
                                        if not tmpRet:
                                            # failed submission
                                            errStr = 'failed to submit a workerID={0} with {1}'.format(
                                                workSpec.workerID,
                                                tmpStr)
                                            tmpLog.error(errStr)
                                            workSpec.set_status(WorkSpec.ST_missed)
                                            workSpec.set_dialog_message(tmpStr)
                                            workSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr)
                                            if jobList is not None:
                                                # increment attempt number
                                                newJobList = []
                                                for jobSpec in jobList:
                                                    # skip if successful with another worker
                                                    if jobSpec.PandaID in okPandaIDs:
                                                        continue
                                                    if jobSpec.submissionAttempts is None:
                                                        jobSpec.submissionAttempts = 0
                                                    jobSpec.submissionAttempts += 1
                                                    # max attempt or permanent error
                                                    if tmpRet is False or \
                                                            jobSpec.submissionAttempts >= \
                                                            queueConfig.maxSubmissionAttempts:
                                                        newJobList.append(jobSpec)
                                                    else:
                                                        self.dbProxy.increment_submission_attempt(
                                                            jobSpec.PandaID,
                                                            jobSpec.submissionAttempts)
                                                jobList = newJobList
                                        elif queueConfig.useJobLateBinding and workSpec.hasJob == 1:
                                            # directly go to running after feeding jobs for late biding
                                            workSpec.set_status(WorkSpec.ST_running)
                                        else:
                                            # normal successful submission
                                            workSpec.set_status(WorkSpec.ST_submitted)
                                        workSpec.submitTime = timeNow
                                        workSpec.modificationTime = timeNow
                                        workSpec.checkTime = timeNow
                                        if self.monitor_fifo.enabled:
                                            workSpec.set_work_params({'lastCheckAt': timeNow_timestamp})
                                        # prefetch events
                                        if tmpRet and workSpec.hasJob == 1 and \
                                                workSpec.eventsRequest == WorkSpec.EV_useEvents and \
                                                queueConfig.prefetchEvents:
                                            workSpec.eventsRequest = WorkSpec.EV_requestEvents
                                            eventsRequestParams = dict()
                                            for jobSpec in jobList:
                                                eventsRequestParams[jobSpec.PandaID] = \
                                                    {'pandaID': jobSpec.PandaID,
                                                     'taskID': jobSpec.taskID,
                                                     'jobsetID': jobSpec.jobParams['jobsetID'],
                                                     'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))),
                                                                    jobSpec.jobParams['coreCount']),
                                                     }
                                            workSpec.eventsRequestParams = eventsRequestParams
                                        # register worker
                                        tmpStat = self.dbProxy.register_worker(workSpec, jobList, lockedBy)
                                        if jobList is not None:
                                            for jobSpec in jobList:
                                                pandaIDs.add(jobSpec.PandaID)
                                                if tmpStat:
                                                    if tmpRet:
                                                        tmpStr = \
                                                            'submitted a workerID={0} for PandaID={1} with batchID={2}'
                                                        tmpLog.info(tmpStr.format(workSpec.workerID,
                                                                                  jobSpec.PandaID,
                                                                                  workSpec.batchID))
                                                    else:
                                                        tmpStr = 'failed to submit a workerID={0} for PandaID={1}'
                                                        tmpLog.error(tmpStr.format(workSpec.workerID,
                                                                                   jobSpec.PandaID))
                                                else:
                                                    tmpStr = \
                                                        'failed to register a worker for PandaID={0} with batchID={1}'
                                                    tmpLog.error(tmpStr.format(jobSpec.PandaID, workSpec.batchID))
                                    # enqueue to monitor fifo
                                    if self.monitor_fifo.enabled \
                                            and queueConfig.mapType != WorkSpec.MT_MultiWorkers:
                                        workSpecsToEnqueue = \
                                            [[w] for w in workSpecList if w.status
                                             in (WorkSpec.ST_submitted, WorkSpec.ST_running)]
                                        check_delay = min(
                                                        getattr(harvester_config.monitor, 'eventBasedCheckInterval',
                                                                harvester_config.monitor.checkInterval),
                                                        getattr(harvester_config.monitor, 'fifoCheckInterval',
                                                                harvester_config.monitor.checkInterval))
                                        monitor_fifo.put((queueName, workSpecsToEnqueue), time.time() + check_delay)
                                        mainLog.debug('put workers to monitor FIFO')
                                    submitted = True
                                # release jobs
                                self.dbProxy.release_jobs(pandaIDs, lockedBy)
                                tmpLog.info('done')
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
                # release the site
                self.dbProxy.release_site(siteName, lockedBy)
                if sw_main.get_elapsed_time_in_sec() > queueLockInterval:
                    mainLog.warning('a submitter cycle was longer than queueLockInterval {0} sec'.format(queueLockInterval)
                                    + sw_main.get_elapsed_time())
            mainLog.debug('done')
            # define sleep interval
            if siteName is None:
                sleepTime = harvester_config.submitter.sleepTime
            else:
                sleepTime = 0
                if submitted and hasattr(harvester_config.submitter, 'minSubmissionInterval'):
                    interval = harvester_config.submitter.minSubmissionInterval
                    if interval > 0:
                        newTime = datetime.datetime.utcnow() + datetime.timedelta(seconds=interval)
                        self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=siteName)

            # time the cycle
            mainLog.debug('done a submitter cycle' + sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(sleepTime):
                mainLog.debug('terminated')
                return

    # wrapper for submitWorkers to skip ready workers
    def submit_workers(self, submitter_core, workspec_list):
        retList = []
        strList = []
        newSpecList = []
        workersToSubmit = []
        for workSpec in workspec_list:
            if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]:
                newSpecList.append(workSpec)
                retList.append(True)
                strList.append('')
            else:
                workersToSubmit.append(workSpec)
        tmpRetList = submitter_core.submit_workers(workersToSubmit)

        # submit the workers to the monitoring
        self.apfmon.create_workers(workersToSubmit)

        for tmpRet, tmpStr in tmpRetList:
            retList.append(tmpRet)
            strList.append(tmpStr)
        newSpecList += workersToSubmit
        return newSpecList, retList, strList
示例#28
0
import os
import sys
import json
from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy
from pandaharvester.harvestermessenger import shared_file_messenger

workerID = int(sys.argv[1])

eventID = sys.argv[2]
status = sys.argv[3]

proxy = DBProxy()
workSpec = proxy.get_worker_with_id(workerID)
jobSpec = proxy.get_jobs_with_worker_id(workerID, None)[0]

accessPoint = workSpec.get_access_point()

try:
    os.makedirs(accessPoint)
except:
    pass

node = {}
node['eventRangeID'] = eventID
node['eventStatus'] = status

f = open(
    os.path.join(accessPoint, shared_file_messenger.jsonEventsUpdateFileName),
    'w')
json.dump([node], f)
f.close()
 def __init__(self, **kwarg):
     # logic type : AND: throttled if all rules are satisfied, OR: throttled if one rule is satisfied
     self.logicType = 'OR'
     PluginBase.__init__(self, **kwarg)
     self.dbProxy = DBProxy()
示例#30
0
        stdoutHandler = logging.StreamHandler(sys.stdout)
        stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
        loggerObj.addHandler(stdoutHandler)

msgStr = "plugin={0}".format(preparatorCore.__class__.__name__)
tmpLog.debug(msgStr)
msgStr = "Initial queueConfig.preparator = {}".format(
    initial_queueConfig_preparator)
tmpLog.debug(msgStr)
msgStr = "Modified queueConfig.preparator = {}".format(
    modified_queueConfig_preparator)
tmpLog.debug(msgStr)

scope = 'panda'

proxy = DBProxy()
communicator = CommunicatorPool()
cacher = Cacher(communicator, single_mode=True)
cacher.run()

tmpLog.debug("plugin={0}".format(preparatorCore.__class__.__name__))
tmpLog.debug("BasePath from preparator configuration: %s " %
             preparatorCore.basePath)

# get all jobs in table in a preparing substate
tmpLog.debug('try to get all jobs in a preparing substate')
jobSpec_list = proxy.get_jobs_in_sub_status('preparing', 2000, None, None,
                                            None, None, None, None)
tmpLog.debug('got {0} jobs'.format(len(jobSpec_list)))
# loop over all found jobs
if len(jobSpec_list) > 0:
示例#31
0
import os
import sys

from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy
from pandaharvester.harvestercore.communicator_pool import CommunicatorPool
from pandaharvester.harvestermessenger import shared_file_messenger

workerID = int(sys.argv[1])

proxy = DBProxy()
workSpec = proxy.get_worker_with_id(workerID)
jobSpec = proxy.get_jobs_with_worker_id(workerID, None)[0]

accessPoint = workSpec.get_access_point()

try:
    os.makedirs(accessPoint)
except:
    pass

node = {}
node['pandaID'] = jobSpec.PandaID
node['jobsetID'] = jobSpec.jobParams['jobsetID']
node['taskID'] = jobSpec.taskID


a = CommunicatorPool()
tmpStat, tmpVal = a.getEventRanges(node)

mess = shared_file_messenger.SharedFileMessenger()
mess.feed_events(workSpec, tmpVal)
示例#32
0
import os
import sys

workerID = int(sys.argv[1])

from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy

proxy = DBProxy()
workSpec = proxy.get_worker_with_id(workerID)

accessPoint = workSpec.get_access_point()

try:
    os.makedirs(accessPoint)
except:
    pass

from pandaharvester.harvestermessenger import shared_file_messenger

f = open(os.path.join(accessPoint, shared_file_messenger.jsonJobRequestFileName), 'w')
f.close()
示例#33
0
except Exception:
    pass

for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict):
    if loggerName.startswith('panda.log'):
        if len(loggerObj.handlers) == 0:
            continue
        if loggerName.split('.')[-1] in ['db_proxy']:
            continue
        stdoutHandler = logging.StreamHandler(sys.stdout)
        stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
        loggerObj.addHandler(stdoutHandler)

queueConfigMapper = QueueConfigMapper()

proxy = DBProxy()
proxy.make_tables(queueConfigMapper)

job = JobSpec()
job.PandaID = 1


job.modificationTime = datetime.datetime.now()
proxy.insert_jobs([job])

newJob = proxy.get_job(1)


a = CommunicatorPool()
a.get_jobs('siteName', 'nodeName', 'prodSourceLabel', 'computingElement', 1, {})
示例#34
0
class ARCSubmitter(PluginBase):
    '''Submitter for ARC CE'''

    def __init__(self, **kwarg):
        '''Set up DB connection and credentials'''
        PluginBase.__init__(self, **kwarg)

        self.dbproxy = DBProxy()
        self.schedulerid = harvester_config.master.harvester_id

        # Credential dictionary role: proxy file
        self.certs = dict(zip([r.split('=')[1] for r in list(harvester_config.credmanager.voms)],
                              list(harvester_config.credmanager.outCertFile)))
        self.cred_type = arc.initializeCredentialsType(arc.initializeCredentialsType.SkipCredentials)


    def _run_submit(self, thr):
        '''Run a thread to do the submission'''

        try:
            thr.start()
        except:
            pass

        # Be careful to wait longer than submission timeout
        thr.join(thr.userconfig.Timeout() + 60.0)
        if thr.isAlive():
            # abort due to timeout and try again
            raise Exception("Submission timeout")
        if thr.job is None:
            raise Exception("Submission failed")

        return thr.job


    def _arc_submit(self, xrsl, arcces, userconfig, log):
        '''Check the available CEs and submit'''

        queuelist = []

        for arcce in arcces:
            (ce_endpoint, ce_queue) = arcce
            aris = arc.URL(str(ce_endpoint))
            ce_host = aris.Host()
            if aris.Protocol() == 'https':
                aris.ChangePath('/arex')
                infoendpoints = [arc.Endpoint(aris.str(),
                                              arc.Endpoint.COMPUTINGINFO,
                                              'org.ogf.glue.emies.resourceinfo')]
            else:
                aris = 'ldap://'+aris.Host()+'/mds-vo-name=local,o=grid'
                infoendpoints = [arc.Endpoint(aris,
                                              arc.Endpoint.COMPUTINGINFO,
                                              'org.nordugrid.ldapng')]

            # retriever contains a list of CE endpoints
            retriever = arc.ComputingServiceRetriever(userconfig, infoendpoints)
            retriever.wait()
            # targets is the list of queues
            # parse target.ComputingService.ID for the CE hostname
            # target.ComputingShare.Name is the queue name
            targets = retriever.GetExecutionTargets()

            # Filter only sites for this process
            for target in targets:
                if not target.ComputingService.ID:
                    log.info("Target {0} does not have ComputingService ID defined, skipping".format(target.ComputingService.Name))
                    continue
                # If EMI-ES infoendpoint, force EMI-ES submission
                if infoendpoints[0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' \
                  and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation':
                    log.debug("Rejecting target interface {0} because not EMI-ES".format(target.ComputingEndpoint.InterfaceName))
                    continue
                # Check for matching host and queue
                targethost = re.sub(':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID))
                targetqueue = target.ComputingShare.Name
                if targethost != ce_host:
                    log.debug('Rejecting target host {0} as it does not match {1}'.format(targethost, ce_host))
                    continue
                if targetqueue != ce_queue:
                    log.debug('Rejecting target queue {0} as it does not match {1}'.format(targetqueue, ce_queue))
                    continue

                queuelist.append(target)
                log.debug("Adding target {0}:{1}".format(targethost, targetqueue))

        # check if any queues are available, if not leave and try again next time
        if not queuelist:
            raise Exception("No free queues available")

        log.debug("preparing submission")
        jobdescs = arc.JobDescriptionList()
        if not arc.JobDescription_Parse(str(xrsl), jobdescs):
            raise Exception("Failed to prepare job description")

        # Run the submission in a separate thread
        thr = SubmitThr(queuelist, jobdescs, userconfig)
        return self._run_submit(thr)


    def _set_logdir(self, site):
        date = time.strftime('%Y-%m-%d')
        return os.path.join(date, site)


    # submit workers
    def submit_workers(self, workspec_list):
        retlist = []

        # Get queue info from DB
        pandaqueues = self.dbproxy.get_cache("panda_queues.json", None)
        if pandaqueues is None:
            raise Exception("Failed to get panda queue info from database")
        pandaqueues = pandaqueues.data

        osmap = self.dbproxy.get_cache("ddmendpoints_objectstores.json", None)
        if osmap is None:
            raise Exception("Failed to get Object Store info from database")
        osmap = osmap.data

        for workspec in workspec_list:

            arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
            tmplog = arclog.log

            # Assume for aCT that jobs are always pre-fetched (no late-binding)
            for jobspec in workspec.get_jobspec_list():

                tmplog.debug("JobSpec: {0}".format(jobspec.values_map()))

                if jobspec.computingSite not in pandaqueues:
                    retlist.append((False, "No queue information for {0}".format(jobspec.computingSite)))
                    continue

                # Get CEs from panda queue info
                # List of (endpoint, queue) tuples
                arcces = []
                for endpoint in pandaqueues[jobspec.computingSite]['queues']:
                    ce_endpoint = endpoint['ce_endpoint']
                    if not re.search('://', ce_endpoint):
                        ce_endpoint = 'gsiftp://%s' % ce_endpoint
                    ce_queue = endpoint['ce_queue_name']
                    arcces.append((ce_endpoint, ce_queue))

                if not arcces:
                    retlist.append((False, "No CEs defined for %{0}".format(jobspec.computingSite)))
                    continue

                # Set true pilot or not
                queueconfigmapper = QueueConfigMapper()
                queueconfig = queueconfigmapper.get_queue(jobspec.computingSite)
                pandaqueues[jobspec.computingSite]['truepilot'] = 'running' in queueconfig.noHeartbeat

                # Set log URL for GTAG env in job description
                logbaseurl = queueconfig.submitter.get('logBaseURL')
                logsubdir = self._set_logdir(jobspec.computingSite)
                logfileurl = '/'.join([logbaseurl, logsubdir, '%d.out' % jobspec.PandaID]) if logbaseurl else None

                tmplog.debug("Converting to ARC XRSL format")
                arcxrsl = ARCParser(jobspec.jobParams,
                                    jobspec.computingSite,
                                    pandaqueues[jobspec.computingSite],
                                    logfileurl,
                                    self.schedulerid,
                                    osmap,
                                    '/tmp', # tmpdir, TODO common tmp dir
                                    None, #jobSpec.eventranges, # TODO event ranges
                                    tmplog)
                arcxrsl.parse()
                xrsl = arcxrsl.getXrsl()
                tmplog.debug("ARC xrsl: {0}".format(xrsl))
                
                # Set the files to be downloaded at the end of the job
                downloadfiles = 'gmlog/errors'
                if 'logFile' in jobspec.jobParams:
                    downloadfiles += ';%s' %jobspec.jobParams['logFile'].replace('.tgz', '')
                if not pandaqueues[jobspec.computingSite]['truepilot']:
                    downloadfiles += ';jobSmallFiles.tgz'
                    
                # Set certificate
                userconfig = arc.UserConfig(self.cred_type)
                proxyrole = ''
                if jobspec.jobParams['prodSourceLabel'] == 'user':
                    userconfig.ProxyPath(str(self.certs['pilot']))
                    proxyrole = 'pilot'
                else:
                    userconfig.ProxyPath(str(self.certs['production']))
                    proxyrole = 'production'
                tmplog.debug("Submitting using {0} proxy at {1}".format(proxyrole, userconfig.ProxyPath()))

                try:
                    tmplog.debug("Submission targets: {0}".format(arcces))
                    arcjob = self._arc_submit(xrsl, arcces, userconfig, tmplog)
                    tmplog.info("ARC CE job id {0}".format(arcjob.JobID))
                    arc_utils.arcjob2workspec(arcjob, workspec)
                    workspec.workAttributes['arcdownloadfiles'] = downloadfiles
                    workspec.workAttributes['proxyrole'] = proxyrole
                    workspec.workAttributes['logsubdir'] = logsubdir
                    workspec.batchID = arcjob.JobID
                    tmplog.debug(workspec.workAttributes)
                    result = (True, '')
                except Exception as exc:
                    tmplog.error(traceback.format_exc())
                    result = (False, "Failed to submit ARC job: {0}".format(str(exc)))

                retlist.append(result)

        return retlist
示例#35
0
class Sweeper(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()


    # main loop
    def run(self):
        lockedBy = 'sweeper-{0}'.format(self.ident)
        while True:
            mainLog = core_utils.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            mainLog.debug('try to get workers to kill')
            # get workers to kill
            workersToKill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers,
                                                             harvester_config.sweeper.checkInterval)
            mainLog.debug('got {0} queues to kill workers'.format(len(workersToKill)))
            # loop over all workers
            for queueName, workSpecs in iteritems(workersToKill):
                # get sweeper
                if not self.queueConfigMapper.has_queue(queueName):
                    mainLog.error('queue config for {0} not found'.format(queueName))
                    continue
                queueConfig = self.queueConfigMapper.get_queue(queueName)
                sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper)
                for workSpec in workSpecs:
                    tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID),
                                                    method_name='run')
                    tmpLog.debug('start killing')
                    tmpStat, tmpOut = sweeperCore.kill_worker(workSpec)
                    tmpLog.debug('done with status={0} diag={1}'.format(tmpStat, tmpOut))
            mainLog.debug('done kill')
            # timeout for missed
            try:
                keepMissed = harvester_config.sweeper.keepMissed
            except:
                keepMissed = 24
            # get workers for cleanup
            statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished,
                                'failed': harvester_config.sweeper.keepFailed,
                                'cancelled': harvester_config.sweeper.keepCancelled,
                                'missed': keepMissed
                                }
            workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers,
                                                                     statusTimeoutMap)
            mainLog.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup)))
            for queueName, workSpecs in iteritems(workersForCleanup):
                # get sweeper
                if not self.queueConfigMapper.has_queue(queueName):
                    mainLog.error('queue config for {0} not found'.format(queueName))
                    continue
                queueConfig = self.queueConfigMapper.get_queue(queueName)
                sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper)
                for workSpec in workSpecs:
                    tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID),
                                                    method_name='run')
                    tmpLog.debug('start cleanup')
                    tmpStat, tmpOut = sweeperCore.sweep_worker(workSpec)
                    tmpLog.debug('done with status={0} diag={1}'.format(tmpStat, tmpOut))
                    if tmpStat:
                        # delete from DB
                        self.dbProxy.delete_worker(workSpec.workerID)
            mainLog.debug('done cleanup')
            # check if being terminated
            if self.terminated(harvester_config.sweeper.sleepTime):
                mainLog.debug('terminated')
                return
示例#36
0
 def __init__(self):
     self.pluginFactory = PluginFactory()
     self.dbProxy = DBProxy()
示例#37
0
 def __init__(self, **kwarg):
     # logic type : AND: throttled if all rules are satisfied, OR: throttled if one rule is satisfied
     self.logicType = 'OR'
     PluginBase.__init__(self, **kwarg)
     self.dbProxy = DBProxy()
      if loggerName.split('.')[-1] in ['db_proxy']:
         continue
      stdoutHandler = logging.StreamHandler(sys.stdout)
      stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
      loggerObj.addHandler(stdoutHandler)

msgStr = "plugin={0}".format(preparatorCore.__class__.__name__)
tmpLog.debug(msgStr)
msgStr = "Initial queueConfig.preparator = {}".format(initial_queueConfig_preparator)
tmpLog.debug(msgStr)
msgStr = "Modified queueConfig.preparator = {}".format(modified_queueConfig_preparator)
tmpLog.debug(msgStr)

scope = 'panda'

proxy = DBProxy()
communicator = CommunicatorPool()
cacher = Cacher(communicator, single_mode=True)
cacher.run()

tmpLog.debug("plugin={0}".format(preparatorCore.__class__.__name__))
tmpLog.debug("BasePath from preparator configuration: %s " % preparatorCore.basePath)
 
# get all jobs in table in a preparing substate
#tmpLog.debug('try to get all jobs in a preparing substate')
#jobSpec_list = proxy.get_jobs_in_sub_status('preparing',2000,None,None,None,None,None,None)
# get all jobs 
if job_id > 0 :
   tmpLog.debug('try to get job ID - {}'.format(job_id))
   jobSpec_list = [proxy.get_job(job_id)]
else :
示例#39
0
for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict):
    if loggerName.startswith('panda.log'):
        if len(loggerObj.handlers) == 0:
            continue
        if loggerName.split('.')[-1] in ['db_proxy']:
            continue
        stdoutHandler = logging.StreamHandler(sys.stdout)
        stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
        loggerObj.addHandler(stdoutHandler)

pp = pprint.PrettyPrinter(indent=4)

queueConfigMapper = QueueConfigMapper()

proxy = DBProxy()

sqlJ ="SELECT * FROM job_table"

resultsJobcur = proxy.execute(sqlJ)
resultsJob = resultsJobcur.fetchall()
proxy.commit()

sqlF ="SELECT * FROM file_table"

resultsFilescur = proxy.execute(sqlF)
resultsFiles = resultsFilescur.fetchall()
proxy.commit()

print "job_table - "
print resultsJob[0].keys()
      if loggerName.split('.')[-1] in ['db_proxy']:
         continue
      stdoutHandler = logging.StreamHandler(sys.stdout)
      stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
      loggerObj.addHandler(stdoutHandler)

msgStr = "plugin={0}".format(preparatorCore.__class__.__name__)
tmpLog.debug(msgStr)
msgStr = "Initial queueConfig.preparator = {}".format(initial_queueConfig_preparator)
tmpLog.debug(msgStr)
msgStr = "Modified queueConfig.preparator = {}".format(modified_queueConfig_preparator)
tmpLog.debug(msgStr)

scope = 'panda'

proxy = DBProxy()
communicator = CommunicatorPool()
cacher = Cacher(communicator, single_mode=True)
cacher.run()

tmpLog.debug("plugin={0}".format(preparatorCore.__class__.__name__))
tmpLog.debug("BasePath from preparator configuration: %s " % preparatorCore.basePath)
 
# get all jobs in table in a preparing substate
tmpLog.debug('try to get all jobs in a preparing substate')
jobSpec_list = proxy.get_jobs_in_sub_status('preparing',2000,None,None,None,None,None,None)
tmpLog.debug('got {0} jobs'.format(len(jobSpec_list)))
# loop over all found jobs
if len(jobSpec_list) > 0 :
    for jobSpec in jobSpec_list:
        tmpLog.debug(' PandaID = %d status = %s subStatus = %s lockedBy = %s' %
示例#41
0
class Monitor(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.pluginFactory = PluginFactory()
        self.startTimestamp = time.time()
        self.monitor_fifo = MonitorFIFO()
        self.apfmon = Apfmon(self.queueConfigMapper)

    # main loop
    def run(self):
        lockedBy = 'monitor-{0}'.format(self.get_pid())
        # init messengers
        for queueConfig in self.queueConfigMapper.get_all_queues().values():
            # just import for module initialization
            self.pluginFactory.get_plugin(queueConfig.messenger)
        # main
        try:
            fifoSleepTimeMilli = harvester_config.monitor.fifoSleepTimeMilli
        except AttributeError:
            fifoSleepTimeMilli = 5000
        try:
            fifoCheckDuration = harvester_config.monitor.fifoCheckDuration
        except AttributeError:
            fifoCheckDuration = 30
        try:
            fifoMaxWorkersPerChunk = harvester_config.monitor.fifoMaxWorkersPerChunk
        except AttributeError:
            fifoMaxWorkersPerChunk = 500
        try:
            fifoProtectiveDequeue = harvester_config.monitor.fifoProtectiveDequeue
        except AttributeError:
            fifoProtectiveDequeue = True
        last_DB_cycle_timestamp = 0
        monitor_fifo = self.monitor_fifo
        sleepTime = (fifoSleepTimeMilli / 1000.0) \
                        if monitor_fifo.enabled else harvester_config.monitor.sleepTime
        adjusted_sleepTime = sleepTime
        if monitor_fifo.enabled:
            monitor_fifo.restore()
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(lockedBy),
                                       method_name='run')
            mainLog.debug('start a monitor cycle')
            if time.time() >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime and \
                    not (monitor_fifo.enabled and self.singleMode):
                # run with workers from DB
                sw_db = core_utils.get_stopwatch()
                mainLog.debug('starting run with DB')
                mainLog.debug('getting workers to monitor')
                workSpecsPerQueue = self.dbProxy.get_workers_to_update(
                    harvester_config.monitor.maxWorkers,
                    harvester_config.monitor.checkInterval,
                    harvester_config.monitor.lockInterval, lockedBy)
                mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue)))
                # loop over all workers
                for queueName, configIdWorkSpecs in iteritems(
                        workSpecsPerQueue):
                    for configID, workSpecsList in iteritems(
                            configIdWorkSpecs):
                        retVal = self.monitor_agent_core(lockedBy,
                                                         queueName,
                                                         workSpecsList,
                                                         config_id=configID)
                        if monitor_fifo.enabled and retVal is not None:
                            workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                            if workSpecsToEnqueue:
                                mainLog.debug('putting workers to FIFO')
                                try:
                                    score = fifoCheckInterval + timeNow_timestamp
                                    monitor_fifo.put(
                                        (queueName, workSpecsToEnqueue), score)
                                    mainLog.info(
                                        'put workers of {0} to FIFO with score {1}'
                                        .format(queueName, score))
                                except Exception as errStr:
                                    mainLog.error(
                                        'failed to put object from FIFO: {0}'.
                                        format(errStr))
                            if workSpecsToEnqueueToHead:
                                mainLog.debug('putting workers to FIFO head')
                                try:
                                    score = fifoCheckInterval - timeNow_timestamp
                                    monitor_fifo.put(
                                        (queueName, workSpecsToEnqueueToHead),
                                        score)
                                    mainLog.info(
                                        'put workers of {0} to FIFO with score {1}'
                                        .format(queueName, score))
                                except Exception as errStr:
                                    mainLog.error(
                                        'failed to put object from FIFO head: {0}'
                                        .format(errStr))
                last_DB_cycle_timestamp = time.time()
                if sw_db.get_elapsed_time_in_sec(
                ) > harvester_config.monitor.lockInterval:
                    mainLog.warning(
                        'a single DB cycle was longer than lockInterval ' +
                        sw_db.get_elapsed_time())
                else:
                    mainLog.debug('done a DB cycle' + sw_db.get_elapsed_time())
                mainLog.debug('ended run with DB')
            elif monitor_fifo.enabled:
                # run with workers from FIFO
                sw = core_utils.get_stopwatch()
                n_loops = 0
                n_loops_hit = 0
                last_fifo_cycle_timestamp = time.time()
                to_break = False
                obj_dequeued_id_list = []
                obj_to_enqueue_dict = collections.defaultdict(
                    lambda: [[], 0, 0])
                obj_to_enqueue_to_head_dict = collections.defaultdict(
                    lambda: [[], 0, 0])
                remaining_obj_to_enqueue_dict = {}
                remaining_obj_to_enqueue_to_head_dict = {}
                n_chunk_peeked_stat, sum_overhead_time_stat = 0, 0.0
                while time.time(
                ) < last_fifo_cycle_timestamp + fifoCheckDuration:
                    sw.reset()
                    n_loops += 1
                    retVal, overhead_time = monitor_fifo.to_check_workers()
                    if overhead_time is not None:
                        n_chunk_peeked_stat += 1
                        sum_overhead_time_stat += overhead_time
                    if retVal:
                        # check fifo size
                        fifo_size = monitor_fifo.size()
                        mainLog.debug('FIFO size is {0}'.format(fifo_size))
                        mainLog.debug('starting run with FIFO')
                        try:
                            obj_gotten = monitor_fifo.get(
                                timeout=1, protective=fifoProtectiveDequeue)
                        except Exception as errStr:
                            mainLog.error(
                                'failed to get object from FIFO: {0}'.format(
                                    errStr))
                        else:
                            if obj_gotten is not None:
                                sw_fifo = core_utils.get_stopwatch()
                                if fifoProtectiveDequeue:
                                    obj_dequeued_id_list.append(obj_gotten.id)
                                queueName, workSpecsList = obj_gotten.item
                                mainLog.debug(
                                    'got a chunk of {0} workers of {1} from FIFO'
                                    .format(len(workSpecsList), queueName) +
                                    sw.get_elapsed_time())
                                sw.reset()
                                configID = None
                                for workSpecs in workSpecsList:
                                    if configID is None and len(workSpecs) > 0:
                                        configID = workSpecs[0].configID
                                    for workSpec in workSpecs:
                                        if workSpec.pandaid_list is None:
                                            _jobspec_list = workSpec.get_jobspec_list(
                                            )
                                            if _jobspec_list is not None:
                                                workSpec.pandaid_list = [
                                                    j.PandaID
                                                    for j in workSpec.
                                                    get_jobspec_list()
                                                ]
                                            else:
                                                workSpec.pandaid_list = []
                                            workSpec.force_update(
                                                'pandaid_list')
                                retVal = self.monitor_agent_core(
                                    lockedBy,
                                    queueName,
                                    workSpecsList,
                                    from_fifo=True,
                                    config_id=configID)
                                if retVal is not None:
                                    workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                                    try:
                                        if len(obj_to_enqueue_dict[queueName]
                                               [0]) + len(
                                                   workSpecsToEnqueue
                                               ) <= fifoMaxWorkersPerChunk:
                                            obj_to_enqueue_dict[queueName][
                                                0].extend(workSpecsToEnqueue)
                                            obj_to_enqueue_dict[queueName][
                                                1] = max(
                                                    obj_to_enqueue_dict[
                                                        queueName][1],
                                                    timeNow_timestamp)
                                            obj_to_enqueue_dict[queueName][
                                                2] = max(
                                                    obj_to_enqueue_dict[
                                                        queueName][2],
                                                    fifoCheckInterval)
                                        else:
                                            to_break = True
                                            remaining_obj_to_enqueue_dict[
                                                queueName] = [
                                                    workSpecsToEnqueue,
                                                    timeNow_timestamp,
                                                    fifoCheckInterval
                                                ]
                                    except Exception as errStr:
                                        mainLog.error(
                                            'failed to gather workers for FIFO: {0}'
                                            .format(errStr))
                                        to_break = True
                                    try:
                                        if len(obj_to_enqueue_to_head_dict[
                                                queueName][0]) + len(
                                                    workSpecsToEnqueueToHead
                                                ) <= fifoMaxWorkersPerChunk:
                                            obj_to_enqueue_to_head_dict[
                                                queueName][0].extend(
                                                    workSpecsToEnqueueToHead)
                                            obj_to_enqueue_to_head_dict[
                                                queueName][1] = max(
                                                    obj_to_enqueue_to_head_dict[
                                                        queueName][1],
                                                    timeNow_timestamp)
                                            obj_to_enqueue_to_head_dict[
                                                queueName][2] = max(
                                                    obj_to_enqueue_to_head_dict[
                                                        queueName][2],
                                                    fifoCheckInterval)
                                        else:
                                            to_break = True
                                            remaining_obj_to_enqueue_to_head_dict[
                                                queueName] = [
                                                    workSpecsToEnqueueToHead,
                                                    timeNow_timestamp,
                                                    fifoCheckInterval
                                                ]
                                    except Exception as errStr:
                                        mainLog.error(
                                            'failed to gather workers for FIFO head: {0}'
                                            .format(errStr))
                                        to_break = True
                                    mainLog.debug(
                                        'checked {0} workers from FIFO'.format(
                                            len(workSpecsList)) +
                                        sw.get_elapsed_time())
                                else:
                                    mainLog.debug(
                                        'monitor_agent_core returned None. Skipped putting to FIFO'
                                    )
                                if sw_fifo.get_elapsed_time_in_sec(
                                ) > harvester_config.monitor.lockInterval:
                                    mainLog.warning(
                                        'a single FIFO cycle was longer than lockInterval '
                                        + sw_fifo.get_elapsed_time())
                                else:
                                    mainLog.debug('done a FIFO cycle' +
                                                  sw_fifo.get_elapsed_time())
                                    n_loops_hit += 1
                                if to_break:
                                    break
                            else:
                                mainLog.debug('got nothing in FIFO')
                    else:
                        mainLog.debug(
                            'workers in FIFO too young to check. Skipped')
                        if self.singleMode:
                            break
                        if overhead_time is not None:
                            time.sleep(
                                max(-overhead_time * random.uniform(0.1, 1),
                                    adjusted_sleepTime))
                        else:
                            time.sleep(
                                max(fifoCheckDuration * random.uniform(0.1, 1),
                                    adjusted_sleepTime))
                mainLog.debug(
                    'run {0} loops, including {1} FIFO cycles'.format(
                        n_loops, n_loops_hit))

                # enqueue to fifo
                sw.reset()
                n_chunk_put = 0
                mainLog.debug('putting worker chunks to FIFO')
                for _dct in (obj_to_enqueue_dict,
                             remaining_obj_to_enqueue_dict):
                    for queueName, obj_to_enqueue in iteritems(_dct):
                        try:
                            workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue
                            if workSpecsToEnqueue:
                                score = fifoCheckInterval + timeNow_timestamp
                                monitor_fifo.put(
                                    (queueName, workSpecsToEnqueue), score)
                                n_chunk_put += 1
                                mainLog.info(
                                    'put a chunk of {0} workers of {1} to FIFO with score {2}'
                                    .format(len(workSpecsToEnqueue), queueName,
                                            score))
                        except Exception as errStr:
                            mainLog.error(
                                'failed to put object from FIFO: {0}'.format(
                                    errStr))
                mainLog.debug('putting worker chunks to FIFO head')
                for _dct in (obj_to_enqueue_to_head_dict,
                             remaining_obj_to_enqueue_to_head_dict):
                    for queueName, obj_to_enqueue_to_head in iteritems(_dct):
                        try:
                            workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue_to_head
                            if workSpecsToEnqueueToHead:
                                score = fifoCheckInterval + timeNow_timestamp - 2**32
                                monitor_fifo.put(
                                    (queueName, workSpecsToEnqueueToHead),
                                    score)
                                n_chunk_put += 1
                                mainLog.info(
                                    'put a chunk of {0} workers of {1} to FIFO with score {2}'
                                    .format(len(workSpecsToEnqueueToHead),
                                            queueName, score))
                        except Exception as errStr:
                            mainLog.error(
                                'failed to put object from FIFO head: {0}'.
                                format(errStr))
                # release protective dequeued objects
                if fifoProtectiveDequeue and len(obj_dequeued_id_list) > 0:
                    monitor_fifo.release(ids=obj_dequeued_id_list)
                mainLog.debug(
                    'put {0} worker chunks into FIFO'.format(n_chunk_put) +
                    sw.get_elapsed_time())
                # adjust adjusted_sleepTime
                if n_chunk_peeked_stat > 0 and sum_overhead_time_stat > sleepTime:
                    speedup_factor = (sum_overhead_time_stat - sleepTime) / (
                        n_chunk_peeked_stat *
                        harvester_config.monitor.checkInterval)
                    speedup_factor = max(speedup_factor, 0)
                    adjusted_sleepTime = adjusted_sleepTime / (1. +
                                                               speedup_factor)
                elif n_chunk_peeked_stat == 0 or sum_overhead_time_stat < 0:
                    adjusted_sleepTime = (sleepTime + adjusted_sleepTime) / 2
                mainLog.debug('adjusted_sleepTime becomes {0:.3f} sec'.format(
                    adjusted_sleepTime))
                # end run with fifo
                mainLog.debug('ended run with FIFO')
            # time the cycle
            mainLog.debug('done a monitor cycle' + sw_main.get_elapsed_time())

            # check if being terminated
            if self.terminated(adjusted_sleepTime):
                mainLog.debug('terminated')
                return

    # core of monitor agent to check workers in workSpecsList of queueName
    def monitor_agent_core(self,
                           lockedBy,
                           queueName,
                           workSpecsList,
                           from_fifo=False,
                           config_id=None):
        tmpQueLog = self.make_logger(_logger,
                                     'id={0} queue={1}'.format(
                                         lockedBy, queueName),
                                     method_name='run')
        # check queue
        if not self.queueConfigMapper.has_queue(queueName, config_id):
            tmpQueLog.error('config not found')
            return None
        # get queue
        queueConfig = self.queueConfigMapper.get_queue(queueName, config_id)
        try:
            apfmon_status_updates = self.queueConfigMapper.queueConfig[
                queueName].monitor['apfmon_status_updates']
        except Exception:
            apfmon_status_updates = False
        tmpQueLog.debug(
            'apfmon_status_updates: {0}'.format(apfmon_status_updates))
        # get plugins
        monCore = self.pluginFactory.get_plugin(queueConfig.monitor)
        messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
        # workspec chunk of active workers
        workSpecsToEnqueue_dict = {}
        workSpecsToEnqueueToHead_dict = {}
        timeNow_timestamp = time.time()
        # get fifoCheckInterval for PQ and other fifo attributes
        try:
            fifoCheckInterval = monCore.fifoCheckInterval
        except Exception:
            if hasattr(harvester_config.monitor, 'fifoCheckInterval'):
                fifoCheckInterval = harvester_config.monitor.fifoCheckInterval
            else:
                fifoCheckInterval = harvester_config.monitor.checkInterval
        try:
            forceEnqueueInterval = harvester_config.monitor.fifoForceEnqueueInterval
        except AttributeError:
            forceEnqueueInterval = 3600
        try:
            fifoMaxPreemptInterval = harvester_config.monitor.fifoMaxPreemptInterval
        except AttributeError:
            fifoMaxPreemptInterval = 60
        # check workers
        allWorkers = [item for sublist in workSpecsList for item in sublist]
        tmpQueLog.debug('checking {0} workers'.format(len(allWorkers)))
        tmpStat, tmpRetMap = self.check_workers(monCore, messenger, allWorkers,
                                                queueConfig, tmpQueLog,
                                                from_fifo)
        if tmpStat:
            # loop over all worker chunks
            tmpQueLog.debug('update jobs and workers')
            iWorker = 0
            for workSpecs in workSpecsList:
                jobSpecs = None
                pandaIDsList = []
                eventsToUpdateList = []
                filesToStageOutList = dict()
                isCheckedList = []
                mapType = workSpecs[0].mapType
                # loop over workSpecs
                for workSpec in workSpecs:
                    tmpLog = self.make_logger(_logger,
                                              'id={0} workerID={1}'.format(
                                                  lockedBy, workSpec.workerID),
                                              method_name='run')
                    tmpOut = tmpRetMap[workSpec.workerID]
                    oldStatus = tmpOut['oldStatus']
                    newStatus = tmpOut['newStatus']
                    monStatus = tmpOut['monStatus']
                    diagMessage = tmpOut['diagMessage']
                    workAttributes = tmpOut['workAttributes']
                    eventsToUpdate = tmpOut['eventsToUpdate']
                    filesToStageOut = tmpOut['filesToStageOut']
                    eventsRequestParams = tmpOut['eventsRequestParams']
                    nJobsToReFill = tmpOut['nJobsToReFill']
                    pandaIDs = tmpOut['pandaIDs']
                    isChecked = tmpOut['isChecked']
                    tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} '
                    tmpStr += 'postProcessed={3} files={4}'
                    tmpLog.debug(
                        tmpStr.format(newStatus, monStatus, diagMessage,
                                      workSpec.is_post_processed(),
                                      str(filesToStageOut)))
                    iWorker += 1
                    # check status
                    if newStatus not in WorkSpec.ST_LIST:
                        tmpLog.error('unknown status={0}'.format(newStatus))
                        return
                    # update worker
                    workSpec.set_status(newStatus)
                    workSpec.set_work_attributes(workAttributes)
                    workSpec.set_dialog_message(diagMessage)
                    if isChecked:
                        workSpec.checkTime = datetime.datetime.utcnow()
                    isCheckedList.append(isChecked)
                    if monStatus == WorkSpec.ST_failed:
                        if not workSpec.has_pilot_error():
                            workSpec.set_pilot_error(
                                PilotErrors.ERR_GENERALERROR, diagMessage)
                    elif monStatus == WorkSpec.ST_cancelled:
                        if not workSpec.has_pilot_error():
                            workSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL,
                                                     diagMessage)
                    if monStatus in [
                            WorkSpec.ST_finished, WorkSpec.ST_failed,
                            WorkSpec.ST_cancelled
                    ]:
                        workSpec.set_work_params({'finalMonStatus': monStatus})
                    # request events
                    if eventsRequestParams != {}:
                        workSpec.eventsRequest = WorkSpec.EV_requestEvents
                        workSpec.eventsRequestParams = eventsRequestParams
                    # jobs to refill
                    if nJobsToReFill is not None:
                        workSpec.nJobsToReFill = nJobsToReFill
                    # get associated jobs for the worker chunk
                    if workSpec.hasJob == 1 and jobSpecs is None:
                        jobSpecs = self.dbProxy.get_jobs_with_worker_id(
                            workSpec.workerID,
                            None,
                            only_running=True,
                            slim=True)
                    # pandaIDs for push
                    pandaIDsList.append(pandaIDs)
                    if len(eventsToUpdate) > 0:
                        eventsToUpdateList.append(eventsToUpdate)
                    if len(filesToStageOut) > 0:
                        filesToStageOutList[
                            workSpec.workerID] = filesToStageOut
                    # apfmon status update
                    if apfmon_status_updates and newStatus != oldStatus:
                        tmpQueLog.debug(
                            'apfmon_status_updates: {0} newStatus: {1} monStatus: {2} oldStatus: {3} workSpecStatus: {4}'
                            .format(apfmon_status_updates, newStatus,
                                    monStatus, oldStatus, workSpec.status))
                        self.apfmon.update_worker(workSpec, monStatus)

                # lock workers for fifo
                if from_fifo:
                    # collect some attributes to be updated when workers are locked
                    worker_id_list = dict()
                    for workSpec, isChecked in zip(workSpecs, isCheckedList):
                        attrs = dict()
                        if isChecked:
                            attrs['checkTime'] = workSpec.checkTime
                            workSpec.force_not_update('checkTime')
                        if workSpec.has_updated_attributes():
                            attrs['lockedBy'] = lockedBy
                            workSpec.lockedBy = lockedBy
                            workSpec.force_not_update('lockedBy')
                        else:
                            attrs['lockedBy'] = None
                        worker_id_list[workSpec.workerID] = attrs
                    temRetLockWorker = self.dbProxy.lock_workers(
                        worker_id_list, harvester_config.monitor.lockInterval)
                    # skip if not locked
                    if not temRetLockWorker:
                        continue
                # update jobs and workers
                if jobSpecs is not None and len(jobSpecs) > 0:
                    tmpQueLog.debug(
                        'updating {0} jobs with {1} workers'.format(
                            len(jobSpecs), len(workSpecs)))
                    core_utils.update_job_attributes_with_workers(
                        mapType, jobSpecs, workSpecs, filesToStageOutList,
                        eventsToUpdateList)
                # update local database
                tmpRet = self.dbProxy.update_jobs_workers(
                    jobSpecs, workSpecs, lockedBy, pandaIDsList)
                if not tmpRet:
                    for workSpec in workSpecs:
                        tmpLog = self.make_logger(_logger,
                                                  'id={0} workerID={1}'.format(
                                                      lockedBy,
                                                      workSpec.workerID),
                                                  method_name='run')
                        if from_fifo:
                            tmpLog.info(
                                'failed to update the DB. Maybe locked by other thread running with DB'
                            )
                        else:
                            if workSpec.status in [
                                    WorkSpec.ST_finished, WorkSpec.ST_failed,
                                    WorkSpec.ST_cancelled, WorkSpec.ST_missed
                            ]:
                                tmpLog.info(
                                    'worker already in final status. Skipped')
                            else:
                                tmpLog.error(
                                    'failed to update the DB. lockInterval may be too short'
                                )
                else:
                    if jobSpecs is not None:
                        for jobSpec in jobSpecs:
                            tmpLog = self.make_logger(
                                _logger,
                                'id={0} PandaID={1}'.format(
                                    lockedBy, jobSpec.PandaID),
                                method_name='run')
                            tmpLog.debug(
                                'new status={0} subStatus={1} status_in_metadata={2}'
                                .format(
                                    jobSpec.status, jobSpec.subStatus,
                                    jobSpec.get_job_status_from_attributes()))
                # send ACK to workers for events and files
                if len(eventsToUpdateList) > 0 or len(filesToStageOutList) > 0:
                    for workSpec in workSpecs:
                        try:
                            messenger.acknowledge_events_files(workSpec)
                        except Exception:
                            core_utils.dump_error_message(tmpQueLog)
                            tmpQueLog.error(
                                'failed to send ACK to workerID={0}'.format(
                                    workSpec.workerID))
                # active workers for fifo
                if self.monitor_fifo.enabled and workSpecs:
                    workSpec = workSpecs[0]
                    tmpOut = tmpRetMap[workSpec.workerID]
                    newStatus = tmpOut['newStatus']
                    monStatus = tmpOut['monStatus']
                    if newStatus in [WorkSpec.ST_submitted, WorkSpec.ST_running, WorkSpec.ST_idle] \
                        and workSpec.mapType != WorkSpec.MT_MultiWorkers \
                        and workSpec.workAttributes is not None:
                        timeNow = datetime.datetime.utcnow()
                        timeNow_timestamp = time.time()
                        # get lastCheckAt
                        _bool, lastCheckAt = workSpec.get_work_params(
                            'lastCheckAt')
                        try:
                            last_check_period = timeNow_timestamp - lastCheckAt
                        except TypeError:
                            last_check_period = forceEnqueueInterval + 1.0
                        # get lastForceEnqueueAt
                        _bool, lastForceEnqueueAt = workSpec.get_work_params(
                            'lastForceEnqueueAt')
                        if not (_bool and lastForceEnqueueAt is not None):
                            lastForceEnqueueAt = 0
                        # notification
                        intolerable_delay = max(
                            forceEnqueueInterval * 2,
                            harvester_config.monitor.checkInterval * 4)
                        if _bool and lastCheckAt is not None and last_check_period > harvester_config.monitor.checkInterval \
                            and timeNow_timestamp - harvester_config.monitor.checkInterval > self.startTimestamp:
                            if last_check_period > intolerable_delay:
                                tmpQueLog.error(
                                    'last check period of workerID={0} is {1} sec, intolerably longer than monitor checkInterval. Will NOT enquque worker by force. Please check why monitor checks worker slowly'
                                    .format(workSpec.workerID,
                                            last_check_period))
                            else:
                                tmpQueLog.warning(
                                    'last check period of workerID={0} is {1} sec, longer than monitor checkInterval'
                                    .format(workSpec.workerID,
                                            last_check_period))
                        # prepartion to enqueue fifo
                        if (from_fifo) \
                            or (not from_fifo
                                and timeNow_timestamp - harvester_config.monitor.sleepTime > self.startTimestamp
                                and last_check_period > forceEnqueueInterval
                                and last_check_period < intolerable_delay
                                and timeNow_timestamp - lastForceEnqueueAt > 86400 + forceEnqueueInterval):
                            if not from_fifo:
                                # in DB cycle
                                tmpQueLog.warning(
                                    'last check period of workerID={0} is {1} sec, longer than monitor forceEnqueueInterval. Enqueue the worker by force'
                                    .format(workSpec.workerID,
                                            last_check_period))
                                workSpec.set_work_params(
                                    {'lastForceEnqueueAt': timeNow_timestamp})
                            workSpec.set_work_params(
                                {'lastCheckAt': timeNow_timestamp})
                            workSpec.lockedBy = None
                            workSpec.force_update('lockedBy')
                            if monStatus in [
                                    WorkSpec.ST_finished, WorkSpec.ST_failed,
                                    WorkSpec.ST_cancelled
                            ]:
                                # for post-processing
                                _bool, startFifoPreemptAt = workSpec.get_work_params(
                                    'startFifoPreemptAt')
                                if not _bool or startFifoPreemptAt is None:
                                    startFifoPreemptAt = timeNow_timestamp
                                    workSpec.set_work_params({
                                        'startFifoPreemptAt':
                                        startFifoPreemptAt
                                    })
                                tmpQueLog.debug(
                                    'workerID={0} , startFifoPreemptAt: {1}'.
                                    format(workSpec.workerID,
                                           startFifoPreemptAt))
                                if timeNow_timestamp - startFifoPreemptAt < fifoMaxPreemptInterval:
                                    workSpecsToEnqueueToHead_dict[
                                        workSpec.workerID] = workSpecs
                                else:
                                    workSpec.set_work_params({
                                        'startFifoPreemptAt':
                                        timeNow_timestamp
                                    })
                                    workSpec.modificationTime = timeNow
                                    workSpec.force_update('modificationTime')
                                    workSpecsToEnqueue_dict[
                                        workSpec.workerID] = workSpecs
                            else:
                                workSpec.modificationTime = timeNow
                                workSpec.force_update('modificationTime')
                                workSpecsToEnqueue_dict[
                                    workSpec.workerID] = workSpecs
        else:
            tmpQueLog.error('failed to check workers')
        workSpecsToEnqueue = list(workSpecsToEnqueue_dict.values())
        workSpecsToEnqueueToHead = list(workSpecsToEnqueueToHead_dict.values())
        retVal = workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval
        tmpQueLog.debug('done')
        return retVal

    # wrapper for checkWorkers
    def check_workers(self, mon_core, messenger, all_workers, queue_config,
                      tmp_log, from_fifo):
        # check timeout value
        try:
            checkTimeout = mon_core.checkTimeout
        except Exception:
            try:
                checkTimeout = harvester_config.monitor.checkTimeout
            except Exception:
                checkTimeout = None
        try:
            workerQueueTimeLimit = harvester_config.monitor.workerQueueTimeLimit
        except AttributeError:
            workerQueueTimeLimit = 172800
        workersToCheck = []
        thingsToPostProcess = []
        retMap = dict()
        for workSpec in all_workers:
            eventsRequestParams = {}
            eventsToUpdate = []
            pandaIDs = []
            workStatus = None
            workAttributes = None
            filesToStageOut = []
            nJobsToReFill = None
            if workSpec.has_work_params('finalMonStatus'):
                # to post-process
                _bool, finalMonStatus = workSpec.get_work_params(
                    'finalMonStatus')
                _thing = (workSpec, (finalMonStatus, ''))
                thingsToPostProcess.append(_thing)
            else:
                # job-level late binding
                if workSpec.hasJob == 0 and workSpec.mapType != WorkSpec.MT_NoJob:
                    # check if job is requested
                    jobRequested = messenger.job_requested(workSpec)
                    if jobRequested:
                        # set ready when job is requested
                        workStatus = WorkSpec.ST_ready
                    else:
                        workStatus = workSpec.status
                elif workSpec.nJobsToReFill in [0, None]:
                    # check if job is requested to refill free slots
                    jobRequested = messenger.job_requested(workSpec)
                    if jobRequested:
                        nJobsToReFill = jobRequested
                    workersToCheck.append(workSpec)
                else:
                    workersToCheck.append(workSpec)
            # add
            retMap[workSpec.workerID] = {
                'oldStatus': workSpec.status,
                'newStatus': workStatus,
                'monStatus': workStatus,
                'workAttributes': workAttributes,
                'filesToStageOut': filesToStageOut,
                'eventsRequestParams': eventsRequestParams,
                'eventsToUpdate': eventsToUpdate,
                'diagMessage': '',
                'pandaIDs': pandaIDs,
                'nJobsToReFill': nJobsToReFill,
                'isChecked': True
            }
        # check workers
        tmp_log.debug('checking workers with plugin')
        try:
            if workersToCheck:
                tmpStat, tmpOut = mon_core.check_workers(workersToCheck)
                if not tmpStat:
                    tmp_log.error(
                        'failed to check workers with: {0}'.format(tmpOut))
                    workersToCheck = []
                    tmpOut = []
                else:
                    tmp_log.debug('checked')
            else:
                tmp_log.debug('Nothing to be checked with plugin')
                tmpOut = []
            timeNow = datetime.datetime.utcnow()
            for workSpec, (newStatus, diagMessage) in itertools.chain(
                    zip(workersToCheck, tmpOut), thingsToPostProcess):
                workerID = workSpec.workerID
                tmp_log.debug('Going to check workerID={0}'.format(workerID))
                pandaIDs = []
                if workerID in retMap:
                    # failed to check status
                    if newStatus is None:
                        tmp_log.warning(
                            'Failed to check workerID={0} with {1}'.format(
                                workerID, diagMessage))
                        retMap[workerID]['isChecked'] = False
                        # set status
                        if workSpec.checkTime is not None and checkTimeout is not None and \
                                timeNow - workSpec.checkTime > datetime.timedelta(seconds=checkTimeout):
                            # kill due to timeout
                            tmp_log.debug(
                                'kill workerID={0} due to consecutive check failures'
                                .format(workerID))
                            self.dbProxy.kill_worker(workSpec.workerID)
                            newStatus = WorkSpec.ST_cancelled
                            diagMessage = 'Killed by Harvester due to consecutive worker check failures. ' + diagMessage
                            workSpec.set_pilot_error(
                                PilotErrors.ERR_FAILEDBYSERVER, diagMessage)
                        else:
                            # use original status
                            newStatus = workSpec.status
                    # request kill
                    if messenger.kill_requested(workSpec):
                        tmp_log.debug(
                            'kill workerID={0} as requested'.format(workerID))
                        self.dbProxy.kill_worker(workSpec.workerID)
                    # stuck queuing for too long
                    if workSpec.status == WorkSpec.ST_submitted \
                        and timeNow > workSpec.submitTime + datetime.timedelta(seconds=workerQueueTimeLimit):
                        tmp_log.debug(
                            'kill workerID={0} due to queuing longer than {1} seconds'
                            .format(workerID, workerQueueTimeLimit))
                        self.dbProxy.kill_worker(workSpec.workerID)
                        diagMessage = 'Killed by Harvester due to worker queuing too long' + diagMessage
                        workSpec.set_pilot_error(
                            PilotErrors.ERR_FAILEDBYSERVER, diagMessage)
                    # expired heartbeat - only when requested in the configuration
                    try:
                        # check if the queue configuration requires checking for worker heartbeat
                        worker_heartbeat_limit = int(
                            queue_config.messenger['worker_heartbeat'])
                    except (AttributeError, KeyError):
                        worker_heartbeat_limit = None
                    tmp_log.debug(
                        'workerID={0} heartbeat limit is configured to {1}'.
                        format(workerID, worker_heartbeat_limit))
                    if worker_heartbeat_limit:
                        if messenger.is_alive(workSpec,
                                              worker_heartbeat_limit):
                            tmp_log.debug(
                                'heartbeat for workerID={0} is valid'.format(
                                    workerID))
                        else:
                            tmp_log.debug(
                                'heartbeat for workerID={0} expired: sending kill request'
                                .format(workerID))
                            self.dbProxy.kill_worker(workSpec.workerID)
                            diagMessage = 'Killed by Harvester due to worker heartbeat expired. ' + diagMessage
                            workSpec.set_pilot_error(
                                PilotErrors.ERR_FAILEDBYSERVER, diagMessage)
                    # get work attributes
                    workAttributes = messenger.get_work_attributes(workSpec)
                    retMap[workerID]['workAttributes'] = workAttributes
                    # get output files
                    filesToStageOut = messenger.get_files_to_stage_out(
                        workSpec)
                    retMap[workerID]['filesToStageOut'] = filesToStageOut
                    # get events to update
                    if workSpec.eventsRequest in [
                            WorkSpec.EV_useEvents, WorkSpec.EV_requestEvents
                    ]:
                        eventsToUpdate = messenger.events_to_update(workSpec)
                        retMap[workerID]['eventsToUpdate'] = eventsToUpdate
                    # request events
                    if workSpec.eventsRequest == WorkSpec.EV_useEvents:
                        eventsRequestParams = messenger.events_requested(
                            workSpec)
                        retMap[workerID][
                            'eventsRequestParams'] = eventsRequestParams
                    # get PandaIDs for pull model
                    if workSpec.mapType == WorkSpec.MT_NoJob:
                        pandaIDs = messenger.get_panda_ids(workSpec)
                    retMap[workerID]['pandaIDs'] = pandaIDs
                    # keep original new status
                    retMap[workerID]['monStatus'] = newStatus
                    # set running or idle while there are events to update or files to stage out
                    if newStatus in [
                            WorkSpec.ST_finished, WorkSpec.ST_failed,
                            WorkSpec.ST_cancelled
                    ]:
                        if len(retMap[workerID]['filesToStageOut']) > 0 or \
                                        len(retMap[workerID]['eventsToUpdate']) > 0:
                            if workSpec.status == WorkSpec.ST_running:
                                newStatus = WorkSpec.ST_running
                            else:
                                newStatus = WorkSpec.ST_idle
                        elif not workSpec.is_post_processed():
                            if not queue_config.is_no_heartbeat_status(
                                    newStatus):
                                # post processing unless heartbeat is suppressed
                                jobSpecs = self.dbProxy.get_jobs_with_worker_id(
                                    workSpec.workerID,
                                    None,
                                    True,
                                    only_running=True,
                                    slim=True)
                                # post processing
                                messenger.post_processing(
                                    workSpec, jobSpecs, workSpec.mapType)
                            workSpec.post_processed()
                            if workSpec.status == WorkSpec.ST_running:
                                newStatus = WorkSpec.ST_running
                            else:
                                newStatus = WorkSpec.ST_idle
                        # reset modification time to immediately trigger subsequent lookup
                        if not self.monitor_fifo.enabled:
                            workSpec.trigger_next_lookup()
                    retMap[workerID]['newStatus'] = newStatus
                    retMap[workerID]['diagMessage'] = diagMessage
                else:
                    tmp_log.debug(
                        'workerID={0} not in retMap'.format(workerID))
            return True, retMap
        except Exception:
            core_utils.dump_error_message(tmp_log)
            return False, None
示例#42
0
class Propagator(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.communicator = communicator
        self.queueConfigMapper = queue_config_mapper
        self._last_stats_update = None
        self._last_metrics_update = None

    # main loop
    def run(self):
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run')
            mainLog.debug('getting jobs to propagate')
            sw = core_utils.get_stopwatch()
            jobSpecs = self.dbProxy.get_jobs_to_propagate(harvester_config.propagator.maxJobs,
                                                          harvester_config.propagator.lockInterval,
                                                          harvester_config.propagator.updateInterval,
                                                          self.get_pid())
            mainLog.debug('got {0} jobs {1}'.format(len(jobSpecs), sw.get_elapsed_time()))
            # update jobs in central database
            iJobs = 0
            nJobs = harvester_config.propagator.nJobsInBulk
            hbSuppressMap = dict()
            while iJobs < len(jobSpecs):
                jobList = jobSpecs[iJobs:iJobs + nJobs]
                iJobs += nJobs
                # collect jobs to update or check
                jobListToSkip = []
                jobListToUpdate = []
                jobListToCheck = []
                retList = []
                for tmpJobSpec in jobList:
                    if tmpJobSpec.computingSite not in hbSuppressMap:
                        queueConfig = self.queueConfigMapper.get_queue(tmpJobSpec.computingSite,
                                                                       tmpJobSpec.configID)
                        hbSuppressMap[tmpJobSpec.computingSite] = queueConfig.get_no_heartbeat_status()
                    # heartbeat is suppressed
                    if tmpJobSpec.get_status() in hbSuppressMap[tmpJobSpec.computingSite] and \
                            not tmpJobSpec.not_suppress_heartbeat():
                        # check running job to detect lost heartbeat
                        if tmpJobSpec.status == 'running':
                            jobListToCheck.append(tmpJobSpec)
                        else:
                            jobListToSkip.append(tmpJobSpec)
                            retList.append({'StatusCode': 0, 'command': None})
                    else:
                        jobListToUpdate.append(tmpJobSpec)
                sw.reset()
                retList += self.communicator.check_jobs(jobListToCheck)
                mainLog.debug('check_jobs for {0} jobs {1}'.format(len(jobListToCheck), sw.get_elapsed_time()))
                sw.reset()
                retList += self.communicator.update_jobs(jobListToUpdate, self.get_pid())
                mainLog.debug('update_jobs for {0} jobs took {1}'.format(len(jobListToUpdate),
                                                                              sw.get_elapsed_time()))
                # logging
                for tmpJobSpec, tmpRet in zip(jobListToSkip+jobListToCheck+jobListToUpdate, retList):
                    if tmpRet['StatusCode'] == 0:
                        if tmpJobSpec in jobListToUpdate:
                            mainLog.debug('updated PandaID={0} status={1}'.format(tmpJobSpec.PandaID,
                                                                                  tmpJobSpec.status))
                        else:
                            mainLog.debug('skip updating PandaID={0} status={1}'.format(tmpJobSpec.PandaID,
                                                                                        tmpJobSpec.status))
                        # release job
                        tmpJobSpec.propagatorLock = None
                        if tmpJobSpec.is_final_status() and tmpJobSpec.status == tmpJobSpec.get_status():
                            # unset to disable further updating
                            tmpJobSpec.propagatorTime = None
                            tmpJobSpec.subStatus = 'done'
                            tmpJobSpec.modificationTime = datetime.datetime.utcnow()
                        elif tmpJobSpec.is_final_status() and not tmpJobSpec.all_events_done():
                            # trigger next propagation to update remaining events
                            tmpJobSpec.trigger_propagation()
                        else:
                            # check event availability
                            if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \
                                    tmpJobSpec.subStatus != 'submitted':
                                tmpEvStat, tmpEvRet = self.communicator.check_event_availability(tmpJobSpec)
                                if tmpEvStat:
                                    if tmpEvRet is not None:
                                        tmpJobSpec.nRemainingEvents = tmpEvRet
                                    if tmpEvRet == 0:
                                        mainLog.debug('kill PandaID={0} due to no event'.format(tmpJobSpec.PandaID))
                                        tmpRet['command'] = 'tobekilled'
                            # got kill command
                            if 'command' in tmpRet and tmpRet['command'] in ['tobekilled']:
                                nWorkers = self.dbProxy.kill_workers_with_job(tmpJobSpec.PandaID)
                                if nWorkers == 0:
                                    # no workers
                                    tmpJobSpec.status = 'cancelled'
                                    tmpJobSpec.subStatus = 'killed'
                                    tmpJobSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL,
                                                               PilotErrors.pilotError[PilotErrors.ERR_PANDAKILL])
                                    tmpJobSpec.stateChangeTime = datetime.datetime.utcnow()
                                    tmpJobSpec.trigger_propagation()
                        self.dbProxy.update_job(tmpJobSpec, {'propagatorLock': self.get_pid()})
                    else:
                        mainLog.error('failed to update PandaID={0} status={1}'.format(tmpJobSpec.PandaID,
                                                                                       tmpJobSpec.status))
            mainLog.debug('getting workers to propagate')
            sw.reset()
            workSpecs = self.dbProxy.get_workers_to_propagate(harvester_config.propagator.maxWorkers,
                                                              harvester_config.propagator.updateInterval)
            mainLog.debug('got {0} workers {1}'.format(len(workSpecs), sw.get_elapsed_time()))
            # update workers in central database
            sw.reset()
            iWorkers = 0
            nWorkers = harvester_config.propagator.nWorkersInBulk
            while iWorkers < len(workSpecs):
                workList = workSpecs[iWorkers:iWorkers + nWorkers]
                iWorkers += nWorkers
                retList, tmpErrStr = self.communicator.update_workers(workList)
                # logging
                if retList is None:
                    mainLog.error('failed to update workers with {0}'.format(tmpErrStr))
                else:
                    for tmpWorkSpec, tmpRet in zip(workList, retList):
                        if tmpRet:
                            mainLog.debug('updated workerID={0} status={1}'.format(tmpWorkSpec.workerID,
                                                                                   tmpWorkSpec.status))
                            # update logs
                            for logFilePath, logOffset, logSize, logRemoteName in \
                                    tmpWorkSpec.get_log_files_to_upload():
                                with open(logFilePath, 'rb') as logFileObj:
                                    tmpStat, tmpErr = self.communicator.upload_file(logRemoteName, logFileObj,
                                                                                    logOffset, logSize)
                                    if tmpStat:
                                        tmpWorkSpec.update_log_files_to_upload(logFilePath, logOffset+logSize)
                            # disable further update
                            if tmpWorkSpec.is_final_status():
                                tmpWorkSpec.disable_propagation()
                            self.dbProxy.update_worker(tmpWorkSpec, {'workerID': tmpWorkSpec.workerID})
                        else:
                            mainLog.error('failed to update workerID={0} status={1}'.format(tmpWorkSpec.workerID,
                                                                                            tmpWorkSpec.status))
            mainLog.debug('update_workers for {0} workers took {1}'.format(iWorkers,
                                                                      sw.get_elapsed_time()))
            mainLog.debug('getting commands')
            commandSpecs = self.dbProxy.get_commands_for_receiver('propagator')
            mainLog.debug('got {0} commands'.format(len(commandSpecs)))
            for commandSpec in commandSpecs:
                if commandSpec.command.startswith(CommandSpec.COM_reportWorkerStats):
                    # get worker stats
                    siteName = commandSpec.command.split(':')[-1]
                    workerStats = self.dbProxy.get_worker_stats(siteName)
                    if len(workerStats) == 0:
                        mainLog.error('failed to get worker stats for {0}'.format(siteName))
                    else:
                        # report worker stats
                        tmpRet, tmpStr = self.communicator.update_worker_stats(siteName, workerStats)
                        if tmpRet:
                            mainLog.debug('updated worker stats (command) for {0}'.format(siteName))
                        else:
                            mainLog.error('failed to update worker stats (command) for {0} err={1}'.format(siteName,
                                                                                                           tmpStr))

            if not self._last_stats_update or time.time() - self._last_stats_update > STATS_PERIOD:

                # get active UPS queues. PanDA server needs to know about them and which harvester instance is taking
                # care of them
                active_ups_queues = self.queueConfigMapper.get_active_ups_queues()

                # update worker stats for all sites
                worker_stats_bulk = self.dbProxy.get_worker_stats_bulk(active_ups_queues)
                if not worker_stats_bulk:
                    mainLog.error('failed to get worker stats in bulk')
                else:
                    for site_name in worker_stats_bulk:
                        tmp_ret, tmp_str = self.communicator.update_worker_stats(site_name,
                                                                                 worker_stats_bulk[site_name])
                        if tmp_ret:
                            mainLog.debug('update of worker stats (bulk) for {0}'.format(site_name))
                            self._last_stats_update = time.time()
                        else:
                            mainLog.error('failed to update worker stats (bulk) for {0} err={1}'.format(site_name,
                                                                                                        tmp_str))

            if not self._last_metrics_update \
                    or datetime.datetime.utcnow() - self._last_metrics_update > datetime.timedelta(seconds=METRICS_PERIOD):
                # get latest metrics from DB
                service_metrics_list = self.dbProxy.get_service_metrics(self._last_metrics_update)
                if not service_metrics_list:
                    mainLog.error('failed to get service metrics')
                    self._last_metrics_update = datetime.datetime.utcnow()
                else:
                    tmp_ret, tmp_str = self.communicator.update_service_metrics(service_metrics_list)
                    if tmp_ret:
                        mainLog.debug('update of service metrics OK')
                        self._last_metrics_update = datetime.datetime.utcnow()
                    else:
                        mainLog.error('failed to update service metrics err={0}'.format(tmp_str))

            # send dialog messages
            mainLog.debug('getting dialog messages to propagate')
            try:
                maxDialogs = harvester_config.propagator.maxDialogs
            except Exception:
                maxDialogs = 50
            diagSpecs = self.dbProxy.get_dialog_messages_to_send(maxDialogs,
                                                                 harvester_config.propagator.lockInterval)
            mainLog.debug('got {0} dialogs'.format(len(diagSpecs)))
            if len(diagSpecs) > 0:
                tmpStat, tmpStr = self.communicator.send_dialog_messages(diagSpecs)
                if tmpStat:
                    diagIDs = [diagSpec.diagID for diagSpec in diagSpecs]
                    self.dbProxy.delete_dialog_messages(diagIDs)
                    mainLog.debug('sent {0} dialogs'.format(len(diagSpecs)))

                else:
                    mainLog.error('failed to send dialogs err={0}'.format(tmpStr))
            if sw_main.get_elapsed_time_in_sec() > harvester_config.propagator.lockInterval:
                mainLog.warning('a single cycle was longer than lockInterval. done' + sw_main.get_elapsed_time())
            else:
                mainLog.debug('done' + sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.propagator.sleepTime):
                mainLog.debug('terminated')
                return
示例#43
0
class CredManager(AgentBase):

    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queue_config_mapper = queue_config_mapper
        self.pluginFactory = PluginFactory()
        self.dbProxy = DBProxy()
        # plugin cores
        self.exeCores = []
        self.queue_exe_cores = []
        # get plugin from harvester config
        self.get_cores_from_harvester_config()
        # update plugin cores from queue config
        self.update_cores_from_queue_config()

    # get list
    def get_list(self, data):
        if isinstance(data, list):
            return data
        else:
            return [data]

    # get plugin cores from harvester config
    def get_cores_from_harvester_config(self):
        # get module and class names
        if hasattr(harvester_config.credmanager, 'moduleName'):
            moduleNames = self.get_list(
                harvester_config.credmanager.moduleName)
        else:
            moduleNames = []
        if hasattr(harvester_config.credmanager, 'className'):
            classNames = self.get_list(harvester_config.credmanager.className)
        else:
            classNames = []
        # file names of original certificates
        if hasattr(harvester_config.credmanager, 'inCertFile'):
            inCertFiles = self.get_list(
                harvester_config.credmanager.inCertFile)
        elif hasattr(harvester_config.credmanager, 'certFile'):
            inCertFiles = self.get_list(harvester_config.credmanager.certFile)
        else:
            inCertFiles = []
        # file names of certificates to be generated
        if hasattr(harvester_config.credmanager, 'outCertFile'):
            outCertFiles = self.get_list(
                harvester_config.credmanager.outCertFile)
        else:
            # use the file name of the certificate for panda connection as output name
            outCertFiles = self.get_list(harvester_config.pandacon.cert_file)
        # VOMS
        if hasattr(harvester_config.credmanager, 'voms'):
            vomses = self.get_list(harvester_config.credmanager.voms)
        else:
            vomses = []
        # direct and merged plugin configuration in json
        if hasattr(harvester_config.credmanager, 'pluginConfigs'):
            pluginConfigs = harvester_config.credmanager.pluginConfigs
        else:
            pluginConfigs = []
        # from traditional attributes
        for moduleName, className, inCertFile, outCertFile, voms in \
                zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses):
            pluginPar = {}
            pluginPar['module'] = moduleName
            pluginPar['name'] = className
            pluginPar['inCertFile'] = inCertFile
            pluginPar['outCertFile'] = outCertFile
            pluginPar['voms'] = voms
            try:
                exeCore = self.pluginFactory.get_plugin(pluginPar)
                self.exeCores.append(exeCore)
            except Exception:
                _logger.error(
                    'failed to launch credmanager with traditional attributes for {0}'
                    .format(pluginPar))
                core_utils.dump_error_message(_logger)
        # from pluginConfigs
        for pc in pluginConfigs:
            try:
                setup_maps = pc['configs']
                for setup_name, setup_map in setup_maps.items():
                    try:
                        pluginPar = {}
                        pluginPar['module'] = pc['module']
                        pluginPar['name'] = pc['name']
                        pluginPar['setup_name'] = setup_name
                        pluginPar.update(setup_map)
                        exeCore = self.pluginFactory.get_plugin(pluginPar)
                        self.exeCores.append(exeCore)
                    except Exception:
                        _logger.error(
                            'failed to launch credmanager in pluginConfigs for {0}'
                            .format(pluginPar))
                        core_utils.dump_error_message(_logger)
            except Exception:
                _logger.error('failed to parse pluginConfigs {0}'.format(pc))
                core_utils.dump_error_message(_logger)

    # update plugin cores from queue config
    def update_cores_from_queue_config(self):
        self.queue_exe_cores = []
        for queue_name, queue_config in self.queue_config_mapper.get_all_queues(
        ).items():
            if queue_config.queueStatus == 'offline' \
                    or not hasattr(queue_config, 'credmanagers') \
                    or not isinstance(queue_config.credmanagers, list):
                continue
            for cm_setup in queue_config.credmanagers:
                try:
                    pluginPar = {}
                    pluginPar['module'] = cm_setup['module']
                    pluginPar['name'] = cm_setup['name']
                    pluginPar['setup_name'] = queue_name
                    for k, v in cm_setup.items():
                        if k in ('module', 'name'):
                            pass
                        if isinstance(v, str) and '$' in v:
                            # replace placeholders
                            value = v
                            patts = re.findall('\$\{([a-zA-Z\d_.]+)\}', v)
                            for patt in patts:
                                tmp_ph = '${' + patt + '}'
                                tmp_val = None
                                if patt == 'harvesterID':
                                    tmp_val = harvester_config.master.harvester_id
                                elif patt == 'queueName':
                                    tmp_val = queue_name
                                elif patt.startswith('common.'):
                                    # values from common blocks
                                    attr = patt.replace('common.', '')
                                    if hasattr(
                                            queue_config, 'common'
                                    ) and attr in queue_config.common:
                                        tmp_val = queue_config.common[attr]
                                if tmp_val is not None:
                                    value = value.replace(tmp_ph, tmp_val)
                            # fill in
                            pluginPar[k] = value
                        else:
                            # fill in
                            pluginPar[k] = v
                    exe_core = self.pluginFactory.get_plugin(pluginPar)
                    self.queue_exe_cores.append(exe_core)
                except Exception:
                    _logger.error(
                        'failed to launch about queue={0} for {1}'.format(
                            queue_name, pluginPar))
                    core_utils.dump_error_message(_logger)

    # main loop
    def run(self):
        while True:
            # update plugin cores from queue config
            self.update_cores_from_queue_config()

            # execute
            self.execute()  # this is the main run

            # check if being terminated
            if self.terminated(harvester_config.credmanager.sleepTime,
                               randomize=False):
                return

    # main
    def execute(self):
        # get lock
        locked = self.dbProxy.get_process_lock(
            'credmanager', self.get_pid(),
            harvester_config.credmanager.sleepTime)
        if not locked:
            return
        # loop over all plugins
        for exeCore in itertools.chain(self.exeCores, self.queue_exe_cores):
            # do nothing
            if exeCore is None:
                continue
            # make logger
            credmanager_name = ''
            if hasattr(exeCore, 'setup_name'):
                credmanager_name = exeCore.setup_name
            else:
                credmanager_name = '{0} {1}'.format(exeCore.inCertFile,
                                                    exeCore.outCertFile)
            mainLog = self.make_logger(_logger,
                                       '{0} {1}'.format(
                                           exeCore.__class__.__name__,
                                           credmanager_name),
                                       method_name='execute')
            try:
                # check credential
                mainLog.debug('check credential')
                isValid = exeCore.check_credential()
                if isValid:
                    mainLog.debug('valid')
                elif not isValid:
                    # renew it if necessary
                    mainLog.debug('invalid')
                    mainLog.debug('renew credential')
                    tmpStat, tmpOut = exeCore.renew_credential()
                    if not tmpStat:
                        mainLog.error('failed : {0}'.format(tmpOut))
                        continue
            except Exception:
                core_utils.dump_error_message(mainLog)
            mainLog.debug('done')

    # monit main
    def execute_monit(self):
        self.update_cores_from_queue_config()

        metrics = {}
        # loop over all plugins
        for exeCore in itertools.chain(self.exeCores, self.queue_exe_cores):
            # do nothing
            if exeCore is None:
                continue

            # make logger
            if hasattr(exeCore, 'setup_name'):
                credmanager_name = exeCore.setup_name
            else:
                credmanager_name = '{0} {1}'.format(exeCore.inCertFile,
                                                    exeCore.outCertFile)

            subLog = self.make_logger(_logger,
                                      '{0} {1}'.format(
                                          exeCore.__class__.__name__,
                                          credmanager_name),
                                      method_name='execute_monit')
            try:
                # check credential
                subLog.debug('check credential lifetime')
                lifetime = exeCore.check_credential_lifetime()
                if lifetime is not None:
                    metrics[exeCore.outCertFile] = lifetime
            except Exception:
                core_utils.dump_error_message(subLog)

            subLog.debug('done')

        return metrics
示例#44
0
class WorkerAdjuster(object):
    # constructor
    def __init__(self, queue_config_mapper):
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()
        self.dbProxy = DBProxy()
        self.throttlerMap = dict()
        self.apf_mon = Apfmon(self.queueConfigMapper)
        try:
            self.maxNewWorkers = harvester_config.submitter.maxNewWorkers
        except AttributeError:
            self.maxNewWorkers = None

    # define number of workers to submit based on various information
    def define_num_workers(self, static_num_workers, site_name):
        tmpLog = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers')
        tmpLog.debug('start')
        tmpLog.debug('static_num_workers: {0}'.format(static_num_workers))
        dyn_num_workers = copy.deepcopy(static_num_workers)
        try:
            # get queue status
            queueStat = self.dbProxy.get_cache("panda_queues.json", None)
            if queueStat is None:
                queueStat = dict()
            else:
                queueStat = queueStat.data

            # get job statistics
            job_stats = self.dbProxy.get_cache("job_statistics.json", None)
            if job_stats is None:
                job_stats = dict()
            else:
                job_stats = job_stats.data

            # define num of new workers
            for queueName in static_num_workers:
                # get queue
                queueConfig = self.queueConfigMapper.get_queue(queueName)
                workerLimits_dict = self.dbProxy.get_worker_limits(queueName)
                maxWorkers = workerLimits_dict.get('maxWorkers', 0)
                nQueueLimit = workerLimits_dict.get('nQueueLimitWorker', 0)
                nQueueLimitPerRT = workerLimits_dict['nQueueLimitWorkerPerRT']
                nQueue_total, nReady_total, nRunning_total = 0, 0, 0
                apf_msg = None
                apf_data = None
                for resource_type, tmpVal in iteritems(static_num_workers[queueName]):
                    tmpLog.debug('Processing queue {0} resource {1} with static_num_workers {2}'.
                                 format(queueName, resource_type, tmpVal))

                    # set 0 to num of new workers when the queue is disabled
                    if queueName in queueStat and queueStat[queueName]['status'] in ['offline', 'standby',
                                                                                     'maintenance']:
                        dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                        retMsg = 'set nNewWorkers=0 since status={0}'.format(queueStat[queueName]['status'])
                        tmpLog.debug(retMsg)
                        apf_msg = 'Not submitting workers since queue status = {0}'.format(queueStat[queueName]['status'])
                        continue

                    # protection against not-up-to-date queue config
                    if queueConfig is None:
                        dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                        retMsg = 'set nNewWorkers=0 due to missing queueConfig'
                        tmpLog.debug(retMsg)
                        apf_msg = 'Not submitting workers because of missing queueConfig'
                        continue

                    # get throttler
                    if queueName not in self.throttlerMap:
                        if hasattr(queueConfig, 'throttler'):
                            throttler = self.pluginFactory.get_plugin(queueConfig.throttler)
                        else:
                            throttler = None
                        self.throttlerMap[queueName] = throttler

                    # check throttler
                    throttler = self.throttlerMap[queueName]
                    if throttler is not None:
                        toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig)
                        if toThrottle:
                            dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                            retMsg = 'set nNewWorkers=0 by {0}:{1}'.format(throttler.__class__.__name__, tmpMsg)
                            tmpLog.debug(retMsg)
                            continue

                    # check stats
                    nQueue = tmpVal['nQueue']
                    nReady = tmpVal['nReady']
                    nRunning = tmpVal['nRunning']
                    if resource_type != 'ANY':
                        nQueue_total += nQueue
                        nReady_total += nReady
                        nRunning_total += nRunning
                    if queueConfig.runMode == 'slave':
                        nNewWorkersDef = tmpVal['nNewWorkers']
                        if nNewWorkersDef == 0:
                            dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                            retMsg = 'set nNewWorkers=0 by panda in slave mode'
                            tmpLog.debug(retMsg)
                            continue
                    else:
                        nNewWorkersDef = None

                    # define num of new workers based on static site config
                    nNewWorkers = 0
                    if nQueue >= nQueueLimitPerRT > 0:
                        # enough queued workers
                        retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimitPerRT({1})'.format(nQueue, nQueueLimitPerRT)
                        tmpLog.debug(retMsg)
                        pass
                    elif (nQueue + nReady + nRunning) >= maxWorkers > 0:
                        # enough workers in the system
                        retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format(nQueue,
                                                                                                          nReady,
                                                                                                          nRunning)
                        retMsg += '>= maxWorkers({0})'.format(maxWorkers)
                        tmpLog.debug(retMsg)
                        pass
                    else:

                        maxQueuedWorkers = None

                        if nQueueLimitPerRT > 0:  # there is a limit set for the queue
                            maxQueuedWorkers = nQueueLimitPerRT

                        # Reset the maxQueueWorkers according to particular
                        if nNewWorkersDef is not None:  # don't surpass limits given centrally
                            maxQueuedWorkers_slave = nNewWorkersDef + nQueue
                            if maxQueuedWorkers is not None:
                                maxQueuedWorkers = min(maxQueuedWorkers_slave, maxQueuedWorkers)
                            else:
                                maxQueuedWorkers = maxQueuedWorkers_slave

                        elif queueConfig.mapType == 'NoJob': # for pull mode, limit to activated jobs
                            # limit the queue to the number of activated jobs to avoid empty pilots
                            try:
                                n_activated = max(job_stats[queueName]['activated'], 1) # avoid no activity queues
                                queue_limit = maxQueuedWorkers
                                maxQueuedWorkers = min(n_activated, maxQueuedWorkers)
                                tmpLog.debug('limiting maxQueuedWorkers to min(n_activated={0}, queue_limit={1})'.
                                             format(n_activated, queue_limit))
                            except KeyError:
                                tmpLog.warning('n_activated not defined, defaulting to configured queue limits')
                                pass

                        if maxQueuedWorkers is None:  # no value found, use default value
                            maxQueuedWorkers = 1

                        # new workers
                        nNewWorkers = max(maxQueuedWorkers - nQueue, 0)
                        tmpLog.debug('setting nNewWorkers to {0} in maxQueuedWorkers calculation'
                                     .format(nNewWorkers))
                        if maxWorkers > 0:
                            nNewWorkers = min(nNewWorkers, max(maxWorkers - nQueue - nReady - nRunning, 0))
                            tmpLog.debug('setting nNewWorkers to {0} to respect maxWorkers'
                                         .format(nNewWorkers))
                    if queueConfig.maxNewWorkersPerCycle > 0:
                        nNewWorkers = min(nNewWorkers, queueConfig.maxNewWorkersPerCycle)
                        tmpLog.debug('setting nNewWorkers to {0} in order to respect maxNewWorkersPerCycle'
                                     .format(nNewWorkers))
                    if self.maxNewWorkers is not None and self.maxNewWorkers > 0:
                        nNewWorkers = min(nNewWorkers, self.maxNewWorkers)
                        tmpLog.debug('setting nNewWorkers to {0} in order to respect universal maxNewWorkers'
                                     .format(nNewWorkers))
                    dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers

                # adjust nNewWorkers for UCORE to let aggregations over RT respect nQueueLimitWorker and maxWorkers
                if queueConfig is None:
                    maxNewWorkersPerCycle = 0
                    retMsg = 'set maxNewWorkersPerCycle=0 in UCORE aggregation due to missing queueConfig'
                    tmpLog.debug(retMsg)
                else:
                    maxNewWorkersPerCycle = queueConfig.maxNewWorkersPerCycle
                if len(dyn_num_workers[queueName]) > 1:
                    total_new_workers_rts = sum( dyn_num_workers[queueName][_rt]['nNewWorkers']
                                                if _rt != 'ANY' else 0
                                                for _rt in dyn_num_workers[queueName] )
                    nNewWorkers_max_agg = min(
                                                max(nQueueLimit - nQueue_total, 0),
                                                max(maxWorkers - nQueue_total - nReady_total - nRunning_total, 0),
                                                )
                    if maxNewWorkersPerCycle >= 0:
                        nNewWorkers_max_agg = min(nNewWorkers_max_agg, maxNewWorkersPerCycle)
                    if self.maxNewWorkers is not None and self.maxNewWorkers > 0:
                        nNewWorkers_max_agg = min(nNewWorkers_max_agg, self.maxNewWorkers)
                    # exceeded max, to adjust
                    if total_new_workers_rts > nNewWorkers_max_agg:
                        if nNewWorkers_max_agg == 0:
                            for resource_type in dyn_num_workers[queueName]:
                                dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0
                            tmpLog.debug('No nNewWorkers since nNewWorkers_max_agg=0 for UCORE')
                        else:
                            tmpLog.debug('nNewWorkers_max_agg={0} for UCORE'.format(nNewWorkers_max_agg))
                            _d = dyn_num_workers[queueName].copy()
                            del _d['ANY']
                            simple_rt_nw_list = [ [_rt, _d[_rt].get('nNewWorkers', 0), 0] for _rt in _d ]
                            _countdown = nNewWorkers_max_agg
                            for _rt_list in simple_rt_nw_list:
                                resource_type, nNewWorkers_orig, _r = _rt_list
                                nNewWorkers, remainder = divmod(nNewWorkers_orig*nNewWorkers_max_agg, total_new_workers_rts)
                                dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers
                                _rt_list[2] = remainder
                                _countdown -= nNewWorkers
                            _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1]))
                            sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True)
                            for resource_type, nNewWorkers_orig, remainder in sorted_rt_nw_list:
                                if _countdown <= 0:
                                    break
                                dyn_num_workers[queueName][resource_type]['nNewWorkers'] += 1
                                _countdown -= 1
                        for resource_type in dyn_num_workers[queueName]:
                            if resource_type == 'ANY':
                                continue
                            nNewWorkers = dyn_num_workers[queueName][resource_type]['nNewWorkers']
                            tmpLog.debug('setting nNewWorkers to {0} of type {1} in order to respect RT aggregations for UCORE'
                                         .format(nNewWorkers, resource_type))

                if not apf_msg:
                    apf_data = copy.deepcopy(dyn_num_workers[queueName])

                self.apf_mon.update_label(queueName, apf_msg, apf_data)

            # dump
            tmpLog.debug('defined {0}'.format(str(dyn_num_workers)))
            return dyn_num_workers
        except Exception:
            # dump error
            errMsg = core_utils.dump_error_message(tmpLog)
            return None
示例#45
0
    os.remove(harvester_config.db.database_filename)
except Exception:
    pass

for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict):
    if loggerName.startswith('panda.log'):
        if len(loggerObj.handlers) == 0:
            continue
        if loggerName.split('.')[-1] in ['db_proxy']:
            continue
        stdoutHandler = logging.StreamHandler(sys.stdout)
        stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
        loggerObj.addHandler(stdoutHandler)

queueConfigMapper = QueueConfigMapper()

proxy = DBProxy()
proxy.make_tables(queueConfigMapper)

job = JobSpec()
job.PandaID = 1

job.modificationTime = datetime.datetime.now()
proxy.insert_jobs([job])

newJob = proxy.get_job(1)

a = CommunicatorPool()
a.get_jobs('siteName', 'nodeName', 'prodSourceLabel', 'computingElement', 1,
           {})
示例#46
0
class CommandManager(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.db_proxy = DBProxy()
        self.communicator = communicator
        self.queueConfigMapper = queue_config_mapper
        self.nodeName = socket.gethostname()
        self.lastHeartbeat = None

    # set single mode
    def set_single_mode(self, single_mode):
        self.singleMode = single_mode

    def convert_to_command_specs(self, commands):
        """
        Generates a list of CommandSpec objects
        """
        command_specs = []
        for command in commands:
            command_spec = CommandSpec()
            command_spec.convert_command_json(command)
            for comStr, receiver in iteritems(CommandSpec.receiver_map):
                if command_spec.command.startswith(comStr):
                    command_spec.receiver = receiver
                    break
            if command_spec.receiver is not None:
                command_specs.append(command_spec)
        return command_specs

    def run(self):
        """
        main
        """
        main_log = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run')
        bulk_size = harvester_config.commandmanager.commands_bulk_size
        locked = self.db_proxy.get_process_lock('commandmanager', self.get_pid(),
                                                harvester_config.commandmanager.sleepTime)
        if locked:
            # send command list to be received
            siteNames = set()
            commandList = []
            for queueName, queueConfig in iteritems(self.queueConfigMapper.get_active_queues()):
                if queueConfig is None or queueConfig.runMode != 'slave':
                    continue
                # one command for all queues in one site
                if queueConfig.siteName not in siteNames:
                    commandItem = {'command': CommandSpec.COM_reportWorkerStats,
                                   'computingSite': queueConfig.siteName,
                                   'resourceType': queueConfig.resourceType
                                   }
                    commandList.append(commandItem)
                siteNames.add(queueConfig.siteName)
                # one command for each queue
                commandItem = {'command': CommandSpec.COM_setNWorkers,
                               'computingSite': queueConfig.siteName,
                               'resourceType': queueConfig.resourceType
                               }
                commandList.append(commandItem)
            data = {'startTime': datetime.datetime.utcnow(),
                    'sw_version': panda_pkg_info.release_version,
                    'commit_stamp': commit_timestamp.timestamp}
            if len(commandList) > 0:
                main_log.debug('sending command list to receive')
                data['commands'] = commandList
            self.communicator.is_alive(data)

        # main loop
        while True:
            # get lock
            locked = self.db_proxy.get_process_lock('commandmanager', self.get_pid(),
                                                    harvester_config.commandmanager.sleepTime)
            if locked or self.singleMode:

                main_log.debug('polling commands loop')

                # send heartbeat
                if self.lastHeartbeat is None \
                        or self.lastHeartbeat < datetime.datetime.utcnow() - datetime.timedelta(minutes=10):
                    self.lastHeartbeat = datetime.datetime.utcnow()
                    self.communicator.is_alive({})

                continuous_loop = True  # as long as there are commands, retrieve them

                while continuous_loop:

                    # get commands from panda server for this harvester instance
                    commands = self.communicator.get_commands(bulk_size)
                    main_log.debug('got {0} commands (bulk size: {1})'.format(len(commands), bulk_size))
                    command_specs = self.convert_to_command_specs(commands)

                    # cache commands in internal DB
                    self.db_proxy.store_commands(command_specs)
                    main_log.debug('cached {0} commands in internal DB'.format(len(command_specs)))

                    # retrieve processed commands from harvester cache
                    command_ids_ack = self.db_proxy.get_commands_ack()

                    for shard in core_utils.create_shards(command_ids_ack, bulk_size):
                        # post acknowledgements to panda server
                        self.communicator.ack_commands(shard)
                        main_log.debug('acknowledged {0} commands to panda server'.format(len(shard)))

                        # clean acknowledged commands
                        self.db_proxy.clean_commands_by_id(shard)

                    # clean commands that have been processed and do not need acknowledgement
                    self.db_proxy.clean_processed_commands()

                    # if we didn't collect the full bulk, give panda server a break
                    if len(commands) < bulk_size:
                        continuous_loop = False

            # check if being terminated
            if self.terminated(harvester_config.commandmanager.sleepTime, randomize=False):
                main_log.debug('terminated')
                return
示例#47
0
class EventFeeder(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.queueConfigMapper = queue_config_mapper
        self.communicator = communicator
        self.pluginFactory = PluginFactory()

    # main loop
    def run(self):
        lockedBy = 'eventfeeder-{0}'.format(self.get_pid())
        while True:
            mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            mainLog.debug('getting workers to feed events')
            workSpecsPerQueue = self.dbProxy.get_workers_to_feed_events(harvester_config.eventfeeder.maxWorkers,
                                                                        harvester_config.eventfeeder.lockInterval,
                                                                        lockedBy)
            mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue)))
            # loop over all workers
            for queueName, workSpecList in iteritems(workSpecsPerQueue):
                tmpQueLog = self.make_logger(_logger, 'queue={0}'.format(queueName), method_name='run')
                # check queue
                if not self.queueConfigMapper.has_queue(queueName):
                    tmpQueLog.error('config not found')
                    continue
                # get queue
                queueConfig = self.queueConfigMapper.get_queue(queueName)
                if hasattr(queueConfig, 'scatteredEvents') and queueConfig.scatteredEvents:
                    scattered = True
                else:
                    scattered = False
                # get plugin
                messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
                # loop over all workers
                for workSpec in workSpecList:
                    tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID),
                                                    method_name='run')
                    # lock worker again
                    lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy)
                    if not lockedFlag:
                        tmpLog.debug('skipped since locked by another')
                        continue
                    # get events
                    tmpLog.debug('get events')
                    tmpStat, events = self.communicator.get_event_ranges(workSpec.eventsRequestParams,
                                                                         scattered)
                    # failed
                    if tmpStat is False:
                        tmpLog.error('failed to get events with {0}'.format(events))
                        continue
                    # lock worker again
                    lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy)
                    if not lockedFlag:
                        tmpLog.debug('skipped before feeding since locked by another')
                        continue
                    tmpStat = messenger.feed_events(workSpec, events)
                    # failed
                    if tmpStat is False:
                        tmpLog.error('failed to feed events')
                        continue
                    # dump
                    for pandaID, eventList in iteritems(events):
                        try:
                            nRanges = workSpec.eventsRequestParams[pandaID]['nRanges']
                        except Exception:
                            nRanges = None
                        tmpLog.debug('got {0} events for PandaID={1} while getting {2} events'.format(len(eventList),
                                                                                                      pandaID,
                                                                                                      nRanges))
                        # disable multi workers
                        if workSpec.mapType == WorkSpec.MT_MultiWorkers:
                            if len(eventList) == 0 or (nRanges is not None and len(eventList) < nRanges):
                                tmpStat = self.dbProxy.disable_multi_workers(pandaID)
                                if tmpStat == 1:
                                    tmpStr = 'disabled MultiWorkers for PandaID={0}'.format(pandaID)
                                    tmpLog.debug(tmpStr)
                    # update worker
                    workSpec.eventsRequest = WorkSpec.EV_useEvents
                    workSpec.eventsRequestParams = None
                    workSpec.eventFeedTime = None
                    workSpec.eventFeedLock = None
                    # update local database
                    tmpStat = self.dbProxy.update_worker(workSpec, {'eventFeedLock': lockedBy})
                    tmpLog.debug('done with {0}'.format(tmpStat))
                tmpQueLog.debug('done')
            mainLog.debug('done')
            # check if being terminated
            if self.terminated(harvester_config.eventfeeder.sleepTime):
                mainLog.debug('terminated')
                return
示例#48
0
 def __init__(self, queue_config_mapper, single_mode=False):
     AgentBase.__init__(self, single_mode)
     self.dbProxy = DBProxy()
     self.queueConfigMapper = queue_config_mapper
     self.pluginFactory = PluginFactory()
示例#49
0
for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict):
    if loggerName.startswith('panda.log'):
        if len(loggerObj.handlers) == 0:
            continue
        if loggerName.split('.')[-1] in ['db_proxy']:
            continue
        stdoutHandler = logging.StreamHandler(sys.stdout)
        stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
        loggerObj.addHandler(stdoutHandler)

pp = pprint.PrettyPrinter(indent=4)

queueConfigMapper = QueueConfigMapper()

proxy = DBProxy()

sqlJ = "SELECT * FROM job_table"

resultsJobcur = proxy.execute(sqlJ)
resultsJob = resultsJobcur.fetchall()
proxy.commit()

sqlF = "SELECT * FROM file_table"

resultsFilescur = proxy.execute(sqlF)
resultsFiles = resultsFilescur.fetchall()
proxy.commit()

print "job_table - "
print resultsJob[0].keys()
示例#50
0
class Sweeper(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()


    # main loop
    def run(self):
        lockedBy = 'sweeper-{0}'.format(self.get_pid())
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            # killing stage
            sw_kill = core_utils.get_stopwatch()
            mainLog.debug('try to get workers to kill')
            # get workers to kill
            workersToKill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers,
                                                             harvester_config.sweeper.checkInterval)
            mainLog.debug('got {0} queues to kill workers'.format(len(workersToKill)))
            # loop over all workers
            sw = core_utils.get_stopwatch()
            for queueName, configIdWorkSpecList in iteritems(workersToKill):
                for configID, workspec_list in iteritems(configIdWorkSpecList):
                    # get sweeper
                    if not self.queueConfigMapper.has_queue(queueName, configID):
                        mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(queueName, configID)
                    sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper)
                    sw.reset()
                    n_workers = len(workspec_list)
                    try:
                        # try bulk method
                        tmpLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
                        tmpLog.debug('start killing')
                        tmpList = sweeperCore.kill_workers(workspec_list)
                    except AttributeError:
                        # fall back to single-worker method
                        for workspec in workspec_list:
                            tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                                      method_name='run')
                            try:
                                tmpLog.debug('start killing one worker')
                                tmpStat, tmpOut = sweeperCore.kill_worker(workspec)
                                tmpLog.debug('done killing with status={0} diag={1}'.format(tmpStat, tmpOut))
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
                    except Exception:
                        core_utils.dump_error_message(mainLog)
                    else:
                        # bulk method
                        n_killed = 0
                        for workspec, (tmpStat, tmpOut) in zip(workspec_list, tmpList):
                            tmpLog.debug('done killing workerID={0} with status={1} diag={2}'.format(
                                            workspec.workerID, tmpStat, tmpOut))
                            if tmpStat:
                                n_killed += 1
                        tmpLog.debug('killed {0}/{1} workers'.format(n_killed, n_workers))
                    mainLog.debug('done killing {0} workers'.format(n_workers) + sw.get_elapsed_time())
            mainLog.debug('done all killing' + sw_kill.get_elapsed_time())
            # cleanup stage
            sw_cleanup = core_utils.get_stopwatch()
            # timeout for missed
            try:
                keepMissed = harvester_config.sweeper.keepMissed
            except Exception:
                keepMissed = 24
            try:
                keepPending = harvester_config.sweeper.keepPending
            except Exception:
                keepPending = 24
            # get workers for cleanup
            statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished,
                                'failed': harvester_config.sweeper.keepFailed,
                                'cancelled': harvester_config.sweeper.keepCancelled,
                                'missed': keepMissed,
                                'pending': keepPending
                                }
            workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers,
                                                                     statusTimeoutMap)
            mainLog.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup)))
            sw = core_utils.get_stopwatch()
            for queueName, configIdWorkSpecList in iteritems(workersForCleanup):
                for configID, workspec_list in iteritems(configIdWorkSpecList):
                    # get sweeper
                    if not self.queueConfigMapper.has_queue(queueName, configID):
                        mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(queueName, configID)
                    sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper)
                    messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
                    sw.reset()
                    n_workers = len(workspec_list)
                    # make sure workers to clean up are all terminated
                    mainLog.debug('making sure workers to clean up are all terminated')
                    try:
                        # try bulk method
                        tmpList = sweeperCore.kill_workers(workspec_list)
                    except AttributeError:
                        # fall back to single-worker method
                        for workspec in workspec_list:
                            tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                                      method_name='run')
                            try:
                                tmpStat, tmpOut = sweeperCore.kill_worker(workspec)
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
                    except Exception:
                        core_utils.dump_error_message(mainLog)
                    mainLog.debug('made sure workers to clean up are all terminated')
                    # start cleanup
                    for workspec in workspec_list:
                        tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                                  method_name='run')
                        try:
                            tmpLog.debug('start cleaning up one worker')
                            # sweep worker
                            tmpStat, tmpOut = sweeperCore.sweep_worker(workspec)
                            tmpLog.debug('swept_worker with status={0} diag={1}'.format(tmpStat, tmpOut))
                            tmpLog.debug('start messenger cleanup')
                            mc_tmpStat, mc_tmpOut = messenger.clean_up(workspec)
                            tmpLog.debug('messenger cleaned up with status={0} diag={1}'.format(mc_tmpStat, mc_tmpOut))
                            if tmpStat:
                                self.dbProxy.delete_worker(workspec.workerID)
                        except Exception:
                            core_utils.dump_error_message(tmpLog)
                    mainLog.debug('done cleaning up {0} workers'.format(n_workers) + sw.get_elapsed_time())
            mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time())
            # old-job-deletion stage
            sw_delete = core_utils.get_stopwatch()
            mainLog.debug('delete old jobs')
            jobTimeout = max(statusTimeoutMap.values()) + 1
            self.dbProxy.delete_old_jobs(jobTimeout)
            # delete orphaned job info
            self.dbProxy.delete_orphaned_job_info()
            mainLog.debug('done deletion of old jobs' + sw_delete.get_elapsed_time())
            # time the cycle
            mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.sweeper.sleepTime):
                mainLog.debug('terminated')
                return
示例#51
0
class WorkerMaker(object):
    # constructor
    def __init__(self):
        self.pluginFactory = PluginFactory()
        self.dbProxy = DBProxy()

    # get plugin
    def get_plugin(self, queue_config):
        return self.pluginFactory.get_plugin(queue_config.workerMaker)

    # make workers
    def make_workers(self, jobchunk_list, queue_config, n_ready, resource_type, maker=None):
        tmpLog = core_utils.make_logger(_logger, 'queue={0} rtype={1}'.format(queue_config.queueName, resource_type),
                                        method_name='make_workers')
        tmpLog.debug('start')
        try:
            # get plugin
            if maker is None:
                maker = self.pluginFactory.get_plugin(queue_config.workerMaker)
            if maker is None:
                # not found
                tmpLog.error('plugin for {0} not found'.format(queue_config.queueName))
                return [], jobchunk_list
            # get ready workers
            readyWorkers = self.dbProxy.get_ready_workers(queue_config.queueName, n_ready)
            # loop over all chunks
            okChunks = []
            ngChunks = []
            for iChunk, jobChunk in enumerate(jobchunk_list):
                # make a worker
                if iChunk >= n_ready:
                    workSpec = maker.make_worker(jobChunk, queue_config, resource_type)
                else:
                    # use ready worker
                    if iChunk < len(readyWorkers):
                        workSpec = readyWorkers[iChunk]
                    else:
                        workSpec = None
                # failed
                if workSpec is None:
                    ngChunks.append(jobChunk)
                    continue
                # set workerID
                if workSpec.workerID is None:
                    workSpec.workerID = self.dbProxy.get_next_seq_number('SEQ_workerID')
                    workSpec.configID = queue_config.configID
                    workSpec.isNew = True
                okChunks.append((workSpec, jobChunk))
            # dump
            tmpLog.debug('made {0} workers while {1} chunks failed'.format(len(okChunks),
                                                                           len(ngChunks)))
            return okChunks, ngChunks
        except Exception:
            # dump error
            core_utils.dump_error_message(tmpLog)
            return [], jobchunk_list

    # get number of jobs per worker
    def get_num_jobs_per_worker(self, queue_config, n_workers, resource_type, maker=None):
        # get plugin
        if maker is None:
            maker = self.pluginFactory.get_plugin(queue_config.workerMaker)
        return maker.get_num_jobs_per_worker(n_workers)

    # get number of workers per job
    def get_num_workers_per_job(self, queue_config, n_workers, resource_type, maker=None):
        # get plugin
        if maker is None:
            maker = self.pluginFactory.get_plugin(queue_config.workerMaker)
        return maker.get_num_workers_per_job(n_workers)

    # check number of ready resources
    def num_ready_resources(self, queue_config, resource_type, maker=None):
        # get plugin
        if maker is None:
            maker = self.pluginFactory.get_plugin(queue_config.workerMaker)
        return maker.num_ready_resources()

    # get upper limit on the cumulative total of workers per job
    def get_max_workers_per_job_in_total(self, queue_config, resource_type, maker=None):
        # get plugin
        if maker is None:
            maker = self.pluginFactory.get_plugin(queue_config.workerMaker)
        return maker.get_max_workers_per_job_in_total()

    # get upper limit on the number of new workers per job in a cycle
    def get_max_workers_per_job_per_cycle(self, queue_config, resource_type, maker=None):
        # get plugin
        if maker is None:
            maker = self.pluginFactory.get_plugin(queue_config.workerMaker)
        return maker.get_max_workers_per_job_per_cycle()
 def __init__(self, queue_config_mapper):
     self.queueConfigMapper = queue_config_mapper
     self.pluginFactory = PluginFactory()
     self.dbProxy = DBProxy()
     self.throttlerMap = dict()
示例#53
0
class SimpleThrottler(PluginBase):
    # constructor
    def __init__(self, **kwarg):
        # logic type : AND: throttled if all rules are satisfied, OR: throttled if one rule is satisfied
        self.logicType = 'OR'
        PluginBase.__init__(self, **kwarg)
        self.dbProxy = DBProxy()

    # check if to be throttled
    def to_be_throttled(self, queue_config):
        tmpLog = self.make_logger(baseLogger,
                                  'computingSite={0}'.format(
                                      queue_config.queueName),
                                  method_name='to_be_throttled')
        tmpLog.debug('start')
        # set default return vale
        if self.logicType == 'OR':
            retVal = False, "no rule was satisfied"
        else:
            retVal = True, "all rules were satisfied"
        # loop over all rules
        criteriaList = []
        maxMissedList = []
        timeNow = datetime.datetime.utcnow()
        for rule in self.rulesForMissed:
            # convert rule to criteria
            if rule['level'] == 'site':
                criteria = dict()
                criteria['siteName'] = queue_config.siteName
                criteria['timeLimit'] = timeNow - datetime.timedelta(
                    minutes=rule['timeWindow'])
                criteriaList.append(criteria)
                maxMissedList.append(rule['maxMissed'])
            elif rule['level'] == 'pq':
                criteria = dict()
                criteria['computingSite'] = queue_config.queueName
                criteria['timeLimit'] = timeNow - datetime.timedelta(
                    minutes=rule['timeWindow'])
                criteriaList.append(criteria)
                maxMissedList.append(rule['maxMissed'])
            elif rule['level'] == 'ce':
                elmName = 'computingElements'
                if elmName not in queue_config.submitter:
                    tmpLog.debug(
                        'skipped since {0} is undefined in submitter config'.
                        format(elmName))
                    continue
                for ce in queue_config.submitter[elmName]:
                    criteria = dict()
                    criteria['computingElement'] = ce
                    criteria['timeLimit'] = timeNow - datetime.timedelta(
                        minutes=rule['timeWindow'])
                    criteriaList.append(criteria)
                    maxMissedList.append(rule['maxMissed'])
        # loop over all criteria
        for criteria, maxMissed in zip(criteriaList, maxMissedList):
            nMissed = self.dbProxy.get_num_missed_workers(
                queue_config.queueName, criteria)
            if nMissed > maxMissed:
                if self.logicType == 'OR':
                    tmpMsg = 'logic={0} and '.format(self.logicType)
                    tmpMsg += 'nMissed={0} > maxMissed={1} for {2}'.format(
                        nMissed, maxMissed, str(criteria))
                    retVal = True, tmpMsg
                    break
            else:
                if self.logicType == 'AND':
                    tmpMsg = 'logic={0} and '.format(self.logicType)
                    tmpMsg += 'nMissed={0} <= maxMissed={1} for {2}'.format(
                        nMissed, maxMissed, str(criteria))
                    retVal = False, tmpMsg
                    break
        tmpLog.debug('ret={0} : {1}'.format(*retVal))
        return retVal
示例#54
0
class JobFetcher(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.communicator = communicator
        self.nodeName = socket.gethostname()
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()

    # main loop
    def run(self):
        while True:
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(self.get_pid()),
                                       method_name='run')
            mainLog.debug('getting number of jobs to be fetched')
            # get number of jobs to be fetched
            nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch(
                harvester_config.jobfetcher.nQueues,
                harvester_config.jobfetcher.lookupTime)
            mainLog.debug('got {0} queues'.format(len(nJobsPerQueue)))
            # loop over all queues
            for queueName, nJobs in iteritems(nJobsPerQueue):
                # check queue
                if not self.queueConfigMapper.has_queue(queueName):
                    continue
                tmpLog = self.make_logger(_logger,
                                          'queueName={0}'.format(queueName),
                                          method_name='run')
                # get queue
                queueConfig = self.queueConfigMapper.get_queue(queueName)
                # upper limit
                if nJobs > harvester_config.jobfetcher.maxJobs:
                    nJobs = harvester_config.jobfetcher.maxJobs
                # get jobs
                default_prodSourceLabel = queueConfig.get_source_label()
                pdpm = getattr(queueConfig,
                               'prodSourceLabelRandomWeightsPermille', {})
                choice_list = core_utils.make_choice_list(
                    pdpm=pdpm, default=default_prodSourceLabel)
                prodSourceLabel = random.choice(choice_list)
                tmpLog.debug('getting {0} jobs for prodSourceLabel {1}'.format(
                    nJobs, prodSourceLabel))
                sw = core_utils.get_stopwatch()
                siteName = queueConfig.siteName
                jobs, errStr = self.communicator.get_jobs(
                    siteName, self.nodeName, prodSourceLabel, self.nodeName,
                    nJobs, queueConfig.getJobCriteria)
                tmpLog.info('got {0} jobs with {1} {2}'.format(
                    len(jobs), errStr, sw.get_elapsed_time()))
                # convert to JobSpec
                if len(jobs) > 0:
                    # get extractor plugin
                    if hasattr(queueConfig, 'extractor'):
                        extractorCore = self.pluginFactory.get_plugin(
                            queueConfig.extractor)
                    else:
                        extractorCore = None
                    jobSpecs = []
                    fileStatMap = dict()
                    sw_startconvert = core_utils.get_stopwatch()
                    for job in jobs:
                        timeNow = datetime.datetime.utcnow()
                        jobSpec = JobSpec()
                        jobSpec.convert_job_json(job)
                        jobSpec.computingSite = queueName
                        jobSpec.status = 'starting'
                        jobSpec.subStatus = 'fetched'
                        jobSpec.creationTime = timeNow
                        jobSpec.stateChangeTime = timeNow
                        jobSpec.configID = queueConfig.configID
                        jobSpec.set_one_attribute(
                            'schedulerID', 'harvester-{0}'.format(
                                harvester_config.master.harvester_id))
                        if queueConfig.zipPerMB is not None and jobSpec.zipPerMB is None:
                            jobSpec.zipPerMB = queueConfig.zipPerMB
                        fileGroupDictList = [
                            jobSpec.get_input_file_attributes()
                        ]
                        if extractorCore is not None:
                            fileGroupDictList.append(
                                extractorCore.get_aux_inputs(jobSpec))
                        for fileGroupDict in fileGroupDictList:
                            for tmpLFN, fileAttrs in iteritems(fileGroupDict):
                                # check file status
                                if tmpLFN not in fileStatMap:
                                    fileStatMap[
                                        tmpLFN] = self.dbProxy.get_file_status(
                                            tmpLFN, 'input',
                                            queueConfig.ddmEndpointIn,
                                            'starting')
                                # make file spec
                                fileSpec = FileSpec()
                                fileSpec.PandaID = jobSpec.PandaID
                                fileSpec.taskID = jobSpec.taskID
                                fileSpec.lfn = tmpLFN
                                fileSpec.endpoint = queueConfig.ddmEndpointIn
                                fileSpec.scope = fileAttrs['scope']
                                # set preparing to skip stage-in if the file is (being) taken care of by another job
                                if 'ready' in fileStatMap[tmpLFN] or 'preparing' in fileStatMap[tmpLFN] \
                                        or 'to_prepare' in fileStatMap[tmpLFN]:
                                    fileSpec.status = 'preparing'
                                else:
                                    fileSpec.status = 'to_prepare'
                                if fileSpec.status not in fileStatMap[tmpLFN]:
                                    fileStatMap[tmpLFN][fileSpec.status] = 0
                                fileStatMap[tmpLFN][fileSpec.status] += 1
                                if 'INTERNAL_FileType' in fileAttrs:
                                    fileSpec.fileType = fileAttrs[
                                        'INTERNAL_FileType']
                                    jobSpec.auxInput = JobSpec.AUX_hasAuxInput
                                else:
                                    fileSpec.fileType = 'input'
                                if 'INTERNAL_URL' in fileAttrs:
                                    fileSpec.url = fileAttrs['INTERNAL_URL']
                                jobSpec.add_in_file(fileSpec)
                        jobSpec.trigger_propagation()
                        jobSpecs.append(jobSpec)
                    # insert to DB
                    tmpLog.debug("Converting of {0} jobs {1}".format(
                        len(jobs), sw_startconvert.get_elapsed_time()))
                    sw_insertdb = core_utils.get_stopwatch()
                    self.dbProxy.insert_jobs(jobSpecs)
                    tmpLog.debug('Insert of {0} jobs {1}'.format(
                        len(jobSpecs), sw_insertdb.get_elapsed_time()))
            mainLog.debug('done')
            # check if being terminated
            if self.terminated(harvester_config.jobfetcher.sleepTime):
                mainLog.debug('terminated')
                return
示例#55
0
class EventFeeder(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.queueConfigMapper = queue_config_mapper
        self.communicator = communicator
        self.pluginFactory = PluginFactory()

    # main loop
    def run(self):
        lockedBy = 'eventfeeder-{0}'.format(self.get_pid())
        while True:
            mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            mainLog.debug('getting workers to feed events')
            workSpecsPerQueue = self.dbProxy.get_workers_to_feed_events(harvester_config.eventfeeder.maxWorkers,
                                                                        harvester_config.eventfeeder.lockInterval,
                                                                        lockedBy)
            mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue)))
            # loop over all workers
            for queueName, workSpecList in iteritems(workSpecsPerQueue):
                tmpQueLog = self.make_logger(_logger, 'queue={0}'.format(queueName), method_name='run')
                # check queue
                if not self.queueConfigMapper.has_queue(queueName):
                    tmpQueLog.error('config not found')
                    continue
                # get queue
                queueConfig = self.queueConfigMapper.get_queue(queueName)
                if hasattr(queueConfig, 'scatteredEvents') and queueConfig.scatteredEvents:
                    scattered = True
                else:
                    scattered = False
                # get plugin
                messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
                # loop over all workers
                for workSpec in workSpecList:
                    tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID),
                                                    method_name='run')
                    # lock worker again
                    lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy)
                    if not lockedFlag:
                        tmpLog.debug('skipped since locked by another')
                        continue
                    # get events
                    tmpLog.debug('get events')
                    tmpStat, events = self.communicator.get_event_ranges(workSpec.eventsRequestParams,
                                                                         scattered,
                                                                         workSpec.get_access_point())
                    # failed
                    if tmpStat is False:
                        tmpLog.error('failed to get events with {0}'.format(events))
                        continue
                    # lock worker again
                    lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy)
                    if not lockedFlag:
                        tmpLog.debug('skipped before feeding since locked by another')
                        continue
                    tmpStat = messenger.feed_events(workSpec, events)
                    # failed
                    if tmpStat is False:
                        tmpLog.error('failed to feed events')
                        continue
                    # dump
                    for pandaID, eventList in iteritems(events):
                        try:
                            nRanges = workSpec.eventsRequestParams[pandaID]['nRanges']
                        except Exception:
                            nRanges = None
                        tmpLog.debug('got {0} events for PandaID={1} while getting {2} events'.format(len(eventList),
                                                                                                      pandaID,
                                                                                                      nRanges))
                        # disable multi workers
                        if workSpec.mapType == WorkSpec.MT_MultiWorkers:
                            if len(eventList) == 0 or (nRanges is not None and len(eventList) < nRanges):
                                tmpStat = self.dbProxy.disable_multi_workers(pandaID)
                                if tmpStat == 1:
                                    tmpStr = 'disabled MultiWorkers for PandaID={0}'.format(pandaID)
                                    tmpLog.debug(tmpStr)
                    # update worker
                    workSpec.eventsRequest = WorkSpec.EV_useEvents
                    workSpec.eventsRequestParams = None
                    workSpec.eventFeedTime = None
                    workSpec.eventFeedLock = None
                    # update local database
                    tmpStat = self.dbProxy.update_worker(workSpec, {'eventFeedLock': lockedBy})
                    tmpLog.debug('done with {0}'.format(tmpStat))
                tmpQueLog.debug('done')
            mainLog.debug('done')
            # check if being terminated
            if self.terminated(harvester_config.eventfeeder.sleepTime):
                mainLog.debug('terminated')
                return
class SimpleThrottler(PluginBase):
    # constructor
    def __init__(self, **kwarg):
        # logic type : AND: throttled if all rules are satisfied, OR: throttled if one rule is satisfied
        self.logicType = 'OR'
        PluginBase.__init__(self, **kwarg)
        self.dbProxy = DBProxy()

    # check if to be throttled
    def to_be_throttled(self, queue_config):
        tmpLog = self.make_logger(baseLogger, 'computingSite={0}'.format(queue_config.queueName),
                                  method_name='to_be_throttled')
        tmpLog.debug('start')
        # set default return vale
        if self.logicType == 'OR':
            retVal = False, "no rule was satisfied"
        else:
            retVal = True, "all rules were satisfied"
        # loop over all rules
        criteriaList = []
        maxMissedList = []
        timeNow = datetime.datetime.utcnow()
        for rule in self.rulesForMissed:
            # convert rule to criteria
            if rule['level'] == 'site':
                criteria = dict()
                criteria['siteName'] = queue_config.siteName
                criteria['timeLimit'] = timeNow - datetime.timedelta(minutes=rule['timeWindow'])
                criteriaList.append(criteria)
                maxMissedList.append(rule['maxMissed'])
            elif rule['level'] == 'pq':
                criteria = dict()
                criteria['computingSite'] = queue_config.queueName
                criteria['timeLimit'] = timeNow - datetime.timedelta(minutes=rule['timeWindow'])
                criteriaList.append(criteria)
                maxMissedList.append(rule['maxMissed'])
            elif rule['level'] == 'ce':
                elmName = 'computingElements'
                if elmName not in queue_config.submitter:
                    tmpLog.debug('skipped since {0} is undefined in submitter config'.format(elmName))
                    continue
                for ce in queue_config.submitter[elmName]:
                    criteria = dict()
                    criteria['computingElement'] = ce
                    criteria['timeLimit'] = timeNow - datetime.timedelta(minutes=rule['timeWindow'])
                    criteriaList.append(criteria)
                    maxMissedList.append(rule['maxMissed'])
        # loop over all criteria
        for criteria, maxMissed in zip(criteriaList, maxMissedList):
            nMissed = self.dbProxy.get_num_missed_workers(queue_config.queueName, criteria)
            if nMissed > maxMissed:
                if self.logicType == 'OR':
                    tmpMsg = 'logic={0} and '.format(self.logicType)
                    tmpMsg += 'nMissed={0} > maxMissed={1} for {2}'.format(nMissed, maxMissed, str(criteria))
                    retVal = True, tmpMsg
                    break
            else:
                if self.logicType == 'AND':
                    tmpMsg = 'logic={0} and '.format(self.logicType)
                    tmpMsg += 'nMissed={0} <= maxMissed={1} for {2}'.format(nMissed, maxMissed, str(criteria))
                    retVal = False, tmpMsg
                    break
        tmpLog.debug('ret={0} : {1}'.format(*retVal))
        return retVal
示例#57
0
class JobFetcher(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.communicator = communicator
        self.nodeName = socket.gethostname()
        self.queueConfigMapper = queue_config_mapper

    # main loop
    def run(self):
        while True:
            mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run')
            mainLog.debug('getting number of jobs to be fetched')
            # get number of jobs to be fetched
            nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch(harvester_config.jobfetcher.nQueues,
                                                               harvester_config.jobfetcher.lookupTime)
            mainLog.debug('got {0} queues'.format(len(nJobsPerQueue)))
            # loop over all queues
            for queueName, nJobs in iteritems(nJobsPerQueue):
                # check queue
                if not self.queueConfigMapper.has_queue(queueName):
                    continue
                tmpLog = self.make_logger(_logger, 'queueName={0}'.format(queueName),
                                          method_name='run')
                # get queue
                queueConfig = self.queueConfigMapper.get_queue(queueName)
                # upper limit
                if nJobs > harvester_config.jobfetcher.maxJobs:
                    nJobs = harvester_config.jobfetcher.maxJobs
                # get jobs
                tmpLog.debug('getting {0} jobs'.format(nJobs))
                sw = core_utils.get_stopwatch()
                siteName = queueConfig.siteName
                jobs, errStr = self.communicator.get_jobs(siteName, self.nodeName,
                                                          queueConfig.get_source_label(),
                                                          self.nodeName, nJobs,
                                                          queueConfig.getJobCriteria)
                tmpLog.info('got {0} jobs with {1} {2}'.format(len(jobs), errStr, sw.get_elapsed_time()))
                # convert to JobSpec
                if len(jobs) > 0:
                    jobSpecs = []
                    fileStatMap = dict()
                    sw_startconvert = core_utils.get_stopwatch()
                    for job in jobs:
                        timeNow = datetime.datetime.utcnow()
                        jobSpec = JobSpec()
                        jobSpec.convert_job_json(job)
                        jobSpec.computingSite = queueName
                        jobSpec.status = 'starting'
                        jobSpec.subStatus = 'fetched'
                        jobSpec.creationTime = timeNow
                        jobSpec.stateChangeTime = timeNow
                        jobSpec.configID = queueConfig.configID
                        jobSpec.set_one_attribute('schedulerID',
                                                  'harvester-{0}'.format(harvester_config.master.harvester_id))
                        if queueConfig.zipPerMB is not None and jobSpec.zipPerMB is None:
                            jobSpec.zipPerMB = queueConfig.zipPerMB
                        for tmpLFN, fileAttrs in iteritems(jobSpec.get_input_file_attributes()):
                            # check file status
                            if tmpLFN not in fileStatMap:
                                fileStatMap[tmpLFN] = self.dbProxy.get_file_status(tmpLFN, 'input',
                                                                                   queueConfig.ddmEndpointIn,
                                                                                   'starting')
                            # make file spec
                            fileSpec = FileSpec()
                            fileSpec.PandaID = jobSpec.PandaID
                            fileSpec.taskID = jobSpec.taskID
                            fileSpec.lfn = tmpLFN
                            fileSpec.endpoint = queueConfig.ddmEndpointIn
                            fileSpec.scope = fileAttrs['scope']
                            # set preparing to skip stage-in if the file is (being) taken care of by another job
                            if 'ready' in fileStatMap[tmpLFN] or 'preparing' in fileStatMap[tmpLFN] \
                                    or 'to_prepare' in fileStatMap[tmpLFN]:
                                fileSpec.status = 'preparing'
                            else:
                                fileSpec.status = 'to_prepare'
                            if fileSpec.status not in fileStatMap[tmpLFN]:
                                fileStatMap[tmpLFN][fileSpec.status] = 0
                            fileStatMap[tmpLFN][fileSpec.status] += 1
                            fileSpec.fileType = 'input'
                            jobSpec.add_in_file(fileSpec)
                        jobSpec.trigger_propagation()
                        jobSpecs.append(jobSpec)
                    # insert to DB
                    tmpLog.debug("Converting of {0} jobs {1}".format(len(jobs),sw_startconvert.get_elapsed_time()))
                    sw_insertdb =core_utils.get_stopwatch()
                    self.dbProxy.insert_jobs(jobSpecs)
                    tmpLog.debug('Insert of {0} jobs {1}'.format(len(jobSpecs), sw_insertdb.get_elapsed_time()))
            mainLog.debug('done')
            # check if being terminated
            if self.terminated(harvester_config.jobfetcher.sleepTime):
                mainLog.debug('terminated')
                return
示例#58
0
class Stager(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()


    # main loop
    def run(self):
        lockedBy = 'stager-{0}'.format(self.get_pid())
        while True:
            sw = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
            mainLog.debug('try to get jobs to check')
            # get jobs to check preparation
            try:
                maxFilesPerJob = harvester_config.stager.maxFilesPerJobToCheck
            except Exception:
                maxFilesPerJob = None
            jobsToCheck = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToCheck,
                                                              harvester_config.stager.checkInterval,
                                                              harvester_config.stager.lockInterval,
                                                              lockedBy, 'transferring',
                                                              JobSpec.HO_hasTransfer,
                                                              max_files_per_job=maxFilesPerJob)
            mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck)))
            # loop over all jobs
            for jobSpec in jobsToCheck:
                tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                          method_name='run')
                try:
                    tmpLog.debug('start checking')
                    # configID
                    configID = jobSpec.configID
                    if not core_utils.dynamic_plugin_change():
                        configID = None
                    # get queue
                    if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                        tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                                 configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID)
                    # get plugin
                    stagerCore = self.pluginFactory.get_plugin(queueConfig.stager)
                    if stagerCore is None:
                        # not found
                        tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                        continue
                    # lock job again
                    lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                    if not lockedAgain:
                        tmpLog.debug('skip since locked by another thread')
                        continue
                    tmpStat, tmpStr = stagerCore.check_status(jobSpec)
                    # check result
                    if tmpStat is True:
                        # succeeded
                        newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy)
                        tmpLog.debug('succeeded new subStatus={0}'.format(newSubStatus))
                    elif tmpStat is False:
                        # fatal error
                        tmpLog.debug('fatal error when checking status with {0}'.format(tmpStr))
                        # update job
                        for fileSpec in jobSpec.outFiles:
                            if fileSpec.status != 'finished':
                                fileSpec.status = 'failed'
                        errStr = 'stage-out failed with {0}'.format(tmpStr)
                        jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr)
                        jobSpec.trigger_propagation()
                        newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy)
                        tmpLog.debug('updated new subStatus={0}'.format(newSubStatus))
                    else:
                        # on-going
                        tmpLog.debug('try to check later since {0}'.format(tmpStr))
                except Exception:
                    core_utils.dump_error_message(tmpLog)
            # get jobs to trigger stage-out
            try:
                maxFilesPerJob = harvester_config.stager.maxFilesPerJobToTrigger
            except Exception:
                maxFilesPerJob = None
            jobsToTrigger = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToTrigger,
                                                                harvester_config.stager.triggerInterval,
                                                                harvester_config.stager.lockInterval,
                                                                lockedBy, 'to_transfer',
                                                                JobSpec.HO_hasOutput,
                                                                JobSpec.HO_hasZipOutput,
                                                                max_files_per_job=maxFilesPerJob)
            mainLog.debug('got {0} jobs to trigger'.format(len(jobsToTrigger)))
            # loop over all jobs
            for jobSpec in jobsToTrigger:
                tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                          method_name='run')
                try:
                    tmpLog.debug('try to trigger stage-out')
                    # configID
                    configID = jobSpec.configID
                    if not core_utils.dynamic_plugin_change():
                        configID = None
                    # get queue
                    if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                        tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                                 configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID)
                    # get plugin
                    stagerCore = self.pluginFactory.get_plugin(queueConfig.stager)
                    if stagerCore is None:
                        # not found
                        tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                        continue
                    # lock job again
                    lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                    if not lockedAgain:
                        tmpLog.debug('skip since locked by another thread')
                        continue
                    # trigger stage-out
                    tmpStat, tmpStr = stagerCore.trigger_stage_out(jobSpec)
                    # check result
                    if tmpStat is True:
                        # succeeded
                        jobSpec.all_files_triggered_to_stage_out()
                        newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy)
                        tmpLog.debug('triggered new subStatus={0}'.format(newSubStatus))
                    elif tmpStat is False:
                        # fatal error
                        tmpLog.debug('fatal error to trigger with {0}'.format(tmpStr))
                        # update job
                        for fileSpec in jobSpec.outFiles:
                            if fileSpec.status != 'finished':
                                fileSpec.status = 'failed'
                        errStr = 'stage-out failed with {0}'.format(tmpStr)
                        jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr)
                        jobSpec.trigger_propagation()
                        newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy)
                        tmpLog.debug('updated new subStatus={0}'.format(newSubStatus))
                    else:
                        # temporary error
                        tmpLog.debug('try to trigger later since {0}'.format(tmpStr))
                except Exception:
                    core_utils.dump_error_message(tmpLog)
            # get jobs to zip output
            try:
                maxFilesPerJob = harvester_config.stager.maxFilesPerJobToZip
            except Exception:
                maxFilesPerJob = None
            try:
                zipInterval = harvester_config.stager.zipInterval
            except Exception:
                zipInterval = harvester_config.stager.triggerInterval
            jobsToZip = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToZip,
                                                            zipInterval,
                                                            harvester_config.stager.lockInterval,
                                                            lockedBy, 'to_transfer',
                                                            JobSpec.HO_hasZipOutput,
                                                            JobSpec.HO_hasOutput,
                                                            max_files_per_job=maxFilesPerJob)
            mainLog.debug('got {0} jobs to zip'.format(len(jobsToZip)))
            # loop over all jobs
            for jobSpec in jobsToZip:
                tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID),
                                          method_name='run')
                try:
                    tmpLog.debug('try to zip output')
                    # configID
                    configID = jobSpec.configID
                    if not core_utils.dynamic_plugin_change():
                        configID = None
                    # get queue
                    if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID):
                        tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite,
                                                                                 configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID)
                    # get plugin
                    stagerCore = self.pluginFactory.get_plugin(queueConfig.stager)
                    if stagerCore is None:
                        # not found
                        tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite))
                        continue
                    # lock job again
                    lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                    if not lockedAgain:
                        tmpLog.debug('skip since locked by another thread')
                        continue
                    # trigger preparation
                    tmpStat, tmpStr = stagerCore.zip_output(jobSpec)
                    # succeeded
                    if tmpStat is True:
                        # update job
                        jobSpec.all_files_zipped()
                        newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, False, lockedBy)
                        tmpLog.debug('zipped new subStatus={0}'.format(newSubStatus))
                    else:
                        # failed
                        tmpLog.debug('failed to zip with {0}'.format(tmpStr))
                except Exception:
                    core_utils.dump_error_message(tmpLog)
            mainLog.debug('done' + sw.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.stager.sleepTime):
                mainLog.debug('terminated')
                return