Пример #1
0
 def doForWaitingJobs(self):
     tmpLog = MsgWrapper(logger, 'doForWaitingJobs label=user')
     # check every 60 min
     checkInterval = 60
     # get lib.tgz for waiting jobs
     libList = self.taskBufferIF.getLibForWaitingRunJob_JEDI(self.vo, self.prodSourceLabel, checkInterval)
     tmpLog.debug('got {0} lib.tgz files'.format(len(libList)))
     # activate or kill orphan jobs which were submitted to use lib.tgz when the lib.tgz was being produced
     for prodUserName,datasetName,tmpFileSpec in libList:
         tmpLog = MsgWrapper(logger,'< #ATM #KV doForWaitingJobs jediTaskID={0} label=user >'.format(tmpFileSpec.jediTaskID))
         tmpLog.debug('start')
         # check status of lib.tgz
         if tmpFileSpec.status == 'failed':
             # get buildJob
             pandaJobSpecs = self.taskBufferIF.peekJobs([tmpFileSpec.PandaID],
                                                        fromDefined=False,
                                                        fromActive=False,
                                                        fromWaiting=False)
             pandaJobSpec = pandaJobSpecs[0]
             if pandaJobSpec is not None:
                 # kill
                 self.taskBufferIF.updateJobs([pandaJobSpec],False)
                 tmpLog.debug('  action=killed_downstream_jobs for user="******" with libDS={1}'.format(prodUserName,datasetName))
             else:
                 # PandaJobSpec not found
                 tmpLog.error('  cannot find PandaJobSpec for user="******" with PandaID={1}'.format(prodUserName,
                                                                                                  tmpFileSpec.PandaID))
         elif tmpFileSpec.status == 'finished':
             # set metadata
             self.taskBufferIF.setGUIDs([{'guid':tmpFileSpec.GUID,
                                          'lfn':tmpFileSpec.lfn,
                                          'checksum':tmpFileSpec.checksum,
                                          'fsize':tmpFileSpec.fsize,
                                          'scope':tmpFileSpec.scope,
                                          }])
             # get lib dataset
             dataset = self.taskBufferIF.queryDatasetWithMap({'name':datasetName})
             if dataset is not None:
                 # activate jobs
                 aThr = Activator(self.taskBufferIF,dataset)
                 aThr.start()
                 aThr.join()
                 tmpLog.debug('  action=activated_downstream_jobs for user="******" with libDS={1}'.format(prodUserName,datasetName))
             else:
                 # datasetSpec not found
                 tmpLog.error('  cannot find datasetSpec for user="******" with libDS={1}'.format(prodUserName,datasetName))
         else:
             # lib.tgz is not ready
             tmpLog.debug('  keep waiting for user="******" libDS={1}'.format(prodUserName,datasetName))
Пример #2
0
 def doCleanDataLocality(self):
     tmpLog = MsgWrapper(logger, ' #ATM #KV doCleanDataLocality')
     tmpLog.debug('start')
     try:
         # lock
         got_lock = self.taskBufferIF.lockProcess_JEDI(  vo=self.vo, prodSourceLabel='default',
                                                         cloud=None, workqueue_id=None, resource_name=None,
                                                         component='AtlasDataLocalityUpdaterWatchDog.doCleanDataLocality',
                                                         pid=self.pid, timeLimit=1440)
         if not got_lock:
             tmpLog.debug('locked by another process. Skipped')
             return
         tmpLog.debug('got lock')
         # lifetime of records
         record_lifetime_hours = 24
         # run
         now_timestamp = datetime.datetime.utcnow()
         before_timestamp = now_timestamp - datetime.timedelta(hours=record_lifetime_hours)
         n_rows = self.taskBufferIF.deleteOutdatedDatasetLocality_JEDI(before_timestamp)
         tmpLog.info('cleaned up {0} records'.format(n_rows))
         # done
         tmpLog.debug('done')
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1} {2}'.format(errtype, errvalue, traceback.format_exc()))
Пример #3
0
 def doAction(self):
     try:
         # get logger
         origTmpLog = MsgWrapper(logger)
         origTmpLog.debug('start')
         # lock
         got_lock = self._get_lock()
         if not got_lock:
             origTmpLog.debug('locked by another process. Skipped')
             return self.SC_SUCCEEDED
         origTmpLog.debug('got lock')
         # undo preassigned tasks
         self.undo_preassign()
         # preassign tasks to sites
         ret_map = self.do_preassign()
         # unlock
         # self._release_lock()
         # origTmpLog.debug('released lock')
         # to-reassign map
         to_reassign_map = ret_map['to_reassign']
         if to_reassign_map:
             # wait some minutes so that preassigned tasks can be brokered, before reassigning jobs
             origTmpLog.debug('wait {0}s before reassigning jobs'.format(
                 reassign_jobs_wait_time))
             time.sleep(reassign_jobs_wait_time)
             # reassign jobs of preassigned tasks
             self.reassign_jobs(to_reassign_map)
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         err_str = traceback.format_exc()
         origTmpLog.error('failed with {0} {1} ; {2}'.format(
             errtype, errvalue, err_str))
     # return
     origTmpLog.debug('done')
     return self.SC_SUCCEEDED
Пример #4
0
 def doAction(self):
     # get logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start')
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
Пример #5
0
 def doAction(self):
     try:
         # get logger
         tmpLog = MsgWrapper(logger)
         tmpLog.debug('start')
         # action for priority boost
         self.doActionForPriorityBoost(tmpLog)
         # action for reassign
         self.doActionForReassgin(tmpLog)
         # action for throttled
         self.doActionForThrottled(tmpLog)
         # action for high prio pending
         for minPriority, timeoutVal in [
             (950, 10),
             (900, 30),
         ]:
             self.doActionForHighPrioPending(tmpLog, minPriority,
                                             timeoutVal)
         # action to set scout job data w/o scouts
         self.doActionToSetScoutJobData(tmpLog)
     except:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0}:{1} {2}'.format(
             errtype.__name__, errvalue, traceback.format_exc()))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
Пример #6
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for taskSpec in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID))
                 tmpLog.info('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 # get impl
                 impl = self.implFactory.instantiateImpl(taskSpec.vo,taskSpec.prodSourceLabel,None,
                                                         self.taskBufferIF,self.ddmIF)
                 if impl == None:
                     # post processor is undefined
                     tmpLog.error('post-processor is undefined for vo={0} sourceLabel={1}'.format(taskSpec.vo,taskSpec.prodSourceLabel))
                     tmpStat = Interaction.SC_FATAL
                 # execute    
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('post-process with {0}'.format(impl.__class__.__name__))
                     try:
                         impl.doPostProcess(taskSpec,tmpLog)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpLog.error('doPostProcess failed with {0}:{1}'.format(errtype.__name__,errvalue))
                         tmpStat = Interaction.SC_FATAL
                 # done
                 if tmpStat == Interaction.SC_FATAL:
                     # task is broken
                     tmpErrStr = 'post-process failed'
                     tmpLog.error(tmpErrStr)
                     taskSpec.status = 'broken'
                     taskSpec.setErrDiag(tmpErrStr)
                     taskSpec.lockedBy = None
                     self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID})    
                 elif tmpStat == Interaction.SC_FAILED:
                     tmpErrStr = 'post processing failed'
                     taskSpec.setOnHold()
                     taskSpec.setErrDiag(tmpErrStr,True)
                     taskSpec.lockedBy = None
                     self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID})
                     tmpLog.info('set task_status={0} since {1}'.format(taskSpec.status,taskSpec.errorDialog))
                     continue
                 # final procedure
                 try:
                     impl.doFinalProcedure(taskSpec,tmpLog)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     tmpLog.error('doFinalProcedure failed with {0}:{1}'.format(errtype.__name__,errvalue))
                 # done
                 tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Пример #7
0
 def doCheck(self,taskSpecList):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start doCheck')
     # return for failure
     retFatal    = self.SC_FATAL,{}
     retTmpError = self.SC_FAILED,{}
     # get list of jediTaskIDs
     taskIdList = []
     taskSpecMap = {}
     for taskSpec in taskSpecList:
         taskIdList.append(taskSpec.jediTaskID)
         taskSpecMap[taskSpec.jediTaskID] = taskSpec
     # check with panda
     tmpLog.debug('check with panda')
     tmpPandaStatus,cloudsInPanda = PandaClient.seeCloudTask(taskIdList)
     if tmpPandaStatus != 0:
         tmpLog.error('failed to see clouds')
         return retTmpError
     # make return map
     retMap = {}
     for tmpTaskID,tmpCoreName in cloudsInPanda.iteritems():
         tmpLog.debug('jediTaskID={0} -> {1}'.format(tmpTaskID,tmpCoreName))
         if not tmpCoreName in ['NULL','',None]:
             taskSpec = taskSpecMap[tmpTaskID]
             if taskSpec.useWorldCloud():
                 # get destinations for WORLD cloud
                 ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                 # get site
                 siteSpec = self.siteMapper.getSite(tmpCoreName)
                 # get nucleus
                 nucleus = siteSpec.pandasite
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(tmpTaskID,['output','log'])
                 # get destinations
                 retMap[tmpTaskID] = {'datasets':[],'nucleus':nucleus}
                 for datasetSpec in tmpDatasetSpecs:
                     # skip distributed datasets
                     if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                         continue
                     # get token
                     token = ddmIF.convertTokenToEndpoint(siteSpec.ddm,datasetSpec.storageToken)
                     # use default endpoint
                     if token == None:
                         token = siteSpec.ddm
                     # add origianl token
                     if not datasetSpec.storageToken in ['',None]:
                         token += '/{0}'.format(datasetSpec.storageToken)
                     retMap[tmpTaskID]['datasets'].append({'datasetID':datasetSpec.datasetID,
                                                           'token':'dst:{0}'.format(token),
                                                           'destination':tmpCoreName})
             else:
                 retMap[tmpTaskID] = tmpCoreName
     tmpLog.debug('ret {0}'.format(str(retMap)))
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,retMap
Пример #8
0
 def start(self):
     # start base classes
     JediKnight.start(self)
     FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.info('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # prepare tasks to be finished
                     tmpLog.info('preparing tasks to be finished for vo={0} label={1}'.format(vo,prodSourceLabel))
                     tmpRet = self.taskBufferIF.prepareTasksToBeFinished_JEDI(vo,prodSourceLabel,
                                                                              jedi_config.postprocessor.nTasks,
                                                                              pid=self.pid)
                     if tmpRet == None:
                         # failed
                         tmpLog.error('failed to prepare tasks')
                     # get tasks to be finished
                     tmpLog.info('getting tasks to be finished') 
                     tmpList = self.taskBufferIF.getTasksToBeFinished_JEDI(vo,prodSourceLabel,self.pid,
                                                                           jedi_config.postprocessor.nTasks)
                     if tmpList == None: 
                         # failed
                         tmpLog.error('failed to get tasks to be finished')
                     else:
                         tmpLog.info('got {0} tasks'.format(len(tmpList)))
                         # put to a locked list
                         taskList = ListWithLock(tmpList)
                         # make thread pool
                         threadPool = ThreadPool()
                         # make workers
                         nWorker = jedi_config.postprocessor.nWorkers
                         for iWorker in range(nWorker):
                             thr = PostProcessorThread(taskList,threadPool,
                                                       self.taskBufferIF,
                                                       self.ddmIF,
                                                       self)
                             thr.start()
                         # join
                         threadPool.join()
             tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
         # sleep if needed
         loopCycle = 60
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)
Пример #9
0
 def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue,
                   resourceType):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start vo={0} label={1} cloud={2} workQueue={3}'.format(
         vo, prodSourceLabel, cloudName, workQueue.queue_name))
     # check if unthrottled
     if not workQueue.throttled:
         tmpLog.debug("  done : unthrottled since throttled is False")
         return self.retUnThrottled
     tmpLog.debug("  done : SKIP")
     return self.retThrottled
Пример #10
0
 def start(self):
     # start base classes
     JediKnight.start(self)
     FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.debug('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # get the list of tasks to refine
                     tmpList = self.taskBufferIF.getTasksToRefine_JEDI(
                         vo, prodSourceLabel)
                     if tmpList is None:
                         # failed
                         tmpLog.error(
                             'failed to get the list of tasks to refine')
                     else:
                         tmpLog.debug('got {0} tasks'.format(len(tmpList)))
                         # put to a locked list
                         taskList = ListWithLock(tmpList)
                         # make thread pool
                         threadPool = ThreadPool()
                         # get work queue mapper
                         workQueueMapper = self.taskBufferIF.getWorkQueueMap(
                         )
                         # make workers
                         nWorker = jedi_config.taskrefine.nWorkers
                         for iWorker in range(nWorker):
                             thr = TaskRefinerThread(
                                 taskList, threadPool, self.taskBufferIF,
                                 self.ddmIF, self, workQueueMapper)
                             thr.start()
                         # join
                         threadPool.join()
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
             tmpLog.error('Traceback: {0}'.format(traceback.format_exc()))
         # sleep if needed
         loopCycle = jedi_config.taskrefine.loopCycle
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)
         # randomize cycle
         self.randomSleep(max_val=loopCycle)
Пример #11
0
 def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue,
                   jobStat):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start vo={0} label={1} cloud={2} workQueue={3}'.format(
         vo, prodSourceLabel, cloudName, workQueue.queue_name))
     # check if unthrottled
     if workQueue.queue_share == None:
         tmpLog.debug("  done : unthrottled since share=None")
         return self.retUnThrottled
     tmpLog.debug("  done : SKIP")
     return self.retThrottled
Пример #12
0
 def start(self):
     # start base classes
     JediKnight.start(self)
     FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.info('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # vo/prodSourceLabel specific action
                     impl = self.getImpl(vo,
                                         prodSourceLabel,
                                         subType=self.subStr)
                     if impl is not None:
                         plugin_name = impl.__class__.__name__
                         tmpLog.info(
                             'pre-action for vo={} label={} cls={}'.format(
                                 vo, prodSourceLabel, plugin_name))
                         impl.pre_action(tmpLog, vo, prodSourceLabel,
                                         self.pid)
                         tmpLog.info(
                             'do action for vo={} label={} cls={}'.format(
                                 vo, prodSourceLabel, plugin_name))
                         tmpStat = impl.doAction()
                         if tmpStat != Interaction.SC_SUCCEEDED:
                             tmpLog.error(
                                 'failed to run special action for vo={} label={} cls={}'
                                 .format(vo, prodSourceLabel, plugin_name))
                         else:
                             tmpLog.info(
                                 'done for vo={} label={} cls={}'.format(
                                     vo, prodSourceLabel, plugin_name))
             tmpLog.info('done')
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
         # sleep if needed
         loopCycle = jedi_config.watchdog.loopCycle if self.period is None else self.period
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)
         # randomize cycle
         self.randomSleep(max_val=loopCycle)
Пример #13
0
 def start(self):
     # start base classes
     JediKnight.start(self)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.debug('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # get the list of tasks to exec command
                     tmpList = self.taskBufferIF.getTasksToExecCommand_JEDI(
                         vo, prodSourceLabel)
                     if tmpList == None:
                         # failed
                         tmpLog.error(
                             'failed to get the task list for vo={0} label={1}'
                             .format(vo, prodSourceLabel))
                     else:
                         tmpLog.debug('got {0} tasks'.format(len(tmpList)))
                         # put to a locked list
                         taskList = ListWithLock(tmpList)
                         # make thread pool
                         threadPool = ThreadPool()
                         # make workers
                         nWorker = jedi_config.taskrefine.nWorkers
                         for iWorker in range(nWorker):
                             thr = TaskCommandoThread(
                                 taskList, threadPool, self.taskBufferIF,
                                 self.ddmIF, self.pid)
                             thr.start()
                         # join
                         threadPool.join()
             tmpLog.debug('done')
         except:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
         # sleep if needed
         loopCycle = jedi_config.tcommando.loopCycle
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)
         # randomize cycle
         self.randomSleep()
Пример #14
0
 def doForPreStaging(self):
     try:
         tmpLog = MsgWrapper(logger,'doForPreStaging')
         # lock
         flagLocked = self.taskBufferIF.lockProcess_JEDI(self.vo,self.prodSourceLabel,
                                                         self.cronActions['forPrestage'],
                                                         0,self.pid,timeLimit=5)
         if not flagLocked:
             return
         tmpLog.debug('start')
         # get throttled users
         thrUserTasks = self.taskBufferIF.getThrottledUsersTasks_JEDI(self.vo,self.prodSourceLabel)
         # get dispatch datasets
         dispUserTasks = self.taskBufferIF.getDispatchDatasetsPerUser(self.vo,self.prodSourceLabel,True,True)
         # max size of prestaging requests in MB
         maxPrestaging = self.taskBufferIF.getConfigValue('anal_watchdog', 'PRESTAGE_LIMIT', 'jedi', 'atlas')
         if maxPrestaging == None:
             maxPrestaging = 1
         maxPrestaging *= 1024*1024
         # throttle interval
         thrInterval = 120
         # loop over all users
         for userName,userDict in dispUserTasks.iteritems():
             tmpLog.debug('{0} {1} GB'.format(userName, userDict['size']/1024))
             # too large
             if userDict['size'] > maxPrestaging:
                 tmpLog.debug('{0} has too large prestaging {1}>{2} GB'.format(userName,
                                                                               userDict['size']/1024,
                                                                               maxPrestaging/1024))
                 # throttle tasks
                 for taskID in userDict['tasks']:
                     if not userName in thrUserTasks or not taskID in thrUserTasks[userName]:
                         tmpLog.debug('thottle jediTaskID={0}'.format(taskID))
                         errDiag = 'throttled for {0} min due to too large prestaging from TAPE'.format(thrInterval)
                         self.taskBufferIF.throttleTask_JEDI(taskID,thrInterval,errDiag)
                 # remove the user from the list
                 if userName in thrUserTasks:
                     del thrUserTasks[userName]
         # release users
         for userName,taskIDs in thrUserTasks.items():
             tmpLog.debug('{0} release throttled tasks'.format(userName))
             # unthrottle tasks
             for taskID in taskIDs:
                 tmpLog.debug('unthottle jediTaskID={0}'.format(taskID))
                 self.taskBufferIF.releaseThrottledTask_JEDI(taskID)
         tmpLog.debug('done')
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1} {2}'.format(errtype,errvalue,traceback.format_exc()))
Пример #15
0
 def doAction(self):
     try:
         # get logger
         origTmpLog = MsgWrapper(logger)
         origTmpLog.debug('start')
         # make tasks pending under certain conditions
         self.do_for_data_locality()
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         err_str = traceback.format_exc()
         origTmpLog.error('failed with {0} {1} ; {2}'.format(
             errtype, errvalue, err_str))
     # return
     origTmpLog.debug('done')
     return self.SC_SUCCEEDED
Пример #16
0
 def doAction(self):
     try:
         # get logger
         origTmpLog = MsgWrapper(logger)
         origTmpLog.debug('start')
         # clean up data locality
         self.doCleanDataLocality()
         # update data locality
         self.doUpdateDataLocality()
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         origTmpLog.error('failed with {0} {1}'.format(errtype, errvalue))
     # return
     origTmpLog.debug('done')
     return self.SC_SUCCEEDED
Пример #17
0
 def reassign_jobs(self, to_reassign_map):
     tmp_log = MsgWrapper(logger, 'reassign_jobs')
     for jedi_taskid, value_map in to_reassign_map.items():
         site = value_map['site']
         n_jobs_to_fill = value_map['n_jobs_to_fill']
         # compute n_jobs_to_close from n_jobs_to_fill
         n_jobs_to_close = int(n_jobs_to_fill / 3)
         # reassign
         n_jobs_closed = self.taskBufferIF.reassignJobsInPreassignedTask_JEDI(
             jedi_taskid, site, n_jobs_to_close)
         if n_jobs_closed is None:
             tmp_log.debug(
                 'jediTaskID={0} no longer ready/running or not assigned to {1} , skipped'
                 .format(jedi_taskid, site))
         else:
             tmp_log.debug('jediTaskID={0} to {1} , closed {2} jobs'.format(
                 jedi_taskid, site, n_jobs_closed))
Пример #18
0
 def doAction(self):
     try:
         # get logger
         origTmpLog = MsgWrapper(logger)
         origTmpLog.debug('start')
         # handle waiting jobs
         self.doForWaitingJobs()
         # throttle tasks if so many prestaging requests
         self.doForPreStaging()
         # priority massage
         self.doForPriorityMassage()
         # redo stalled analysis jobs
         self.doForRedoStalledJobs()
         # throttle WAN data access
         #self.doForThrottleWAN()
     except Exception:
         errtype,errvalue = sys.exc_info()[:2]
         origTmpLog.error('failed with {0} {1}'.format(errtype,errvalue))
     # return
     origTmpLog.debug('done')
     return self.SC_SUCCEEDED
Пример #19
0
    def doAction(self):
        try:
            # get logger
            tmpLog = MsgWrapper(logger)
            tmpLog.debug('start')

            # action for priority boost
            self.doActionForPriorityBoost(tmpLog)

            # action for reassign
            self.doActionForReassign(tmpLog)

            # action for throttled
            self.doActionForThrottled(tmpLog)

            # action for high prio pending
            for minPriority, timeoutVal in [(950, 10),
                                            (900, 30),
                                            ]:
                self.doActionForHighPrioPending(tmpLog, minPriority, timeoutVal)

            # action to set scout job data w/o scouts
            self.doActionToSetScoutJobData(tmpLog)

            # action to throttle jobs in paused tasks
            self.doActionToThrottleJobInPausedTasks(tmpLog)

            # action for jumbo
            jumbo = JumboWatchDog(self.taskBufferIF, self.ddmIF, tmpLog, 'atlas', 'managed')
            jumbo.run()
        except Exception:
            errtype, errvalue = sys.exc_info()[:2]
            tmpLog.error('failed with {0}:{1} {2}'.format(errtype.__name__, errvalue, traceback.format_exc()))
        # return
        tmpLog.debug('done')
        return self.SC_SUCCEEDED
Пример #20
0
 def doUpdateDataLocality(self):
     tmpLog = MsgWrapper(logger, ' #ATM #KV doUpdateDataLocality')
     tmpLog.debug('start')
     try:
         # lock
         got_lock = self.taskBufferIF.lockProcess_JEDI(  vo=self.vo, prodSourceLabel='default',
                                                         cloud=None, workqueue_id=None, resource_name=None,
                                                         component='AtlasDataLocalityUpdaterWatchDog.doUpdateDataLocality',
                                                         pid=self.pid, timeLimit=240)
         if not got_lock:
             tmpLog.debug('locked by another process. Skipped')
             return
         tmpLog.debug('got lock')
         # get list of datasets
         datasets_list = self.get_datasets_list()
         tmpLog.debug('got {0} datasets to update'.format(len(datasets_list)))
         # make thread pool
         thread_pool = ThreadPool()
         # make workers
         n_workers = 4
         for _ in range(n_workers):
             thr = DataLocalityUpdaterThread(taskDsList=datasets_list,
                                             threadPool=thread_pool,
                                             taskbufferIF=self.taskBufferIF,
                                             ddmIF=self.ddmIF,
                                             pid=self.pid,
                                             loggerObj=tmpLog)
             thr.start()
         tmpLog.debug('started {0} updater workers'.format(n_workers))
         # join
         thread_pool.join()
         # done
         tmpLog.debug('done')
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1} {2}'.format(errtype, errvalue, traceback.format_exc()))
Пример #21
0
    def start(self):
        # start base classes
        JediKnight.start(self)
        FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF)
        # go into main loop
        while True:
            startTime = datetime.datetime.utcnow()
            try:
                # get logger
                tmpLog = MsgWrapper(logger)
                tmpLog.debug('start TaskBroker')
                # get work queue mapper
                workQueueMapper = self.taskBufferIF.getWorkQueueMap()
                resource_types = self.taskBufferIF.load_resource_types()

                # loop over all vos
                for vo in self.vos:
                    # loop over all sourceLabels
                    for prodSourceLabel in self.prodSourceLabels:
                        # loop over all work queues
                        for workQueue in workQueueMapper.getAlignedQueueList(
                                vo, prodSourceLabel):
                            for resource_type in resource_types:
                                wq_name = '_'.join(
                                    workQueue.queue_name.split(' '))
                                msgLabel = 'vo={0} label={1} queue={2} resource_type={3}: '.\
                                    format(vo, prodSourceLabel, wq_name, resource_type.resource_name)
                                tmpLog.debug(msgLabel + 'start')
                                # get the list of tasks to check
                                tmpList = self.taskBufferIF.getTasksToCheckAssignment_JEDI(
                                    vo, prodSourceLabel, workQueue,
                                    resource_type.resource_name)
                                if tmpList is None:
                                    # failed
                                    tmpLog.error(
                                        msgLabel +
                                        'failed to get the list of tasks to check'
                                    )
                                else:
                                    tmpLog.debug(msgLabel +
                                                 'got tasks_to_check={0}'.
                                                 format(len(tmpList)))
                                    # put to a locked list
                                    taskList = ListWithLock(tmpList)
                                    # make thread pool
                                    threadPool = ThreadPool()
                                    # make workers
                                    nWorker = jedi_config.taskbroker.nWorkers
                                    for iWorker in range(nWorker):
                                        thr = TaskCheckerThread(
                                            taskList, threadPool,
                                            self.taskBufferIF, self.ddmIF,
                                            self, vo, prodSourceLabel)
                                        thr.start()
                                    # join
                                    threadPool.join()
                                # get the list of tasks to assign
                                tmpList = self.taskBufferIF.getTasksToAssign_JEDI(
                                    vo, prodSourceLabel, workQueue,
                                    resource_type.resource_name)
                                if tmpList is None:
                                    # failed
                                    tmpLog.error(
                                        msgLabel +
                                        'failed to get the list of tasks to assign'
                                    )
                                else:
                                    tmpLog.debug(msgLabel +
                                                 'got tasks_to_assign={0}'.
                                                 format(len(tmpList)))
                                    # put to a locked list
                                    taskList = ListWithLock(tmpList)
                                    # make thread pool
                                    threadPool = ThreadPool()
                                    # make workers
                                    nWorker = jedi_config.taskbroker.nWorkers
                                    for iWorker in range(nWorker):
                                        thr = TaskBrokerThread(
                                            taskList, threadPool,
                                            self.taskBufferIF, self.ddmIF,
                                            self, vo, prodSourceLabel,
                                            workQueue,
                                            resource_type.resource_name)
                                        thr.start()
                                    # join
                                    threadPool.join()
                                tmpLog.debug(msgLabel + 'done')
            except Exception:
                errtype, errvalue = sys.exc_info()[:2]
                tmpLog.error('failed in {0}.start() with {1} {2}'.format(
                    self.__class__.__name__, errtype.__name__, errvalue))
            tmpLog.debug('done')
            # sleep if needed
            loopCycle = jedi_config.taskbroker.loopCycle
            timeDelta = datetime.datetime.utcnow() - startTime
            sleepPeriod = loopCycle - timeDelta.seconds
            if sleepPeriod > 0:
                time.sleep(sleepPeriod)
            # randomize cycle
            self.randomSleep(max_val=loopCycle)
Пример #22
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 100
             taskList = self.taskList.get(nTasks)
             totalTasks, idxTasks = self.taskList.stat()
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug(
                     '{0} terminating since no more items'.format(
                         self.__class__.__name__))
                 return
             # make logger
             tmpLog = MsgWrapper(self.logger)
             tmpLog.info(
                 'start TaskBrokerThread {0}/{1} for jediTaskID={2}'.format(
                     idxTasks, totalTasks, taskList))
             tmpStat = Interaction.SC_SUCCEEDED
             # get TaskSpecs
             tmpListToAssign = []
             for tmpTaskItem in taskList:
                 tmpListItem = self.taskBufferIF.getTasksToBeProcessed_JEDI(
                     None,
                     None,
                     None,
                     None,
                     None,
                     simTasks=[tmpTaskItem],
                     readMinFiles=True)
                 if tmpListItem is None:
                     # failed
                     tmpLog.error(
                         'failed to get the input chunks for jediTaskID={0}'
                         .format(tmpTaskItem))
                     tmpStat = Interaction.SC_FAILED
                     break
                 tmpListToAssign += tmpListItem
             # get impl
             if tmpStat == Interaction.SC_SUCCEEDED:
                 tmpLog.info('getting Impl')
                 try:
                     impl = self.implFactory.getImpl(
                         self.vo, self.prodSourceLabel)
                     if impl is None:
                         # task refiner is undefined
                         tmpLog.error(
                             'task broker is undefined for vo={0} sourceLabel={1}'
                             .format(self.vo, self.prodSourceLabel))
                         tmpStat = Interaction.SC_FAILED
                 except Exception:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error('getImpl failed with {0}:{1}'.format(
                         errtype.__name__, errvalue))
                     tmpStat = Interaction.SC_FAILED
             # brokerage
             if tmpStat == Interaction.SC_SUCCEEDED:
                 tmpLog.info('brokerage with {0} for {1} tasks '.format(
                     impl.__class__.__name__, len(tmpListToAssign)))
                 try:
                     tmpStat = impl.doBrokerage(tmpListToAssign, self.vo,
                                                self.prodSourceLabel,
                                                self.workQueue,
                                                self.resource_name)
                 except Exception:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error('doBrokerage failed with {0}:{1}'.format(
                         errtype.__name__, errvalue))
                     tmpStat = Interaction.SC_FAILED
             # register
             if tmpStat != Interaction.SC_SUCCEEDED:
                 tmpLog.error('failed')
             else:
                 tmpLog.info('done')
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
Пример #23
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 100
             taskList = self.taskList.get(nTasks)
             totalTasks, idxTasks = self.taskList.stat()
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug(
                     '{0} terminating since no more items'.format(
                         self.__class__.__name__))
                 return
             # make logger
             tmpLog = MsgWrapper(self.logger)
             tmpLog.info(
                 'start TaskCheckerThread {0}/{1} for jediTaskID={2}'.
                 format(idxTasks, totalTasks, taskList))
             tmpStat = Interaction.SC_SUCCEEDED
             # get TaskSpecs
             taskSpecList = []
             for jediTaskID in taskList:
                 tmpRet, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(
                     jediTaskID, False)
                 if tmpRet and taskSpec is not None:
                     taskSpecList.append(taskSpec)
                 else:
                     tmpLog.error(
                         'failed to get taskSpec for jediTaskID={0}'.format(
                             jediTaskID))
             if taskSpecList != []:
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         impl = self.implFactory.getImpl(
                             self.vo, self.prodSourceLabel)
                         if impl is None:
                             # task brokerage is undefined
                             tmpLog.error(
                                 'task broker is undefined for vo={0} sourceLabel={1}'
                                 .format(self.vo, self.prodSourceLabel))
                             tmpStat = Interaction.SC_FAILED
                     except Exception:
                         errtype, errvalue = sys.exc_info()[:2]
                         tmpLog.error('getImpl failed with {0}:{1}'.format(
                             errtype.__name__, errvalue))
                         tmpStat = Interaction.SC_FAILED
                 # check
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('check with {0}'.format(
                         impl.__class__.__name__))
                     try:
                         tmpStat, taskCloudMap = impl.doCheck(taskSpecList)
                     except Exception:
                         errtype, errvalue = sys.exc_info()[:2]
                         tmpLog.error('doCheck failed with {0}:{1}'.format(
                             errtype.__name__, errvalue))
                         tmpStat = Interaction.SC_FAILED
                 # update
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to check assignment')
                 else:
                     tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(
                         taskCloudMap)
                     tmpLog.info('done with {0} for {1}'.format(
                         tmpRet, str(taskCloudMap)))
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
Пример #24
0
    def doSetup(self,taskSpec,datasetToRegister,pandaJobs):
        # make logger
        tmpLog = MsgWrapper(logger,"< jediTaskID={0} >".format(taskSpec.jediTaskID))
        tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType))
        # returns
        retFatal    = self.SC_FATAL
        retTmpError = self.SC_FAILED
        retOK       = self.SC_SUCCEEDED
        try:
            # get DDM I/F
            ddmIF = self.ddmIF.getInterface(taskSpec.vo)
            # register datasets
            if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
                # prod vs anal
                userSetup = False
                if taskSpec.prodSourceLabel in ['user']:
                    userSetup = True
                    # collect datasetID to register datasets/containers just in case
                    for tmpPandaJob in pandaJobs:
                        if not tmpPandaJob.produceUnMerge():
                            for tmpFileSpec in tmpPandaJob.Files:
                                if tmpFileSpec.type in ['output','log']:
                                    if tmpFileSpec.datasetID not in datasetToRegister:
                                        datasetToRegister.append(tmpFileSpec.datasetID)
                tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister)))
                # get site mapper
                siteMapper = self.taskBufferIF.getSiteMapper()

                # loop over all datasets
                avDatasetList = []
                cnDatasetMap  = {}
                for datasetID in datasetToRegister:
                    # get output and log datasets
                    tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID))
                    tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID,
                                                                                  datasetID)
                    if not tmpStat:
                        tmpLog.error('failed to get output and log datasets')
                        return retFatal
                    if datasetSpec.isPseudo():
                        tmpLog.info('skip pseudo dataset')
                        continue
                    # DDM backend
                    ddmBackEnd = taskSpec.getDdmBackEnd()
                    tmpLog.info('checking {0}'.format(datasetSpec.datasetName))
                    # check if dataset and container are available in DDM
                    for targetName in [datasetSpec.datasetName,datasetSpec.containerName]:
                        if targetName is None:
                            continue
                        if targetName not in avDatasetList:
                            # set lifetime
                            if targetName.startswith('panda'):
                                if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed':
                                    lifetime = 365
                                else:
                                    lifetime = 14
                            else:
                                lifetime = None
                            # check dataset/container in DDM
                            tmpList = ddmIF.listDatasets(targetName)
                            if tmpList == []:
                                # get location
                                location = None
                                locForRule = None
                                if targetName == datasetSpec.datasetName:
                                    # dataset
                                    if datasetSpec.site in ['',None]:
                                        if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                            locForRule = datasetSpec.destination
                                        elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) is not None:
                                            location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken)
                                        elif taskSpec.cloud is not None:
                                            # use T1 SE
                                            tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source']
                                            location = siteMapper.getDdmEndpoint(tmpT1Name, datasetSpec.storageToken,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                                    else:
                                        tmpLog.info('site={0} token={1}'.format(datasetSpec.site, datasetSpec.storageToken))
                                        location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken,
                                                                             taskSpec.prodSourceLabel,
                                                                             JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                                if locForRule is None:
                                    locForRule = location
                                # set metadata
                                if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName:
                                    metaData = {}
                                    metaData['task_id'] = taskSpec.jediTaskID
                                    if taskSpec.campaign not in [None,'']:
                                        metaData['campaign'] = taskSpec.campaign
                                    if datasetSpec.getTransient() is not None:
                                        metaData['transient'] = datasetSpec.getTransient()
                                else:
                                    metaData = None
                                # register dataset/container
                                tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName,
                                                                                                                         location,
                                                                                                                         ddmBackEnd,
                                                                                                                         lifetime,
                                                                                                                         str(metaData)))
                                tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location,
                                                                   lifetime=lifetime,metaData=metaData)
                                if not tmpStat:
                                    tmpLog.error('failed to register {0}'.format(targetName))
                                    return retFatal
                                # procedures for user
                                if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                    # register location
                                    tmpToRegister = False
                                    if userSetup and targetName == datasetSpec.datasetName and datasetSpec.site not in ['',None]:
                                        if taskSpec.workingGroup:
                                            userName = taskSpec.workingGroup
                                        else:
                                            userName = taskSpec.userName
                                        grouping = None
                                        tmpToRegister = True
                                    elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                        userName = None
                                        grouping = 'NONE'
                                        tmpToRegister = True
                                    if tmpToRegister:
                                        activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                                        tmpLog.info('registering location={} lifetime={} days activity={} grouping={} '
                                                    'owner={}'.format(locForRule, lifetime, activity, grouping,
                                                                      userName))
                                        tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName,
                                                                                lifetime=lifetime,backEnd=ddmBackEnd,
                                                                                activity=activity,grouping=grouping)
                                        if not tmpStat:
                                            tmpLog.error('failed to register location {0} for {1}'.format(locForRule,
                                                                                                          targetName))
                                            return retFatal
                                        # double copy
                                        if userSetup and datasetSpec.type == 'output':
                                            if datasetSpec.destination != datasetSpec.site:
                                                tmpLog.info('skip making double copy as destination={0} is not site={1}'.format(datasetSpec.destination,
                                                                                                                                datasetSpec.site))
                                            else:

                                                second_copy = True
                                                try:
                                                    if taskSpec.site:
                                                        panda_site = siteMapper.getSite(taskSpec.site)
                                                        if panda_site.catchall and 'skip_2nd_copy' in panda_site.catchall:
                                                            tmpLog.info('skip making double copy as specified in {0} catchall'.format(panda_site))
                                                            second_copy = False
                                                except Exception:
                                                    second_copy = True

                                                if second_copy:
                                                    locForDouble = '(type=SCRATCHDISK)\\notforextracopy=True'
                                                    tmpMsg  = 'registering double copy '
                                                    tmpMsg += 'location="{0}" lifetime={1}days activity={2} for dataset={3}'.format(locForDouble,lifetime,
                                                                                                                                    activity,targetName)
                                                    tmpLog.info(tmpMsg)
                                                    tmpStat = ddmIF.registerDatasetLocation(targetName,locForDouble,copies=2,owner=userName,
                                                                                            lifetime=lifetime,activity=activity,
                                                                                            grouping='NONE',weight='freespace',
                                                                                            ignore_availability=False)
                                                    if not tmpStat:
                                                        tmpLog.error('failed to register double copylocation {0} for {1}'.format(locForDouble,
                                                                                                                               targetName))
                                                        return retFatal
                                avDatasetList.append(targetName)
                            else:
                                tmpLog.info('{0} already registered'.format(targetName))
                    # check if dataset is in the container
                    if datasetSpec.containerName is not None and datasetSpec.containerName != datasetSpec.datasetName:
                        # get list of constituent datasets in the container
                        if datasetSpec.containerName not in cnDatasetMap:
                            cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName)
                        # add dataset
                        if datasetSpec.datasetName not in cnDatasetMap[datasetSpec.containerName]:
                            tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName))
                            tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName],
                                                                   backEnd=ddmBackEnd)
                            if not tmpStat:
                                tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName,
                                                                               datasetSpec.containerName))
                                return retFatal
                            cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName)
                        else:
                            tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName))
                    # update dataset
                    datasetSpec.status = 'registered'
                    self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID,
                                                                      'datasetID':datasetID})
            # register ES datasets
            if taskSpec.registerEsFiles():
                targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID)
                location = None
                metaData = {}
                metaData['task_id'] = taskSpec.jediTaskID
                metaData['hidden']  = True
                tmpLog.info('registering ES dataset {0} with location={1} meta={2}'.format(targetName,
                                                                                           location,
                                                                                           str(metaData)))
                tmpStat = ddmIF.registerNewDataset(targetName,location=location,metaData=metaData,
                                                   resurrect=True)
                if not tmpStat:
                    tmpLog.error('failed to register ES dataset {0}'.format(targetName))
                    return retFatal
                # register rule
                location = 'type=DATADISK'
                activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                grouping = 'NONE'
                tmpLog.info('registering location={0} activity={1} grouping={2}'.format(location,
                                                                                        activity,
                                                                                        grouping))
                tmpStat = ddmIF.registerDatasetLocation(targetName,location,activity=activity,
                                                        grouping=grouping)
                if not tmpStat:
                    tmpLog.error('failed to register location {0} with {2} for {1}'.format(location,
                                                                                           targetName,
                                                                                           activity))
                    return retFatal
            # open datasets
            if taskSpec.prodSourceLabel in ['managed','test']:
                # get the list of output/log datasets
                outDatasetList = []
                for tmpPandaJob in pandaJobs:
                    for tmpFileSpec in tmpPandaJob.Files:
                        if tmpFileSpec.type in ['output','log']:
                            if tmpFileSpec.destinationDBlock not in outDatasetList:
                                outDatasetList.append(tmpFileSpec.destinationDBlock)
                # open datasets
                for outDataset in outDatasetList:
                    tmpLog.info('open {0}'.format(outDataset))
                    ddmIF.openDataset(outDataset)
                    # unset lifetime
                    ddmIF.setDatasetMetadata(outDataset,'lifetime',None)
            # return
            tmpLog.info('done')
            return retOK
        except Exception:
            errtype,errvalue = sys.exc_info()[:2]
            tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue))
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            return retFatal
Пример #25
0
    def doActionForReassign(self,gTmpLog):
        # get DDM I/F
        ddmIF = self.ddmIF.getInterface(self.vo)
        # get site mapper
        siteMapper = self.taskBufferIF.getSiteMapper()
        # get tasks to get reassigned
        taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel)

        gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList)))
        for taskSpec in taskList:
            tmpLog = MsgWrapper(logger, '< jediTaskID={0} >'.format(taskSpec.jediTaskID))
            tmpLog.debug('start to reassign')
            # DDM backend
            ddmBackEnd = taskSpec.getDdmBackEnd()
            # get datasets
            tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log'])
            if tmpStat is not True:
                tmpLog.error('failed to get datasets')
                continue
            # update DB
            if not taskSpec.useWorldCloud():
                # update cloudtasks
                tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True)
                if tmpStat != 'SUCCEEDED':
                    tmpLog.error('failed to update CloudTasks')
                    continue
                # check cloud
                if not siteMapper.checkCloud(taskSpec.cloud):
                    tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud))
                    continue
            else:
                # re-run task brokerage
                if taskSpec.nucleus in [None,'']:
                    taskSpec.status = 'assigning'
                    taskSpec.oldStatus = None
                    taskSpec.setToRegisterDatasets()
                    self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID': taskSpec.jediTaskID},
                                                      setOldModTime=True)
                    tmpLog.debug('#ATM #KV label=managed action=trigger_new_brokerage by setting task_status={0}'.
                                 format(taskSpec.status))
                    continue

                # get nucleus
                nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus)
                if nucleusSpec is None:
                    tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus))
                    continue

                # set nucleus
                retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)}
                tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)

            # get T1/nucleus
            if not taskSpec.useWorldCloud():
                t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest']
            else:
                t1SiteName = nucleusSpec.getOnePandaSite()
            t1Site = siteMapper.getSite(t1SiteName)

            # loop over all datasets
            isOK = True
            for datasetSpec in datasetSpecList:
                tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName))
                if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                    tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName))
                    continue
                # get location
                location = siteMapper.getDdmEndpoint(t1Site.sitename, datasetSpec.storageToken, taskSpec.prodSourceLabel,
                                                     JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                # make subscription
                try:
                    tmpLog.debug('registering subscription to {0} with backend={1}'.format(location,
                                                                                           ddmBackEnd))
                    tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location,
                                                                'Production Output',asynchronous=True)
                    if tmpStat is not True:
                        tmpLog.error("failed to make subscription")
                        isOK = False
                        break
                except Exception:
                    errtype,errvalue = sys.exc_info()[:2]
                    tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue))
                    isOK = False
                    break
            # succeeded
            if isOK:
                # activate task
                if taskSpec.oldStatus in ['assigning','exhausted',None]:
                    taskSpec.status = 'ready'
                else:
                    taskSpec.status = taskSpec.oldStatus
                taskSpec.oldStatus = None
                self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID},
                                                  setOldModTime=True)
                tmpLog.debug('finished to reassign')
Пример #26
0
 def doBrokerage(self, inputList, vo, prodSourceLabel, workQueue):
     # variables for submission
     maxBunchTask = 100
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start doBrokerage')
     # return for failure
     retFatal = self.SC_FATAL
     retTmpError = self.SC_FAILED
     tmpLog.debug('vo={0} label={1} queue={2}'.format(
         vo, prodSourceLabel, workQueue.queue_name))
     # loop over all tasks
     allRwMap = {}
     prioMap = {}
     tt2Map = {}
     expRWs = {}
     jobSpecList = []
     for tmpJediTaskID, tmpInputList in inputList:
         for taskSpec, cloudName, inputChunk in tmpInputList:
             # make JobSpec to be submitted for TaskAssigner
             jobSpec = JobSpec()
             jobSpec.taskID = taskSpec.jediTaskID
             jobSpec.jediTaskID = taskSpec.jediTaskID
             # set managed to trigger TA
             jobSpec.prodSourceLabel = 'managed'
             jobSpec.processingType = taskSpec.processingType
             jobSpec.workingGroup = taskSpec.workingGroup
             jobSpec.metadata = taskSpec.processingType
             jobSpec.assignedPriority = taskSpec.taskPriority
             jobSpec.currentPriority = taskSpec.currentPriority
             jobSpec.maxDiskCount = (
                 taskSpec.getOutDiskSize() +
                 taskSpec.getWorkDiskSize()) / 1024 / 1024
             if taskSpec.useWorldCloud():
                 # use destinationSE to trigger task brokerage in WORLD cloud
                 jobSpec.destinationSE = taskSpec.cloud
             prodDBlock = None
             setProdDBlock = False
             for datasetSpec in inputChunk.getDatasets():
                 prodDBlock = datasetSpec.datasetName
                 if datasetSpec.isMaster():
                     jobSpec.prodDBlock = datasetSpec.datasetName
                     setProdDBlock = True
                 for fileSpec in datasetSpec.Files:
                     tmpInFileSpec = fileSpec.convertToJobFileSpec(
                         datasetSpec)
                     jobSpec.addFile(tmpInFileSpec)
             # use secondary dataset name as prodDBlock
             if setProdDBlock == False and prodDBlock != None:
                 jobSpec.prodDBlock = prodDBlock
             # append
             jobSpecList.append(jobSpec)
             prioMap[jobSpec.taskID] = jobSpec.currentPriority
             tt2Map[jobSpec.taskID] = jobSpec.processingType
             # get RW for a priority
             if not allRwMap.has_key(jobSpec.currentPriority):
                 tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI(
                     vo, prodSourceLabel, workQueue,
                     jobSpec.currentPriority)
                 if tmpRW == None:
                     tmpLog.error(
                         'failed to calculate RW with prio={0}'.format(
                             jobSpec.currentPriority))
                     return retTmpError
                 allRwMap[jobSpec.currentPriority] = tmpRW
             # get expected RW
             expRW = self.taskBufferIF.calculateTaskRW_JEDI(
                 jobSpec.jediTaskID)
             if expRW == None:
                 tmpLog.error(
                     'failed to calculate RW for jediTaskID={0}'.format(
                         jobSpec.jediTaskID))
                 return retTmpError
             expRWs[jobSpec.taskID] = expRW
     # get fullRWs
     fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI(
         vo, prodSourceLabel, None, None)
     if fullRWs == None:
         tmpLog.error('failed to calculate full RW')
         return retTmpError
     # set metadata
     for jobSpec in jobSpecList:
         rwValues = allRwMap[jobSpec.currentPriority]
         jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % (
             jobSpec.metadata, str(rwValues), str(expRWs), str(prioMap),
             str(fullRWs), str(tt2Map))
     tmpLog.debug('run task assigner for {0} tasks'.format(
         len(jobSpecList)))
     nBunchTask = 0
     while nBunchTask < len(jobSpecList):
         # get a bunch
         jobsBunch = jobSpecList[nBunchTask:nBunchTask + maxBunchTask]
         strIDs = 'jediTaskID='
         for tmpJobSpec in jobsBunch:
             strIDs += '{0},'.format(tmpJobSpec.taskID)
         strIDs = strIDs[:-1]
         tmpLog.debug(strIDs)
         # increment index
         nBunchTask += maxBunchTask
         # run task brokerge
         stS, outSs = PandaClient.runTaskAssignment(jobsBunch)
         tmpLog.debug('{0}:{1}'.format(stS, str(outSs)))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
Пример #27
0
 def findMissingFiles(self, jediTaskID, cloudName):
     tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(jediTaskID))
     tmpLog.debug('start findMissingFiles')
     # return for failure
     retError = self.SC_FAILED
     # get datasets
     tmpSt, datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
         jediTaskID, ['input'], True)
     if not tmpSt:
         tmpLog.error('failed to get the list of datasets')
         return retError
     # loop over all datasets
     for datasetSpec in datasetSpecList:
         # check only master dataset
         if not datasetSpec.isMaster():
             continue
         tmpLog.debug('checking {0}'.format(datasetSpec.datasetName))
         # get ddmIF
         ddmIF = self.ddmIF.getInterface(datasetSpec.vo)
         if ddmIF == None:
             tmpLog.error('failed to get DDM I/F for vo={0}'.format(
                 datasetSpec.vo))
             return retError
         # get the list of sites where data is available
         tmpSt, tmpRet = AtlasBrokerUtils.getSitesWithData(
             self.siteMapper, ddmIF, datasetSpec.datasetName)
         if tmpSt != self.SC_SUCCEEDED:
             tmpLog.error(
                 'failed to get the list of sites where {0} is available, since {1}'
                 .format(datasetSpec.datasetName, tmpRet))
             return retError
         dataSiteMap = tmpRet
         # data is unavailable in cloud
         if not dataSiteMap.has_key(cloudName):
             tmpLog.error('{0} is unavailable in cloud={1} map={2}'.format(
                 datasetSpec.datasetName, cloudName, str(dataSiteMap)))
             return retError
         # mapping between sites and storage endpoints
         checkedSites = [self.siteMapper.getCloud(cloudName)['source']
                         ] + dataSiteMap[cloudName]['t2']
         siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(
             checkedSites, self.siteMapper)
         # get available files per site/endpoint
         tmpAvFileMap = ddmIF.getAvailableFiles(datasetSpec,
                                                siteStorageEP,
                                                self.siteMapper,
                                                ngGroup=[1],
                                                checkLFC=True)
         if tmpAvFileMap == None:
             tmpLog.error(
                 'failed to get available file list for {0}'.format(
                     datasetSpec.datasetName))
             return retError
         # check availability
         missingFiles = []
         for fileSpec in datasetSpec.Files:
             fileFound = False
             for tmpSiteName, availableFilesMap in tmpAvFileMap.iteritems():
                 for tmpStorageType, availableFiles in availableFilesMap.iteritems(
                 ):
                     for availableFile in availableFiles:
                         if fileSpec.lfn == availableFile.lfn:
                             fileFound = True
                             break
                     if fileFound:
                         break
                 if fileFound:
                     break
             # missing
             if not fileFound:
                 missingFiles.append(fileSpec.fileID)
                 tmpLog.debug('{0} missing'.format(fileSpec.lfn))
         # update contents
         if missingFiles != []:
             tmpSt = self.taskBufferIF.setMissingFiles_JEDI(
                 jediTaskID, datasetSpec.datasetID, missingFiles)
             if not tmpSt:
                 tmpLog.error('failed to set missing files in {0}'.format(
                     datasetSpec.datasetName))
                 return retError
     tmpLog.debug('done findMissingFiles')
     return self.SC_SUCCEEDED
Пример #28
0
 def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         '<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                         monToken='<jediTaskID={0} {1}>'.format(
                             taskSpec.jediTaskID,
                             datetime.datetime.utcnow().isoformat('/')))
     tmpLog.debug('start')
     # return for failure
     retFatal = self.SC_FATAL, inputChunk
     retTmpError = self.SC_FAILED, inputChunk
     # get primary site candidates
     sitePreAssigned = False
     excludeList = []
     includeList = None
     scanSiteList = []
     # get list of site access
     siteAccessList = self.taskBufferIF.listSiteAccess(
         None, taskSpec.userName)
     siteAccessMap = {}
     for tmpSiteName, tmpAccess in siteAccessList:
         siteAccessMap[tmpSiteName] = tmpAccess
     # site limitation
     if taskSpec.useLimitedSites():
         if 'excludedSite' in taskParamMap:
             excludeList = taskParamMap['excludedSite']
             # str to list for task retry
             try:
                 if type(excludeList) != types.ListType:
                     excludeList = excludeList.split(',')
             except:
                 pass
         if 'includedSite' in taskParamMap:
             includeList = taskParamMap['includedSite']
             # str to list for task retry
             if includeList == '':
                 includeList = None
             try:
                 if type(includeList) != types.ListType:
                     includeList = includeList.split(',')
             except:
                 pass
     # loop over all sites
     for siteName, tmpSiteSpec in self.siteMapper.siteSpecList.iteritems():
         if tmpSiteSpec.type == 'analysis':
             scanSiteList.append(siteName)
     # preassigned
     if not taskSpec.site in ['', None]:
         # site is pre-assigned
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         sitePreAssigned = True
         if not taskSpec.site in scanSiteList:
             scanSiteList.append(taskSpec.site)
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     # allowed remote access protocol
     allowedRemoteProtocol = 'fax'
     # MP
     if taskSpec.coreCount != None and taskSpec.coreCount > 1:
         # use MCORE only
         useMP = 'only'
     elif taskSpec.coreCount == 0:
         # use MCORE and normal
         useMP = 'any'
     else:
         # not use MCORE
         useMP = 'unuse'
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status in ['offline']:
             skipFlag = True
         elif tmpSiteSpec.status in ['brokeroff', 'test']:
             if not sitePreAssigned:
                 skipFlag = True
             elif tmpSiteName != taskSpec.site:
                 skipFlag = True
         if not skipFlag:
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug(
                 '  skip site=%s due to status=%s criteria=-status' %
                 (tmpSiteName, tmpSiteSpec.status))
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed site status check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for MP
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                     (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \
                              (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount))
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for useMP={1}'.format(
             len(scanSiteList), useMP))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if taskSpec.transHome.startswith('ROOT'):
             # hack until x86_64-slc6-gcc47-opt is published in installedsw
             if taskSpec.architecture == 'x86_64-slc6-gcc47-opt':
                 tmpCmtConfig = 'x86_64-slc6-gcc46-opt'
             else:
                 tmpCmtConfig = taskSpec.architecture
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                 scanSiteList, cmtConfig=tmpCmtConfig, onlyCmtConfig=True)
         elif 'AthAnalysis' in taskSpec.transHome or re.search(
                 'Ath[a-zA-Z]+Base', taskSpec.transHome) != None:
             # AthAnalysis
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                 scanSiteList,
                 cmtConfig=taskSpec.architecture,
                 onlyCmtConfig=True)
         else:
             # remove AnalysisTransforms-
             transHome = re.sub('^[^-]+-*', '', taskSpec.transHome)
             transHome = re.sub('_', '-', transHome)
             if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \
                     re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None :
                 # cache is checked
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList,
                     caches=transHome,
                     cmtConfig=taskSpec.architecture)
             elif transHome == '' and taskSpec.transUses != None:
                 # remove Atlas-
                 transUses = taskSpec.transUses.split('-')[-1]
                 # release is checked
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList,
                     releases=transUses,
                     cmtConfig=taskSpec.architecture)
             else:
                 # nightlies
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList, releases='CVMFS')
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \
                              (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for SW {1}:{2}:{3}'.format(
             len(scanSiteList), taskSpec.transUses, taskSpec.transHome,
             taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for memory
     minRamCount = inputChunk.getMaxRamCount()
     minRamCount = JediCoreUtils.compensateRamCount(minRamCount)
     if not minRamCount in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # site max memory requirement
             if not tmpSiteSpec.maxrss in [0, None]:
                 site_maxmemory = tmpSiteSpec.maxrss
             else:
                 site_maxmemory = tmpSiteSpec.maxmemory
             if not site_maxmemory in [
                     0, None
             ] and minRamCount != 0 and minRamCount > site_maxmemory:
                 tmpLog.debug(
                     '  skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory'
                     .format(tmpSiteName, site_maxmemory, minRamCount))
                 continue
             # site min memory requirement
             if not tmpSiteSpec.minrss in [0, None]:
                 site_minmemory = tmpSiteSpec.minrss
             else:
                 site_minmemory = tmpSiteSpec.minmemory
             if not site_minmemory in [
                     0, None
             ] and minRamCount != 0 and minRamCount < site_minmemory:
                 tmpLog.debug(
                     '  skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory'
                     .format(tmpSiteName, site_minmemory, minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(
             len(scanSiteList), minRamCount, taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for scratch disk
     tmpMaxAtomSize = inputChunk.getMaxAtomSize()
     tmpEffAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True)
     tmpOutDiskSize = taskSpec.getOutDiskSize()
     tmpWorkDiskSize = taskSpec.getWorkDiskSize()
     minDiskCountS = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize
         minDiskCountR = minDiskCountR / 1024 / 1024
     tmpLog.debug(
         'maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}'
         .format(tmpMaxAtomSize, tmpEffAtomSize, tmpOutDiskSize,
                 tmpWorkDiskSize))
     tmpLog.debug('minDiskCountScratch={0} minDiskCountRemote={1}'.format(
         minDiskCountS, minDiskCountR))
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug(
                     '  skip site={0} due to small scratch disk={1} < {2} criteria=-disk'
                     .format(tmpSiteName, tmpSiteSpec.maxwdir,
                             minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check endpoint
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         tmpEndPoint = tmpSiteSpec.ddm_endpoints.getEndPoint(
             tmpSiteSpec.ddm)
         if tmpEndPoint is not None:
             # free space must be >= 200GB
             diskThreshold = 200
             tmpSpaceSize = 0
             if tmpEndPoint['space_expired'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_expired']
             if tmpEndPoint['space_free'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_free']
             if tmpSpaceSize < diskThreshold:
                 tmpLog.debug(
                     '  skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk'
                     .format(tmpSiteName, tmpSpaceSize, diskThreshold))
                 continue
             # check if blacklisted
             if tmpEndPoint['blacklisted'] == 'Y':
                 tmpLog.debug(
                     '  skip site={0} since {1} is blacklisted in DDM criteria=-blacklist'
                     .format(tmpSiteName, tmpSiteSpec.ddm))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0, None] and minWalltime > 0:
         minWalltime *= tmpEffAtomSize
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug(
                     '  skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime'
                     .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug(
                     '  skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime'
                     .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(
             len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                 'updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug(
                 '  skip site=%s due to no pilot criteria=-nopilot' %
                 tmpSiteName)
             if not self.testMode:
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed pilot activity check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # check inclusion and exclusion
     newScanSiteList = []
     sitesForANY = []
     for tmpSiteName in scanSiteList:
         autoSite = False
         # check exclusion
         if AtlasBrokerUtils.isMatched(tmpSiteName, excludeList):
             tmpLog.debug(
                 '  skip site={0} excluded criteria=-excluded'.format(
                     tmpSiteName))
             continue
         # check inclusion
         if includeList != None and not AtlasBrokerUtils.isMatched(
                 tmpSiteName, includeList):
             if 'AUTO' in includeList:
                 autoSite = True
             else:
                 tmpLog.debug(
                     '  skip site={0} not included criteria=-notincluded'.
                     format(tmpSiteName))
                 continue
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limited access
         if tmpSiteSpec.accesscontrol == 'grouplist':
             if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \
                     siteAccessMap[tmpSiteSpec.sitename] != 'approved':
                 tmpLog.debug(
                     '  skip site={0} limited access criteria=-limitedaccess'
                     .format(tmpSiteName))
                 continue
         # check cloud
         if not taskSpec.cloud in [None, '', 'any', tmpSiteSpec.cloud]:
             tmpLog.debug(
                 '  skip site={0} cloud mismatch criteria=-cloudmismatch'.
                 format(tmpSiteName))
             continue
         if autoSite:
             sitesForANY.append(tmpSiteName)
         else:
             newScanSiteList.append(tmpSiteName)
     # use AUTO sites if no sites are included
     if newScanSiteList == []:
         newScanSiteList = sitesForANY
     else:
         for tmpSiteName in sitesForANY:
             tmpLog.debug(
                 '  skip site={0} not included criteria=-notincluded'.
                 format(tmpSiteName))
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for data availability
     hasDDS = False
     dataWeight = {}
     remoteSourceList = {}
     if inputChunk.getDatasets() != []:
         oldScanSiteList = copy.copy(scanSiteList)
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug(
                     'getting the list of sites where {0} is available'.
                     format(datasetName))
                 tmpSt, tmpRet = AtlasBrokerUtils.getAnalSitesWithData(
                     scanSiteList, self.siteMapper, self.ddmIF, datasetName)
                 if tmpSt in [
                         Interaction.JEDITemporaryError,
                         Interaction.JEDITimeoutError
                 ]:
                     tmpLog.error(
                         'temporary failed to get the list of sites where data is available, since %s'
                         % tmpRet)
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retTmpError
                 if tmpSt == Interaction.JEDIFatalError:
                     tmpLog.error(
                         'fatal error when getting the list of sites where data is available, since %s'
                         % tmpRet)
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 if datasetName.startswith('ddo'):
                     tmpLog.debug(' {0} sites'.format(len(tmpRet)))
                 else:
                     tmpLog.debug(' {0} sites : {1}'.format(
                         len(tmpRet), str(tmpRet)))
                     # check if distributed
                     if tmpRet != {}:
                         isDistributed = True
                         for tmpMap in tmpRet.values():
                             for tmpVal in tmpMap.values():
                                 if tmpVal['state'] == 'complete':
                                     isDistributed = False
                                     break
                             if not isDistributed:
                                 break
                         if isDistributed:
                             # check if really distributed
                             isDistributed = self.ddmIF.isDistributedDataset(
                                 datasetName)
                             if isDistributed:
                                 hasDDS = True
                                 datasetSpec.setDistributed()
                                 tmpLog.debug(' {0} is distributed'.format(
                                     datasetName))
             # check if the data is available at somewhere
             if self.dataSiteMap[datasetName] == {}:
                 tmpLog.error(
                     '{0} is unavailable at any site'.format(datasetName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 # send info to logger
                 self.sendLogMessage(tmpLog)
                 return retFatal
         # get the list of sites where data is available
         scanSiteList = None
         scanSiteListOnDisk = None
         normFactor = 0
         for datasetName, tmpDataSite in self.dataSiteMap.iteritems():
             normFactor += 1
             # get sites where replica is available
             tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(
                 tmpDataSite, includeTape=True)
             tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(
                 tmpDataSite, includeTape=False)
             # get sites which can remotely access source sites
             if inputChunk.isMerging:
                 # disable remote access for merging
                 tmpSatelliteSites = {}
             elif (not sitePreAssigned) or (
                     sitePreAssigned and not taskSpec.site in tmpSiteList):
                 tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(
                     tmpDiskSiteList,
                     self.taskBufferIF,
                     self.siteMapper,
                     nSites=50,
                     protocol=allowedRemoteProtocol)
             else:
                 tmpSatelliteSites = {}
             # make weight map for local
             for tmpSiteName in tmpSiteList:
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = 0
                 # give more weight to disk
                 if tmpSiteName in tmpDiskSiteList:
                     dataWeight[tmpSiteName] += 1
                 else:
                     dataWeight[tmpSiteName] += 0.001
             # make weight map for remote
             for tmpSiteName, tmpWeightSrcMap in tmpSatelliteSites.iteritems(
             ):
                 # skip since local data is available
                 if tmpSiteName in tmpSiteList:
                     continue
                 tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                 # negative weight for remote access
                 wRemote = 50.0
                 if not tmpSiteSpec.wansinklimit in [0, None]:
                     wRemote /= float(tmpSiteSpec.wansinklimit)
                 # sum weight
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = float(
                         tmpWeightSrcMap['weight']) / wRemote
                 else:
                     dataWeight[tmpSiteName] += float(
                         tmpWeightSrcMap['weight']) / wRemote
                 # make remote source list
                 if not remoteSourceList.has_key(tmpSiteName):
                     remoteSourceList[tmpSiteName] = {}
                 remoteSourceList[tmpSiteName][
                     datasetName] = tmpWeightSrcMap['source']
             # first list
             if scanSiteList == None:
                 scanSiteList = []
                 for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     if not tmpSiteName in scanSiteList:
                         scanSiteList.append(tmpSiteName)
                 scanSiteListOnDisk = set()
                 for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(
                 ):
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     scanSiteListOnDisk.add(tmpSiteName)
                 continue
             # pickup sites which have all data
             newScanList = []
             for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteList and not tmpSiteName in newScanList:
                     newScanList.append(tmpSiteName)
             scanSiteList = newScanList
             tmpLog.debug('{0} is available at {1} sites'.format(
                 datasetName, len(scanSiteList)))
             # pickup sites which have all data on DISK
             newScanListOnDisk = set()
             for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteListOnDisk:
                     newScanListOnDisk.add(tmpSiteName)
             scanSiteListOnDisk = newScanListOnDisk
             tmpLog.debug('{0} is available at {1} sites on DISK'.format(
                 datasetName, len(scanSiteListOnDisk)))
         # check for preassigned
         if sitePreAssigned and not taskSpec.site in scanSiteList:
             scanSiteList = []
             tmpLog.debug(
                 'data is unavailable locally or remotely at preassigned site {0}'
                 .format(taskSpec.site))
         elif len(scanSiteListOnDisk) > 0:
             # use only disk sites
             scanSiteList = list(scanSiteListOnDisk)
         tmpLog.debug('{0} candidates have input data'.format(
             len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retFatal
     ######################################
     # sites already used by task
     tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
         taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # calculate weight
     fqans = taskSpec.makeFQANs()
     """
     tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans,
                                                                                               taskSpec.workingGroup,True)
     currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight)
     currentPriority -= 500
     tmpLog.debug('currentPriority={0}'.format(currentPriority))
     """
     tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(
         taskSpec.vo, taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # check for preassigned
     if sitePreAssigned and not taskSpec.site in scanSiteList:
         tmpLog.debug("preassigned site {0} did not pass all tests".format(
             taskSpec.site))
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retFatal
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     timeWindowForFC = 6
     preSiteCandidateSpec = None
     failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI(
         taskSpec.jediTaskID, timeWindowForFC)
     problematicSites = set()
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                'running', None, None)
         nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'defined',
                                                 None, None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \
                      AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
         nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'starting',
                                                 None, None)
         nFailed = 0
         nClosed = 0
         nFinished = 0
         if tmpSiteName in failureCounts:
             if 'failed' in failureCounts[tmpSiteName]:
                 nFailed = failureCounts[tmpSiteName]['failed']
             if 'closed' in failureCounts[tmpSiteName]:
                 nClosed = failureCounts[tmpSiteName]['closed']
             if 'finished' in failureCounts[tmpSiteName]:
                 nFinished = failureCounts[tmpSiteName]['finished']
         # problematic sites
         if nFailed + nClosed > 2 * nFinished:
             problematicSites.add(tmpSiteName)
         # calculate weight
         weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                              nStarting + 1)
         nThrottled = 0
         if remoteSourceList.has_key(tmpSiteName):
             nThrottled = AtlasBrokerUtils.getNumJobs(
                 jobStatPrioMap, tmpSiteName, 'throttled', None, None)
             weight /= float(nThrottled + 1)
         # noramize weights by taking data availability into account
         tmpDataWeight = 1
         if dataWeight.has_key(tmpSiteName):
             weight = weight * dataWeight[tmpSiteName]
             tmpDataWeight = dataWeight[tmpSiteName]
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # preassigned
         if sitePreAssigned and tmpSiteName == taskSpec.site:
             preSiteCandidateSpec = siteCandidateSpec
         # set weight
         siteCandidateSpec.weight = weight
         tmpStr = '  site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format(
             tmpSiteName, nRunning, nAssigned, nActivated, nStarting)
         tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format(
             nFailed, nClosed, nFinished, nThrottled, tmpDataWeight, weight)
         tmpLog.debug(tmpStr)
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)
     # sort candidates by weights
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight
     # limit the number of sites. use all sites for distributed datasets
     if not hasDDS:
         maxNumSites = 10
         # remove problematic sites
         candidateSpecList = AtlasBrokerUtils.skipProblematicSites(
             candidateSpecList, problematicSites, sitesUsedByTask,
             preSiteCandidateSpec, maxNumSites, timeWindowForFC, tmpLog)
     # append preassigned
     if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList:
         candidateSpecList.append(preSiteCandidateSpec)
     # collect site names
     scanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # get list of available files
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             fileScanSiteList = []
             for tmpSiteName in scanSiteList:
                 fileScanSiteList.append(tmpSiteName)
                 if remoteSourceList.has_key(
                         tmpSiteName
                 ) and remoteSourceList[tmpSiteName].has_key(
                         datasetSpec.datasetName):
                     for tmpRemoteSite in remoteSourceList[tmpSiteName][
                             datasetSpec.datasetName]:
                         if not tmpRemoteSite in fileScanSiteList:
                             fileScanSiteList.append(tmpRemoteSite)
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(
                 fileScanSiteList, self.siteMapper)
             # disable file lookup for merge jobs
             if inputChunk.isMerging:
                 checkCompleteness = False
             else:
                 checkCompleteness = True
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(
                 datasetSpec,
                 siteStorageEP,
                 self.siteMapper,
                 ngGroup=[2],
                 checkCompleteness=checkCompleteness)
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError, 'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' %
                          (errtype.__name__, errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # preassigned
         if sitePreAssigned and tmpSiteName != taskSpec.site:
             tmpLog.debug(
                 '  skip site={0} non pre-assigned site criteria=-nonpreassigned'
                 .format(tmpSiteName))
             continue
         # set available files
         if inputChunk.getDatasets() == []:
             isAvailable = True
         else:
             isAvailable = False
         for tmpDatasetName, availableFiles in availableFileMap.iteritems():
             tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName)
             # check remote files
             if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[
                     tmpSiteName].has_key(tmpDatasetName):
                 for tmpRemoteSite in remoteSourceList[tmpSiteName][
                         tmpDatasetName]:
                     if availableFiles.has_key(tmpRemoteSite) and \
                             len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']):
                         # use only remote disk files
                         siteCandidateSpec.remoteFiles += availableFiles[
                             tmpRemoteSite]['localdisk']
                         # set remote site and access protocol
                         siteCandidateSpec.remoteProtocol = allowedRemoteProtocol
                         siteCandidateSpec.remoteSource = tmpRemoteSite
                         isAvailable = True
                         break
             # local files
             if availableFiles.has_key(tmpSiteName):
                 if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \
                         (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0):
                     siteCandidateSpec.localDiskFiles += availableFiles[
                         tmpSiteName]['localdisk']
                     # add cached files to local list since cached files go to pending when reassigned
                     siteCandidateSpec.localDiskFiles += availableFiles[
                         tmpSiteName]['cache']
                     siteCandidateSpec.localTapeFiles += availableFiles[
                         tmpSiteName]['localtape']
                     siteCandidateSpec.cacheFiles += availableFiles[
                         tmpSiteName]['cache']
                     siteCandidateSpec.remoteFiles += availableFiles[
                         tmpSiteName]['remote']
                     siteCandidateSpec.addAvailableFiles(
                         availableFiles[tmpSiteName]['all'])
                     isAvailable = True
                 else:
                     tmpMsg = '{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}'
                     tmpLog.debug(
                         tmpMsg.format(
                             tmpDatasetName,
                             tmpSiteName,
                             len(tmpDatasetSpec.Files),
                             len(availableFiles[tmpSiteName]['localdisk']),
                             len(availableFiles[tmpSiteName]['cache']),
                             len(availableFiles[tmpSiteName]['localtape']),
                         ))
             if not isAvailable:
                 break
         # append
         if not isAvailable:
             tmpLog.debug(
                 '  skip site={0} file unavailable criteria=-fileunavailable'
                 .format(siteCandidateSpec.siteName))
             continue
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug(
             '  use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use'
             .format(
                 siteCandidateSpec.siteName,
                 siteCandidateSpec.weight,
                 len(siteCandidateSpec.localDiskFiles),
                 len(siteCandidateSpec.localTapeFiles),
                 len(siteCandidateSpec.cacheFiles),
                 len(siteCandidateSpec.remoteFiles),
             ))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # send info to logger
     self.sendLogMessage(tmpLog)
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, inputChunk
Пример #29
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # loop twice to see immediate result
                     for iLoop in range(2):
                         # get active PandaIDs to be killed
                         if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr:
                             pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID)
                         else:
                             pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                         if pandaIDs == None:
                             tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                             tmpStat = Interaction.SC_FAILED
                         # kill jobs or update task
                         if tmpStat == Interaction.SC_SUCCEEDED:
                             if pandaIDs == []:
                                 # done since no active jobs
                                 tmpMsg = 'completed cleaning jobs'
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpTaskSpec = JediTaskSpec()
                                 tmpTaskSpec.jediTaskID = jediTaskID
                                 updateTaskStatus = True
                                 if commandStr != 'reassign':
                                     # reset oldStatus
                                     # keep oldStatus for task reassignment since it is reset when actually reassigned
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                 else:
                                     # extract cloud or site
                                     if commentStr != None:
                                         tmpItems = commentStr.split(':')
                                         if tmpItems[0] == 'cloud':
                                             tmpTaskSpec.cloud = tmpItems[1]
                                         else:
                                             tmpTaskSpec.site = tmpItems[1]
                                         tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1])
                                         tmpLog.sendMsg(tmpMsg,self.msgType)
                                         tmpLog.info(tmpMsg)
                                         # back to oldStatus if necessary 
                                         if tmpItems[2] == 'y':
                                             tmpTaskSpec.status = oldStatus
                                             tmpTaskSpec.forceUpdate('oldStatus')
                                             updateTaskStatus = False
                                 if commandStr == 'reassign':
                                     tmpTaskSpec.forceUpdate('errorDialog')
                                 if updateTaskStatus:
                                     tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                                 tmpMsg = 'set task.status={0}'.format(tmpTaskSpec.status)
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID})
                                 tmpLog.info('done with {0}'.format(str(tmpRet)))
                                 break
                             else:
                                 # kill only in the first loop
                                 if iLoop > 0:
                                     break
                                 # wait or kill jobs 
                                 if 'soft finish' in commentStr:
                                     tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = True
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                                     break
                                 else:
                                     tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpLog.sendMsg(tmpMsg,self.msgType)
                                     if commandStr in ['reassign','finish']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True)
                                     else:
                                         # normal kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry failed files
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr)
                     if tmpRet == True:
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errStr  = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errStr += traceback.format_exc()
             logger.error(errStr)
Пример #30
0
    def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue,
                      resource_name):
        # params
        nBunch = 4
        threshold = 2.0
        nJobsInBunchMax = 600
        nJobsInBunchMin = 500
        minTotalWalltime = 50 * 1000 * 1000
        nWaitingLimit = 4
        nWaitingBunchLimit = 2
        nParallel = 2
        nParallelCap = 5
        # make logger
        tmpLog = MsgWrapper(logger)

        workQueueID = workQueue.getID()
        workQueueName = workQueue.queue_name

        workQueueName = '_'.join(workQueue.queue_name.split(' '))
        msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format(
            vo, prodSourceLabel, cloudName, workQueueName, resource_name)
        tmpLog.debug('{0} start workQueueID={1}'.format(
            msgHeader, workQueueID))

        # get central configuration values
        config_map = self.__getConfiguration(vo, workQueue.queue_name,
                                             resource_name)
        configQueueLimit = config_map[NQUEUELIMIT]['value']
        configQueueCap = config_map[NQUEUECAP]['value']
        configRunningCap = config_map[NRUNNINGCAP]['value']

        tmpLog.debug(
            msgHeader +
            ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}'
            .format(configQueueLimit, configQueueCap, configRunningCap))

        # check if unthrottled
        if not workQueue.throttled:
            msgBody = "PASS unthrottled since GS_throttled is False"
            tmpLog.info(msgHeader + " " + msgBody)
            return self.retUnThrottled

        # get the jobs statistics for our wq/gs and expand the stats map
        jobstats_map = self.__prepareJobStats(workQueue, resource_name,
                                              config_map)
        nRunning_rt = jobstats_map['nRunning_rt']
        nRunning_gs = jobstats_map['nRunning_gs']
        nRunning_runningcap = jobstats_map['nRunning_runningcap']
        nNotRun_rt = jobstats_map['nNotRun_rt']
        nNotRun_gs = jobstats_map['nNotRun_gs']
        nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit']
        nNotRun_queuecap = jobstats_map['nNotRun_queuecap']
        nDefine_rt = jobstats_map['nDefine_rt']
        nDefine_gs = jobstats_map['nDefine_gs']
        nDefine_queuelimit = jobstats_map['nDefine_queuelimit']
        nDefine_queuecap = jobstats_map['nDefine_queuecap']
        nWaiting_rt = jobstats_map['nWaiting_rt']
        nWaiting_gs = jobstats_map['nWaiting_gs']

        # check if higher prio tasks are waiting
        if workQueue.queue_name in non_rt_wqs:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI(
                'managed', cloudName, workQueue)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(
                vo, workQueue, 'managed', cloudName)
        else:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI(
                'managed', cloudName, workQueue, resource_name)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(
                vo, workQueue, 'managed', cloudName, resource_name)

        highestPrioInPandaDB = highestPrioJobStat['highestPrio']
        nNotRunHighestPrio = highestPrioJobStat['nNotRun']
        if highestPrioWaiting is None:
            msgBody = 'failed to get the highest priority of waiting tasks'
            tmpLog.error("{0} {1}".format(msgHeader, msgBody))
            return self.retTmpError

        # high priority tasks are waiting
        highPrioQueued = False
        if highestPrioWaiting > highestPrioInPandaDB \
                or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin):
            highPrioQueued = True
        tmpLog.debug(
            "{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}"
            .format(msgHeader, highestPrioWaiting, highestPrioInPandaDB,
                    nNotRunHighestPrio, highPrioQueued))
        # set maximum number of jobs to be submitted
        if workQueue.queue_name in non_rt_wqs:
            tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs)
        else:
            tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt)
        # use the lower limit to avoid creating too many _sub/_dis datasets
        nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot),
                           nJobsInBunchMax)

        if configQueueLimit is not None:
            nQueueLimit = configQueueLimit
        else:
            nQueueLimit = nJobsInBunch * nBunch

        # use nPrestage for reprocessing
        if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']:
            # reset nJobsInBunch
            if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit):
                tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit +
                                                  nDefine_queuelimit)
                if tmpRemainingSlot > nJobsInBunch:
                    nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax)

        # get cap
        # set number of jobs to be submitted
        if configQueueCap is None:
            self.setMaxNumJobs(nJobsInBunch / nParallel)
        else:
            self.setMaxNumJobs(configQueueCap / nParallelCap)

        # get total walltime
        totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(
            vo, prodSourceLabel, workQueue, resource_name, cloudName)

        # log the current situation and limits
        tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format(
            msgHeader, nQueueLimit, configRunningCap, configQueueCap))
        tmpLog.info(
            "{0} at global share level: nQueued={1} nDefine={2} nRunning={3}".
            format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs,
                   nRunning_gs))
        tmpLog.info(
            "{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}"
            .format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt,
                    nRunning_rt, totWalltime))

        # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
        limitPriority = False
        if workQueue.queue_name not in non_rt_wqs \
                and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \
                and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs \
                and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name not in non_rt_wqs and  nRunning_rt != 0 \
                and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format(
                    nNotRun_rt + nDefine_rt, nRunning_rt, threshold,
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \
                and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format(
                    nNotRun_gs + nDefine_gs, nRunning_gs, threshold,
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif nDefine_queuelimit > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # brokerage is stuck
                msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format(
                    nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif nWaiting_rt > max(nRunning_rt * nWaitingLimit,
                               nJobsInBunch * nWaitingBunchLimit):
            limitPriority = True
            if not highPrioQueued:
                # too many waiting
                msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format(
                    nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch,
                    nWaitingBunchLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif configRunningCap and nRunning_runningcap > configRunningCap:
            # cap on running
            msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format(
                nRunning_runningcap, configRunningCap)
            tmpLog.warning('{0} {1}'.format(msgHeader, msgBody))
            tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody),
                           self.msgType,
                           msgLevel='warning',
                           escapeChar=True)
            return self.retMergeUnThr

        elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap:
            limitPriority = True
            if not highPrioQueued:
                # cap on queued
                msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format(
                    nNotRun_queuecap + nDefine_queuecap, configQueueCap)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        # get jobs from prodDB
        limitPriorityValue = None
        if limitPriority:
            limitPriorityValue = highestPrioWaiting
            self.setMinPriority(limitPriorityValue)
        else:
            # not enough jobs are queued
            if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \
                    or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \
                    or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt):
                tmpLog.debug(msgHeader + " not enough jobs queued")
                if not workQueue.queue_name in non_rt_wqs:
                    self.notEnoughJobsQueued()
                self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit / 20))

        msgBody = "PASS - priority limit={0} maxNumJobs={1}".format(
            limitPriorityValue, self.maxNumJobs)
        tmpLog.info(msgHeader + " " + msgBody)
        return self.retUnThrottled