def main(tbuf=None, **kwargs):
    # logger
    tmpLog = LogWrapper(_logger)

    tmpLog.debug("================= start ==================")
    # instantiate TB
    if tbuf is None:
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
    else:
        taskBuffer = tbuf

    # instantiate MyProxy I/F
    my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface()

    # roles
    if hasattr(panda_config,'proxy_cache_roles'):
        roles = panda_config.proxy_cache_roles.split(',')
    else:
        roles = ['atlas','atlas:/atlas/Role=production','atlas:/atlas/Role=pilot']
    # get users
    sql = 'select distinct DN FROM ATLAS_PANDAMETA.users WHERE GRIDPREF LIKE :patt'
    varMap = {}
    varMap[':patt'] = '%p%'
    tmpStat,tmpRes = taskBuffer.querySQLS(sql,varMap)
    for realDN, in tmpRes:
        if realDN is None:
            continue
        realDN = CoreUtils.get_bare_dn(realDN, keep_digits=False)
        name = taskBuffer.cleanUserID(realDN)
        # check proxy
        tmpLog.debug("check proxy cache for {}".format(name))
        for role in roles:
            my_proxy_interface_instance.checkProxy(realDN, role=role, name=name)
    tmpLog.debug("done")
示例#2
0
def run(inFile,v_onlyTA,v_firstSubmission):
    try:
        import cPickle as pickle
    except ImportError:
        import pickle
    try:
        # read Jobs from file
        f = open(inFile, 'rb')
        jobs = pickle.load(f)
        f.close()
    except Exception as e:
        print("run() : %s %s" % (str(e), traceback.format_exc()))
        return
    # password
    from pandaserver.config import panda_config
    # initialize cx_Oracle using dummy connection
    from pandaserver.taskbuffer.Initializer import initializer
    initializer.init()
    # instantiate TB
    from pandaserver.taskbuffer.TaskBuffer import taskBuffer
    taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
    # run Setupper
    from pandaserver.dataservice.Setupper import Setupper
    thr = Setupper(taskBuffer,jobs,onlyTA=v_onlyTA,firstSubmission=v_firstSubmission)
    thr.start()
    thr.join()
    return
示例#3
0
 def __init__(self, site, cloud, nJobs):
     """Initialize class with parameters
     """
     self.__site = site
     self.__cloud = cloud
     self.__nJobs = nJobs
     taskBuffer.init(panda_config.dbhost,
                     panda_config.dbpasswd,
                     nDBConnection=1)
def main(tbuf=None, **kwargs):
    # instantiate TB
    if tbuf is None:
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
    else:
        taskBuffer = tbuf
    # run
    WorkerSync(tbuf=taskBuffer).run()
示例#5
0
def main(argv=tuple(), tbuf=None, **kwargs):
    # instantiate TB
    if tbuf is None:
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
    else:
        taskBuffer = tbuf
    # dbif session
    session = dbif.get_session()

    # If no argument, call the basic configurator
    if len(argv) == 1:
        _logger = logger_utils.make_logger(base_logger, 'Configurator')
        t1 = time.time()
        configurator = Configurator(session=session)
        if not configurator.run():
            _logger.critical('Configurator loop FAILED')
        t2 = time.time()
        _logger.debug('Configurator run took {0}s'.format(t2 - t1))

    # If --network argument, call the network configurator
    elif len(argv) == 2 and argv[1].lower() == '--network':
        _logger = logger_utils.make_logger(base_logger, 'NetworkConfigurator')
        t1 = time.time()
        network_configurator = NetworkConfigurator(taskBuffer=taskBuffer,
                                                   session=session)
        if not network_configurator.run():
            _logger.critical('Configurator loop FAILED')
        t2 = time.time()
        _logger.debug(' run took {0}s'.format(t2 - t1))

    # If --json_dump
    elif len(argv) == 2 and argv[1].lower() == '--json_dump':
        _logger = logger_utils.make_logger(base_logger, 'JsonDumper')
        t1 = time.time()
        json_dumper = JsonDumper(taskBuffer=taskBuffer, session=session)
        out_msg = json_dumper.run()
        _logger.debug('Json_dumper finished with {0}'.format(out_msg))
        t2 = time.time()
        _logger.debug(' run took {0}s'.format(t2 - t1))
    else:
        _logger.error(
            'Configurator being called with wrong arguments. Use either no arguments or --network or --json_dump'
        )

    # dbif session close
    session.close()
    dbif.engine_dispose()
示例#6
0
    def __init__(self):
        """
        Initialization and configuration
        """
        threading.Thread.__init__(self)
        taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)

        if hasattr(panda_config, 'AGIS_URL_SCHEDCONFIG'):
            self.AGIS_URL_SCHEDCONFIG = panda_config.AGIS_URL_SCHEDCONFIG
        else:
            self.AGIS_URL_SCHEDCONFIG = 'http://atlas-agis-api.cern.ch/request/pandaqueue/query/list/?json&preset=schedconf.all&vo_name=atlas&state=ACTIVE'

        _logger.debug('Getting schedconfig dump...')
        self.schedconfig_dump = aux.get_dump(self.AGIS_URL_SCHEDCONFIG)
        _logger.debug('Done')
示例#7
0
    def __init__(self):
        threading.Thread.__init__(self)

        taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)

        if hasattr(panda_config, 'NWS_URL'):
            self.NWS_URL = panda_config.NWS_URL
        else:
            self.NWS_URL = 'http://atlas-adc-netmetrics-lb.cern.ch/metrics/latest.json'
        _logger.debug('Getting NWS dump...')
        self.nws_dump = aux.get_dump(self.NWS_URL)
        _logger.debug('Done')

        if hasattr(panda_config, 'AGIS_URL_CM'):
            self.AGIS_URL_CM = panda_config.AGIS_URL_CM
        else:
            self.AGIS_URL_CM = 'http://atlas-agis-api.cern.ch/request/site/query/list_links/?json'
        _logger.debug('Getting AGIS cost matrix dump...')
        self.agis_cm_dump = aux.get_dump(self.AGIS_URL_CM)
        _logger.debug('Done')
示例#8
0
def main(tbuf=None, **kwargs):

    _logger.debug("===================== start =====================")

    # overall timeout value
    overallTimeout = 300
    # prefix of evp files
    prefixEVP = 'evp.'
    # file pattern of evp files
    evpFilePatt = panda_config.cache_dir + '/' + prefixEVP + '*'

    # # kill old process
    # try:
    #     # time limit
    #     timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=overallTimeout)
    #     # get process list
    #     scriptName = sys.argv[0]
    #     out = commands_get_status_output('env TZ=UTC ps axo user,pid,lstart,args | grep %s' % scriptName)[-1]
    #     for line in out.split('\n'):
    #         items = line.split()
    #         # owned process
    #         if items[0] not in ['sm','atlpan','pansrv','root']: # ['os.getlogin()']: doesn't work in cron
    #             continue
    #         # look for python
    #         if re.search('python',line) is None:
    #             continue
    #         # PID
    #         pid = items[1]
    #         # start time
    #         timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
    #         startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
    #         # kill old process
    #         if startTime < timeLimit:
    #             _logger.debug("old process : %s %s" % (pid,startTime))
    #             _logger.debug(line)
    #             commands_get_status_output('kill -9 %s' % pid)
    # except Exception:
    #     type, value, traceBack = sys.exc_info()
    #     _logger.error("kill process : %s %s" % (type,value))

    # instantiate PD2P
    # if tbuf is None:
    from pandaserver.taskbuffer.TaskBuffer import taskBuffer
    taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
    # else:
    #     taskBuffer = tbuf
    siteMapper = SiteMapper.SiteMapper(taskBuffer)


    # thread pool
    class ThreadPool:
        def __init__(self):
            self.lock = threading.Lock()
            self.list = []

        def add(self,obj):
            self.lock.acquire()
            self.list.append(obj)
            self.lock.release()

        def remove(self,obj):
            self.lock.acquire()
            self.list.remove(obj)
            self.lock.release()

        def join(self):
            self.lock.acquire()
            thrlist = tuple(self.list)
            self.lock.release()
            for thr in thrlist:
                thr.join()


    # thread to ev-pd2p
    class EvpThr (threading.Thread):
        def __init__(self,lock,pool,aTaskBuffer,aSiteMapper,fileName,ignoreError):
            threading.Thread.__init__(self)
            self.lock       = lock
            self.pool       = pool
            self.fileName   = fileName
            self.evp        = EventPicker(aTaskBuffer,aSiteMapper,fileName,ignoreError)
            self.pool.add(self)

        def run(self):
            self.lock.acquire()
            retRun = self.evp.run()
            _logger.debug("%s : %s" % (retRun,self.fileName))
            self.pool.remove(self)
            self.lock.release()


    # get files
    _logger.debug("EVP session")
    timeNow = datetime.datetime.utcnow()
    timeInt = datetime.datetime.utcnow()
    fileList = glob.glob(evpFilePatt)
    fileList.sort()

    # create thread pool and semaphore
    adderLock = threading.Semaphore(1)
    adderThreadPool = ThreadPool()

    # add
    while len(fileList) != 0:
        # time limit to aviod too many copyArchve running at the sametime
        if (datetime.datetime.utcnow() - timeNow) > datetime.timedelta(minutes=overallTimeout):
            _logger.debug("time over in EVP session")
            break
        # try to get Semaphore
        adderLock.acquire()
        # get fileList
        if (datetime.datetime.utcnow() - timeInt) > datetime.timedelta(minutes=15):
            timeInt = datetime.datetime.utcnow()
            # get file
            fileList = glob.glob(evpFilePatt)
            fileList.sort()
        # choose a file
        fileName = fileList.pop(0)
        # release lock
        adderLock.release()
        if not os.path.exists(fileName):
            continue
        try:
            modTime = datetime.datetime(*(time.gmtime(os.path.getmtime(fileName))[:7]))
            if (timeNow - modTime) > datetime.timedelta(hours=24):
                # last chance
                _logger.debug("Last event picking : %s" % fileName)
                thr = EvpThr(adderLock,adderThreadPool,taskBuffer,siteMapper,fileName,False)
                thr.start()
            elif (timeInt - modTime) > datetime.timedelta(minutes=1):
                # try
                _logger.debug("event picking : %s" % fileName)
                thr = EvpThr(adderLock,adderThreadPool,taskBuffer,siteMapper,fileName,True)
                thr.start()
            else:
                _logger.debug("%s : %s" % ((timeInt - modTime),fileName))
        except Exception:
            errType,errValue = sys.exc_info()[:2]
            _logger.error("%s %s" % (errType,errValue))

    # join all threads
    adderThreadPool.join()

    _logger.debug("===================== end =====================")
示例#9
0
import sys

try:
    from urllib import urlencode, urlopen
    from urllib2 import Request
except ImportError:
    from urllib.parse import urlencode
    from urllib.request import urlopen, Request
from pandaserver.taskbuffer.TaskBuffer import taskBuffer
from pandaserver.config import panda_config
taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)

d = taskBuffer.queryDatasetWithMap({'name': sys.argv[1]})
node = {}
node['vuid'] = d.vuid
node['site'] = sys.argv[2]
url = 'https://localhost:25443/server/panda/datasetCompleted'
rdata = urlencode(node)
req = Request(url)
fd = urlopen(req, rdata)
data = fd.read()

print(data)
示例#10
0
def main(backGround=False):
    _logger.debug('starting ...')
    # register signal handler
    signal.signal(signal.SIGINT, catch_sig)
    signal.signal(signal.SIGHUP, catch_sig)
    signal.signal(signal.SIGTERM, catch_sig)
    signal.signal(signal.SIGALRM, catch_sig)
    signal.alarm(overallTimeout)
    # forking
    pid = os.fork()
    if pid != 0:
        # watch child process
        os.wait()
        time.sleep(1)
    else:
        # main loop
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        # check certificate
        certName = '%s/pandasv1_usercert.pem' % panda_config.certdir
        keyName = '%s/pandasv1_userkey.pem' % panda_config.certdir

        _logger.debug('checking certificate {0}'.format(certName))
        certOK, certMsg = DataServiceUtils.checkCertificate(certName)
        if not certOK:
            _logger.error('bad certificate : {0}'.format(certMsg))
        # initialize cx_Oracle using dummy connection
        from pandaserver.taskbuffer.Initializer import initializer
        initializer.init()
        # instantiate TB
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
        # instantiate sitemapper
        siteMapper = SiteMapper(taskBuffer)
        # ActiveMQ params
        queue = '/queue/Consumer.PANDA.atlas.ddm.siteservices'
        ssl_opts = {
            'use_ssl': True,
            'ssl_version': ssl.PROTOCOL_TLSv1,
            'ssl_cert_file': certName,
            'ssl_key_file': keyName
        }
        # resolve multiple brokers
        brokerList = socket.gethostbyname_ex('atlas-mb.cern.ch')[-1]
        # set listener
        connList = []
        for tmpBroker in brokerList:
            try:
                clientid = 'PANDA-' + socket.getfqdn() + '-' + tmpBroker
                subscription_id = 'panda-server-consumer-' + socket.getfqdn()
                _logger.debug('setting listener %s' % clientid)
                conn = stomp.Connection(host_and_ports=[(tmpBroker, 61023)],
                                        **ssl_opts)
                connList.append(conn)
            except Exception:
                errtype, errvalue = sys.exc_info()[:2]
                _logger.error("failed to connect to %s : %s %s" %
                              (tmpBroker, errtype, errvalue))
                catch_sig(None, None)
        while True:
            for conn in connList:
                try:
                    if not conn.is_connected():
                        conn.set_listener(
                            'FileCallbackListener',
                            FileCallbackListener(conn, taskBuffer, siteMapper,
                                                 subscription_id))
                        conn.start()
                        conn.connect(headers={'client-id': clientid})
                        conn.subscribe(destination=queue,
                                       id=subscription_id,
                                       ack='client-individual')
                        _logger.debug('listener %s is up and running' %
                                      clientid)
                except Exception:
                    errtype, errvalue = sys.exc_info()[:2]
                    _logger.error("failed to set listener on %s : %s %s" %
                                  (tmpBroker, errtype, errvalue))
                    catch_sig(None, None)
            time.sleep(5)
示例#11
0
def main(argv=tuple(), tbuf=None, **kwargs):

    try:
        long
    except NameError:
        long = int

    prelock_pid = GenericThread().get_pid()
    tmpLog = LogWrapper(_logger, "<pid={}>".format(prelock_pid))

    tmpLog.debug("===================== start =====================")

    # return value, true to run main again in next daemon loop
    ret_val = True

    # grace period
    try:
        gracePeriod = int(argv[1])
    except Exception:
        gracePeriod = 1

    # lock interval in minutes
    lock_interval = 10

    # retry interval in minutes
    retry_interval = 3

    # instantiate TB
    if tbuf is None:
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
    else:
        taskBuffer = tbuf

    # instantiate sitemapper
    aSiteMapper = SiteMapper(taskBuffer)

    # thread for adder
    class AdderThread(GenericThread):
        def __init__(self, taskBuffer, aSiteMapper, job_output_reports):
            GenericThread.__init__(self)
            self.taskBuffer = taskBuffer
            self.aSiteMapper = aSiteMapper
            self.job_output_reports = job_output_reports

        # main loop
        def run(self):
            # initialize
            taskBuffer = self.taskBuffer
            aSiteMapper = self.aSiteMapper
            # get file list
            timeNow = datetime.datetime.utcnow()
            timeInt = datetime.datetime.utcnow()
            # unique pid
            GenericThread.__init__(self)
            uniq_pid = self.get_pid()
            # log pid
            tmpLog.debug("pid={0} : run".format(uniq_pid))
            # stats
            n_processed = 0
            # loop
            while True:
                # get report
                one_jor = self.job_output_reports.pop()
                if not one_jor:
                    break
                # lock
                panda_id, job_status, attempt_nr, time_stamp = one_jor
                got_lock = taskBuffer.lockJobOutputReport(
                    panda_id=panda_id,
                    attempt_nr=attempt_nr,
                    pid=uniq_pid,
                    time_limit=lock_interval)
                if not got_lock:
                    continue
                # add
                try:
                    modTime = time_stamp
                    if (timeNow - modTime) > datetime.timedelta(hours=24):
                        # last add
                        tmpLog.debug(
                            "pid={0} : last add job={1}.{2} st={3}".format(
                                uniq_pid, panda_id, attempt_nr, job_status))
                        ignoreTmpError = False
                    else:
                        # usual add
                        tmpLog.debug("pid={0} : add job={1}.{2} st={3}".format(
                            uniq_pid, panda_id, attempt_nr, job_status))
                        ignoreTmpError = True
                    # get adder
                    adder_gen = AdderGen(taskBuffer,
                                         panda_id,
                                         job_status,
                                         attempt_nr,
                                         ignoreTmpError=ignoreTmpError,
                                         siteMapper=aSiteMapper,
                                         pid=uniq_pid,
                                         prelock_pid=uniq_pid,
                                         lock_offset=lock_interval -
                                         retry_interval)
                    n_processed += 1
                    # execute
                    adder_gen.run()
                    del adder_gen
                except Exception as e:
                    tmpLog.error("pid={} : failed to run with {} {}".format(
                        uniq_pid, str(e), traceback.format_exc()))
            # stats
            tmpLog.debug("pid={} : processed {}".format(uniq_pid, n_processed))

        # launcher, run with multiprocessing
        def proc_launch(self):
            # run
            self.process = multiprocessing.Process(target=self.run)
            self.process.start()

        # join of multiprocessing
        def proc_join(self):
            self.process.join()

    # TaskBuffer with more connections behind TaskBufferInterface
    tmpLog.debug("setup taskBufferIF")
    n_connections = 4
    _tbuf = TaskBuffer()
    _tbuf.init(panda_config.dbhost,
               panda_config.dbpasswd,
               nDBConnection=n_connections)
    taskBufferIF = TaskBufferInterface()
    taskBufferIF.launch(_tbuf)

    # add files
    tmpLog.debug("run Adder")

    interval = 10
    nLoop = 10
    for iLoop in range(10):
        tmpLog.debug('start iLoop={}/{}'.format(iLoop, nLoop))
        start_time = datetime.datetime.utcnow()
        adderThrList = []
        nThr = 10

        n_jors_per_batch = 1000

        jor_lists = WeightedLists(multiprocessing.Lock())

        # get some job output reports
        jor_list_others = taskBuffer.listJobOutputReport(
            only_unlocked=True,
            time_limit=lock_interval,
            limit=n_jors_per_batch * nThr,
            grace_period=gracePeriod,
            anti_labels=['user'])
        jor_lists.add(3, jor_list_others)
        jor_list_user = taskBuffer.listJobOutputReport(
            only_unlocked=True,
            time_limit=lock_interval,
            limit=n_jors_per_batch * nThr,
            grace_period=gracePeriod,
            labels=['user'])
        jor_lists.add(7, jor_list_user)

        # adder consumer processes
        _n_thr_with_tbuf = 0
        tbuf_list = []
        tmpLog.debug("got {} job reports".format(len(jor_lists)))
        for i in range(nThr):
            if i < _n_thr_with_tbuf:
                tbuf = TaskBuffer()
                tbuf_list.append(tbuf)
                tbuf.init(panda_config.dbhost,
                          panda_config.dbpasswd,
                          nDBConnection=1)
                thr = AdderThread(tbuf, aSiteMapper, jor_lists)
            else:
                thr = AdderThread(taskBufferIF.getInterface(), aSiteMapper,
                                  jor_lists)
            adderThrList.append(thr)
        # start all threads
        for thr in adderThrList:
            # thr.start()
            thr.proc_launch()
            time.sleep(0.25)

        # join all threads
        for thr in adderThrList:
            # thr.join()
            thr.proc_join()
        [tbuf.cleanup() for tbuf in tbuf_list]
        end_time = datetime.datetime.utcnow()
        sleep_time = interval - (end_time - start_time).seconds
        if sleep_time > 0 and iLoop + 1 < nLoop:
            sleep_time = random.randint(1, sleep_time)
            tmpLog.debug("sleep {} sec".format(sleep_time))
            time.sleep(sleep_time)

    # stop TaskBuffer IF
    taskBufferIF.stop()

    tmpLog.debug("===================== end =====================")

    # return
    return ret_val
示例#12
0
def main(tbuf=None, **kwargs):
    _logger.debug("===================== start =====================")

    # overall timeout value
    overallTimeout = 300
    # prefix of the files
    if 'target' in kwargs and kwargs['target']:
        evpFilePatt = kwargs['target']
    else:
        prefixEVP = '/workflow.'
        # file pattern of evp files
        evpFilePatt = panda_config.cache_dir + '/' + prefixEVP + '*'

    from pandaserver.taskbuffer.TaskBuffer import taskBuffer
    taskBuffer.init(panda_config.dbhost,
                    panda_config.dbpasswd,
                    nDBConnection=1)

    test_mode = kwargs.get('test_mode', False)
    dump_workflow = kwargs.get('dump_workflow', False)

    # thread pool
    class ThreadPool:
        def __init__(self):
            self.lock = threading.Lock()
            self.list = []

        def add(self, obj):
            self.lock.acquire()
            self.list.append(obj)
            self.lock.release()

        def remove(self, obj):
            self.lock.acquire()
            self.list.remove(obj)
            self.lock.release()

        def join(self):
            self.lock.acquire()
            thrlist = tuple(self.list)
            self.lock.release()
            for thr in thrlist:
                thr.join()

    # thread
    class EvpThr(threading.Thread):
        def __init__(self, task_buffer, lock, pool, file_name, to_delete,
                     get_log):
            threading.Thread.__init__(self)
            self.lock = lock
            self.pool = pool
            self.fileName = file_name
            self.to_delete = to_delete
            self.get_log = get_log
            self.pool.add(self)
            self.processor = WorkflowProcessor(task_buffer=task_buffer,
                                               log_stream=_logger)

        def run(self):
            self.lock.acquire()
            try:
                self.processor.process(self.fileName, self.to_delete,
                                       test_mode, self.get_log, dump_workflow)
            except Exception as e:
                _logger.error("{} {}".format(str(e), traceback.format_exc()))
            self.pool.remove(self)
            self.lock.release()

    # get files
    timeNow = datetime.datetime.utcnow()
    timeInt = datetime.datetime.utcnow()
    fileList = glob.glob(evpFilePatt)
    fileList.sort()

    # create thread pool and semaphore
    adderLock = threading.Semaphore(1)
    adderThreadPool = ThreadPool()

    # add
    while len(fileList) != 0:
        # time limit to aviod too many copyArchve running at the sametime
        if (datetime.datetime.utcnow() -
                timeNow) > datetime.timedelta(minutes=overallTimeout):
            _logger.debug("time over in main session")
            break
        # try to get Semaphore
        adderLock.acquire()
        # get fileList
        if (datetime.datetime.utcnow() -
                timeInt) > datetime.timedelta(minutes=15):
            timeInt = datetime.datetime.utcnow()
            # get file
            fileList = glob.glob(evpFilePatt)
            fileList.sort()
        # choose a file
        fileName = fileList.pop(0)
        # release lock
        adderLock.release()
        if not os.path.exists(fileName):
            continue
        try:
            modTime = datetime.datetime(
                *(time.gmtime(os.path.getmtime(fileName))[:7]))
            to_go = True
            if test_mode:
                _logger.debug("Testing : %s" % fileName)
                to_delete = False
            elif (timeNow - modTime) > datetime.timedelta(hours=2):
                # last chance
                _logger.debug("Last attempt : %s" % fileName)
                to_delete = True
            elif (timeInt - modTime) > datetime.timedelta(seconds=5):
                # try
                _logger.debug("Normal attempt : %s" % fileName)
                to_delete = False
            else:
                _logger.debug("Wait %s : %s" % ((timeInt - modTime), fileName))
                to_go = False
            if to_go:
                thr = EvpThr(taskBuffer, adderLock, adderThreadPool, fileName,
                             to_delete, False)
                thr.start()
        except Exception as e:
            _logger.error("{} {}".format(str(e), traceback.format_exc()))

    # join all threads
    adderThreadPool.join()

    _logger.debug("===================== end =====================")
示例#13
0
def daemon_loop(dem_config, msg_queue, pipe_conn, worker_lifetime, tbuf=None):
    # pid of the worker
    my_pid = os.getpid()
    my_full_pid = '{0}-{1}-{2}'.format(socket.getfqdn().split('.')[0], os.getpgrp(), my_pid)
    # logger to log in file
    base_logger = logger_utils.setup_logger('daemons')
    tmp_log = logger_utils.make_logger(base_logger, 'worker_pid={pid}'.format(pid=my_pid))
    tmp_log.info('daemon worker start')
    # signal handler
    def got_end_sig(sig, frame):
        tmp_log.warning('(got signal {sig})'.format(sig=sig))
    for sig in END_SIGNALS:
        signal.signal(sig, got_end_sig)
    # dict of all daemons and their script module object
    module_map = {}
    # package of daemon scripts
    mod_package = getattr(daemon_config, 'package')
    # start timestamp
    start_ts = time.time()
    # expiry time
    expiry_ts = start_ts + worker_lifetime
    # create taskBuffer object if not given
    if tbuf is None:
        # initialize cx_Oracle using dummy connection
        try:
            from pandaserver.taskbuffer.Initializer import initializer
            initializer.init()
        except Exception as e:
            tmp_log.error('failed to launch initializer with {err} ; terminated'.format(
                                err='{0}: {1}'.format(e.__class__.__name__, e)))
            return
        # taskBuffer object
        try:
            from pandaserver.taskbuffer.TaskBuffer import taskBuffer as tbuf
            tbuf.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)
            tmp_log.debug('taskBuffer initialized')
        except Exception as e:
            tmp_log.error('failed to initialize taskBuffer with {err} ; terminated'.format(
                                err='{0}: {1}'.format(e.__class__.__name__, e)))
            return
    # import module of all daemons
    for dem_name, attrs in dem_config.items():
        mod_name = attrs['module']
        try:
            the_module = importlib.import_module('.{mod}'.format(mod=mod_name), mod_package)
            module_map[dem_name] = the_module
        except Exception as e:
            tmp_log.warning('for daemon {dem}, failed to import {mod} with {err} ; skipped it'.format(
                                dem=dem_name, mod=mod_name, err='{0}: {1}'.format(e.__class__.__name__, e)))
        else:
            module_map[dem_name] = the_module
    tmp_log.debug('initialized, running')
    # loop
    while True:
        # stop the worker since when reaches its lifetime
        if time.time() > expiry_ts:
            tmp_log.info('worker reached its lifetime, stop this worker')
            break
        # get command from pipe
        if pipe_conn.poll():
            cmd = pipe_conn.recv()
            if cmd == CMD_STOP:
                # got stop command, stop the process
                tmp_log.info('got stop command, stop this worker')
                break
            else:
                tmp_log.debug('got invalid command "{cmd}" ; skipped it'.format(cmd=cmd))
        # clean up memory
        gc.collect()
        # get a message from queue
        tmp_log.debug('waiting for message...')
        keep_going = True
        one_msg = None
        while True:
            try:
                one_msg = msg_queue.get(timeout=5)
                break
            except queue.Empty:
                # timeout to get from queue, check whether to keep going
                if time.time() > expiry_ts:
                    # worker expired, do not keep going
                    keep_going = False
                    break
        # keep going
        if not keep_going:
            continue
        # process message
        if one_msg in module_map and one_msg is not None:
            # got a daemon name, get the module object and corresponding attributes
            dem_name = one_msg
            tmp_log.debug('got message of {dem}'.format(dem=dem_name))
            the_module = module_map[dem_name]
            attrs = dem_config[dem_name]
            mod_args = attrs['arguments']
            mod_argv = tuple([__file__] + mod_args)
            dem_period = attrs['period']
            dem_period_in_minute = dem_period/60.
            is_sync = attrs['sync']
            is_loop = attrs['loop']
            # initialize variables
            to_run_daemon = False
            has_run = False
            last_run_start_ts = 0
            last_run_end_ts = 0
            # component name in lock table
            component = 'pandaD.{dem}'.format(dem=dem_name)
            # whether the daemon shoule be synchronized among nodes
            if is_sync:
                # sychronized daemon, check process lock in DB
                ret_val, locked_time = tbuf.checkProcessLock_PANDA(component=component, pid=my_full_pid, time_limit=dem_period_in_minute)
                if ret_val:
                    # locked by some process on other nodes
                    last_run_start_ts = int((locked_time - EPOCH).total_seconds())
                    tmp_log.debug('found {dem} is locked by other process ; skipped it'.format(dem=dem_name))
                else:
                    # try to get the lock
                    got_lock = tbuf.lockProcess_PANDA(component=component, pid=my_full_pid, time_limit=dem_period_in_minute)
                    if got_lock:
                        # got the lock
                        to_run_daemon = True
                        tmp_log.debug('got lock of {dem}'.format(dem=dem_name))
                    else:
                        # did not get lock, skip
                        last_run_start_ts = int(time.time())
                        tmp_log.debug('did not get lock of {dem} ; skipped it'.format(dem=dem_name))
            else:
                to_run_daemon = True
            # run daemon
            if to_run_daemon:
                last_run_start_ts = int(time.time())
                try:
                    if is_loop:
                        # go looping the script until reaching daemon period
                        tmp_log.info('{dem} start looping'.format(dem=dem_name))
                        start_ts = time.time()
                        while True:
                            ret_val = the_module.main(argv=mod_argv, tbuf=tbuf)
                            now_ts = time.time()
                            if not ret_val:
                                # daemon main function says stop the loop
                                break
                            if now_ts > start_ts + dem_period:
                                # longer than the period, stop the loop
                                break
                        tmp_log.info('{dem} finish looping'.format(dem=dem_name))
                    else:
                        # execute the module script with arguments
                        tmp_log.info('{dem} start'.format(dem=dem_name))
                        the_module.main(argv=mod_argv, tbuf=tbuf)
                        tmp_log.info('{dem} finish'.format(dem=dem_name))
                except Exception as e:
                    # with error
                    tb = traceback.format_exc()
                    tmp_log.error('failed to run daemon {dem} with {err} ; stop this worker'.format(
                                    dem=dem_name, err='{0}: {1}\n{2}\n'.format(e.__class__.__name__, e, tb)))
                    # daemon has run but failed
                    last_run_end_ts = int(time.time())
                    has_run = True
                    # send daemon status back to master
                    status_tuple = (dem_name, has_run, last_run_start_ts, last_run_end_ts)
                    pipe_conn.send(status_tuple)
                    # stop the worker
                    break
                else:
                    # daemon has run
                    last_run_end_ts = int(time.time())
                    has_run = True
            # send daemon status back to master
            status_tuple = (dem_name, has_run, last_run_start_ts, last_run_end_ts)
            pipe_conn.send(status_tuple)
            # FIXME: stop and spawn worker in every run for now since some script breaks the worker without exception
            # tmp_log.info('as script done, stop this worker')
            # break
        else:
            # got invalid message
            tmp_log.warning('got invalid message "{msg}", skipped it'.format(msg=one_msg))
        # sleep
        time.sleep(2**-5)
示例#14
0
"""

import datetime
import types

# config file
from pandaserver.config import panda_config

# initialize cx_Oracle using dummy connection
from pandaserver.taskbuffer.Initializer import initializer
initializer.init()

# initialzie TaskBuffer
from pandaserver.taskbuffer.TaskBuffer import taskBuffer
taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,panda_config.nDBConnection,True)

# initialize JobDispatcher
from pandaserver.jobdispatcher.JobDispatcher import jobDispatcher
if panda_config.nDBConnection != 0:
    jobDispatcher.init(taskBuffer)

# initialize DataService
from pandaserver.dataservice.DataService import dataService
if panda_config.nDBConnection != 0:
    dataService.init(taskBuffer)

# initialize UserIF
from pandaserver.userinterface.UserIF import userIF
if panda_config.nDBConnection != 0:
    userIF.init(taskBuffer)
def main(taskBuffer=None, exec_options=None, log_stream=None, args_list=None):
    # options
    parser = argparse.ArgumentParser()
    if taskBuffer:
        parser.add_argument('--ds',action='store',dest='ds',default=None,
                            help='dataset name')
    else:
        parser.add_argument('--ds',action='store',dest='ds',default=None,required=True,
                            help='dataset name')
    parser.add_argument('--files',action='store',dest='files',default=None,
                        help='comma-separated list of lost file names. The list is dedeuced if this option is omitted')
    parser.add_argument('--noChildRetry',action='store_const',const=True,dest='noChildRetry',default=False,
                        help='not retry child tasks')
    parser.add_argument('--resurrectDS',action='store_const',const=True,dest='resurrectDS',default=False,
                        help='resurrect output and log datasets if they were already deleted')
    parser.add_argument('--dryRun',action='store_const',const=True,dest='dryRun',default=False,
                        help='dry run')
    parser.add_argument('--force', action='store_const', const=True, dest='force', default=False,
                        help='force retry even if no lost files')
    parser.add_argument('--reproduceParent', action='store_const', const=True, dest='reproduceParent',
                        default=False, help='reproduce the input files from which the lost files were produced. '
                        'Typically useful to recover merged files when unmerged files were already deleted')
    # parse options
    if taskBuffer:
        if args_list:
            options = parser.parse_args(args_list)
        else:
            options, unknown = parser.parse_known_args()
    else:
        if args_list:
            options = parser.parse_args(args_list)
        else:
            options = parser.parse_args()

    # executed via command-line
    givenTaskID = None
    dn = None
    if taskBuffer is None:
        # instantiate TB
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)

    else:
        # set options from dict
        if exec_options is None:
            exec_options = {}
        keys = set(vars(options).keys())
        for k in exec_options:
            if k in keys:
                setattr(options, k, exec_options[k])
        if 'jediTaskID' in exec_options:
            givenTaskID = exec_options['jediTaskID']
        if 'userName' in exec_options:
            dn = exec_options['userName']

    ds_files = {}
    if options.files is not None:
        files = options.files.split(',')
        ds_files[options.ds] = files
    else:
        # look for lost files
        if not givenTaskID:
            # get files from rucio
            st, files_rucio = get_files_from_rucio(options.ds, log_stream)
            if st is not True:
                return st, files_rucio
            # get files from panda
            dsName = options.ds.split(':')[-1]
            fd, fo = taskBuffer.querySQLS(
                'SELECT c.lfn FROM ATLAS_PANDA.JEDI_Datasets d,ATLAS_PANDA.JEDI_Dataset_Contents c '
                'WHERE c.jediTaskID=d.jediTaskID AND c.datasetID=d.datasetID AND '
                'd.type IN (:t1,:t2) AND c.status=:s AND d.datasetName=:name ',
                {':s': 'finished', ':t1': 'output', ':t2': 'log', ':name': dsName})
            for tmpLFN, in fo:
                if tmpLFN not in files_rucio:
                    ds_files.setdefault(options.ds, [])
                    ds_files[options.ds].append(tmpLFN)
            # get taskID
            td, to = taskBuffer.querySQLS(
                        'SELECT jediTaskID FROM ATLAS_PANDA.JEDI_Datasets '
                        'WHERE datasetName=:datasetName AND type IN (:t1,:t2) ',
                        {':t1': 'output', ':t2': 'log', ':datasetName': dsName})
            jediTaskID, = to[0]
        else:
            # get dataset names
            dd, do = taskBuffer.querySQLS(
                'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets '
                'WHERE jediTaskID=:jediTaskID AND type IN (:t1,:t2) ',
                {':t1': 'output', ':t2': 'log', ':jediTaskID': givenTaskID})
            # get files from rucio
            files_rucio = set()
            for tmpDS, in do:
                st, tmp_files_rucio = get_files_from_rucio(tmpDS, log_stream)
                if st is None:
                    return st, tmp_files_rucio
                # ignore unknown dataset
                if st:
                    files_rucio = files_rucio.union(tmp_files_rucio)
            # get files from rucio
            fd, fo = taskBuffer.querySQLS(
                'SELECT d.datasetName,c.lfn FROM ATLAS_PANDA.JEDI_Datasets d,ATLAS_PANDA.JEDI_Dataset_Contents c '
                'WHERE d.jediTaskID=:jediTaskID AND c.jediTaskID=d.jediTaskID AND c.datasetID=d.datasetID AND '
                'd.type IN (:t1,:t2) AND c.status=:s ',
                {':s': 'finished', ':t1': 'output', ':t2': 'log', ':jediTaskID': givenTaskID})
            for tmpDS, tmpLFN in fo:
                if tmpLFN not in files_rucio:
                    ds_files.setdefault(tmpDS, [])
                    ds_files[tmpDS].append(tmpLFN)
        for tmpDS in ds_files:
            files = ds_files[tmpDS]
            msgStr = '{} has {} lost files -> {}'.format(tmpDS, len(files), ','.join(files))
            if log_stream:
                log_stream.info(msgStr)
            else:
                print(msgStr)

    # no lost files
    if not ds_files and not options.force:
        return True, "No lost files. Use --force to ignore this check"

    # reset file status
    s = False
    for tmpDS in ds_files:
        files = ds_files[tmpDS]
        if dn:
            ts, jediTaskID, lostInputFiles = taskBuffer.resetFileStatusInJEDI(dn, False, tmpDS,
                                                                              files, options.reproduceParent,
                                                                              options.dryRun)
        else:
            ts, jediTaskID, lostInputFiles = taskBuffer.resetFileStatusInJEDI('', True, tmpDS,
                                                                              files, options.reproduceParent,
                                                                              options.dryRun)
        msgStr = 'reset file status for {} in the DB: done with {} for jediTaskID={}'.format(tmpDS, ts, jediTaskID)
        if log_stream:
            log_stream.info(msgStr)
        else:
            print(msgStr)
        s |= ts
        # recover parent
        if options.reproduceParent:
            # reproduce input
            for lostDS in lostInputFiles:
                com_args = ['--ds', lostDS, '--noChildRetry', '--resurrectDS']
                if options.dryRun:
                    com_args.append('--dryRun')
                com_args += ['--files', ','.join(lostInputFiles[lostDS])]
            main(taskBuffer=taskBuffer, log_stream=log_stream, args_list=com_args)

    # go ahead
    if options.dryRun:
        return True, 'Done in the dry-run mode with {}'.format(s)
    if s or options.force:
        if options.resurrectDS:
            sd,so = taskBuffer.querySQLS(
                'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets WHERE jediTaskID=:id AND type IN (:t1,:t2)',
                {':id': jediTaskID, ':t1': 'output', ':t2': 'log'})
            rc = RucioClient()
            for datasetName, in so:
                for i in range(3):
                    try:
                        scope, name = rucioAPI.extract_scope(datasetName)
                        rc.get_did(scope, name)
                        break
                    except DataIdentifierNotFound:
                        print('resurrect {0}'.format(datasetName))
                        rc.resurrect([{'scope': scope, 'name': name}])
                        try:
                            rc.set_metadata(scope, name, 'lifetime', None)
                        except Exception:
                            pass
        if not options.reproduceParent:
            msgStr = Client.retryTask(jediTaskID, noChildRetry=options.noChildRetry)[-1][-1]
        else:
            msgStr = Client.reloadInput(jediTaskID)[-1][-1]
        if log_stream:
            log_stream.info("Retried task with {}".format(msgStr))
            log_stream.info("Done")
        else:
            print("Retried task: done with {}".format(msgStr))
        return True, msgStr
    else:
        msgStr = 'failed'
        if log_stream:
            log_stream.error(msgStr)
        else:
            print(msgStr)
        return False, msgStr
示例#16
0
def main(tbuf=None, **kwargs):
    # instantiate TB
    if tbuf is None:
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
    else:
        taskBuffer = tbuf
    # pid
    my_pid = os.getpid()
    my_full_pid = '{0}-{1}-{2}'.format(socket.getfqdn().split('.')[0],
                                       os.getpgrp(), my_pid)
    # go
    if DRY_RUN:
        # dry run, regardless of lock, not update DB
        fetcher = FetchData(taskBuffer)
        # loop over all fetch data methods to run and update to DB
        for metric_name, update_type, period in metric_list:
            main_logger.debug('(dry-run) start {metric_name}'.format(
                metric_name=metric_name))
            # fetch data and update DB
            the_method = getattr(fetcher, metric_name)
            fetched_data = the_method()
            if fetched_data is None:
                main_logger.warning(
                    '(dry-run) {metric_name} got no valid data'.format(
                        metric_name=metric_name))
                continue
            main_logger.debug(
                '(dry-run) done {metric_name}'.format(metric_name=metric_name))
    else:
        # real run, will update DB
        # instantiate
        mdb = MetricsDB(taskBuffer)
        fetcher = FetchData(taskBuffer)
        # loop over all fetch data methods to run and update to DB
        for metric_name, update_type, period in metric_list:
            # metric lock
            lock_component_name = 'pandaMetr.{0:.30}.{1:0x}'.format(
                metric_name, adler32(metric_name.encode('utf-8')))
            # try to get lock
            got_lock = taskBuffer.lockProcess_PANDA(
                component=lock_component_name,
                pid=my_full_pid,
                time_limit=period)
            if got_lock:
                main_logger.debug('got lock of {metric_name}'.format(
                    metric_name=metric_name))
            else:
                main_logger.debug(
                    '{metric_name} locked by other process; skipped...'.format(
                        metric_name=metric_name))
                continue
            main_logger.debug(
                'start {metric_name}'.format(metric_name=metric_name))
            # fetch data and update DB
            the_method = getattr(fetcher, metric_name)
            fetched_data = the_method()
            if fetched_data is None:
                main_logger.warning('{metric_name} got no valid data'.format(
                    metric_name=metric_name))
                continue
            mdb.update(metric=metric_name,
                       update_type=update_type,
                       entity_dict=fetched_data)
            main_logger.debug(
                'done {metric_name}'.format(metric_name=metric_name))
示例#17
0
def main(argv=tuple(), tbuf=None, **kwargs):

    try:
        long
    except NameError:
        long = int

    tmpLog = LogWrapper(_logger, None)

    tmpLog.debug("===================== start =====================")

    # current minute
    currentMinute = datetime.datetime.utcnow().minute

    # instantiate TB
    if tbuf is None:
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
    else:
        taskBuffer = tbuf

    # instantiate sitemapper
    aSiteMapper = SiteMapper(taskBuffer)

    # delete
    tmpLog.debug("Del session")
    status, retSel = taskBuffer.querySQLS(
        "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {})
    if retSel is not None:
        try:
            maxID = retSel[0][0]
            tmpLog.debug("maxID : %s" % maxID)
            if maxID is not None:
                varMap = {}
                varMap[':maxID'] = maxID
                varMap[':jobStatus1'] = 'activated'
                varMap[':jobStatus2'] = 'waiting'
                varMap[':jobStatus3'] = 'failed'
                varMap[':jobStatus4'] = 'cancelled'
                status, retDel = taskBuffer.querySQLS(
                    "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)",
                    varMap)
        except Exception:
            pass

    # count # of getJob/updateJob in dispatcher's log
    try:
        # don't update when logrotate is running
        timeNow = datetime.datetime.utcnow()
        logRotateTime = timeNow.replace(hour=3,
                                        minute=2,
                                        second=0,
                                        microsecond=0)
        if (timeNow > logRotateTime and (timeNow-logRotateTime) < datetime.timedelta(minutes=5)) or \
               (logRotateTime > timeNow and (logRotateTime-timeNow) < datetime.timedelta(minutes=5)):
            tmpLog.debug("skip pilotCounts session for logrotate")
        else:
            # log filename
            dispLogName = '%s/panda-PilotRequests.log' % panda_config.logdir
            # time limit
            timeLimit = datetime.datetime.utcnow() - datetime.timedelta(
                hours=3)
            timeLimitS = datetime.datetime.utcnow() - datetime.timedelta(
                hours=1)
            # check if tgz is required
            com = 'head -1 %s' % dispLogName
            lostat, loout = commands_get_status_output(com)
            useLogTgz = True
            if lostat == 0:
                match = re.search('^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',
                                  loout)
                if match is not None:
                    startTime = datetime.datetime(*time.strptime(
                        match.group(0), '%Y-%m-%d %H:%M:%S')[:6])
                    # current log contains all info
                    if startTime < timeLimit:
                        useLogTgz = False
            # log files
            dispLogNameList = [dispLogName]
            if useLogTgz:
                today = datetime.date.today()
                dispLogNameList.append('{0}-{1}.gz'.format(
                    dispLogName, today.strftime('%Y%m%d')))
            # delete tmp
            commands_get_status_output('rm -f %s.tmp-*' % dispLogName)
            # tmp name
            tmpLogName = '%s.tmp-%s' % (dispLogName, datetime.datetime.utcnow(
            ).strftime('%Y-%m-%d-%H-%M-%S'))
            # loop over all files
            pilotCounts = {}
            pilotCountsS = {}
            for tmpDispLogName in dispLogNameList:
                # expand or copy
                if tmpDispLogName.endswith('.gz'):
                    com = 'gunzip -c %s > %s' % (tmpDispLogName, tmpLogName)
                else:
                    com = 'cp %s %s' % (tmpDispLogName, tmpLogName)
                lostat, loout = commands_get_status_output(com)
                if lostat != 0:
                    errMsg = 'failed to expand/copy %s with : %s' % (
                        tmpDispLogName, loout)
                    raise RuntimeError(errMsg)
                # search string
                sStr = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*'
                sStr += 'method=(.+),site=(.+),node=(.+),type=(.+)'
                # read
                logFH = open(tmpLogName)
                for line in logFH:
                    # check format
                    match = re.search(sStr, line)
                    if match is not None:
                        # check timerange
                        timeStamp = datetime.datetime(*time.strptime(
                            match.group(1), '%Y-%m-%d %H:%M:%S')[:6])
                        if timeStamp < timeLimit:
                            continue
                        tmpMethod = match.group(2)
                        tmpSite = match.group(3)
                        tmpNode = match.group(4)
                        tmpType = match.group(5)

                        # protection against corrupted entries from pilot,
                        # e.g. pilot reading site json from cvmfs while it was being updated
                        if tmpSite not in aSiteMapper.siteSpecList:
                            continue
                        # sum
                        pilotCounts.setdefault(tmpSite, {})
                        pilotCounts[tmpSite].setdefault(tmpMethod, {})
                        pilotCounts[tmpSite][tmpMethod].setdefault(tmpNode, 0)
                        pilotCounts[tmpSite][tmpMethod][tmpNode] += 1
                        # short
                        if timeStamp > timeLimitS:
                            if tmpSite not in pilotCountsS:
                                pilotCountsS[tmpSite] = dict()
                            if tmpMethod not in pilotCountsS[tmpSite]:
                                pilotCountsS[tmpSite][tmpMethod] = dict()
                            if tmpNode not in pilotCountsS[tmpSite][tmpMethod]:
                                pilotCountsS[tmpSite][tmpMethod][tmpNode] = 0
                            pilotCountsS[tmpSite][tmpMethod][tmpNode] += 1
                # close
                logFH.close()
            # delete tmp
            commands_get_status_output('rm %s' % tmpLogName)
            # update
            hostID = panda_config.pserverhost.split('.')[0]
            tmpLog.debug("pilotCounts session")
            retPC = taskBuffer.updateSiteData(hostID, pilotCounts, interval=3)
            tmpLog.debug(retPC)
            retPC = taskBuffer.updateSiteData(hostID, pilotCountsS, interval=1)
            tmpLog.debug(retPC)
    except Exception:
        errType, errValue = sys.exc_info()[:2]
        tmpLog.error("updateJob/getJob : %s %s" % (errType, errValue))

    # nRunning
    tmpLog.debug("nRunning session")
    try:
        if (currentMinute / panda_config.nrun_interval
            ) % panda_config.nrun_hosts == panda_config.nrun_snum:
            retNR = taskBuffer.insertnRunningInSiteData()
            tmpLog.debug(retNR)
    except Exception:
        errType, errValue = sys.exc_info()[:2]
        tmpLog.error("nRunning : %s %s" % (errType, errValue))

    # session for co-jumbo jobs
    tmpLog.debug("co-jumbo session")
    try:
        ret = taskBuffer.getCoJumboJobsToBeFinished(30, 0, 1000)
        if ret is None:
            tmpLog.debug("failed to get co-jumbo jobs to finish")
        else:
            coJumboA, coJumboD, coJumboW, coJumboTokill = ret
            tmpLog.debug("finish {0} co-jumbo jobs in Active".format(
                len(coJumboA)))
            if len(coJumboA) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboA,
                                               fromDefined=False,
                                               fromActive=True,
                                               fromArchived=False,
                                               fromWaiting=False)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], False)
            tmpLog.debug("finish {0} co-jumbo jobs in Defined".format(
                len(coJumboD)))
            if len(coJumboD) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboD,
                                               fromDefined=True,
                                               fromActive=False,
                                               fromArchived=False,
                                               fromWaiting=False)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], True)
            tmpLog.debug("finish {0} co-jumbo jobs in Waiting".format(
                len(coJumboW)))
            if len(coJumboW) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboW,
                                               fromDefined=False,
                                               fromActive=False,
                                               fromArchived=False,
                                               fromWaiting=True)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], False, True)
            tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format(
                len(coJumboTokill)))
            if len(coJumboTokill) > 0:
                jediJobs = list(coJumboTokill)
                nJob = 100
                iJob = 0
                while iJob < len(jediJobs):
                    tmpLog.debug(' killing %s' %
                                 str(jediJobs[iJob:iJob + nJob]))
                    Client.killJobs(jediJobs[iJob:iJob + nJob],
                                    51,
                                    keepUnmerged=True)
                    iJob += nJob
    except Exception:
        errStr = traceback.format_exc()
        tmpLog.error(errStr)

    tmpLog.debug("Fork session")

    # thread for fork
    class ForkThr(threading.Thread):
        def __init__(self, fileName):
            threading.Thread.__init__(self)
            self.fileName = fileName

        def run(self):
            if 'VIRTUAL_ENV' in os.environ:
                prefix = os.environ['VIRTUAL_ENV']
            else:
                prefix = ''
            setupStr = 'source {0}/etc/sysconfig/panda_server; '.format(prefix)
            runStr = '%s/python -Wignore ' % panda_config.native_python
            runStr += panda_config.pandaPython_dir + '/dataservice/forkSetupper.py -i '
            runStr += self.fileName
            if self.fileName.split('/')[-1].startswith('set.NULL.'):
                runStr += ' -t'
            comStr = setupStr + runStr
            tmpLog.debug(comStr)
            commands_get_status_output(comStr)

    # get set.* files
    filePatt = panda_config.logdir + '/' + 'set.*'
    fileList = glob.glob(filePatt)

    # the max number of threads
    maxThr = 10
    nThr = 0

    # loop over all files
    forkThrList = []
    timeNow = datetime.datetime.utcnow()
    for tmpName in fileList:
        if not os.path.exists(tmpName):
            continue
        try:
            # takes care of only recent files
            modTime = datetime.datetime(
                *(time.gmtime(os.path.getmtime(tmpName))[:7]))
            if (timeNow - modTime) > datetime.timedelta(minutes=1) and \
                    (timeNow - modTime) < datetime.timedelta(hours=1):
                cSt, cOut = commands_get_status_output(
                    'ps aux | grep fork | grep -v PYTH')
                # if no process is running for the file
                if cSt == 0 and tmpName not in cOut:
                    nThr += 1
                    thr = ForkThr(tmpName)
                    thr.start()
                    forkThrList.append(thr)
                    if nThr > maxThr:
                        break
        except Exception:
            errType, errValue = sys.exc_info()[:2]
            tmpLog.error("%s %s" % (errType, errValue))

    # join fork threads
    for thr in forkThrList:
        thr.join()

    # terminate TaskBuffer IF
    # taskBufferIF.terminate()

    tmpLog.debug("===================== end =====================")