예제 #1
0
def run(inFile,v_onlyTA,v_firstSubmission):
    try:
        import cPickle as pickle
    except ImportError:
        import pickle
    try:
        # read Jobs from file
        f = open(inFile, 'rb')
        jobs = pickle.load(f)
        f.close()
    except Exception as e:
        print("run() : %s %s" % (str(e), traceback.format_exc()))
        return
    # password
    from pandaserver.config import panda_config
    # initialize cx_Oracle using dummy connection
    from pandaserver.taskbuffer.Initializer import initializer
    initializer.init()
    # instantiate TB
    from pandaserver.taskbuffer.TaskBuffer import taskBuffer
    taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
    # run Setupper
    from pandaserver.dataservice.Setupper import Setupper
    thr = Setupper(taskBuffer,jobs,onlyTA=v_onlyTA,firstSubmission=v_firstSubmission)
    thr.start()
    thr.join()
    return
예제 #2
0
#!/usr/bin/python

"""
entry point

"""

import datetime
import types

# config file
from pandaserver.config import panda_config

# initialize cx_Oracle using dummy connection
from pandaserver.taskbuffer.Initializer import initializer
initializer.init()

# initialzie TaskBuffer
from pandaserver.taskbuffer.TaskBuffer import taskBuffer
taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,panda_config.nDBConnection,True)

# initialize JobDispatcher
from pandaserver.jobdispatcher.JobDispatcher import jobDispatcher
if panda_config.nDBConnection != 0:
    jobDispatcher.init(taskBuffer)

# initialize DataService
from pandaserver.dataservice.DataService import dataService
if panda_config.nDBConnection != 0:
    dataService.init(taskBuffer)
예제 #3
0
def main(backGround=False):
    _logger.debug('starting ...')
    # register signal handler
    signal.signal(signal.SIGINT, catch_sig)
    signal.signal(signal.SIGHUP, catch_sig)
    signal.signal(signal.SIGTERM, catch_sig)
    signal.signal(signal.SIGALRM, catch_sig)
    signal.alarm(overallTimeout)
    # forking
    pid = os.fork()
    if pid != 0:
        # watch child process
        os.wait()
        time.sleep(1)
    else:
        # main loop
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        # check certificate
        certName = '%s/pandasv1_usercert.pem' % panda_config.certdir
        keyName = '%s/pandasv1_userkey.pem' % panda_config.certdir

        _logger.debug('checking certificate {0}'.format(certName))
        certOK, certMsg = DataServiceUtils.checkCertificate(certName)
        if not certOK:
            _logger.error('bad certificate : {0}'.format(certMsg))
        # initialize cx_Oracle using dummy connection
        from pandaserver.taskbuffer.Initializer import initializer
        initializer.init()
        # instantiate TB
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
        # instantiate sitemapper
        siteMapper = SiteMapper(taskBuffer)
        # ActiveMQ params
        queue = '/queue/Consumer.PANDA.atlas.ddm.siteservices'
        ssl_opts = {
            'use_ssl': True,
            'ssl_version': ssl.PROTOCOL_TLSv1,
            'ssl_cert_file': certName,
            'ssl_key_file': keyName
        }
        # resolve multiple brokers
        brokerList = socket.gethostbyname_ex('atlas-mb.cern.ch')[-1]
        # set listener
        connList = []
        for tmpBroker in brokerList:
            try:
                clientid = 'PANDA-' + socket.getfqdn() + '-' + tmpBroker
                subscription_id = 'panda-server-consumer-' + socket.getfqdn()
                _logger.debug('setting listener %s' % clientid)
                conn = stomp.Connection(host_and_ports=[(tmpBroker, 61023)],
                                        **ssl_opts)
                connList.append(conn)
            except Exception:
                errtype, errvalue = sys.exc_info()[:2]
                _logger.error("failed to connect to %s : %s %s" %
                              (tmpBroker, errtype, errvalue))
                catch_sig(None, None)
        while True:
            for conn in connList:
                try:
                    if not conn.is_connected():
                        conn.set_listener(
                            'FileCallbackListener',
                            FileCallbackListener(conn, taskBuffer, siteMapper,
                                                 subscription_id))
                        conn.start()
                        conn.connect(headers={'client-id': clientid})
                        conn.subscribe(destination=queue,
                                       id=subscription_id,
                                       ack='client-individual')
                        _logger.debug('listener %s is up and running' %
                                      clientid)
                except Exception:
                    errtype, errvalue = sys.exc_info()[:2]
                    _logger.error("failed to set listener on %s : %s %s" %
                                  (tmpBroker, errtype, errvalue))
                    catch_sig(None, None)
            time.sleep(5)
예제 #4
0
def daemon_loop(dem_config, msg_queue, pipe_conn, worker_lifetime, tbuf=None):
    # pid of the worker
    my_pid = os.getpid()
    my_full_pid = '{0}-{1}-{2}'.format(socket.getfqdn().split('.')[0], os.getpgrp(), my_pid)
    # logger to log in file
    base_logger = logger_utils.setup_logger('daemons')
    tmp_log = logger_utils.make_logger(base_logger, 'worker_pid={pid}'.format(pid=my_pid))
    tmp_log.info('daemon worker start')
    # signal handler
    def got_end_sig(sig, frame):
        tmp_log.warning('(got signal {sig})'.format(sig=sig))
    for sig in END_SIGNALS:
        signal.signal(sig, got_end_sig)
    # dict of all daemons and their script module object
    module_map = {}
    # package of daemon scripts
    mod_package = getattr(daemon_config, 'package')
    # start timestamp
    start_ts = time.time()
    # expiry time
    expiry_ts = start_ts + worker_lifetime
    # create taskBuffer object if not given
    if tbuf is None:
        # initialize cx_Oracle using dummy connection
        try:
            from pandaserver.taskbuffer.Initializer import initializer
            initializer.init()
        except Exception as e:
            tmp_log.error('failed to launch initializer with {err} ; terminated'.format(
                                err='{0}: {1}'.format(e.__class__.__name__, e)))
            return
        # taskBuffer object
        try:
            from pandaserver.taskbuffer.TaskBuffer import taskBuffer as tbuf
            tbuf.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)
            tmp_log.debug('taskBuffer initialized')
        except Exception as e:
            tmp_log.error('failed to initialize taskBuffer with {err} ; terminated'.format(
                                err='{0}: {1}'.format(e.__class__.__name__, e)))
            return
    # import module of all daemons
    for dem_name, attrs in dem_config.items():
        mod_name = attrs['module']
        try:
            the_module = importlib.import_module('.{mod}'.format(mod=mod_name), mod_package)
            module_map[dem_name] = the_module
        except Exception as e:
            tmp_log.warning('for daemon {dem}, failed to import {mod} with {err} ; skipped it'.format(
                                dem=dem_name, mod=mod_name, err='{0}: {1}'.format(e.__class__.__name__, e)))
        else:
            module_map[dem_name] = the_module
    tmp_log.debug('initialized, running')
    # loop
    while True:
        # stop the worker since when reaches its lifetime
        if time.time() > expiry_ts:
            tmp_log.info('worker reached its lifetime, stop this worker')
            break
        # get command from pipe
        if pipe_conn.poll():
            cmd = pipe_conn.recv()
            if cmd == CMD_STOP:
                # got stop command, stop the process
                tmp_log.info('got stop command, stop this worker')
                break
            else:
                tmp_log.debug('got invalid command "{cmd}" ; skipped it'.format(cmd=cmd))
        # clean up memory
        gc.collect()
        # get a message from queue
        tmp_log.debug('waiting for message...')
        keep_going = True
        one_msg = None
        while True:
            try:
                one_msg = msg_queue.get(timeout=5)
                break
            except queue.Empty:
                # timeout to get from queue, check whether to keep going
                if time.time() > expiry_ts:
                    # worker expired, do not keep going
                    keep_going = False
                    break
        # keep going
        if not keep_going:
            continue
        # process message
        if one_msg in module_map and one_msg is not None:
            # got a daemon name, get the module object and corresponding attributes
            dem_name = one_msg
            tmp_log.debug('got message of {dem}'.format(dem=dem_name))
            the_module = module_map[dem_name]
            attrs = dem_config[dem_name]
            mod_args = attrs['arguments']
            mod_argv = tuple([__file__] + mod_args)
            dem_period = attrs['period']
            dem_period_in_minute = dem_period/60.
            is_sync = attrs['sync']
            is_loop = attrs['loop']
            # initialize variables
            to_run_daemon = False
            has_run = False
            last_run_start_ts = 0
            last_run_end_ts = 0
            # component name in lock table
            component = 'pandaD.{dem}'.format(dem=dem_name)
            # whether the daemon shoule be synchronized among nodes
            if is_sync:
                # sychronized daemon, check process lock in DB
                ret_val, locked_time = tbuf.checkProcessLock_PANDA(component=component, pid=my_full_pid, time_limit=dem_period_in_minute)
                if ret_val:
                    # locked by some process on other nodes
                    last_run_start_ts = int((locked_time - EPOCH).total_seconds())
                    tmp_log.debug('found {dem} is locked by other process ; skipped it'.format(dem=dem_name))
                else:
                    # try to get the lock
                    got_lock = tbuf.lockProcess_PANDA(component=component, pid=my_full_pid, time_limit=dem_period_in_minute)
                    if got_lock:
                        # got the lock
                        to_run_daemon = True
                        tmp_log.debug('got lock of {dem}'.format(dem=dem_name))
                    else:
                        # did not get lock, skip
                        last_run_start_ts = int(time.time())
                        tmp_log.debug('did not get lock of {dem} ; skipped it'.format(dem=dem_name))
            else:
                to_run_daemon = True
            # run daemon
            if to_run_daemon:
                last_run_start_ts = int(time.time())
                try:
                    if is_loop:
                        # go looping the script until reaching daemon period
                        tmp_log.info('{dem} start looping'.format(dem=dem_name))
                        start_ts = time.time()
                        while True:
                            ret_val = the_module.main(argv=mod_argv, tbuf=tbuf)
                            now_ts = time.time()
                            if not ret_val:
                                # daemon main function says stop the loop
                                break
                            if now_ts > start_ts + dem_period:
                                # longer than the period, stop the loop
                                break
                        tmp_log.info('{dem} finish looping'.format(dem=dem_name))
                    else:
                        # execute the module script with arguments
                        tmp_log.info('{dem} start'.format(dem=dem_name))
                        the_module.main(argv=mod_argv, tbuf=tbuf)
                        tmp_log.info('{dem} finish'.format(dem=dem_name))
                except Exception as e:
                    # with error
                    tb = traceback.format_exc()
                    tmp_log.error('failed to run daemon {dem} with {err} ; stop this worker'.format(
                                    dem=dem_name, err='{0}: {1}\n{2}\n'.format(e.__class__.__name__, e, tb)))
                    # daemon has run but failed
                    last_run_end_ts = int(time.time())
                    has_run = True
                    # send daemon status back to master
                    status_tuple = (dem_name, has_run, last_run_start_ts, last_run_end_ts)
                    pipe_conn.send(status_tuple)
                    # stop the worker
                    break
                else:
                    # daemon has run
                    last_run_end_ts = int(time.time())
                    has_run = True
            # send daemon status back to master
            status_tuple = (dem_name, has_run, last_run_start_ts, last_run_end_ts)
            pipe_conn.send(status_tuple)
            # FIXME: stop and spawn worker in every run for now since some script breaks the worker without exception
            # tmp_log.info('as script done, stop this worker')
            # break
        else:
            # got invalid message
            tmp_log.warning('got invalid message "{msg}", skipped it'.format(msg=one_msg))
        # sleep
        time.sleep(2**-5)