示例#1
0
    def proc():

        cb = get_counter()

        def term_cb(uid):

            now = time.time()
            assert (now >= t0 + dur), '%.1f > %.1f + %.1f' % (now, t0, dur)

            cnt_tgt = dur / ival
            cnt_chk = cb(test=True)
            assert (cnt_tgt * 0.8 < cnt_chk < cnt_tgt * 1.2), [
                cnt_tgt, cnt_chk
            ]

        hb = ru.Heartbeat('foo',
                          timeout=tout,
                          interval=ival,
                          beat_cb=cb,
                          term_cb=term_cb)
        t0 = time.time()

        hb.start()

        while time.time() < t0 + dur:
            hb.beat()

        while True:
            time.sleep(0.1)
示例#2
0
    def __init__(self, cfg):

        self._cfg = ru.Config('radical.pilot.cmgr', cfg=cfg)
        self._sid = self._cfg.sid
        self._uid = ru.generate_id('cmgr', ns=self._sid)
        self._uids = [self._uid]  # uids to track hartbeats for (incl. own)

        self._prof = ru.Profiler(self._uid,
                                 ns='radical.pilot',
                                 path=self._cfg.path)
        self._log = ru.Logger(self._uid,
                              ns='radical.pilot',
                              path=self._cfg.path)

        self._prof.prof('init2', uid=self._uid, msg=self._cfg.path)

        # Every ComponentManager runs a HB pubsub bridge in a separate thread.
        # That HB channel should be used by all components and bridges created
        # under this CMGR.
        bcfg = ru.Config(
            cfg={
                'channel': 'heartbeat',
                'type': 'pubsub',
                'uid': self._uid + '.hb',
                'stall_hwm': 1,
                'bulk_size': 0,
                'path': self._cfg.path
            })
        self._hb_bridge = ru.zmq.PubSub(bcfg)
        self._hb_bridge.start()

        self._cfg.heartbeat.addr_pub = str(self._hb_bridge.addr_pub)
        self._cfg.heartbeat.addr_sub = str(self._hb_bridge.addr_sub)

        # runs a HB monitor on that channel
        self._hb = ru.Heartbeat(
            uid=self.uid,
            timeout=self._cfg.heartbeat.timeout,
            interval=self._cfg.heartbeat.interval,
            beat_cb=self._hb_beat_cb,  # on every heartbeat
            term_cb=self._hb_term_cb,  # on termination
            log=self._log)

        self._hb_pub = ru.zmq.Publisher('heartbeat',
                                        self._cfg.heartbeat.addr_pub,
                                        log=self._log,
                                        prof=self._prof)
        self._hb_sub = ru.zmq.Subscriber('heartbeat',
                                         self._cfg.heartbeat.addr_sub,
                                         topic='heartbeat',
                                         cb=self._hb_sub_cb,
                                         log=self._log,
                                         prof=self._prof)

        # confirm the bridge being usable by listening to our own heartbeat
        self._hb.start()
        self._hb.wait_startup(self._uid, self._cfg.heartbeat.timeout)
        self._log.info('heartbeat system up')
示例#3
0
    def __init__(self, cfg, session):

        self._cfg = cfg
        self._pid = cfg.pid
        self._pmgr = cfg.pmgr
        self._pwd = cfg.pilot_sandbox
        self._session = session
        self._log = session._log

        self._starttime = time.time()
        self._final_cause = None

        # this is the earliest point to sync bootstrap and agent profiles
        prof = ru.Profiler(ns='radical.pilot', name='agent.0')
        prof.prof('sync_rel', uid=cfg.pid, msg='agent.0')
        prof.prof('hostname', uid=cfg.pid, msg=ru.get_hostname())

        # connect to MongoDB for state push/pull
        self._connect_db()

        # configure ResourceManager before component startup, as components need
        # ResourceManager information for function (scheduler, executor)
        self._configure_rm()

        # ensure that app communication channels are visible to workload
        self._configure_app_comm()

        # expose heartbeat channel to sub-agents, bridges and components,
        # and start those
        self._cmgr = rpu.ComponentManager(self._cfg)
        self._cfg.heartbeat = self._cmgr.cfg.heartbeat

        self._cmgr.start_bridges()
        self._cmgr.start_components()

        # create the sub-agent configs and start the sub agents
        self._write_sa_configs()
        self._start_sub_agents()  # TODO: move to cmgr?

        # at this point the session is up and connected, and it should have
        # brought up all communication bridges and components.  We are
        # ready to rumble!
        rpu.Worker.__init__(self, self._cfg, session)

        # run our own slow-paced heartbeat monitor to watch pgr heartbeats
        self._hb = ru.Heartbeat(
            uid=self._pid,
            timeout=10.0,  # FIXME:  configurable
            interval=1.0,  # FIXME:  configurable
            beat_cb=self._hb_check,  # no own heartbeat(pmgr pulls)
            term_cb=self._hb_term_cb,
            log=self._log)
        self._hb.start()

        # register pmgr heartbeat
        self._log.info('hb init for %s', self._pmgr)
        self._hb.beat(uid=self._pmgr)
示例#4
0
    def proc():

        hb = ru.Heartbeat('test', timeout=0.1, interval=0.01)
        t0 = time.time()

        hb.start()

        try:
            while True:
                if time.time() < t0 + 3: hb.beat('short')
                if time.time() < t0 + 5: hb.beat('long')
                time.sleep(0.05)
            while True:
                time.sleep(1)

        finally:
            if time.time() > t0 + 3.2:
                sys.exit(-1)