def proc(): cb = get_counter() def term_cb(uid): now = time.time() assert (now >= t0 + dur), '%.1f > %.1f + %.1f' % (now, t0, dur) cnt_tgt = dur / ival cnt_chk = cb(test=True) assert (cnt_tgt * 0.8 < cnt_chk < cnt_tgt * 1.2), [ cnt_tgt, cnt_chk ] hb = ru.Heartbeat('foo', timeout=tout, interval=ival, beat_cb=cb, term_cb=term_cb) t0 = time.time() hb.start() while time.time() < t0 + dur: hb.beat() while True: time.sleep(0.1)
def __init__(self, cfg): self._cfg = ru.Config('radical.pilot.cmgr', cfg=cfg) self._sid = self._cfg.sid self._uid = ru.generate_id('cmgr', ns=self._sid) self._uids = [self._uid] # uids to track hartbeats for (incl. own) self._prof = ru.Profiler(self._uid, ns='radical.pilot', path=self._cfg.path) self._log = ru.Logger(self._uid, ns='radical.pilot', path=self._cfg.path) self._prof.prof('init2', uid=self._uid, msg=self._cfg.path) # Every ComponentManager runs a HB pubsub bridge in a separate thread. # That HB channel should be used by all components and bridges created # under this CMGR. bcfg = ru.Config( cfg={ 'channel': 'heartbeat', 'type': 'pubsub', 'uid': self._uid + '.hb', 'stall_hwm': 1, 'bulk_size': 0, 'path': self._cfg.path }) self._hb_bridge = ru.zmq.PubSub(bcfg) self._hb_bridge.start() self._cfg.heartbeat.addr_pub = str(self._hb_bridge.addr_pub) self._cfg.heartbeat.addr_sub = str(self._hb_bridge.addr_sub) # runs a HB monitor on that channel self._hb = ru.Heartbeat( uid=self.uid, timeout=self._cfg.heartbeat.timeout, interval=self._cfg.heartbeat.interval, beat_cb=self._hb_beat_cb, # on every heartbeat term_cb=self._hb_term_cb, # on termination log=self._log) self._hb_pub = ru.zmq.Publisher('heartbeat', self._cfg.heartbeat.addr_pub, log=self._log, prof=self._prof) self._hb_sub = ru.zmq.Subscriber('heartbeat', self._cfg.heartbeat.addr_sub, topic='heartbeat', cb=self._hb_sub_cb, log=self._log, prof=self._prof) # confirm the bridge being usable by listening to our own heartbeat self._hb.start() self._hb.wait_startup(self._uid, self._cfg.heartbeat.timeout) self._log.info('heartbeat system up')
def __init__(self, cfg, session): self._cfg = cfg self._pid = cfg.pid self._pmgr = cfg.pmgr self._pwd = cfg.pilot_sandbox self._session = session self._log = session._log self._starttime = time.time() self._final_cause = None # this is the earliest point to sync bootstrap and agent profiles prof = ru.Profiler(ns='radical.pilot', name='agent.0') prof.prof('sync_rel', uid=cfg.pid, msg='agent.0') prof.prof('hostname', uid=cfg.pid, msg=ru.get_hostname()) # connect to MongoDB for state push/pull self._connect_db() # configure ResourceManager before component startup, as components need # ResourceManager information for function (scheduler, executor) self._configure_rm() # ensure that app communication channels are visible to workload self._configure_app_comm() # expose heartbeat channel to sub-agents, bridges and components, # and start those self._cmgr = rpu.ComponentManager(self._cfg) self._cfg.heartbeat = self._cmgr.cfg.heartbeat self._cmgr.start_bridges() self._cmgr.start_components() # create the sub-agent configs and start the sub agents self._write_sa_configs() self._start_sub_agents() # TODO: move to cmgr? # at this point the session is up and connected, and it should have # brought up all communication bridges and components. We are # ready to rumble! rpu.Worker.__init__(self, self._cfg, session) # run our own slow-paced heartbeat monitor to watch pgr heartbeats self._hb = ru.Heartbeat( uid=self._pid, timeout=10.0, # FIXME: configurable interval=1.0, # FIXME: configurable beat_cb=self._hb_check, # no own heartbeat(pmgr pulls) term_cb=self._hb_term_cb, log=self._log) self._hb.start() # register pmgr heartbeat self._log.info('hb init for %s', self._pmgr) self._hb.beat(uid=self._pmgr)
def proc(): hb = ru.Heartbeat('test', timeout=0.1, interval=0.01) t0 = time.time() hb.start() try: while True: if time.time() < t0 + 3: hb.beat('short') if time.time() < t0 + 5: hb.beat('long') time.sleep(0.05) while True: time.sleep(1) finally: if time.time() > t0 + 3.2: sys.exit(-1)