def proc(cnx, arg): res= dict() sum_file = dict() procc=dict() sum_connection = dict() percent_mem= dict() all_proc = psutil.get_process_list() interesting = { "ping.py", "proc.py", "master.py", "tracker.py","cpu.py", "rrd.py", "csvw.py"} for x in all_proc: key = ( set(x.cmdline()) & interesting) and "me" or "other" try: procc += dict({key : 1 }) except: pass try: sum_file += dict({key :x.get_num_fds()}) except: pass try: sum_connection += dict({ key: sum(x.get_num_ctx_switches())}) except: pass try: percent_mem += dict({key : x.get_memory_percent() }) except: pass ratio = lambda d : min(1,1.0 *d.get("me", 0)/max(.0001,d.get("other",0))) absol = lambda d : d.get("me", 0) + d.get("other", 0) res=dict(sum_file= sum_file, percent_mem= percent_mem, all_proc =len(all_proc)) res["data"] = map(ratio, [ sum_file, sum_connection, percent_mem, procc]) res["data"] += map(absol, [sum_file, sum_connection, percent_mem, procc]) return res
def proc(ctx, payload, msg): res= dict() sum_file = dict() procc= dict() sum_connection = dict() percent_mem= dict() all_proc = psutil.get_process_list() carbon_measure = dict() interesting = { '/usr/lib/firefox/firefox', '/opt/google/chrome/chrome', 'mysqld', 'mongod', "ping.py", "clock.py", "orchester.py", "proc.py", "master.py", "tracker.py","cpu.py", "rrd.py", "csvw.py"} for x in all_proc: try: key = ( set(x.cmdline()) & interesting) and "me" or "other" carbon_key=None cmd = x.cmdline() intersect = interesting &set(cmd) if intersect: assert len(intersect) == 1 carbon_key = intersect.pop() try: procc += dict({key : 1 }) except: pass try: sum_file += dict({key :x.get_num_fds()}) except: pass try: sum_connection += dict({ key: sum(x.get_num_ctx_switches())}) except: pass try: percent_mem += dict({key : x.get_memory_percent() }) if carbon_key: carbon_measure += { carbon_key: x.get_memory_percent() } except: pass except NoSuchProcess: pass try: send(carbon_measure) except Exception as e: ctx.log.error('Carbon is not very liking that %r' % e) ratio = lambda d : min(1,1.0 *d.get("me", 0)/max(.0001,d.get("other",0))) absol = lambda d : d.get("me", 0) + d.get("other", 0) res=dict(sum_file= sum_file, percent_mem= percent_mem, all_proc =len(all_proc)) res["data"] = map(ratio, [ sum_file, sum_connection, percent_mem, procc]) res["data"] += map(absol, [sum_file, sum_connection, percent_mem, procc]) del all_proc return res
def event_listener(CNX, config): """Processlet responsible for routing and reacting on status change""" D("event listener") global error_per_stage, busy_per_stage, message_per_stage cnx = CNX propagated_to = dict() def on_failure(new): log.critical("KO for <%(job_id)r-%(task_id)r> %(arg)r" % new) def on_end( new): #D("JOB PROCESSED @ STEP %(step)s" % new) cleanUp(new) def cleanUp(new): #q_madd(timer_q,"cancel", new) global busy_per_stage busy_per_stage -= dict({ new["step"] : 1 }) def on_begin(new): global busy_per_stage busy_per_stage += { new["step"] : 1 } def on_timeout(new): __on_error(new) def on_error(new): __on_error(new) def __on_error(new): global error_per_stage cleanUp(new) error_per_stage+={ new["step"] : 1 } D("OE %s" % ( _f(new) )) #### WHY did I put that? if int(new.get("retry",0)) >= config["max_retry"]: D('retry %(retry)s for <%(job_id)s-%(task_id)s>' % new) new["event"] = "FAILURE" on_failure(new) else: log.critical("unhandled failure for %r" % new) ### could also restart failing processes here def on_send(new): pass def on_init(new): pass def on_propagate( new): D("just passing %s" % _f(new)) on_propagate += { new["next"] : 1 } D("waiting") action = dict( INIT = False, BOUNCE = False, SEND = False, HAPPY_END = False, ACK = False, END = on_end, BEGIN = on_begin, PROPAGATE = on_propagate, ERROR = on_error, TIMEOUT = on_error, O_TIMEOUT = on_failure, ) print("Waiting for socket to be read cf 100% CPU zmq bug") sleep(1) local_in_sox = cnx["tracker_in"] print LOCAL_INFO CLOCK_M = int(config.get("tracker_clock_every", 1000)) ignore = 0 check_delay = config.get("check_delay", 3) * 2 now = time() while True: new = parse_event(local_in_sox) ignore += 1 ignore %= CLOCK_M if not ignore: log.info("RCV %s" % _f(new)) if abs(time() - now) > check_delay: q_madd( watchdog_q, busy_per_stage) log.info("zombie count %r" % busy_per_stage) log.info("MPS %r" % message_per_stage) log.info("EPS %r" % error_per_stage) now = time() if new["where"] != LOCAL_INFO["where"]: D("NOT FOR ME %s" % _f(new)) continue try: message_per_stage += { new["step"] : 1 } # only one message at a time can be treated not even sure I need it task_id = new["task_id"] job_id = new["job_id"] if int(new["seq"]) > config.get("max_seq", 50): logging.warning("<%r> was bounced <%r> times" %(new, new["seq"])) continue # Wut ? event, U sure? if action[new["event"]]: action[new["event"]](new) except Exception as e: D("MON new is %r" % new) log.exception("MON %s" % e)
def cleanUp(new): #q_madd(timer_q,"cancel", new) global busy_per_stage busy_per_stage -= dict({ new["step"] : 1 })
def q_madd(q, what): q.put(dumps(what)) def q_mget(q): return loads(q.get()) D("Started tracker") CNX = get_connection(CONFIG, LOCAL_INFO) from pprint import PrettyPrinter as PP pp = PP(indent=4) P = pp.pprint P(dict(CNX)) watchdog_q = Queue() def watcher(watchdog_q, config): proc_tracker = ProcTracker(config) delay = config.get("check_delay",3) now = time() while True: busy_per_stage = q_mget(watchdog_q) if abs(time() - now ) > delay: proc_tracker.watch(busy_per_stage) now = time() ProcWather = Process(target=watcher, args=(watchdog_q, CONFIG,))
def watcher(watchdog_q, config): proc_tracker = ProcTracker(config) delay = config.get("check_delay",.1) now = time() while True: busy_per_stage = q_mget(watchdog_q) if abs(time() - now ) > delay: proc_tracker.watch(busy_per_stage) now = time() ProcWather = Process(target=watcher, args=(watchdog_q, CONFIG,)) ProcWather.start() busy_per_stage = dict() message_per_stage = dict() error_per_stage = dict() def event_listener(CNX, config): """Processlet responsible for routing and reacting on status change""" D("event listener") global error_per_stage, busy_per_stage, message_per_stage cnx = CNX propagated_to = dict() def on_failure(new): log.critical("KO for <%(emitter)r-%(task_id)r> %(arg)r" % new) def on_end( new):