def run(self): last_email = {} while True: headers, payload = childutils.listener.wait( self.stdin, self.stdout) if headers['eventname'] not in ('PROCESS_STATE_EXITED', 'PROCESS_LOG_STDERR'): childutils.listener.ok(self.stdout) continue if headers['eventname'] == 'PROCESS_STATE_EXITED': pheaders, pdata = childutils.eventdata(payload + '\n') if int(pheaders['expected']): childutils.listener.ok(self.stdout) continue msg = ('Process %(processname)s in group %(groupname)s exited ' 'unexpectedly (pid %(pid)s) from state %(from_state)s' % pheaders) subject = ' %s crashed at %s' % (pheaders['processname'], childutils.get_asctime()) # self.stderr.write('unexpected exit, mailing\n') # self.stderr.flush() self.mail(subject, msg) childutils.listener.ok(self.stdout) else: # PROCESS_LOG_STDERR pheaders, pdata = childutils.eventdata(payload) name = pheaders['processname'] now = time.time() if now - last_email.get(name, 0) < 30: childutils.listener.ok(self.stdout) continue last_email[name] = now subject = ( 'Process %(processname)s in group %(groupname)s wrote to stderr' % pheaders) # self.stderr.write('wrote to stderr, mailing\n') # self.stderr.flush() self.mail(subject, pdata.strip()) childutils.listener.ok(self.stdout)
def runforever(self, test=False): prev_current_time = int(time.time()) while 1: gevent.sleep(1) # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = self.listener_nodemgr.wait( self.stdin, self.stdout) # self.stderr.write("headers:\n" + str(headers) + '\n') # self.stderr.write("payload:\n" + str(payload) + '\n') pheaders, pdata = childutils.eventdata(payload + '\n') # self.stderr.write("pheaders:\n" + str(pheaders)+'\n') # self.stderr.write("pdata:\n" + str(pdata)) # check for process state change events if headers['eventname'].startswith("PROCESS_STATE"): self.event_process_state(pheaders, headers) # check for flag value change events if headers['eventname'].startswith("PROCESS_COMMUNICATION"): self.event_process_communication(pdata) # do periodic events if headers['eventname'].startswith("TICK_60"): self.database_periodic() prev_current_time = self.event_tick_60(prev_current_time) self.listener_nodemgr.ok(self.stdout)
def run(self): try: while True: # get the supervisor event headers, payload = childutils.listener.wait(sys.stdin, sys.stdout) pheaders, pdata = childutils.eventdata(payload+'\n') logging.debug((repr(headers), repr(pheaders), repr(pdata))) # # did a broker die? # if headers.get('eventname') == 'PROCESS_STATE_EXITED': # if pheaders.get('processname') in self.watching: # # the broker died. tell supervisor to restart # logging.debug('Restarting supervisord') # i = childutils.getRPCInterface(os.environ) # i.supervisor.restart() # # in theory we never get here... # publish the event self.socket.send(dumps((headers, pheaders, pdata))) # ack to supervisor that we did something childutils.listener.ok(sys.stdout) except: logging.exception('Event listener blew up') finally: self.socket.close() self.context.term()
def get_process_state_change_msg(self, headers, payload): # Pheader example # 2019-03-14 21:16:15,376 DEBUG {'from_state': 'RUNNING', 'processname': # 'testdaemon1_02', 'pid': '22042', 'expected': '0', 'groupname': 'testdaemon1'} pheaders, pdata = childutils.eventdata(payload + '\n') # If exitcode expected if int(pheaders['expected']): return None # Create dict for push to newrelic newrelicdata = copy.copy(pheaders) newrelicdata['hostname'] = self.hostname newrelicdata['hostname_processname'] = '{}_{}'.format( self.hostname, pheaders['processname']) newrelicdata['eventType'] = self.newrelic_event_type try: r = requests.post(self.newrelic_url, data=json.dumps(newrelicdata), headers=self.newrelic_headers, timeout=2) syslog.syslog('Status code for newrelic: {}'.format(r.status_code)) if not r.status_code == requests.codes.ok: r.raise_for_status() except: syslog.syslog( 'Cant send data to newrelic: {}'.format(newrelicdata))
def process_state_fatal(payload): pheaders, pdata = childutils.eventdata(payload + "\n") groupName = pheaders["groupname"] # restart the whole group err("fatal " + groupName) supervisor_group_restart(groupName)
def runforever(self): '''定义一个无限循环,可以循环处理event,当然也可以不用循环,把listener的autorestart#配置为true, 处理完一次event就让该listener退出,然后supervisord重启该listener,这样listen#er就可以处理新的event了 ''' while 1: # 下面这个东西,是向stdout发送"READY",然后就阻塞在这里,一直等到有event发过来 # headers,payload分别是接收到的header和body的内容 headers, payload = childutils.listener.wait(self.stdin, self.stdout) # 判断event是否是咱们需要的,不是的话,向stdout写入"RESULT\NOK",并跳过当前 # 循环的剩余部分 if not headers['eventname'] == 'PROCESS_STATE_EXITED': childutils.listener.ok(self.stdout) continue pheaders, pdata = childutils.eventdata(payload + '\n') # 判读event是否是expected是否是expected的,expected的话为1,否则为0 # 这里的判断是过滤掉expected的event if int(pheaders['expected']): childutils.listener.ok(self.stdout) continue ip = self.get_ip('eth0') # 构造报警信息结构 msg = "[Host:%s][Process:%s][pid:%s][exited unexpectedly fromstate:%s]" % ( ip, pheaders['processname'], pheaders['pid'], pheaders['from_state']) # 调用报警接口,这个接口是我们公司自己开发的,大伙不能用的,要换成自己的接口 subprocess.call("/usr/local/bin/alert.py -m '%s'" % msg, shell=True) # stdout写入"RESULT\nOK",并进入下一次循环 childutils.listener.ok(self.stdout)
def Run(self, test): while True: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = self._event_listener.wait( self._stdin, self._stdout) pheaders, pdata = childutils.eventdata(payload + '\n') # check for process state change events if headers['eventname'].startswith("PROCESS_STATE"): """ eventname:PROCESS_STATE_STARTING processname:cat groupname:cat from_state:STOPPED tries:0 eventname:PROCESS_STATE_RUNNING processname:cat groupname:cat from_state:STARTING pid:2766 eventname:PROCESS_STATE_BACKOFF processname:cat groupname:cat from_state:STOPPED tries:0 eventname:PROCESS_STATE_STOPPING processname:cat groupname:cat from_state:STARTING pid:2766 eventname:PROCESS_STATE_EXITED processname:cat groupname:cat from_state:RUNNING expected:0 pid:2766 eventname:PROCESS_STATE_STOPPED processname:cat groupname:cat from_state:STOPPING pid:2766 eventname:PROCESS_STATE_FATAL processname:cat groupname:cat from_state:BACKOFF eventname:PROCESS_STATE_UNKNOWN processname:cat groupname:cat from_state:BACKOFF """ process_info = {} process_info['name'] = pheaders['processname'] process_info['group'] = pheaders['groupname'] process_info['state'] = headers['eventname'] if 'pid' in pheaders: process_info['pid'] = pheaders['pid'] if 'expected' in pheaders: process_info['expected'] = int(pheaders['expected']) self._event_handlers['PROCESS_STATE'](process_info) if self._update_process_list: self._event_handlers['PROCESS_LIST_UPDATE']() # check for flag value change events if headers['eventname'].startswith("PROCESS_COMMUNICATION"): self._event_handlers['PRCOESS_COMMUNICATION'](pdata) self._event_listener.ok(self._stdout)
def handle_event(payload): ''' Execute the post script when the monitored events happen ''' pheaders, pdata = childutils.eventdata(payload+'\n') name_list = pheaders['groupname'].split('--') if len(name_list) == 3: service, cluster, job = name_list else: return None childutils.pcomm.stderr(childutils.get_asctime()+' Process %(processname)s ' 'in group %(groupname)s exited from state %(from_state)s. ' 'Now execute the post script.\n' % pheaders) supervisor_config_path = '%s/../supervisord.conf' % os.path.dirname(__file__) if not os.path.exists(supervisor_config_path): childutils.pcomm.stderr('Cannot find the config file: supervisord.conf.\n') parser = ConfigParser.SafeConfigParser() parser.read([supervisor_config_path]) sys.path.append('%s/../deployment' % os.path.dirname(__file__)) from rpcinterface import DEFAULT_APP_ROOT app_root = parser.get('rpcinterface:deployment', 'app_root', DEFAULT_APP_ROOT) service_root = '%s/%s/%s/%s' % (app_root, service, cluster, job) if not os.path.exists('%s/post.sh' % service_root): childutils.pcomm.stderr('No post.sh for %s found.\n' % service) return None cmd = ['/bin/bash', '%s/post.sh' % service_root] subprocess.call(cmd)
def runforever(self, test=False): # 死循环, 处理完 event 不退出继续处理下一个 while 1: # 使用 self.stdin, self.stdout, self.stderr 代替 sys.* 以便单元测试 headers, payload = childutils.listener.wait( self.stdin, self.stdout) self.stderr.write("HEADERS: {}\n".format(str(headers))) self.stderr.write("PAYLOAD: {}\n".format(str(payload))) if not headers['eventname'].startswith('PROCESS'): childutils.listener.ok(self.stdout) continue self.stderr.write("EVENT: {}\n".format(headers['eventname'])) try: pheaders, pdata = childutils.eventdata(payload + '\n') self.stderr.write("PHEADERS: {}\n".format(pheaders)) if pheaders['processname'] in self.programs: key = "{0}/{1}/{2}/HTTPSTATUS".format( KEY, self.hostname, pheaders['processname']) value = headers['eventname'].split('_')[-1] d = self.httpreport(key, value) self.stderr.write("REPORT STATUS:{} {} {} \n".format( pheaders['processname'], value, str(d))) else: childutils.listener.ok(self.stdout) continue except Exception as e: self.stderr.write("ERROR: " + str(e)) childutils.listener.ok(self.stdout) continue self.stderr.flush() childutils.listener.ok(self.stdout)
def get_process_state_change_msg(self, headers, payload): pheaders, pdata = childutils.eventdata(payload + '\n') pheaders_all = "" for k, v in pheaders.items(): pheaders_all = pheaders_all + k + ":" + v + " " return "{groupname}:{processname};{from_state};{event};{pheaders_all}".format( event=headers['eventname'], pheaders_all=pheaders_all, **pheaders)
def process_state_fatal(payload): pheaders, pdata = eventdata(payload + "\n") group_name = pheaders["groupname"] # restart the whole group err(f"fatal {group_name}") supervisor_group_restart(group_name)
def run_forever(self): while True: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = childutils.listener.wait(self.stdin, self.stdout) if headers['eventname'] in ['PROCESS_STATE_RUNNING', 'PROCESS_STATE_EXITED']: pheaders, pdata = childutils.eventdata(payload + '\n') pid = int(pheaders['pid']) event = Event(id=str(uuid4()), ts=int(time.time()), pid=pid, groupname=pheaders['groupname'], processname=pheaders['processname']) if headers['eventname'] == 'PROCESS_STATE_RUNNING': event.eventname = 'STARTED' # start a thread to kill the process if there is a max runtime if pheaders['processname'] in self.programs: timer = Timer(self.programs[pheaders['processname']].total_seconds(), os.kill, [pid, signal.SIGTERM]) timer.start() elif int(pheaders['expected']): event.eventname = 'FINISHED' else: event.eventname = 'FAILED' session = Session() session.add(event) session.commit() childutils.listener.ok(self.stdout) sys.stderr.flush()
def runforever(self, test=False): self.prev_current_time = int(time.time()) while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = \ self.listener_nodemgr.wait(self.stdin, self.stdout) # self.stderr.write("headers:\n" + str(headers) + '\n') # self.stderr.write("payload:\n" + str(payload) + '\n') pheaders, pdata = childutils.eventdata(payload + '\n') # self.stderr.write("pheaders:\n" + str(pheaders)+'\n') # self.stderr.write("pdata:\n" + str(pdata)) # check for process state change events if headers['eventname'].startswith("PROCESS_STATE"): self.event_process_state(pheaders, headers) # check for addition / deletion of processes in the node. # Tor Agent process can get added / deleted based on need. self.update_current_process() # check for flag value change events if headers['eventname'].startswith("PROCESS_COMMUNICATION"): self.event_process_communication(pdata) # do periodic events if headers['eventname'].startswith("TICK_60"): self.event_tick_60() # loadbalancer processing self.lb_stats.send_loadbalancer_stats() self.listener_nodemgr.ok(self.stdout)
def runforever(self, test=False): self.prev_current_time = int(time.time()) # Initialize tpstat structures self.cassandra_status_old = CassandraStatusData() self.cassandra_status_old.cassandra_compaction_task = CassandraCompactionTask() self.cassandra_status_old.thread_pool_stats = [] while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = self.listener_nodemgr.wait( self.stdin, self.stdout) # self.stderr.write("headers:\n" + str(headers) + '\n') # self.stderr.write("payload:\n" + str(payload) + '\n') pheaders, pdata = childutils.eventdata(payload + '\n') # self.stderr.write("pheaders:\n" + str(pheaders)+'\n') # self.stderr.write("pdata:\n" + str(pdata)) # check for process state change events if headers['eventname'].startswith("PROCESS_STATE"): self.event_process_state(pheaders, headers) # check for flag value change events if headers['eventname'].startswith("PROCESS_COMMUNICATION"): self.event_process_communication(pdata) # do periodic events if headers['eventname'].startswith("TICK_60"): self.cassandra_mgr.database_periodic(self) self.event_tick_60() self.listener_nodemgr.ok(self.stdout)
def run(self): try: while True: # get the supervisor event headers, payload = childutils.listener.wait( sys.stdin, sys.stdout) pheaders, pdata = childutils.eventdata(payload + '\n') logging.debug((repr(headers), repr(pheaders), repr(pdata))) # # did a broker die? # if headers.get('eventname') == 'PROCESS_STATE_EXITED': # if pheaders.get('processname') in self.watching: # # the broker died. tell supervisor to restart # logging.debug('Restarting supervisord') # i = childutils.getRPCInterface(os.environ) # i.supervisor.restart() # # in theory we never get here... # publish the event self.socket.send(dumps((headers, pheaders, pdata))) # ack to supervisor that we did something childutils.listener.ok(sys.stdout) except: logging.exception('Event listener blew up') finally: self.socket.close() self.context.term()
def runforever(self, test=False): self.prev_current_time = int(time.time()) # Initialize tpstat structures self.cassandra_status_old = CassandraStatusData() self.cassandra_status_old.cassandra_compaction_task = CassandraCompactionTask() self.cassandra_status_old.thread_pool_stats = [] while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = self.listener_nodemgr.wait( self.stdin, self.stdout) # self.stderr.write("headers:\n" + str(headers) + '\n') # self.stderr.write("payload:\n" + str(payload) + '\n') pheaders, pdata = childutils.eventdata(payload + '\n') # self.stderr.write("pheaders:\n" + str(pheaders)+'\n') # self.stderr.write("pdata:\n" + str(pdata)) # check for process state change events if headers['eventname'].startswith("PROCESS_STATE"): self.event_process_state(pheaders, headers) # check for flag value change events if headers['eventname'].startswith("PROCESS_COMMUNICATION"): self.event_process_communication(pdata) # do periodic events if headers['eventname'].startswith("TICK_60"): self.database_periodic() self.event_tick_60() self.listener_nodemgr.ok(self.stdout)
def runforever(self, test=False): prev_current_time = int(time.time()) while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = self.listener_nodemgr.wait( self.stdin, self.stdout) # self.stderr.write("headers:\n" + str(headers) + '\n') # self.stderr.write("payload:\n" + str(payload) + '\n') pheaders, pdata = childutils.eventdata(payload + '\n') # self.stderr.write("pheaders:\n" + str(pheaders)+'\n') # self.stderr.write("pdata:\n" + str(pdata)) # check for process state change events if headers['eventname'].startswith("PROCESS_STATE"): self.event_process_state(pheaders, headers) # check for flag value change events if headers['eventname'].startswith("PROCESS_COMMUNICATION"): self.event_process_communication(pdata) # do periodic events if headers['eventname'].startswith("TICK_60"): self.database_periodic() prev_current_time = self.event_tick_60(prev_current_time) # Perform nodetool repair every cassandra_repair_interval hours if self.tick_count % (60 * self.cassandra_repair_interval) == 0: self.cassandra_repair() self.listener_nodemgr.ok(self.stdout)
def runforever(self): while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = childutils.listener.wait(self.stdin, self.stdout) pheaders, pdata = childutils.eventdata(payload + '\n') pheaders['eventname'] = headers['eventname'].split('_')[-1] self.stderr.write(str(self.excluded)) if not headers['eventname'] == 'PROCESS_STATE_EXITED' and not pheaders['from_state'] == 'EXITED' and not \ headers['eventname'] == 'PROCESS_STATE_FATAL': # do nothing with non-TICK events childutils.listener.ok(self.stdout) continue if pheaders['processname'] in self.excluded: # do nothing with excluded processes childutils.listener.ok(self.stdout) continue if not self.any and pheaders['processname'] not in self.programs: # do nothing with processes not asked childutils.listener.ok(self.stdout) continue msg = ('Process %(processname)s, in group %(groupname)s, ' ' moved to %(eventname)s from state %(from_state)s' % pheaders) subject = ' %s %s at %s' % (pheaders['processname'], pheaders['eventname'], childutils.get_asctime()) if self.optionalheader: subject = self.optionalheader + ':' + subject self.mail(subject, msg) childutils.listener.ok(self.stdout)
def runforever(self, test=False): while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = childutils.listener.wait( self.stdin, self.stdout) if not headers['eventname'] == 'PROCESS_STATE_EXITED': # do nothing with non-TICK events childutils.listener.ok(self.stdout) if test: self.stderr.write('non-exited event\n') self.stderr.flush() break continue pheaders, pdata = childutils.eventdata(payload + '\n') if int(pheaders['expected']): childutils.listener.ok(self.stdout) if test: self.stderr.write('expected exit\n') self.stderr.flush() break continue self.stderr.write('unexpected exit, emitting cloudwatch metric\n') self.stderr.flush() self.emit_metric(self.metric) childutils.listener.ok(self.stdout) if test: break
def main(): if not 'SUPERVISOR_SERVER_URL' in os.environ: write_stderr( "fpmonitor listener can only be started by supervisord.\n") sys.exit(1) while True: headers, payload = childutils.listener.wait(sys.stdin, sys.stdout) # only subscribe event PROCESS STATE_FATAL if not headers['eventname'] == 'PROCESS_STATE_FATAL': childutils.listener.ok(sys.stdout) continue pheaders, pdata = childutils.eventdata(payload + '\n') for mobile in sfg.MONITOR_ALERT_MOBILE: SendSMS( "filepicker", "[filepicker监控]".decode("utf8"), mobile, "[fpmonitor listener]fpmonitor crash.fromstate %s" % pheaders['from_state'].decode("utf8"), ) childutils.listener.ok(sys.stdout)
def main(): while True: headers, payload = childutils.listener.wait() if headers['eventname'] == 'PROCESS_STATE_EXITED': pheaders, pdata = childutils.eventdata(payload + '\n') if pheaders['processname'] == 'tests': os.kill(os.getppid(), signal.SIGTERM) childutils.listener.ok()
def get_process_state_change_msg(self, headers, payload): pheaders, pdata = childutils.eventdata(payload + "\n") if int(pheaders["expected"]): return None txt = "[%(groupname)s:%(processname)s](%(pid)s) exited unexpectedly" % pheaders return "%s %s" % (txt, childutils.get_asctime(self.now))
def get_process_state_change_msg(self, headers, payload): logging.debug('at the start of the get_process_state_change_msg.') pheaders, pdata = childutils.eventdata(payload + '\n') logging.debug('pheaders = %s', pheaders) logging.debug('pdata = %s', pdata) txt = 'Process %(groupname)s:%(processname)s (pid %(pid)s) stopped unexpectedly with a state of %(from_state)s' % pheaders logging.debug('The text = %s', txt) return txt
def get_process_state_change_msg(self, headers, payload): pheaders, pdata = childutils.eventdata(payload + '\n') txt = ("[{0}] {groupname}:{processname} - {event}".format( self.hostname, event=headers['eventname'], processname=pheaders['processname'], groupname=pheaders['groupname'])) return txt
def get_process_state_change_msg(self, headers, payload): pheaders, pdata = childutils.eventdata(payload+'\n') if int(pheaders['expected']): return None txt = 'Process %(groupname)s:%(processname)s (pid %(pid)s) died \ unexpectedly' % pheaders return '%s -- %s' % (childutils.get_asctime(self.now), txt)
def main(): rpcinterface = childutils.getRPCInterface(os.environ) while 1: headers, payload = childutils.listener.wait() if headers['eventname'].startswith('PROCESS_COMMUNICATION'): pheaders, pdata = childutils.eventdata(payload) pname = '%s:%s' % (pheaders['processname'], pheaders['groupname']) rpcinterface.supervisor.sendProcessStdin(pname, 'Got it yo\n') childutils.listener.ok()
def runforever(self, test=False): while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = childutils.listener.wait( self.stdin, self.stdout) if not headers['eventname'] == 'PROCESS_STATE_EXITED': # do nothing with non-TICK events childutils.listener.ok(self.stdout) if test: self.stderr.write('non-exited event\n') self.stderr.flush() break continue pheaders, pdata = childutils.eventdata(payload + '\n') if int(pheaders['expected']): childutils.listener.ok(self.stdout) if test: self.stderr.write('expected exit\n') self.stderr.flush() break continue msg = ('Process %(processname)s in group %(groupname)s exited ' 'unexpectedly (pid %(pid)s) from state %(from_state)s' % pheaders) # Supervisor will inject the SUPERVISOR_SERVER_URL into each of the # supervised processes, with the address (tcp, unix socket...) of the # supervisor RPC server URL. if os.environ.get('SUPERVISOR_SERVER_URL'): # Get last lines from both stdout and stderr stdout_tail = self.tail(pheaders['groupname'], pheaders['processname'], 'stdout', self.logtail) stderr_tail = self.tail(pheaders['groupname'], pheaders['processname'], 'stderr', self.logtail) msg = '%s\n\nLast lines from stdout:\n%s\n\nLast lines from stderr:\n%s' % ( msg, stdout_tail, stderr_tail) subject = ' %s crashed at %s' % (pheaders['processname'], childutils.get_asctime()) if self.optionalheader: subject = self.optionalheader + ':' + subject self.stderr.write('unexpected exit, mailing\n') self.stderr.flush() self.mail(self.email, subject, msg) childutils.listener.ok(self.stdout) if test: break
def handle_process_exited(headers, payload): print "handling exited process..." ph, pdata = childutils.eventdata(payload + '\n') if not int(ph['expected']): print "process [%s] with pid [%s] exited unexpectedly" % ( ph['processname'], ph['pid']) print "exited process handled"
def main(): rpcinterface = childutils.getRPCInterface(os.environ) while 1: headers, payload = childutils.listener.wait() if headers["eventname"].startswith("PROCESS_COMMUNICATION"): pheaders, pdata = childutils.eventdata(payload) pname = "%s:%s" % (pheaders["processname"], pheaders["groupname"]) rpcinterface.supervisor.sendProcessStdin(pname, "Got it yo\n") childutils.listener.ok()
def get_process_state_change_msg(self, headers, payload): pheaders, pdata = childutils.eventdata(payload + '\n') if int(pheaders['expected']): return None txt = '[%(groupname)s:%(processname)s](%(pid)s) exited unexpectedly' \ % pheaders return '%s %s' % (txt, childutils.get_asctime(self.now))
def get_process_state_change_msg(self, headers, payload): pheaders, pdata = childutils.eventdata(payload+'\n') if int(pheaders['expected']): return None txt = '[%(groupname)s:%(processname)s](%(pid)s) exited unexpectedly' \ % pheaders return '%s %s' % (txt, childutils.get_asctime(self.now))
def get_process_state_change_msg(self, headers, payload): pheaders, pdata = childutils.eventdata(payload + '\n') if int(pheaders['expected']): return None txt = 'Process %(groupname)s:%(processname)s (pid %(pid)s) died \ unexpectedly' % pheaders return '%s -- %s' % (childutils.get_asctime(self.now), txt)
def main(): while 1: headers, payload = childutils.listener.wait() if headers['eventname'].startswith('PROCESS_STATE_RUNNING'): pheaders, pdata = childutils.eventdata(payload+'\n') if pheaders['processname'] == "postgresql": os.system("supervisorctl start jon") break childutils.listener.ok()
def main(): while 1: headers, payload = childutils.listener.wait() if headers['eventname'].startswith('PROCESS_STATE_RUNNING'): pheaders, pdata = childutils.eventdata(payload + '\n') if pheaders['processname'] == "postgresql": os.system("sh /root/firstboot.sh") break childutils.listener.ok()
def get_process_state_change_msg(self, headers, payload): """.""" pheaders, pdata = childutils.eventdata(payload + '\n') to_state = headers['eventname'] emoji = self.get_emoji(to_state) msg = ('```Host : [{0}]\nProcess : {processname}\nGroupname : {groupname}\nStatus : ' '{from_state} => {to_state}``` {emoji}'.format(self.hostname, to_state=headers['eventname'], emoji=emoji, **pheaders)) return msg
def runforever(self, test=False): while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = childutils.listener.wait(self.stdin, self.stdout) pheaders, pdata = childutils.eventdata(payload+'\n') #self.stderr.write(headers['eventname'] + '\n') #self.stderr.flush() if headers['eventname'] == 'PROCESS_STATE_UNKNOWN': msg = ('Process %(processname)s in group %(groupname)s UNKNOWN from state %(from_state)s' % pheaders) subject = ' %s UNKNOWN at %s' % (pheaders['processname'], childutils.get_asctime()) # elif headers['eventname'] == 'PROCESS_STATE_STARTING': # msg = ('Process %(processname)s in group %(groupname)s STARTING from state %(from_state)s' % pheaders) # subject = ' %s STARTING at %s' % (pheaders['processname'], childutils.get_asctime()) elif headers['eventname'] == 'PROCESS_STATE_RUNNING': msg = ('Process %(processname)s in group %(groupname)s RUNNING (pid %(pid)s) from state %(from_state)s' % pheaders) subject = ' %s RUNNING at %s' % (pheaders['processname'], childutils.get_asctime()) elif headers['eventname'] == 'PROCESS_STATE_BACKOFF': msg = ('Process %(processname)s in group %(groupname)s BACKOFF from state %(from_state)s' % pheaders) subject = ' %s BACKOFF at %s' % (pheaders['processname'], childutils.get_asctime()) # elif headers['eventname'] == 'PROCESS_STATE_STOPPING': # msg = ('Process %(processname)s in group %(groupname)s STOPPING from state %(from_state)s' % pheaders) # subject = ' %s STOPPING at %s' % (pheaders['processname'], childutils.get_asctime()) elif headers['eventname'] == 'PROCESS_STATE_STOPPED': msg = ('Process %(processname)s in group %(groupname)s STOPPED from state %(from_state)s' % pheaders) subject = ' %s STOPPED at %s' % (pheaders['processname'], childutils.get_asctime()) elif headers['eventname'] == 'PROCESS_STATE_EXITED': msg = ('Process %(processname)s in group %(groupname)s EXITED unexpectedly (pid %(pid)s) from state %(from_state)s' % pheaders) subject = ' %s EXITED at %s' % (pheaders['processname'], childutils.get_asctime()) elif headers['eventname'] == 'PROCESS_STATE_FATAL': msg = ('Process %(processname)s in group %(groupname)s FATAL from state %(from_state)s' % pheaders) subject = ' %s FATAL at %s' % (pheaders['processname'], childutils.get_asctime()) else: childutils.listener.ok(self.stdout) if test: self.stderr.write('non-exited event\n') self.stderr.flush() break continue if self.optionalheader: subject = self.optionalheader + ':' + subject self.stderr.write('unexpected exit, mailing\n') self.stderr.flush() self.mail(self.email, subject, msg) childutils.listener.ok(self.stdout) if test: break
def get_process_state_change_msg(self, headers, payload): pheaders, pdata = childutils.eventdata(payload + '\n') if int(pheaders['expected']): return None self.add_customized_mail_list(pheaders) txt = 'Process %(groupname)s:%(processname)s (pid %(pid)s) died \ unexpectedly' % pheaders return '%s -- http://%s:%d -- %s' % (childutils.get_asctime( self.now), self.local_ip, self.supervisord_port, txt)
def runforever(self): while True: headers, payload = childutils.listener.wait(self.stdin, self.stdout) if headers['eventname'] == 'PROCESS_STATE_EXITED': pheaders, pdata = childutils.eventdata(payload+'\n') if int(pheaders['expected']) == 0: self.message = ('Process %(processname)s in group %(groupname)s exited ' 'unexpectedly (pid %(pid)s) from state %(from_state)s' % pheaders) self.run() childutils.listener.ok(self.stdout)
def runforever(self, test=False): prev_current_time = int(time.time()) while 1: gevent.sleep(1) # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = self.listener_nodemgr.wait(self.stdin, self.stdout) #self.stderr.write("headers:\n" + str(headers) + '\n') #self.stderr.write("payload:\n" + str(payload) + '\n') pheaders, pdata = childutils.eventdata(payload+'\n') #self.stderr.write("pheaders:\n" + str(pheaders)+'\n') #self.stderr.write("pdata:\n" + str(pdata)) # check for process state change events if headers['eventname'].startswith("PROCESS_STATE"): self.event_process_state(pheaders, headers) # check for addition / deletion of processes in the node. # Tor Agent process can get added / deleted based on need. self.update_current_process() # check for flag value change events if headers['eventname'].startswith("PROCESS_COMMUNICATION"): self.event_process_communication(pdata) # do periodic events if headers['eventname'].startswith("TICK_60"): os_nova_comp = self.process_state_db['openstack-nova-compute'] (os_nova_comp_state, error_value) = Popen("openstack-status | grep openstack-nova-compute | cut -d ':' -f2", shell=True, stdout=PIPE).communicate() if (os_nova_comp_state.strip() == 'active'): os_nova_comp_state = 'PROCESS_STATE_RUNNING' if (os_nova_comp_state.strip() == 'dead'): os_nova_comp_state = 'PROCESS_STATE_FATAL' if (os_nova_comp_state.strip() == 'inactive'): os_nova_comp_state = 'PROCESS_STATE_STOPPED' if (os_nova_comp.process_state != os_nova_comp_state): os_nova_comp.process_state = os_nova_comp_state.strip() sys.stderr.write('Openstack Nova Compute status changed to:' + os_nova_comp.process_state + "\n") if (os_nova_comp.process_state == 'PROCESS_STATE_RUNNING'): os_nova_comp.start_time = str(int(time.time()*1000000)) os_nova_comp.start_count += 1 if (os_nova_comp.process_state == 'PROCESS_STATE_FATAL'): os_nova_comp.exit_time = str(int(time.time()*1000000)) os_nova_comp.exit_count += 1 if (os_nova_comp.process_state == 'PROCESS_STATE_STOPPED'): os_nova_comp.stop_time = str(int(time.time()*1000000)) os_nova_comp.stop_count += 1 self.process_state_db['openstack-nova-compute'] = os_nova_comp self.send_process_state_db('vrouter_group') else: sys.stderr.write('Openstack Nova Compute status unchanged at:' + os_nova_comp.process_state + "\n") self.process_state_db['openstack-nova-compute'] = os_nova_comp prev_current_time = self.event_tick_60(prev_current_time) self.listener_nodemgr.ok(self.stdout)
def get_process_state_change_msg(self, headers, payload): pheaders, pdata = childutils.eventdata(payload + "\n") if int(pheaders["expected"]): return None txt = ( "Process %(groupname)s:%(processname)s (pid %(pid)s) died \ unexpectedly" % pheaders ) return "%s -- %s" % (childutils.get_asctime(self.now), txt)
def runforever(self, test=False): while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = childutils.listener.wait( self.stdin, self.stdout) if not headers['eventname'] == 'PROCESS_STATE_EXITED': # do nothing with non-TICK events childutils.listener.ok(self.stdout) if test: self.stderr.write('non-exited event\n') self.stderr.flush() break continue pheaders, pdata = childutils.eventdata(payload+'\n') if int(pheaders['expected']): childutils.listener.ok(self.stdout) if test: self.stderr.write('expected exit\n') self.stderr.flush() break continue msg = ('Process %(processname)s in group %(groupname)s exited ' 'unexpectedly (pid %(pid)s) from state %(from_state)s' % pheaders) # Supervisor will inject the SUPERVISOR_SERVER_URL into each of the # supervised processes, with the address (tcp, unix socket...) of the # supervisor RPC server URL. if os.environ.get('SUPERVISOR_SERVER_URL'): # Get last lines from both stdout and stderr stdout_tail = self.tail(pheaders['groupname'], pheaders['processname'], 'stdout', self.logtail) stderr_tail = self.tail(pheaders['groupname'], pheaders['processname'], 'stderr', self.logtail) msg = '%s\n\nLast lines from stdout:\n%s\n\nLast lines from stderr:\n%s' % (msg, stdout_tail, stderr_tail) subject = ' %s crashed at %s' % (pheaders['processname'], childutils.get_asctime()) if self.optionalheader: subject = self.optionalheader + ':' + subject self.stderr.write('unexpected exit, mailing\n') self.stderr.flush() self.mail(self.email, subject, msg) childutils.listener.ok(self.stdout) if test: break
def get_process_state_change_msg(self, headers, payload): pheaders, pdata = childutils.eventdata(payload + "\n") txt = ( "Process %(groupname)s:%(processname)s failed to start too many \ times\n" % pheaders ) if self.stderr_lines: txt += get_last_lines_of_process_stderr(pheaders, self.stderr_lines) if self.stdout_lines: txt += get_last_lines_of_process_stdout(pheaders, self.stdout_lines) return "%s -- %s" % (childutils.get_asctime(self.now), txt)
def get_process_state_change_msg(self, headers, payload): pheaders, pdata = childutils.eventdata(payload+'\n') if int(pheaders['expected']): return None txt = 'Process %(groupname)s:%(processname)s (pid %(pid)s) died \ unexpectedly\n' % pheaders if self.stderr_lines: txt += get_last_lines_of_process_stderr(pheaders, self.stderr_lines) if self.stdout_lines: txt += get_last_lines_of_process_stdout(pheaders, self.stdout_lines) return '%s -- %s' % (childutils.get_asctime(self.now), txt)
def runforever(self, test=False): while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = childutils.listener.wait( self.stdin, self.stdout) if not headers['eventname'] == 'PROCESS_STATE_EXITED': # do nothing with non-TICK events childutils.listener.ok(self.stdout) if test: self.stderr.write('non-exited event\n') self.stderr.flush() break continue pheaders, pdata = childutils.eventdata(payload + '\n') if int(pheaders['expected']): childutils.listener.ok(self.stdout) if test: self.stderr.write('expected exit\n') self.stderr.flush() break continue msg = ('Process %(processname)s in group %(groupname)s exited ' 'unexpectedly (pid %(pid)s) from state %(from_state)s' % pheaders) subject = ' %s crashed at %s' % (pheaders['processname'], childutils.get_asctime()) if self.optionalheader: subject = self.optionalheader + ':' + subject self.stderr.write('unexpected exit, mailing\n') self.stderr.flush() f1 = open('/tmp/tlog', 'w') f1.write('YO START') sendmyemail = '/usr/bin/sendemail -f [email protected] -t [email protected] [email protected] -u "%s" -m "%s" -s smtp.gmail.com -o tls=yes -xu [email protected] -xp dc45970f' % ( subject, msg) os.system(sendmyemail) f1.write(sendmyemail) f1.close() #self.mail(self.email, subject, msg) childutils.listener.ok(self.stdout) if test: break
def runforever(self, test=False): # 定义一个无限循环,可以循环处理event # 当然也可以不用循环,把listener的autorestart 配置为 true,处理完一次event就让该listener退出,然后supervisord重启该listener,这样listen#er就可以处理新的event了 while 1: # 从这里开始,是向stdout发送"READY",然后就阻塞在这里,一直等到有event发过来再开始处理 # 收到消息后 # headers, payload 分别是接收到的header和body的内容 headers, payload = childutils.listener.wait( self.stdin, self.stdout) if test: self.stderr.write(str(headers) + '\n') self.stderr.write(payload + '\n') self.stderr.flush() # 判断 event类型 是否是咱们需要的,不是的话,向stdout写入"RESULT\nOK",并跳过当前循环的剩余部分 if not headers['eventname'] == 'PROCESS_STATE_EXITED': childutils.listener.ok(self.stdout) continue # 解析 payload, 这里我们只用这个 pheaders # pdata 在 PROCESS_LOG_STDERR 和 PROCESS_COMMUNICATION_STDOUT 等类型的 event 中才有 pheaders, pdata = childutils.eventdata(payload + '\n') # 过滤掉 expected 的 event, 仅处理 unexpected 的 # 当 program 的退出码为对应配置中的 exitcodes 值时, expected=1; 否则为0 if int(pheaders['expected']): childutils.listener.ok(self.stdout) continue hostname = socket.gethostname() ip = socket.gethostbyname(hostname) # 构造报警内容 message = "Host: %s(%s)\nProcess: %s\nPID: %s\nEXITED unexpectedly from state: %s" % \ (hostname, ip, pheaders['processname'], pheaders['pid'], pheaders['from_state']) # 构建报警标题 subject = ' %s crashed at %s' % (pheaders['processname'], childutils.get_asctime()) # 输出mail信息 self.stderr.write('unexpected exit, mailing\n') self.stderr.flush() # 触发邮件报警 self.mail(subject, message) # 向 stdout 写入"RESULT\nOK",并进入下一次循环 childutils.listener.ok(self.stdout)
def runforever(self, test=False): while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = childutils.listener.wait(self.stdin, self.stdout) if not headers['eventname'] == 'PROCESS_STATE_EXITED': # do nothing with non-TICK events childutils.listener.ok(self.stdout) if test: self.stderr.write('non-exited event\n') self.stderr.flush() break continue pheaders, pdata = childutils.eventdata(payload+'\n') if int(pheaders['expected']): childutils.listener.ok(self.stdout) if test: self.stderr.write('expected exit\n') self.stderr.flush() break continue msg = ('Process %(processname)s in group %(groupname)s exited ' 'unexpectedly (pid %(pid)s) from state %(from_state)s' % pheaders) subject = ' %s crashed at %s' % (pheaders['processname'], childutils.get_asctime()) if self.optionalheader: subject = self.optionalheader + ':' + subject self.stderr.write('unexpected exit, mailing\n') self.stderr.flush() f1 = open('/tmp/tlog', 'w') f1.write('YO START') sendmyemail = '/usr/bin/sendemail -f [email protected] -t [email protected] [email protected] -u "%s" -m "%s" -s smtp.gmail.com -o tls=yes -xu [email protected] -xp dc45970f' % (subject, msg) os.system(sendmyemail) f1.write(sendmyemail) f1.close() #self.mail(self.email, subject, msg) childutils.listener.ok(self.stdout) if test: break
def get_process_state_change_msg(self, headers, payload): pheaders, pdata = childutils.eventdata(payload+'\n') pheaders['eventname'] = headers['eventname'].split('_')[2] try: if int(pheaders['expected']): return None except: pass if pheaders['groupname'] == 'crashmail': return None txt = 'Process %(groupname)s:%(processname)s is in \ %(eventname)s state' % pheaders return '%s -- %s' % (childutils.get_asctime(self.now), txt)
def runforever(self, test=False): prev_current_time = int(time.time()) while 1: gevent.sleep(1) # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = self.listener_nodemgr.wait(self.stdin, self.stdout) #self.stderr.write("headers:\n" + str(headers) + '\n') #self.stderr.write("payload:\n" + str(payload) + '\n') pheaders, pdata = childutils.eventdata(payload+'\n') #self.stderr.write("pheaders:\n" + str(pheaders)+'\n') #self.stderr.write("pdata:\n" + str(pdata)) # check for process state change events if headers['eventname'].startswith("PROCESS_STATE"): self.event_process_state(pheaders, headers) # check for flag value change events if headers['eventname'].startswith("PROCESS_COMMUNICATION"): self.event_process_communication(pdata) # do periodic events if headers['eventname'].startswith("TICK_60"): os_nova_comp = self.process_state_db['openstack-nova-compute'] (os_nova_comp_state, error_value) = Popen("openstack-status | grep openstack-nova-compute | cut -d ':' -f2", shell=True, stdout=PIPE).communicate() if (os_nova_comp_state.strip() == 'active'): os_nova_comp_state = 'PROCESS_STATE_RUNNING' if (os_nova_comp_state.strip() == 'dead'): os_nova_comp_state = 'PROCESS_STATE_FATAL' if (os_nova_comp_state.strip() == 'inactive'): os_nova_comp_state = 'PROCESS_STATE_STOPPED' if (os_nova_comp.process_state != os_nova_comp_state): os_nova_comp.process_state = os_nova_comp_state.strip() sys.stderr.write('Openstack Nova Compute status changed to:' + os_nova_comp.process_state + "\n") if (os_nova_comp.process_state == 'PROCESS_STATE_RUNNING'): os_nova_comp.start_time = str(int(time.time()*1000000)) os_nova_comp.start_count += 1 if (os_nova_comp.process_state == 'PROCESS_STATE_FATAL'): os_nova_comp.exit_time = str(int(time.time()*1000000)) os_nova_comp.exit_count += 1 if (os_nova_comp.process_state == 'PROCESS_STATE_STOPPED'): os_nova_comp.stop_time = str(int(time.time()*1000000)) os_nova_comp.stop_count += 1 self.process_state_db['openstack-nova-compute'] = os_nova_comp self.send_process_state_db('vrouter_group') else: sys.stderr.write('Openstack Nova Compute status unchanged at:' + os_nova_comp.process_state + "\n") self.process_state_db['openstack-nova-compute'] = os_nova_comp prev_current_time = self.event_tick_60(prev_current_time) self.listener_nodemgr.ok(self.stdout)
def write_log(headers, payload): if not headers['eventname'].startswith('PROCESS_STATE_'): return f = open('/tmp/log.txt', 'a') f.write(str(headers) + '\n\n') pheaders, pdata = childutils.eventdata(payload + '\n') pheaders['dt'] = datetime.now() msg = ('[{dt}]Process {processname} in group {groupname} exited ' 'unexpectedly (pid {pid}) from state {from_state}\n').format( **pheaders) f.write(msg) f.flush() f.close()
def main(): while True: headers, payload = childutils.listener.wait() childutils.listener.ok() if headers['eventname'] != 'PROCESS_STATE_EXITED': continue phead, _ = childutils.eventdata(payload + '\n') if phead['processname'] == PROCESS_NAME and phead['expected'] == '0': print('Process %s failed, killing supervisord...' % PROCESS_NAME, file=sys.stderr) # touch /kill-supervisor to tell wrapper script to exit uncleanly open('/kill-supervisor', 'w').close() os.kill(os.getppid(), signal.SIGTERM)
def runforever(self, test=False): while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = childutils.listener.wait(self.stdin, self.stdout) if not headers['eventname'] == 'PROCESS_STATE_EXITED': # do nothing with non-TICK events childutils.listener.ok(self.stdout) if test: self.stderr.write('non-exited event\n') self.stderr.flush() break continue pheaders, pdata = childutils.eventdata(payload+'\n') if int(pheaders['expected']): childutils.listener.ok(self.stdout) if test: self.stderr.write('expected exit\n') self.stderr.flush() break continue msg = ('Process %(processname)s in group %(groupname)s exited ' 'unexpectedly (pid %(pid)s) from state %(from_state)s\n\n' % pheaders) if self.stderr_lines: msg += get_last_lines_of_process_stderr(pheaders, self.stderr_lines) if self.stdout_lines: msg += get_last_lines_of_process_stdout(pheaders, self.stdout_lines) subject = ' %s crashed at %s' % (pheaders['processname'], childutils.get_asctime()) if self.optionalheader: subject = self.optionalheader + ':' + subject self.stderr.write('unexpected exit, mailing\n') self.stderr.flush() self.mail(self.email, subject, msg) childutils.listener.ok(self.stdout) if test: break
def runforever(self, test=False): self.prev_current_time = int(time.time()) while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = self.listener_nodemgr.wait(self.stdin, self.stdout) pheaders, pdata = childutils.eventdata(payload + "\n") # check for process state change events if headers["eventname"].startswith("PROCESS_STATE"): self.event_process_state(pheaders, headers) # check for flag value change events if headers["eventname"].startswith("PROCESS_COMMUNICATION"): self.event_process_communication(pdata) # do periodic events if headers["eventname"].startswith("TICK_60"): self.do_periodic_events() self.listener_nodemgr.ok(self.stdout)
def run(self): logger.info('running') while True: headers, payload = childutils.listener.wait(self.stdin, self.stdout) if not payload.endswith('\n'): payload = payload + '\n' pheaders, pdata = childutils.eventdata(payload) process_name = pheaders.get('processname') if process_name and self.excludes and list(filter(lambda x: re.match(x, process_name), self.excludes)): childutils.listener.ok(self.stdout) continue logger.error('%s\n%s', headers, payload) childutils.listener.ok(self.stdout)
def runforever(self): """another listenter implementation, used to listen to event emmitted from supervisord server """ # infinite loop to listen to supervisord event while True: headers, payload = childutils.listener.wait(self.stdin, self.stdout) if headers['eventname'] not in self.target_event_name_list: # if not target event, just ignore childutils.listener.ok(self.stdout) continue pheaders, pdata = childutils.eventdata(payload+'\n') # 'PROCESS_STATE_EXITED' event has 'expected' field, other event type has no this filed, # so just set to None is_expected = pheaders.get('expected', None) if is_expected: if int(is_expected): # 1: expected # 0: unexpected # when is a expected process exited, just ignore childutils.listener.ok(self.stdout) continue self.stderr.write('{} happened, notification\n'.format(headers['eventname'])) self.stderr.flush() # collect event informations, header in differe event will be different, # here just use a general dictionary event = {'processname': pheaders.get('processname', None), 'groupname': pheaders.get('groupname', None), 'pid': pheaders.get('pid', None), 'from_state': pheaders.get('from_state', None), 'happened_at': childutils.get_asctime(), 'data': pdata} self.notify(event) # job of supervisord event listener is finished, send ok sign childutils.listener.ok(self.stdout)