def post(self): Logger.info(json.dumps(self.request.arguments, ensure_ascii=False), self.request.uri) a_dict = dict() a_dict['code'] = -1 a_dict['content'] = dict() # Get service id service_id_string = self.get_argument("service_id") """:type: string""" if not service_id_string: self.write(json.dumps(a_dict, ensure_ascii=False)) # Query status data raw_service_id_list = service_id_string.strip().split(',') """:type: list[string]""" service_id_set = set() for raw_service_id in raw_service_id_list: if raw_service_id.isdigit(): service_id_set.add(int(raw_service_id)) result = Controller.status_holder.get_group_status(service_id_set) if result is None: self.write(json.dumps(a_dict, ensure_ascii=False)) a_dict['code'] = 0 a_dict['content'] = result self.write(json.dumps(a_dict, ensure_ascii=False))
def __main__(): # 设置编码 reload(sys) sys.setdefaultencoding('utf-8') # 初始化日志 result = Logger.init(config.monitor_log_env, config.monitor_log_target, config.monitor_log_name, config.monitor_log_size, config.monitor_log_count, multiprocess=True) if not result: print 'init logger failed' return False with Defer(Logger.close): # Start worker process signal.signal(signal.SIGTERM, Controller.master_signal_handler) Controller.start_worker_process() print "server is starting..." Logger.info("server is starting...") Logger.info("config.server_listen_port: %s" % config.server_listen_port) app = tornado.web.Application([ (r'/api_get_detail_status', ApiGetDetailStatus), (r'/api_monitor_status', ApiMonitorStatus), ], xsrf_cookies=False, debug=config.server_debug_mode) # Listen on a port app.listen(config.server_listen_port) Controller.instance = tornado.ioloop.IOLoop.current() Controller.instance.start()
def __load_jobs(self): try: a_dict = dict() sql = "SELECT services.id, services.service_name, machines.ssh_user, machines.ssh_ip, machines.ssh_port," \ "services.start_cmd, services.stop_cmd, services.is_active, services.auto_recover, services.mail_receiver " \ "FROM services,machines WHERE services.machine_id = machines.id" Logger.info(sql) self.cur.execute(sql) results = self.cur.fetchall() for row in results: (job_id, service_name, ssh_user, ssh_ip, ssh_port, start_cmd, stop_cmd, is_active, auto_recover, mail_receiver) = row a_dict[job_id] = JobDetail(job_id, service_name, ssh_user, ssh_ip, ssh_port, start_cmd, stop_cmd, is_active, auto_recover, mail_receiver) if not self.__load_checks(a_dict): return None if not self.__load_relies(a_dict): return None with self.lock: self.job_list = list() for a_id, a_job in a_dict.items(): self.job_list.append(a_job) return True except: Logger.error(traceback.format_exc()) return None
def post(self): Logger.info(json.dumps(self.request.arguments, ensure_ascii=False), self.request.uri) # Get service id a_dict = dict() a_dict['code'] = -1 a_dict['healthy_code'] = StatusCode.WHITE_CODE a_dict['command_healthy_code'] = dict() a_dict['monitor_time'] = '1970-01-01 00:00:00' service_id = self.get_argument("service_id") if not service_id or not service_id.isdigit(): self.write(json.dumps(a_dict, ensure_ascii=False)) # Query status data service_id = int(service_id) result = Controller.status_holder.get_one_full_status(service_id) if result is None: self.write(json.dumps(a_dict, ensure_ascii=False)) # Return status data (healthy_code, cmd_status, last_t) = result last = datetime.datetime.fromtimestamp(last_t).strftime( '%Y-%m-%d %H:%M:%S') a_dict['code'] = 0 a_dict['healthy_code'] = healthy_code a_dict['command_healthy_code'] = cmd_status a_dict['monitor_time'] = last self.write(json.dumps(a_dict, ensure_ascii=False))
def send(cls, receivers, subject, content): """ :type receivers: string :type subject: string :type content: string :return: """ try: if config.fake_mail: Logger.info('receivers=[%s], subject=[%s], content=[%s]' % (receivers, subject, content)) return url = 'http://f**k.you.com/send_mail' a_dict = { 'receiver': receivers, 'subject': subject, 'content': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ": " + content } ret = requests.post(url, data=a_dict) Logger.info("http_code[%s], http_response[%s]" % (ret.status_code, ret.text)) except: Logger.error(traceback.format_exc())
def lod_job_from_mysql(job_loader): """ :type job_loader: JobLoader :return: """ try: Logger.info("into lod job from mysql") job_loader.load_job() gevent.sleep(1) gevent.spawn(JobLoader.lod_job_from_mysql, job_loader) except Exception, e: Logger.error(e.message)
def start(self): """ Start service using start cmd :rtype: bool | None """ try: std_out, std_err = self.client.execute(self.__start_cmd) Logger.info( "id[%s]: remote[%s] execute cmd[%s], std_out[%s], std_err[%s]" % (self.__id, self.__ssh_ip, self.__start_cmd, std_out, std_err)) return True except Exception, e: Logger.error(e.message)
def stop(self): """ Stop service using stop_cmd :rtype: bool | None """ try: std_out, std_err = self.client.execute(self.__stop_cmd) Logger.info( "id[%s]: remote[%s] execute cmd[%s], std_out[%s], std_err[%s]" % (self.__id, self.__ssh_ip, self.__stop_cmd, std_out, std_err)) # check healthy # result = self.is_running() # return result return True except Exception, e: Logger.error(e.message)
def do_job(self): try: job_id = self.a_job.get_id() service_name = self.a_job.get_service_name() healthy_code = self.a_job.do_all_check() # Refresh job status Logger.report('job_id[%s] [%s] is healthy_code[%s]' % (job_id, service_name, healthy_code)) self.status_holder.set_one_status(job_id, healthy_code, self.a_job.get_check_cmd_healthy_code()) # Success if healthy_code is StatusCode.GREEN_CODE: return # Monitor operation occur error if healthy_code == StatusCode.WHITE_CODE or healthy_code == StatusCode.YELLOW_CODE: content = 'job_id[%s] [%s], healthy_code[%s] cat not be monitored successfully' % (job_id, service_name, healthy_code) Logger.error(content) SendMail.send(self.a_job.get_mail_receiver(), service_name, content) return # Do not need to be recovered if not self.a_job.get_auto_recover(): return # Stopped process stopped = self.a_job.stop() if stopped is None: content = 'job_id[%s] [%s] stop failed' % (job_id, self.a_job.get_service_name()) Logger.info(content) return # Check relies relies = self.a_job.get_all_rely() if not self.status_holder.is_group_healthy(relies): content = 'services job_id[%s] [%s] relying is not healthy' % (job_id, self.a_job.get_service_name()) Logger.info(content) return # Start process ok = self.a_job.start() if not ok: content = 'job_id[%s] [%s] start failed' % (job_id, self.a_job.get_service_name()) Logger.info(content) return # Start ok content = 'job_id[%s] [%s] start success' % (job_id, self.a_job.get_service_name()) Logger.info(content) SendMail.send(self.a_job.get_mail_receiver(), self.a_job.get_service_name(), content) except Exception, e: Logger.error(e.message)
def __load_relies(self, a_dict): """ :type a_dict: dict[int, Job] :return: dict[int, Job] | None """ try: sql = 'SELECT service_id,rely_id FROM service_rely' Logger.info(sql) self.cur.execute(sql) results = self.cur.fetchall() for row in results: service_id, rely_id = row a_job = a_dict.get(service_id, None) if a_job is None: continue a_job.add_rely(rely_id) return True except: Logger.error(traceback.format_exc()) return None
def __load_checks(self, a_dict): """ :type a_dict: dict[int, Job] :return: dict[int, Job] | None """ try: sql = "SELECT id,service_id,local_check,check_shell,operator,check_value,good_match FROM check_cmd" Logger.info(sql) self.cur.execute(sql) results = self.cur.fetchall() for row in results: a_id, service_id, local_check, check_shell, operator, check_value, good_match = row check = CheckCmd(a_id, service_id, local_check, check_shell, operator, check_value, good_match) a_job = a_dict.get(service_id, None) if a_job is None: continue a_job.add_check(check) return True except: Logger.error(traceback.format_exc()) return None
def master_signal_handler(signum, stack): Logger.info("master[%s] received SIGTERM signal" % os.getpid()) Logger.info("master[%s] is stopping" % os.getpid()) # Shutdown worker process for p in Controller.process_list: p.terminate() for p in Controller.process_list: p.join() # Shutdown flask server if Controller.instance is not None: Controller.instance.stop() Logger.info("master[%s] is stopped" % os.getpid())
def do_all_check(self): """ Execute all check command for the job, return (is_operate_success, is_healthy) :rtype: bool | None """ status_code = StatusCode() try: # local checking for item in self.__local.values(): status, output = commands.getstatusoutput(item.check_shell) Logger.info( "id[%s]: localhost[127.0.0.1] execute cmd[%s], status[%s], output[%s]" % (self.__id, item.check_shell, status, output)) if status != 0: status_code.set_status(None) # check healthy healthy_code = self.__get_health(item, output) Logger.info("id[%s]: localhost[127.0.0.1] healthy_code[%s]" % (self.__id, healthy_code)) status_code.set_code(healthy_code) for item in self.__remote.values(): std_out, std_err = self.client.execute(item.check_shell) Logger.info( "id[%s]: remote[%s] execute cmd[%s], std_out[%s], std_err[%s]" % (self.__id, self.__ssh_ip, item.check_shell, std_out, std_err)) if not std_out and std_err: status_code.set_status(None) # check healthy healthy_code = self.__get_health(item, std_out) Logger.info("id[%s]: remote[%s] healthy_code[%s]" % (self.__id, self.__ssh_ip, healthy_code)) status_code.set_code(healthy_code) except Exception, e: Logger.error(e.message)
if item.good_match: if match is True: healthy_code = StatusCode.GREEN_CODE elif match is False: healthy_code = StatusCode.RED_CODE else: if match is True: healthy_code = StatusCode.RED_CODE elif match is False: healthy_code = StatusCode.GREEN_CODE except Exception, e: Logger.error(e.message) healthy_code = StatusCode.YELLOW_CODE finally: Logger.info('check_cmd_id[%s], healthy_code[%s]' % (item.check_id, healthy_code)) item.set_healthy_code(healthy_code) return healthy_code def stop(self): """ Stop service using stop_cmd :rtype: bool | None """ try: std_out, std_err = self.client.execute(self.__stop_cmd) Logger.info( "id[%s]: remote[%s] execute cmd[%s], std_out[%s], std_err[%s]" % (self.__id, self.__ssh_ip, self.__stop_cmd, std_out, std_err)) # check healthy
def slave_signal_handler(signum, stack): Logger.info("slave[%s] received SIGTERM signal" % os.getpid()) Logger.info("slave[%s] is stopping" % os.getpid()) Controller.quit_flag = True