示例#1
0
    def post(self):
        Logger.info(json.dumps(self.request.arguments, ensure_ascii=False),
                    self.request.uri)

        a_dict = dict()
        a_dict['code'] = -1
        a_dict['content'] = dict()

        # Get service id
        service_id_string = self.get_argument("service_id")
        """:type: string"""
        if not service_id_string:
            self.write(json.dumps(a_dict, ensure_ascii=False))

        # Query status data
        raw_service_id_list = service_id_string.strip().split(',')
        """:type: list[string]"""
        service_id_set = set()
        for raw_service_id in raw_service_id_list:
            if raw_service_id.isdigit():
                service_id_set.add(int(raw_service_id))
        result = Controller.status_holder.get_group_status(service_id_set)
        if result is None:
            self.write(json.dumps(a_dict, ensure_ascii=False))

        a_dict['code'] = 0
        a_dict['content'] = result
        self.write(json.dumps(a_dict, ensure_ascii=False))
示例#2
0
def __main__():
    # 设置编码
    reload(sys)
    sys.setdefaultencoding('utf-8')

    # 初始化日志
    result = Logger.init(config.monitor_log_env,
                         config.monitor_log_target,
                         config.monitor_log_name,
                         config.monitor_log_size,
                         config.monitor_log_count,
                         multiprocess=True)
    if not result:
        print 'init logger failed'
        return False

    with Defer(Logger.close):
        # Start worker process
        signal.signal(signal.SIGTERM, Controller.master_signal_handler)
        Controller.start_worker_process()

        print "server is starting..."
        Logger.info("server is starting...")
        Logger.info("config.server_listen_port: %s" %
                    config.server_listen_port)
        app = tornado.web.Application([
            (r'/api_get_detail_status', ApiGetDetailStatus),
            (r'/api_monitor_status', ApiMonitorStatus),
        ],
                                      xsrf_cookies=False,
                                      debug=config.server_debug_mode)
        # Listen on a port
        app.listen(config.server_listen_port)
        Controller.instance = tornado.ioloop.IOLoop.current()
        Controller.instance.start()
示例#3
0
    def __load_jobs(self):
        try:
            a_dict = dict()
            sql = "SELECT services.id, services.service_name, machines.ssh_user, machines.ssh_ip, machines.ssh_port," \
                  "services.start_cmd, services.stop_cmd, services.is_active, services.auto_recover, services.mail_receiver " \
                  "FROM services,machines WHERE services.machine_id = machines.id"
            Logger.info(sql)
            self.cur.execute(sql)
            results = self.cur.fetchall()
            for row in results:
                (job_id, service_name, ssh_user, ssh_ip, ssh_port,
                 start_cmd, stop_cmd, is_active, auto_recover, mail_receiver) = row
                a_dict[job_id] = JobDetail(job_id, service_name, ssh_user, ssh_ip, ssh_port,
                                           start_cmd, stop_cmd, is_active, auto_recover, mail_receiver)

            if not self.__load_checks(a_dict):
                return None

            if not self.__load_relies(a_dict):
                return None

            with self.lock:
                self.job_list = list()
                for a_id, a_job in a_dict.items():
                    self.job_list.append(a_job)
            return True
        except:
            Logger.error(traceback.format_exc())
            return None
示例#4
0
    def post(self):
        Logger.info(json.dumps(self.request.arguments, ensure_ascii=False),
                    self.request.uri)

        # Get service id
        a_dict = dict()
        a_dict['code'] = -1
        a_dict['healthy_code'] = StatusCode.WHITE_CODE
        a_dict['command_healthy_code'] = dict()
        a_dict['monitor_time'] = '1970-01-01 00:00:00'

        service_id = self.get_argument("service_id")
        if not service_id or not service_id.isdigit():
            self.write(json.dumps(a_dict, ensure_ascii=False))

        # Query status data
        service_id = int(service_id)
        result = Controller.status_holder.get_one_full_status(service_id)
        if result is None:
            self.write(json.dumps(a_dict, ensure_ascii=False))

        # Return status data
        (healthy_code, cmd_status, last_t) = result
        last = datetime.datetime.fromtimestamp(last_t).strftime(
            '%Y-%m-%d %H:%M:%S')
        a_dict['code'] = 0
        a_dict['healthy_code'] = healthy_code
        a_dict['command_healthy_code'] = cmd_status
        a_dict['monitor_time'] = last
        self.write(json.dumps(a_dict, ensure_ascii=False))
示例#5
0
    def send(cls, receivers, subject, content):
        """
        :type receivers: string
        :type subject: string
        :type content: string
        :return:
        """
        try:
            if config.fake_mail:
                Logger.info('receivers=[%s], subject=[%s], content=[%s]' %
                            (receivers, subject, content))
                return

            url = 'http://f**k.you.com/send_mail'
            a_dict = {
                'receiver':
                receivers,
                'subject':
                subject,
                'content':
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ": " +
                content
            }

            ret = requests.post(url, data=a_dict)
            Logger.info("http_code[%s], http_response[%s]" %
                        (ret.status_code, ret.text))
        except:
            Logger.error(traceback.format_exc())
示例#6
0
 def lod_job_from_mysql(job_loader):
     """
     :type job_loader: JobLoader
     :return:
     """
     try:
         Logger.info("into lod job from mysql")
         job_loader.load_job()
         gevent.sleep(1)
         gevent.spawn(JobLoader.lod_job_from_mysql, job_loader)
     except Exception, e:
         Logger.error(e.message)
示例#7
0
    def start(self):
        """ Start service using start cmd
        :rtype: bool | None
        """
        try:
            std_out, std_err = self.client.execute(self.__start_cmd)
            Logger.info(
                "id[%s]: remote[%s] execute cmd[%s], std_out[%s], std_err[%s]"
                %
                (self.__id, self.__ssh_ip, self.__start_cmd, std_out, std_err))

            return True
        except Exception, e:
            Logger.error(e.message)
示例#8
0
    def stop(self):
        """ Stop service using stop_cmd
        :rtype: bool | None
        """
        try:
            std_out, std_err = self.client.execute(self.__stop_cmd)
            Logger.info(
                "id[%s]: remote[%s] execute cmd[%s], std_out[%s], std_err[%s]"
                %
                (self.__id, self.__ssh_ip, self.__stop_cmd, std_out, std_err))

            # check healthy
            # result = self.is_running()
            # return result
            return True
        except Exception, e:
            Logger.error(e.message)
示例#9
0
    def do_job(self):
        try:
            job_id = self.a_job.get_id()
            service_name = self.a_job.get_service_name()
            healthy_code = self.a_job.do_all_check()

            # Refresh job status
            Logger.report('job_id[%s] [%s] is healthy_code[%s]' % (job_id, service_name, healthy_code))
            self.status_holder.set_one_status(job_id, healthy_code, self.a_job.get_check_cmd_healthy_code())

            # Success
            if healthy_code is StatusCode.GREEN_CODE:
                return

            # Monitor operation occur error
            if healthy_code == StatusCode.WHITE_CODE or healthy_code == StatusCode.YELLOW_CODE:
                content = 'job_id[%s] [%s], healthy_code[%s] cat not be monitored successfully' % (job_id, service_name, healthy_code)
                Logger.error(content)
                SendMail.send(self.a_job.get_mail_receiver(), service_name, content)
                return

            # Do not need to be recovered
            if not self.a_job.get_auto_recover():
                return

            # Stopped process
            stopped = self.a_job.stop()
            if stopped is None:
                content = 'job_id[%s] [%s] stop failed' % (job_id, self.a_job.get_service_name())
                Logger.info(content)
                return

            # Check relies
            relies = self.a_job.get_all_rely()
            if not self.status_holder.is_group_healthy(relies):
                content = 'services job_id[%s] [%s] relying is not healthy' % (job_id, self.a_job.get_service_name())
                Logger.info(content)
                return

            # Start process
            ok = self.a_job.start()
            if not ok:
                content = 'job_id[%s] [%s] start failed' % (job_id, self.a_job.get_service_name())
                Logger.info(content)
                return

            # Start ok
            content = 'job_id[%s] [%s] start success' % (job_id, self.a_job.get_service_name())
            Logger.info(content)
            SendMail.send(self.a_job.get_mail_receiver(), self.a_job.get_service_name(), content)
        except Exception, e:
            Logger.error(e.message)
示例#10
0
 def __load_relies(self, a_dict):
     """
     :type a_dict: dict[int, Job]
     :return: dict[int, Job] | None
     """
     try:
         sql = 'SELECT service_id,rely_id FROM service_rely'
         Logger.info(sql)
         self.cur.execute(sql)
         results = self.cur.fetchall()
         for row in results:
             service_id, rely_id = row
             a_job = a_dict.get(service_id, None)
             if a_job is None:
                 continue
             a_job.add_rely(rely_id)
         return True
     except:
         Logger.error(traceback.format_exc())
         return None
示例#11
0
 def __load_checks(self, a_dict):
     """
     :type a_dict: dict[int, Job]
     :return: dict[int, Job] | None
     """
     try:
         sql = "SELECT id,service_id,local_check,check_shell,operator,check_value,good_match FROM check_cmd"
         Logger.info(sql)
         self.cur.execute(sql)
         results = self.cur.fetchall()
         for row in results:
             a_id, service_id, local_check, check_shell, operator, check_value, good_match = row
             check = CheckCmd(a_id, service_id, local_check, check_shell, operator, check_value, good_match)
             a_job = a_dict.get(service_id, None)
             if a_job is None:
                 continue
             a_job.add_check(check)
         return True
     except:
         Logger.error(traceback.format_exc())
         return None
示例#12
0
    def master_signal_handler(signum, stack):
        Logger.info("master[%s] received SIGTERM signal" % os.getpid())
        Logger.info("master[%s] is stopping" % os.getpid())

        # Shutdown worker process
        for p in Controller.process_list:
            p.terminate()
        for p in Controller.process_list:
            p.join()

        # Shutdown flask server
        if Controller.instance is not None:
            Controller.instance.stop()
        Logger.info("master[%s] is stopped" % os.getpid())
示例#13
0
    def do_all_check(self):
        """
        Execute all check command for the job, return (is_operate_success, is_healthy)
        :rtype: bool | None
        """
        status_code = StatusCode()
        try:
            # local checking
            for item in self.__local.values():
                status, output = commands.getstatusoutput(item.check_shell)
                Logger.info(
                    "id[%s]: localhost[127.0.0.1] execute cmd[%s], status[%s], output[%s]"
                    % (self.__id, item.check_shell, status, output))
                if status != 0:
                    status_code.set_status(None)

                # check healthy
                healthy_code = self.__get_health(item, output)
                Logger.info("id[%s]: localhost[127.0.0.1] healthy_code[%s]" %
                            (self.__id, healthy_code))
                status_code.set_code(healthy_code)

            for item in self.__remote.values():
                std_out, std_err = self.client.execute(item.check_shell)
                Logger.info(
                    "id[%s]: remote[%s] execute cmd[%s], std_out[%s], std_err[%s]"
                    % (self.__id, self.__ssh_ip, item.check_shell, std_out,
                       std_err))
                if not std_out and std_err:
                    status_code.set_status(None)
                # check healthy
                healthy_code = self.__get_health(item, std_out)
                Logger.info("id[%s]: remote[%s] healthy_code[%s]" %
                            (self.__id, self.__ssh_ip, healthy_code))
                status_code.set_code(healthy_code)
        except Exception, e:
            Logger.error(e.message)
示例#14
0
            if item.good_match:
                if match is True:
                    healthy_code = StatusCode.GREEN_CODE
                elif match is False:
                    healthy_code = StatusCode.RED_CODE
            else:
                if match is True:
                    healthy_code = StatusCode.RED_CODE
                elif match is False:
                    healthy_code = StatusCode.GREEN_CODE
        except Exception, e:
            Logger.error(e.message)
            healthy_code = StatusCode.YELLOW_CODE
        finally:
            Logger.info('check_cmd_id[%s], healthy_code[%s]' %
                        (item.check_id, healthy_code))
            item.set_healthy_code(healthy_code)
            return healthy_code

    def stop(self):
        """ Stop service using stop_cmd
        :rtype: bool | None
        """
        try:
            std_out, std_err = self.client.execute(self.__stop_cmd)
            Logger.info(
                "id[%s]: remote[%s] execute cmd[%s], std_out[%s], std_err[%s]"
                %
                (self.__id, self.__ssh_ip, self.__stop_cmd, std_out, std_err))

            # check healthy
示例#15
0
 def slave_signal_handler(signum, stack):
     Logger.info("slave[%s] received SIGTERM signal" % os.getpid())
     Logger.info("slave[%s] is stopping" % os.getpid())
     Controller.quit_flag = True