示例#1
0
#!/usr/bin/python 
import zkpython
import sys
import os

if __name__ == '__main__':
   zkc = zkpython.ZKClient('localhost:2181')
   if len(sys.argv)>0:
      zkc.delete(sys.argv[1])
      
    
示例#2
0
def main(args,loger):
    # try:
    #     opts, arg = getopt.getopt(args, "h", ["help"])
    # except getopt.GetoptError:
    #     usage()
    #     sys.exit(2)
    #
    # for opt, argitem in opts:
    #     if opt in ("-h", "help"):
    #         usage()
    #         sys.exit(0)
    #     else:
    #         usage()
    #         sys.exit(2)
  try :
    pid = 0
    bRun = False
    bFileExits = False
    runtm = time.time()
    zombiestatustm = 0
    if (os.path.exists(filelock)):
        bFileExits = True

    # 检证pid是不是在运行
    prepid = 0
    prepid = readpidfile(pidfile)
    if prepid != 0 and prepid != None:
        cmd = "ps %d" % prepid
        a = os.system(cmd)
        if a == 0:
            bRun = True
        else:
            bRun = False

    # 如果锁文件存在并/var/run/imonitor_master.pid中记录的pid在运行,则不运行些脚本
    if bFileExits and bRun:
        print "Service Is Running<pid:%d>!" % prepid
        return

    # 成为守护进程
    #daemonize()

    # 将此进程的pid写到/var/run/oristar_sms_server.pid
    writepidfile(pidfile)
    os.system('touch %s' % filelock)

    # 加载配置文件
    log = 'Monitor Master Start\n'
    log +='Loading config file'
    loger.info(log)
    cfg = {}
    with open('/usr/local/imonitor2/config.json', 'r') as config:
        cfg = json.load(config)
    agent_node = cfg['AgentNode']
    sms_run_host = cfg['RunHost']
    ingest_sms = cfg['Ingest_SMS']
    timeout = cfg['TimeOut']
    agent_ls = agent_node.keys()
    log =  "Load config successful,AgentNode %s"%agent_node
    loger.info(log)


    location_mgr = LocationMgr(agent_ls, sms_run_host,agent_node,loger)
    zkctl = zkpython.ZKClient(zk_server_ip)
    zkctl.async()
    localhost_name = cfg['NodeName']

    # 获取sms的重启时间点
    sms_reboot_time = cfg['sms_reboot_time']
    reboot_cfg = []
    reboot_cfg.append(sms_reboot_time['first'])
    reboot_cfg.append(sms_reboot_time['second'])
    conf_txt = '{"sms_reboot_time":%s}'%reboot_cfg
    zkctl.set('/scheduler/task',conf_txt)

    #  根据NodeName创建元服务节点
    server_node_name = '/scheduler/server/%s' %localhost_name
    loger.info( server_node_name)
    if not zkctl.exists(server_node_name):
        zkctl.create(server_node_name, "", 1)
    else:
        while zkctl.exists(server_node_name):
            time.sleep(2)
            log = '%s has exist, wait for it deleted'%server_node_name
        zkctl.create(server_node_name, "", 1)


    # 创建master管理实例,并启动/scheduler/server节点的监测
    master_node = cfg['MasterNode']
    other_master_host = ''
    for host in master_node:
        if host == localhost_name:
            continue
        else:
            other_master_host = host

    regist_timeout = timeout['slaver_regist']

    master2_stat = ''
    if localhost_name=='master1':
        if zkctl.exists('/scheduler/server/master2'):
            master2_stat = zkctl.get('/scheduler/server/master2')
            for i in range(10):
                zkctl.async('/scheduler/task')
                child_ls = zkctl.get_children('/scheduler/task')
                if not child_ls:
                    break
                time.sleep(3)
                log = 'waiting for master2 delete /scheduler/task/slaveX '
                loger.info(log)

    master_mgr = MasterMgr(localhost_name, other_master_host, zkctl, regist_timeout, ingest_sms, loger)
    master_mgr.start_monitor()

    # 创建agent管理实例,并启动/scheduler/agent节点的监控
    agent_mgr = AgentMgr(agent_ls,localhost_name,location_mgr, zkctl,
                         timeout['slaver_regist'],loger)

    # 如果master2已经接管,则恢复接管
    if len(master2_stat) != 0 and master2_stat[0]:
        cur_master_stat = json.loads(master2_stat[0])
        agent_mgr.set_stat_matrix(cur_master_stat)
        log = 'backing take over,set stat of master2 to master1 '
        loger.info(log)

    master_mgr.set_setstatmatrix_fun(agent_mgr.set_stat_matrix)#注册设置节点状态回调
    master_mgr.set_startagentmonitor_fun(agent_mgr.start_agent_monitor)#注册重启agent_monitor回调
    master_mgr.set_deltasknode_fun(agent_mgr.del_task_node)#注册删除task回调
    agent_mgr.start_monitor()



    # 开启webservice
    spyne_webservice.setMainObj(agent_mgr)
    webservice_th = Thread(target=spyne_webservice.webserverFun)
    webservice_th.start()
    try:
        wsclient = suds.client.Client('http://127.0.0.1/sms/webservice/wsnotice?wsdl')
    except:
        log = 'open http://127.0.0.1/sms/webservice/wsnotice?wsdl failed'
        loger.info(log)
    cnt = 0
    boot_ingest_sms_delay = 0
    while True:
        # 定时更新sms的状�?
        time.sleep(2)
        zkctl.async('/scheduler')

        # 获取所有sms的运行状态和位置
        all_run_sms = agent_mgr.get_all_run_sms()
        if not all_run_sms:
            continue
        is_take_over = False
        for sms_id in all_run_sms:
            if spyne_webservice.g_sms_stat[sms_id][1] != all_run_sms[sms_id][1]:
                pre_loc = spyne_webservice.g_sms_stat[sms_id][1]
                spyne_webservice.g_sms_stat[sms_id][1] = all_run_sms[sms_id][1]#1 为sms的运行位置
                log = '%s from %s to %s'%(sms_id,pre_loc,spyne_webservice.g_sms_stat[sms_id][1])
                loger.info(log)
                is_take_over = True

        # 位置发生改变则通知web
        if is_take_over:
            try:

                if wsclient:
                    wsclient.service.isSwitchHall(True)
                else:
                    wsclient = suds.client.Client('http://127.0.0.1/sms/webservice/wsnotice?wsdl')
                    wsclient.service.isSwitchHall(True)
                log = 'sms location change and notice web'
                loger.info(log)
            except:
                log = 'call webserivce isSwitchHall failed'
                loger.info(log)

        # 每隔30秒输出一次排序后的状态
        cnt += 1
        if cnt % 3 == 0:
            db_stat = check_db_sync_stat()
            spyne_webservice.setDBSyncStat(db_stat)
            deadnode = checkhdfs_stat()
            deadname=[]
            if deadnode:
                for name in deadnode:
                    deadname.append(name.split(' ')[1][:-1])
            log = 'check hdfs dead stat:%s(%s:%d)'%(deadnode,deadname,len(deadname))
            loger.info(log)
            spyne_webservice.setHDFSStat(len(deadname))
            spyne_webservice.setHDFSDeadNode(deadname)
            if (agent_mgr.getNodeHealthStat() and check_db_sync_second() == 0 ) or boot_ingest_sms_delay > 10:
                master_mgr.start_ingest_sms()
            else:
                health  = agent_mgr.getNodeHealthStat()
                second = check_db_sync_second()
                log = 'node health:%s db sync second:%d delay cnt:%d,so delay boot ingest sms'%(health,second,boot_ingest_sms_delay)
                loger.info(log)
                boot_ingest_sms_delay += 1

	    
        if cnt % 15 == 0:
            log = 'all sms run stat:%s'%[(k,all_run_sms[k]) for k in sorted(all_run_sms.keys())]
            loger.info(log)
            cnt = 0
  except:
    fp = StringIO.StringIO()
    traceback.print_exc(file=fp)
    message = fp.getvalue()
    loger.info(message)
示例#3
0
        # 如果锁文件存在并且记录的pid在运行,则不运行些脚本
        if bFileExits and bRun:
            log = "Service Is Running<pid:%d>!" % prepid
            loger.info(log)
            exit(0)

        # 装此进程的pid写到pid file中
        writepidfile(filelock)
        os.system('touch %s' % filelock)
        loger.info('Monitor Agent Start....')
        hostname = getHostName()
        name_ls = hostname.split('-')
        hostname = name_ls[0][:-1]

        zkclt = zkpython.ZKClient(zk_server_ip)
        if not init(zkclt, loger):
            exit(0)

        #开启事件处理
        MainMgr = EventMgr(hostname, 10, zkclt, loger)
        setLogAndMgr(loger, MainMgr)
        MainMgr.start()

        # 读取配置
        cfg_txt = zkclt.get('/scheduler/task', watcher)
        log = 'read confg form /scheduler/task(%s)' % cfg_txt[0]
        loger.info(log)
        if len(cfg_txt[0]) > 1:
            cfg = json.loads(cfg_txt[0])
            rt = cfg['sms_reboot_time']