Exemplo n.º 1
0
def alarm_load():
    try:
        loging.write("start %s ......" %alarm_load.__name__)
        whitelist = []
        dict_load = defaultdict()
        db_server = db_idc.idc_servers
        db_zabbix = db_idc.zabbix_info
        db_project = db_op.project_list
        db_project_other = db_op.project_other
        Influx_cli = InfluxDBClient(influxdb_host, influxdb_port, influxdb_user, influxdb_pw, 'zabbix_infos')
        host_infos = db_zabbix.query.with_entities(db_zabbix.ip, db_zabbix.ssh_port,db_zabbix.hostname,db_zabbix.update_time).filter(and_(db_zabbix.cpu_load > 100, db_zabbix.icmpping == 1)).all()
        Key = "op_alarm_load_whitelist"
        if RC_CLUSTER.exists(Key):
            whitelist = RC_CLUSTER.smembers(Key)
        #循环监控疑似问题服务器
        for infos in host_infos:
            host,ssh_port,hostname,update_time=infos
            if time.strftime('%Y-%m-%d',time.localtime()) in update_time:
                try:
                    if not host.startswith('172.16.19.'):
                        now_time = datetime.datetime.now()
                        dt = now_time - datetime.timedelta(minutes=10)
                        dt = dt.strftime('%Y-%m-%dT%H:%M:%SZ')
                        cmd = "select mean(*) from server_infos where time >='%s' group by hostname" % dt
                        results = Influx_cli.query(cmd)
                        if results:
                            for key in results.keys():
                                if hostname == key[-1]['hostname']:
                                    for infos in results[key]:
                                        if infos['mean_cpu_load'] >100:
                                            dict_load[hostname] = (host,ssh_port,int(infos['mean_cpu_load']))
                except Exception as e:
                    logging.error(e)
                    continue
        #进行重启操作
        if dict_load:
            for hostname in dict_load:
                host,ssh_port,cpu_load = dict_load[hostname]
                # 判断ssh是否可以登录
                try:
                    Ssh = SSH.ssh(ip=host,ssh_port=ssh_port)
                except Exception as e:
                    if not hostname.startswith('nj'):
                        Ssh_Key = "op_ssh_login_fail_%s" %hostname
                        RC.incr(Ssh_Key,1)
                        RC.expire(Ssh_Key,350)
                        if int(RC.get(Ssh_Key)) >5:
                            tools.dingding_msg(text,token=ops_token)
                        else:
                            tools.dingding_msg(text)
                else:
                    try:
                        Key = 'op_alarm_load_%s' % hostname
                        Project = None
                        RC_CLUSTER.incr(Key, 5)
                        RC_CLUSTER.expire(Key, 600)
                        ctime = int(RC_CLUSTER.get(Key))
                        if hostname not in whitelist:
                            #筛查可重启服务进程
                            results = Ssh.Run("ps -aux | sort -k3nr |head -n 1")
                            if results['stdout']:
                                results = results['stdout'][0].strip().split()
                                try:

                                    if results[-1].endswith('-rpc.jar'):
                                        pro_jar = results[-1]
                                        if pro_jar in ['moji-location-rpc.jar']:
                                            Project =pro_jar.split('.')[0]
                                    else:
                                        for line in results:
                                            if '-Dcatalina.home=' in line :
                                                Project = line.strip().split('/')[-1]
                                                break
                                except Exception as e:
                                    logging.error(e)
                                if Project:
                                    try:
                                        # 判断是否是tomcat项目
                                        ret = db_project.query.filter(and_(db_project.ip == host, db_project.ssh_port == ssh_port)).all()
                                        if ret:
                                            #重启问题tomcat
                                            result = Ssh.Run("supervisorctl  restart  {0}".format(Project))
                                            if result['stderr']:
                                                text = ['**线上服务重启:%s**' % hostname, "CPU持续{0}分钟平均使用率:{1}%".format(ctime,cpu_load),
                                                        "相关进程:{0}".format(Project), '**服务重启失败,需手动处理!**']
                                            else:
                                                text = ['**线上服务重启:%s**' % hostname, "CPU持续{0}分钟平均使用率:{1}%".format(ctime,cpu_load),
                                                        "相关进程:{0}".format(Project), '**服务重启成功!**']
                                                ops_token = None

                                        else:
                                            # 判断是否是jar项目
                                            server_id = db_server.query.with_entities(db_server.id).filter(db_server.hostname==hostname).all()
                                            if server_id[0]:
                                                ret = db_project_other.query.filter(db_project_other.server_id == int(server_id[0][0])).all()
                                                if ret:
                                                    text = ['**线上服务器预警:%s**' % hostname, "CPU持续{0}分钟平均使用率:{1}%".format(ctime,cpu_load),
                                                    "相关进程:{0}".format(Project), '**请及时进行处理!**']
                                        if text and not hostname.startswith('nj'):
                                            tools.dingding_msg(text,ops_token)
                                    except Exception as e:
                                        logging.error(e)
                    finally:
                        Ssh.Close()
    finally:
        loging.write("%s complete!" % alarm_load.__name__)
        db_idc.DB.session.remove()
        db_op.DB.session.remove()
Exemplo n.º 2
0
def get_other_info():
    db_project_other = db_op.project_other
    db_crontabs = db_idc.crontabs
    db_servers = db_idc.idc_servers
    db_hosts = db_idc.hosts
    infos = db_servers.query.with_entities(db_servers.id,db_servers.ip,db_servers.ssh_port).filter(and_(db_servers.status !='维护中',db_servers.comment !='跳过')).all()
    try:
        for info in infos:
            server_id,ip,ssh_port = info
            if tcpping(host=ip, port=ssh_port, timeout=3):
                try:
                    Ssh = SSH.ssh(ip=ip, ssh_port=ssh_port)
                except:
                    continue
                else:
                    try:
                        update_date = time.strftime('%Y-%m-%d', time.localtime())
                        #收集crontab信息
                        results = Ssh.Run("ls /var/spool/cron/")
                        if results['stdout']:
                            for user in results['stdout']:
                                user = user.strip()
                                results = Ssh.Run("cat /var/spool/cron/%s" %user)
                                if results['stdout']:
                                    v = db_crontabs.query.filter(db_crontabs.server_id==int(server_id)).all()
                                    for c in v:
                                        db_idc.DB.session.delete(c)
                                        db_idc.DB.session.commit()
                                    for result in results['stdout']:
                                        if not result.startswith('#') and '*' in result:
                                            result = result.strip().split()
                                            cron = ' '.join(result[:5])
                                            action = ' '.join(result[5:])
                                            c = db_crontabs(cron=cron,user=user,action=action,server_id=int(server_id),update_time=update_date)
                                            db_idc.DB.session.add(c)
                                            db_idc.DB.session.commit()
                    except Exception as e:
                        logging.error(e)
                    # 收集jar运行信息
                    try:
                        results = Ssh.Run("ps -ef|grep java|grep -e '.jar$'")
                        if results['stdout']:
                            vals = []
                            v = db_project_other.query.filter(db_project_other.server_id == int(server_id)).all()
                            for c in v:
                                db_op.DB.session.delete(c)
                                db_op.DB.session.commit()
                            for result in results['stdout']:
                                if 'hadoop' not in result and 'hive' not in result:
                                    result = result.strip().split()[-1]
                                    if '/' in result:
                                        result = result.split('/')[-1]
                                    vals.append(result)
                            for val in set(vals):
                                result = db_project_other.query.filter(and_(db_project_other.project==val,db_project_other.server_id==server_id)).all()
                                if not result:
                                    business_id = 0
                                    business = db_project_other.query.with_entities(db_project_other.business_id).filter(and_(db_project_other.project == val,db_project_other.business_id != 0)).all()
                                    if business:
                                        business_id = business[0][0]
                                    c = db_project_other(lable='java', project=val, server_id=server_id,business_id=business_id, update_time=update_date)
                                    db_op.DB.session.add(c)
                                    db_op.DB.session.commit()
                    except Exception as e:
                        logging.error(e)
                    #收集hosts信息
                    try:
                        results = Ssh.Run("cat /etc/hosts")
                        if results['stdout']:
                            v = db_hosts.query.filter(db_hosts.server_id == int(server_id)).all()
                            for c in v:
                                db_idc.DB.session.delete(c)
                                db_idc.DB.session.commit()
                            for line in results['stdout']:
                                if not line.startswith('#') and '127.0.0.1' not in line:
                                    line = line.strip().split()
                                    if line:
                                        if len(line) == 2:
                                            if 'localhost' not in line[1]:
                                                c = db_hosts(host=line[0],hostname=line[1],server_id=server_id,update_time=update_date)
                                                db_idc.DB.session.add(c)
                                                db_idc.DB.session.commit()
                                        if len(line) > 2:
                                            for hostname in line[1:]:
                                                if not hostname.startswith('#') and not 'localhost' in  hostname:
                                                    c = db_hosts(host=line[0],hostname=hostname, server_id=server_id,
                                                                 update_time=update_date)
                                                    db_idc.DB.session.add(c)
                                                    db_idc.DB.session.commit()
                    except Exception as e:
                        logging.error(e)
                    Ssh.Close()
    except Exception as e:
        logging.error(e)
    finally:
        db_idc.DB.session.remove()
        db_op.DB.session.remove()
Exemplo n.º 3
0
        def Redis_info(info):
            ip,ssh_port,app_port = info
            #初始化参数
            masterauth = None
            requirepass = None
            pid = None
            conf_dir = None
            conf_file = ""
            redis_type = {'master': '否', 'slave': '否', 'cluster': '否'}
            #判断ssh端口是否连通
            if tcpping(host=ip, port=app_port, timeout=3):
                try:
                    Ssh = SSH.ssh(ip=ip, ssh_port=ssh_port)
                except:
                    pass
                else:
                    cmd = "netstat -lntp|grep :%s" % app_port
                    results = Ssh.Run(cmd)
                    if results['stdout']:
                        for line in results['stdout'][0].split():
                            if '/redis' in line:
                                pid = line.split('/')[0]
                                break
                        if pid:
                            cmd = "/bin/ps -ef|grep -v grep|grep {}".format(pid)
                            results = Ssh.Run(cmd)
                            if results['stdout']:
                                result = results['stdout'][0]
                                if 'cluster' in result:
                                    redis_type['cluster'] = '是'
                                else:
                                    try:
                                        result = results['stdout'][0].split()[-1]
                                        if '/' in result:
                                            conf_file = "/usr/local/moji/redis/etc/{}".format(result.split('/')[-1])
                                        if not conf_file.endswith('.conf'):
                                            cmd = "lsof -p {}|grep 'cwd'".format(pid)
                                            cwd = Ssh.Run(cmd)
                                            if cwd['stdout']:
                                                for line in cwd['stdout']:
                                                    if 'redis' in line:
                                                        conf_dir = line.split()[-1]
                                                        break
                                                if conf_dir:
                                                    cmd = "grep {0} -r {1}/|grep '.conf:port'".format(app_port, conf_dir)
                                                    results = Ssh.Run(cmd)
                                                    if results['stdout']:
                                                        for line in results['stdout']:
                                                            if ':port {}'.format(app_port) in line:
                                                                conf_file = line.split(':')[0]
                                        if conf_file.endswith('.conf'):
                                            cmd = "grep masterauth {}".format(conf_file)
                                            results = Ssh.Run(cmd)
                                            if results['stdout']:
                                                masterauth =  results['stdout'][0].split()[-1].strip()
                                            cmd = "grep requirepass {}".format(conf_file)
                                            pw_result = Ssh.Run(cmd)
                                            if pw_result['stdout']:
                                                requirepass =  pw_result['stdout'][0].split()[-1].strip()
                                        RC = redis.StrictRedis(ip, int(app_port),decode_responses=True)
                                        if requirepass:
                                            RC = redis.StrictRedis(ip,int(app_port),password=requirepass,decode_responses=True)
                                        Infos = RC.info()
                                        if Infos['role'] == 'master':
                                            redis_type['master'] = '是'
                                        if Infos['role'] == 'slave':
                                            redis_type['slave'] = '是'
                                        counts = int((Infos['connected_slaves']))
                                    except:
                                        pass
                                    else:
                                        try:
                                            #修改记录slave信息
                                            if counts > 0:
                                                for i in range(counts):
                                                    Info = Infos['slave%s' % i]
                                                    if isinstance(Info,dict):
                                                        slave_ip = Info['ip']
                                                        slave_port = Info['port']
                                                        slave_status = Info['state']
                                                    else:
                                                        slave_ip, slave_port, slave_status = Info.split(',')
                                                    if slave_status == 'online' and int(slave_port) >1024:

                                                        try:
                                                            SSH_port = ssh_ports['%s:%s' % (slave_ip, slave_port)]
                                                            server_id = server_ids['%s:%s' %(slave_ip,SSH_port)]
                                                        except:
                                                            server_id = slave_ip
                                                            servers = db_servers.query.with_entities(db_servers.ip).filter(db_servers.s_ip.like('%{0};%'.format(slave_ip))).all()
                                                            if servers:
                                                                for server in servers:
                                                                    val = db_third.query.filter(and_(db_third.ip==server[0],db_third.app_port==slave_port)).all()
                                                                    if val:
                                                                        SSH_port = ssh_ports['%s:%s' % (server[0], slave_port)]
                                                                        server_id = server_ids['%s:%s' %(server[0],SSH_port)]
                                                                        break
                                                        try:
                                                            master_id = server_ids['%s:%s' % (ip, ssh_port)]
                                                        except:
                                                            master_id = ip
                                                        val = db_redis.query.filter(and_(db_redis.server_id == server_id, db_redis.port == slave_port)).all()
                                                        if val:
                                                            db_redis.query.filter(and_(db_redis.server_id == server_id, db_redis.port == slave_port)).update(
                                                                {db_redis.masterauth: masterauth, db_redis.requirepass: requirepass,
                                                                 db_redis.master: '否',db_redis.slave: '是',db_redis.cluster: '否',
                                                                 db_redis.Master_Host: master_id,db_redis.Master_Port: app_port,db_redis.update_date: update_date})
                                                            db_idc.DB.session.commit()
                                                        else:
                                                            c = db_redis(server_id=server_id, port=slave_port, masterauth=masterauth,
                                                                         requirepass=requirepass, master='否',
                                                                         slave='是',cluster='否', Master_host=master_id,
                                                                         Master_Port=app_port, update_date=update_date)
                                                            db_idc.DB.session.add(c)
                                                            db_idc.DB.session.commit()
                                        except:
                                            db_idc.DB.session.rollback()
                                try:
                                    #修改记录master或者cluster信息
                                    if redis_type['master'] == '是' or redis_type['cluster'] == '是':
                                        try:
                                            server_id = server_ids['%s:%s' % (ip, ssh_port)]
                                        except:
                                            server_id = ip
                                            servers = db_servers.query.with_entities(db_servers.ip).filter(db_servers.s_ip.like('%{0};%'.format(ip))).all()
                                            if servers:
                                                for server in servers:
                                                    val = db_third.query.filter(and_(db_third.ip == server[0],db_third.app_port == app_port)).all()
                                                    if val:
                                                        server_id = server_ids['%s:%s' % (server[0], ssh_port)]
                                                        break
                                        val = db_redis.query.filter(and_(db_redis.server_id == server_id, db_redis.port == app_port)).all()
                                        if val:
                                            db_redis.query.filter(
                                                and_(db_redis.server_id == server_id, db_redis.port == app_port)).update(
                                                {db_redis.masterauth: masterauth, db_redis.requirepass: requirepass,
                                                 db_redis.master: redis_type['master'],
                                                 db_redis.slave: redis_type['slave'],
                                                 db_redis.cluster: redis_type['cluster'],
                                                 db_redis.Master_Host: '',
                                                 db_redis.Master_Port: '', db_redis.update_date: update_date})
                                            db_idc.DB.session.commit()
                                        else:
                                            loging.write("add new redis %s  %s  ......" % (ip, app_port))
                                            c = db_redis(server_id=server_id, port=app_port, masterauth=masterauth,
                                                         requirepass=requirepass, master=redis_type['master'],
                                                         slave=redis_type['slave'], cluster=redis_type['cluster'],
                                                         Master_host='', Master_Port='',
                                                         update_date=update_date)
                                            db_idc.DB.session.add(c)
                                            db_idc.DB.session.commit()
                                except:
                                    db_idc.DB.session.rollback()
                finally:
                    Ssh.Close()
            else:
                loging.write("delete not exist redis %s  %s  ......" %(ip,app_port))
                v = db_redis.query.filter(and_(db_redis.server_id==server_ids['%s:%s' %(ip,ssh_port)],db_redis.port==app_port)).all()
                for c in v:
                    db_idc.DB.session.delete(c)
                    db_idc.DB.session.commit()
                v= db_third.query.filter(and_(db_third.ip==ip,db_third.app_port==app_port)).all()
                for c in v:
                    db_idc.DB.session.delete(c)
                    db_idc.DB.session.commit()