def get_server_list(): hosts = [] try: res = requests.post(tokenUrl, data={ "username": username, "password": password }) res = res.json() try: headers = {"Authorization": "%s %s" % ("Bearer ", res["token"])} except Exception as e: loging.write(res) logging.error(e) else: res = requests.get(assetsUrl, headers=headers, timeout=30) idcs = requests.get(app.config.get('JPSURL'), headers=headers, timeout=30) idcs = {info['id']: info['value'] for info in idcs.json()} for info in res.json(): try: if info['labels']: idc = idcs[info['labels'][0]] if idc not in ('lan', 'j'): hosts.append((info['ip'], info['port'], info['hostname'], idc)) except: continue except Exception as e: logging.error(e) finally: return list(set(hosts))
def Run(self): self.scheduler.add_job(Task.count_es_logs, 'cron', second='0', minute='*', id=Task.count_es_logs.__name__, replace_existing=True) self.scheduler.add_job(Task.server_per, 'cron', second='0', minute='*/5', id=Task.server_per.__name__, replace_existing=True) self.scheduler.start() loging.write("Scheduler backgroud start on %s ......" % HOST)
def Redis_alarm(): loging.write("start %s ......" %Redis_alarm.__name__) tm = time.strftime('%Y%m%d%H%M',time.localtime()) Key = 'yw_check_master_slave' redis_m = [] db_servers = db_idc.idc_servers db_redis = db_idc.redis_info try: #获取服务器信息 blacklist = ('172.16.70.34','172.16.19.104') server_ids = db_servers.query.with_entities(db_servers.id, db_servers.ip).filter(db_servers.idc_id != 1025).all() server_ids = {str(infos[0]): infos[-1] for infos in server_ids} #获取主redis信息 Masters = db_redis.query.with_entities(db_redis.server_id,db_redis.port,db_redis.requirepass).filter(db_redis.master=='是').all() S_Masters = db_redis.query.with_entities(db_redis.Master_Host, db_redis.Master_Port).filter(and_(db_redis.slave == '是',db_redis.Master_Host !='')).all() S_Masters = set(['%s:%s' %info for info in S_Masters]) #主redis写入数据 for Master in set(Masters): server_id, port, requirepass = Master try: mip = server_ids[str(server_id)] except: continue try: RC = redis.StrictRedis(mip, int(port), decode_responses=True) if requirepass: RC = redis.StrictRedis(mip, int(port), password=requirepass, decode_responses=True) except: continue else: RC.set(Key, tm) RC.expire(Key, 360) redis_m.append((int(server_id),int(port))) def check_slave(info): #检查从reids是否同步 server_id,port = info if int(port) not in [10080]: #获取从redis端口列表 slave_ports = db_redis.query.with_entities(distinct(db_redis.port)).filter(and_(db_redis.Master_Host==server_id,db_redis.Master_Port==port)).all() if slave_ports: slave_ports = [int(sport[0]) for sport in slave_ports] for slave_port in slave_ports: #获取从redis信息 redis_lists = db_redis.query.with_entities(db_redis.server_id,db_redis.port,db_redis.requirepass).filter(and_(db_redis.slave=='是',db_redis.port==slave_port)).all() for info in redis_lists: text = None slave_lists = [] server_id,sport,requirepass = info try: sip = server_ids[str(server_id)] except: continue else: try: RC = redis.StrictRedis(sip, int(sport), decode_responses=True) if requirepass: RC = redis.StrictRedis(sip, int(sport), password=requirepass, decode_responses=True) except: continue else: #获取从redis时间戳 mvals = db_redis.query.with_entities(db_redis.Master_Host, db_redis.Master_Port).filter(and_(db_redis.server_id == server_id, db_redis.port == sport)).all() mip,mport = mvals[0] mip = server_ids[str(mip)] val = RC.get(Key) try: RC = redis.StrictRedis(mip, int(mport), decode_responses=True) if requirepass: RC = redis.StrictRedis(mip, int(mport), password=requirepass, decode_responses=True) except: continue else: if sip not in blacklist: mval = RC.get(Key) if mval and not val: text = ['**线上Redis同步报警:**', "同步Redis:%s:%s 验证数据:%s"%(mip,mport,mval), "延时Redis:%s:%s 验证数据:%s" % (sip, sport,val), "数据同步异常!", '**请及时进行处理!**'] if text: alarm_info = '%s:%s' % (server_id, sport) #判断节点redis if alarm_info in S_Masters: vals = db_redis.query.with_entities(db_redis.server_id,db_redis.port).filter(and_(db_redis.Master_Host==server_id,db_redis.Master_Port==sport)).all() if vals: slave_lists.extend(['%s:%s'%val for val in vals]) if alarm_info not in slave_lists: #redis异常报警 token = ops_token if int(sport) in (8379,6387,17379): token = redis_token tools.dingding_msg(text,token=token) if redis_m: time.sleep(60) pool = ThreadPool(5) pool.map(check_slave,set(redis_m)) pool.close() pool.join() except Exception as e: logging.error(e) finally: db_idc.DB.session.remove() loging.write("%s complete !" % Redis_alarm.__name__)
def k8s_ingress_log(): td = time.strftime('%Y-%m-%d', time.localtime()) th = time.strftime('%H:%M', time.localtime()) Key = 'op_k8s_ingress_log' stat_key = 'op_k8s_ingress_stat' rt_key = 'op_k8s_ingress_rt' k8s_domains_key = 'op_k8s_domains_%s' %td k8s_pv_key = 'op_k8s_pv_%s' %td now_date = datetime.datetime.now() lte_date = now_date.strftime('%Y-%m-%dT%H:%M:%S+08:00') gte_date = now_date - datetime.timedelta(minutes=1) gte_date = gte_date.strftime('%Y-%m-%dT%H:%M:%S+08:00') Domains = [] def auto_delete_pod(pod_name,text): try: namespace = "default" api_instance = client.CoreV1Api() ret = api_instance.list_namespaced_pod(namespace=namespace) for i in ret.items: if i.metadata.name.startswith(pod_name): RC.incr(delete_pod_key, 1) api_instance.delete_namespaced_pod(name=i.metadata.name, namespace=namespace, body=client.V1DeleteOptions()) time.sleep(30) except Exception as e: logging.error(e) finally: counts = RC.get(delete_pod_key) RC.delete(delete_pod_key) text.append('**自动处理问题pod数量:{}**'.format(counts)) return text try: loging.write('start %s ......' % k8s_ingress_log.__name__) # 获取容器平台并发访问数据 try: body = {"query": {"range": {"time_iso8601": {"gte": "%s" % gte_date, "lte": "%s" % lte_date}}}, "aggs": { "avg_resp": { "avg": {"field": "upstream_response_time"} } }} res = es.search(index='k8s-ingress-log-*', body=body) if res['hits']['total']: rt = float(res['aggregations']['avg_resp']['value']) counts = int(res['hits']['total']) if rt > 1: # 统计全部访问量 RC.hset('%s_%s' % (Key, td), th, counts*rt) else: RC.hset('%s_%s' % (Key, td), th, counts) RC.expire('%s_%s' % (Key, td), 864000) # 统计k8s总访问量 RC.incr(k8s_pv_key, counts) RC.expire(k8s_pv_key, 864000) except Exception as e: logging.error(e) # 获取es当前1分钟的状态码统计 try: body = {'size': 0, "query": { "bool": { "must": [{"range": {"time_iso8601": {"gte": gte_date, "lte": lte_date}}}]}}, "aggs": { "hosts": { "terms": { "field": "host.keyword", "size": 100 }, "aggs": { "counts": { "terms": { "field": "status", "size": 100 } } } } }} res = es.search(index='k8s-ingress-log-*', body=body) for infos in res['aggregations']['hosts']['buckets']: try: domain = infos['key'] Domains.append(domain) counts = int(infos['doc_count']) #统计域名列表 RC.sadd(k8s_domains_key,domain) #统计域名访问量 RC.hset('%s_%s_%s'%(Key,domain,td),th,counts) RC.expire('%s_%s_%s' % (Key, domain, td), 864000) #状态码统计 vals = {info['key']: info['doc_count'] for info in infos['counts']['buckets']} RC.hset('%s_%s_%s' % (stat_key, domain, td), th, vals) RC.expire('%s_%s_%s' % (stat_key, domain, td), 864000) except: continue except Exception as e: logging.error(e) try: # 获取es当前1分钟的响应时间统计 body = {'size': 0, "query": { "bool": { "must": [{"range": {"time_iso8601": {"gte": gte_date, "lte": lte_date}}}]}}, "aggs": { "hosts": { "terms": { "field": "host.keyword", "size": 100 }, "aggs": { "avg_resp": { "avg": {"field": "upstream_response_time"} } }}} } res = es.search(index='k8s-ingress-log-*', body=body) for infos in res['aggregations']['hosts']['buckets']: try: domain = infos['key'] RC.hset('%s_%s_%s' % (rt_key, domain, td), th,float('%.3f'%infos['avg_resp']['value'])) RC.expire('%s_%s_%s' % (rt_key, domain, td), 864000) except: continue except Exception as e: logging.error(e) try: for domain in Domains: #业务状态码和响应时间超时报警 text = ['**容器平台业务报警:%s**' % domain] stat_vals = 0.0 nd = now_date - datetime.timedelta(minutes=1) th = nd.strftime('%H:%M') vals = RC.hget('%s_%s_%s' % (stat_key, domain, td), th) if vals: vals = eval(str(vals)) if 200 in vals: stat_vals = vals[200] if len(vals) >1: total_vals = reduce(lambda x, y: x + y, vals.values()) else: total_vals = stat_vals if stat_vals >0: diff_vals = float(stat_vals)/float(total_vals) rt_vals = RC.hget('%s_%s_%s' % (rt_key, domain, td), th) if diff_vals < 0.99: Key = 'op_k8s_project_alarm' RC.incr(Key, 1) RC.expire(Key, 180) if int(RC.get(Key)) >3: db_project = db_op.project_list project = db_project.query.with_entities(distinct(db_project.project)).filter( db_project.domain.like('%{}%'.format(domain))).all() if project: db_k8s_deploy = db_op.k8s_deploy pod_name = db_k8s_deploy.query.with_entities(db_k8s_deploy.deployment).filter( db_k8s_deploy.project == project[0][0]).all() if pod_name: pod_name = pod_name[0][0] text.append("服务可用率:{}%".format('%.2f' % (diff_vals * 100))) if rt_vals: text.append("服务响应时间:{}ms".format(int(float(rt_vals) * 1000))) delete_pod_key = 'op_auto_delete_pod_%s_%s' % (pod_name, td) if not RC.exists(delete_pod_key): text = auto_delete_pod(pod_name,text) tools.dingding_msg(text) RC.delete(Key) except Exception as e: logging.error(e) except Exception as e: logging.error(e) finally: db_op.DB.session.remove() for key in (k8s_domains_key,k8s_pv_key): RC.expire(key,864000) loging.write('complete %s !' % k8s_ingress_log.__name__)
def alarm_load(): try: loging.write("start %s ......" %alarm_load.__name__) whitelist = [] dict_load = defaultdict() db_server = db_idc.idc_servers db_zabbix = db_idc.zabbix_info db_project = db_op.project_list db_project_other = db_op.project_other Influx_cli = InfluxDBClient(influxdb_host, influxdb_port, influxdb_user, influxdb_pw, 'zabbix_infos') host_infos = db_zabbix.query.with_entities(db_zabbix.ip, db_zabbix.ssh_port,db_zabbix.hostname,db_zabbix.update_time).filter(and_(db_zabbix.cpu_load > 100, db_zabbix.icmpping == 1)).all() Key = "op_alarm_load_whitelist" if RC_CLUSTER.exists(Key): whitelist = RC_CLUSTER.smembers(Key) #循环监控疑似问题服务器 for infos in host_infos: host,ssh_port,hostname,update_time=infos if time.strftime('%Y-%m-%d',time.localtime()) in update_time: try: if not host.startswith('172.16.19.'): now_time = datetime.datetime.now() dt = now_time - datetime.timedelta(minutes=10) dt = dt.strftime('%Y-%m-%dT%H:%M:%SZ') cmd = "select mean(*) from server_infos where time >='%s' group by hostname" % dt results = Influx_cli.query(cmd) if results: for key in results.keys(): if hostname == key[-1]['hostname']: for infos in results[key]: if infos['mean_cpu_load'] >100: dict_load[hostname] = (host,ssh_port,int(infos['mean_cpu_load'])) except Exception as e: logging.error(e) continue #进行重启操作 if dict_load: for hostname in dict_load: host,ssh_port,cpu_load = dict_load[hostname] # 判断ssh是否可以登录 try: Ssh = SSH.ssh(ip=host,ssh_port=ssh_port) except Exception as e: if not hostname.startswith('nj'): Ssh_Key = "op_ssh_login_fail_%s" %hostname RC.incr(Ssh_Key,1) RC.expire(Ssh_Key,350) if int(RC.get(Ssh_Key)) >5: tools.dingding_msg(text,token=ops_token) else: tools.dingding_msg(text) else: try: Key = 'op_alarm_load_%s' % hostname Project = None RC_CLUSTER.incr(Key, 5) RC_CLUSTER.expire(Key, 600) ctime = int(RC_CLUSTER.get(Key)) if hostname not in whitelist: #筛查可重启服务进程 results = Ssh.Run("ps -aux | sort -k3nr |head -n 1") if results['stdout']: results = results['stdout'][0].strip().split() try: if results[-1].endswith('-rpc.jar'): pro_jar = results[-1] if pro_jar in ['moji-location-rpc.jar']: Project =pro_jar.split('.')[0] else: for line in results: if '-Dcatalina.home=' in line : Project = line.strip().split('/')[-1] break except Exception as e: logging.error(e) if Project: try: # 判断是否是tomcat项目 ret = db_project.query.filter(and_(db_project.ip == host, db_project.ssh_port == ssh_port)).all() if ret: #重启问题tomcat result = Ssh.Run("supervisorctl restart {0}".format(Project)) if result['stderr']: text = ['**线上服务重启:%s**' % hostname, "CPU持续{0}分钟平均使用率:{1}%".format(ctime,cpu_load), "相关进程:{0}".format(Project), '**服务重启失败,需手动处理!**'] else: text = ['**线上服务重启:%s**' % hostname, "CPU持续{0}分钟平均使用率:{1}%".format(ctime,cpu_load), "相关进程:{0}".format(Project), '**服务重启成功!**'] ops_token = None else: # 判断是否是jar项目 server_id = db_server.query.with_entities(db_server.id).filter(db_server.hostname==hostname).all() if server_id[0]: ret = db_project_other.query.filter(db_project_other.server_id == int(server_id[0][0])).all() if ret: text = ['**线上服务器预警:%s**' % hostname, "CPU持续{0}分钟平均使用率:{1}%".format(ctime,cpu_load), "相关进程:{0}".format(Project), '**请及时进行处理!**'] if text and not hostname.startswith('nj'): tools.dingding_msg(text,ops_token) except Exception as e: logging.error(e) finally: Ssh.Close() finally: loging.write("%s complete!" % alarm_load.__name__) db_idc.DB.session.remove() db_op.DB.session.remove()
def Redis_info(info): ip,ssh_port,app_port = info #初始化参数 masterauth = None requirepass = None pid = None conf_dir = None conf_file = "" redis_type = {'master': '否', 'slave': '否', 'cluster': '否'} #判断ssh端口是否连通 if tcpping(host=ip, port=app_port, timeout=3): try: Ssh = SSH.ssh(ip=ip, ssh_port=ssh_port) except: pass else: cmd = "netstat -lntp|grep :%s" % app_port results = Ssh.Run(cmd) if results['stdout']: for line in results['stdout'][0].split(): if '/redis' in line: pid = line.split('/')[0] break if pid: cmd = "/bin/ps -ef|grep -v grep|grep {}".format(pid) results = Ssh.Run(cmd) if results['stdout']: result = results['stdout'][0] if 'cluster' in result: redis_type['cluster'] = '是' else: try: result = results['stdout'][0].split()[-1] if '/' in result: conf_file = "/usr/local/moji/redis/etc/{}".format(result.split('/')[-1]) if not conf_file.endswith('.conf'): cmd = "lsof -p {}|grep 'cwd'".format(pid) cwd = Ssh.Run(cmd) if cwd['stdout']: for line in cwd['stdout']: if 'redis' in line: conf_dir = line.split()[-1] break if conf_dir: cmd = "grep {0} -r {1}/|grep '.conf:port'".format(app_port, conf_dir) results = Ssh.Run(cmd) if results['stdout']: for line in results['stdout']: if ':port {}'.format(app_port) in line: conf_file = line.split(':')[0] if conf_file.endswith('.conf'): cmd = "grep masterauth {}".format(conf_file) results = Ssh.Run(cmd) if results['stdout']: masterauth = results['stdout'][0].split()[-1].strip() cmd = "grep requirepass {}".format(conf_file) pw_result = Ssh.Run(cmd) if pw_result['stdout']: requirepass = pw_result['stdout'][0].split()[-1].strip() RC = redis.StrictRedis(ip, int(app_port),decode_responses=True) if requirepass: RC = redis.StrictRedis(ip,int(app_port),password=requirepass,decode_responses=True) Infos = RC.info() if Infos['role'] == 'master': redis_type['master'] = '是' if Infos['role'] == 'slave': redis_type['slave'] = '是' counts = int((Infos['connected_slaves'])) except: pass else: try: #修改记录slave信息 if counts > 0: for i in range(counts): Info = Infos['slave%s' % i] if isinstance(Info,dict): slave_ip = Info['ip'] slave_port = Info['port'] slave_status = Info['state'] else: slave_ip, slave_port, slave_status = Info.split(',') if slave_status == 'online' and int(slave_port) >1024: try: SSH_port = ssh_ports['%s:%s' % (slave_ip, slave_port)] server_id = server_ids['%s:%s' %(slave_ip,SSH_port)] except: server_id = slave_ip servers = db_servers.query.with_entities(db_servers.ip).filter(db_servers.s_ip.like('%{0};%'.format(slave_ip))).all() if servers: for server in servers: val = db_third.query.filter(and_(db_third.ip==server[0],db_third.app_port==slave_port)).all() if val: SSH_port = ssh_ports['%s:%s' % (server[0], slave_port)] server_id = server_ids['%s:%s' %(server[0],SSH_port)] break try: master_id = server_ids['%s:%s' % (ip, ssh_port)] except: master_id = ip val = db_redis.query.filter(and_(db_redis.server_id == server_id, db_redis.port == slave_port)).all() if val: db_redis.query.filter(and_(db_redis.server_id == server_id, db_redis.port == slave_port)).update( {db_redis.masterauth: masterauth, db_redis.requirepass: requirepass, db_redis.master: '否',db_redis.slave: '是',db_redis.cluster: '否', db_redis.Master_Host: master_id,db_redis.Master_Port: app_port,db_redis.update_date: update_date}) db_idc.DB.session.commit() else: c = db_redis(server_id=server_id, port=slave_port, masterauth=masterauth, requirepass=requirepass, master='否', slave='是',cluster='否', Master_host=master_id, Master_Port=app_port, update_date=update_date) db_idc.DB.session.add(c) db_idc.DB.session.commit() except: db_idc.DB.session.rollback() try: #修改记录master或者cluster信息 if redis_type['master'] == '是' or redis_type['cluster'] == '是': try: server_id = server_ids['%s:%s' % (ip, ssh_port)] except: server_id = ip servers = db_servers.query.with_entities(db_servers.ip).filter(db_servers.s_ip.like('%{0};%'.format(ip))).all() if servers: for server in servers: val = db_third.query.filter(and_(db_third.ip == server[0],db_third.app_port == app_port)).all() if val: server_id = server_ids['%s:%s' % (server[0], ssh_port)] break val = db_redis.query.filter(and_(db_redis.server_id == server_id, db_redis.port == app_port)).all() if val: db_redis.query.filter( and_(db_redis.server_id == server_id, db_redis.port == app_port)).update( {db_redis.masterauth: masterauth, db_redis.requirepass: requirepass, db_redis.master: redis_type['master'], db_redis.slave: redis_type['slave'], db_redis.cluster: redis_type['cluster'], db_redis.Master_Host: '', db_redis.Master_Port: '', db_redis.update_date: update_date}) db_idc.DB.session.commit() else: loging.write("add new redis %s %s ......" % (ip, app_port)) c = db_redis(server_id=server_id, port=app_port, masterauth=masterauth, requirepass=requirepass, master=redis_type['master'], slave=redis_type['slave'], cluster=redis_type['cluster'], Master_host='', Master_Port='', update_date=update_date) db_idc.DB.session.add(c) db_idc.DB.session.commit() except: db_idc.DB.session.rollback() finally: Ssh.Close() else: loging.write("delete not exist redis %s %s ......" %(ip,app_port)) v = db_redis.query.filter(and_(db_redis.server_id==server_ids['%s:%s' %(ip,ssh_port)],db_redis.port==app_port)).all() for c in v: db_idc.DB.session.delete(c) db_idc.DB.session.commit() v= db_third.query.filter(and_(db_third.ip==ip,db_third.app_port==app_port)).all() for c in v: db_idc.DB.session.delete(c) db_idc.DB.session.commit()