def get_node_list(nodes, param, tbl=DB.NODE_INFO_TBL, add_cond=''): try: if nodes == 'all': sql = 'SELECT ' + param + ' FROM ' + tbl if not add_cond == '': sql = sql + ' WHERE ' + add_cond else: sql = 'SELECT ' + param + ' FROM ' + tbl + ' WHERE nodename = \'' + nodes + '\'' if not add_cond == '': sql = sql + ' and ' + add_cond with DB.connection() as conn: nodes_info = conn.cursor().execute(sql).fetchall() conn.close() return nodes_info except: LOG.exception() return None
def net_check(node): try: if CONF.watchdog()['method'] == 'ping': timeout = CONF.watchdog()['timeout'] if sys.platform == 'darwin': timeout = timeout * 1000 cmd = 'ping -c1 -W%d -n %s' % (timeout, node) result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) output, error = result.communicate() if result.returncode != 0: LOG.error("\'%s\' Network Check Error(%d) ", node, result.returncode) return 'nok' else: return 'ok' except: LOG.exception()
def regi_url(url, auth): try: sql = 'SELECT * FROM ' + DB.REGI_SYS_TBL + ' WHERE url = \'' + url + '\'' sql_evt = 'SELECT * FROM ' + DB.EVENT_TBL with DB.connection() as conn: url_info = conn.cursor().execute(sql).fetchall() evt_list = conn.cursor().execute(sql_evt).fetchall() conn.close() event_list = [] for nodename, item, grade, desc, time in evt_list: if not grade in ['ok', 'normal']: evt = { 'event': 'occur', 'system': nodename, 'item': item, 'grade': grade, 'desc': desc, 'time': time } event_list.append(evt) # if already exist if len(url_info) == 1: res_body = {'Result': 'SUCCESS', 'Event list': event_list} else: # insert db sql = 'INSERT INTO ' + DB.REGI_SYS_TBL + ' VALUES (\'' + url + '\', \'' + auth + '\' )' ret = DB.sql_execute(sql) if ret == 'SUCCESS': res_body = {'Result': 'SUCCESS', 'Event list': event_list} else: res_body = {'Result': 'FAIL'} return res_body except: LOG.exception() return {'Result': 'FAIL'}
def parse_command(req_obj): try: res_body = dict() res_body['command'] = req_obj['command'] res_body['system'] = req_obj['system'] try: res_body['param'] = req_obj['param'] except: res_body['param'] = '' ret = COMMAND_MAP[req_obj['command']](req_obj['system'], req_obj['param']) res_body['result'] = ret res_body['time'] = str(datetime.now()) return res_body except: LOG.exception() return {'Result': 'FAIL'}
def proc_dis_system(node, dummy): try: nodes_info = get_node_list(node, 'nodename, ping, app, cpu, memory, disk', DB.STATUS_TBL) result = dict() for nodename, ping, app, cpu, memory, disk in nodes_info: result[nodename] = { 'ping': ping, 'app': app, 'cpu': cpu, 'memory': memory, 'disk': disk } return result except: LOG.exception() return {'Result': 'FAIL'}
def proc_dis_node(node, param): try: if param == 'list': nodes_info = get_node_list(node, 'nodename, nodelist', DB.ONOS_TBL) elif param == 'port': nodes_info = get_node_list(node, 'nodename, port', DB.ONOS_TBL) if len(nodes_info) == 0: return {'fail': 'This is not a command on the target system.'} res_result = dict() for nodename, value in nodes_info: if value == 'none': res_result[nodename] = 'FAIL' else: res_result[nodename] = eval(value) return res_result except: LOG.exception() return {'Result': 'FAIL'}
def get_service_list(): service_list = [] try: url = CONF.xos()['xos_rest_server'] account = CONF.xos()['xos_rest_account'] cmd = 'curl -H "Accept: application/json; indent=4" -u ' + account + ' -X GET ' + url + '/api/core/instances/' result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) output, error = result.communicate() if result.returncode != 0: LOG.error("Cmd Fail, cause => %s", error) return '' instance_array = json.loads(output) for instance_info in instance_array: name = instance_info['instance_name'] LOG.info('swarm_instance_name = ' + name) service_list.append(name) except: LOG.exception() return service_list
def push_event(node_name, item, grade, pre_grade, reason, time): global history_log try: history_log.write_log('[%s][%s][%s][%s] %s', node_name, item, grade, pre_grade, reason) sql = 'SELECT * FROM ' + DB.REGI_SYS_TBL with DB.connection() as conn: url_list = conn.cursor().execute(sql).fetchall() conn.close() for url, auth in url_list: header = { 'Content-Type': 'application/json', 'Authorization': str(auth) } req_body = { 'system': node_name, 'item': item, 'grade': grade, 'pre_grade': pre_grade, 'reason': reason, 'time': time } req_body_json = json.dumps(req_body) try: requests.post(str(url), headers=header, data=req_body_json, timeout=2) except: # Push event does not respond pass except: LOG.exception()
def proc_dis_onos(node, param): try: if param == 'app': nodes_info = get_node_list(node, 'nodename, applist', DB.ONOS_TBL) if param == 'rest': nodes_info = get_node_list(node, 'nodename, weblist', DB.ONOS_TBL) if len(nodes_info) == 0: return {'fail': 'This is not a command on the target system.'} res_result = dict() for nodename, app_rest_list in nodes_info: if app_rest_list == 'fail' or app_rest_list == 'none': res_result[nodename] = 'FAIL' else: res_result[nodename] = eval(app_rest_list) return res_result except: LOG.exception() return {'Result': 'FAIL'}
def proc_onos_cmd(node, cmd): try: nodes_info = get_node_list(node, 'ip_addr, type', DB.NODE_INFO_TBL) if len(nodes_info) == 0: return {'fail': 'This is not a command on the target system.'} for ip, type in nodes_info: if not type == 'ONOS': return {'fail': 'This is not a command on the target system.'} else: res_result = dict() cmd_rt = SshCommand.onos_ssh_exec(ip, cmd) if not cmd_rt is None: res_result[node] = str(cmd_rt) else: return {'fail': 'Invalid command.'} return res_result except: LOG.exception()
def check_resource(conn, db_log, node_name, user_name, node_ip): try: cpu = str(get_cpu_usage(user_name, node_ip, True)) mem = str(get_mem_usage(user_name, node_ip, True)) disk = str(get_disk_usage(user_name, node_ip, True)) try: sql = 'UPDATE ' + DB.RESOURCE_TBL + \ ' SET cpu = \'' + cpu + '\',' + \ ' memory = \'' + mem + '\',' + \ ' disk = \'' + disk + '\'' \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE RESOURCE INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] RESOURCE DB Update Fail.') except: LOG.exception() return cpu, mem, disk except: LOG.exception() return -1, -1, -1
def send_response_traffic_test_old(cond, auth): trace_result_data = {} try: is_success, result = trace.traffic_test_old(cond) if is_success: trace_result_data['result'] = 'SUCCESS' else: trace_result_data['result'] = 'FAIL' # trace_result_data['fail_reason'] = 'The source ip does not exist.' if result != None: trace_result_data['traffic_test_result'] = result trace_result_data['transaction_id'] = cond['transaction_id'] try: LOG.info('%s', json.dumps(trace_result_data, sort_keys=True, indent=4)) except: pass req_body_json = json.dumps(trace_result_data) try: url = str(cond['app_rest_url']) #requests.post(str(url), headers=header, data=req_body_json, timeout=2) if str(auth).startswith('Basic '): auth = str(auth).split(' ')[1] cmd = 'curl -X POST -u \'' + CONF.onos( )['rest_auth'] + '\' -H \'Content-Type: application/json\' -d \'' + str( req_body_json) + '\' ' + url LOG.error('%s', 'curl = ' + cmd) result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) result.communicate() if result.returncode != 0: # Push noti does not respond pass except: LOG.exception() pass except: LOG.exception()
def get_content(self): if not self.headers.getheader('content-length'): self.do_HEAD(400) self.wfile.write(str({"result": "FAIL", "fail_reason": "Bad Request, Content Length is 0\n"})) LOG.info('[TRACE REST-S] Received No Data from %s', self.client_address) return False else: try: receive_data = json.loads(self.rfile.read(int(self.headers.getheader("content-length")))) LOG.info('%s', '[Trace Conditions] \n' + json.dumps(receive_data, sort_keys=True, indent=4)) return receive_data except: LOG.exception() error_reason = 'Trace Request Json Data Parsing Error\n' self.do_HEAD(400) self.wfile.write(str({"result": "FAIL", "fail_reason": error_reason})) LOG.info('[TRACE] %s', error_reason) return False
def tperf_test_run(perf_conditions): tperf_result = dict() request_headers = { 'Authorization': CONF.onos()['rest_auth'], 'Accept': 'application/json', 'Content-Type': 'application/json' } try: # 1. creeate instance LOG.info("[T-perf server/client VM create] --- ") server_vm, client_vm, client_floatingip = traffic_test.create_instance( perf_conditions['server'], perf_conditions['client']) # 2. run performance test if server_vm and client_vm: tperf_result = traffic_test.tperf_command_exec( server_vm.__dict__['addresses'].values()[0][0]['addr'], client_floatingip.ip, perf_conditions['test_options']) else: tperf_result.update({ 'result': 'FAIL', 'fail_reason': 'Fail to create instance.' }) tperf_result.update( {'transaction_id': perf_conditions['transaction_id']}) LOG.info("[Traffic Performance Test] Return Result = %s", json.dumps(tperf_result)) # send tperf test result to ONOS response = requests.post(perf_conditions['app_rest_url'], data=str(json.dumps(tperf_result)), headers=request_headers) LOG.info("[Tperf Result Send] Response = %s %s", response.status_code, response.reason) # delete tperf test instance traffic_test.delete_test_instance(server_vm, client_vm, client_floatingip) except: LOG.exception()
def onos_ha_check(conn, db_log): try: stats_url = CONF.ha()['ha_proxy_server'] account = CONF.ha()['ha_proxy_account'] cmd = 'curl --user ' + account + ' --header \'Accept: text/html, application/xhtml+xml, image/jxr, */*\' \"' + stats_url + '\"' result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) output, error = result.communicate() if result.returncode != 0: LOG.error("Cmd Fail, cause => %s", error) return None else: report_data = csv.DictReader(output.lstrip('# ').splitlines()) dic_stat = dict() for row in report_data: if row['pxname'].strip() == 'stats' or row['svname'].strip( ) == 'BACKEND': continue dtl_list = { 'name': row['svname'], 'req_count': row['stot'], 'succ_count': row['hrsp_2xx'], 'node_sts': row['status'] } svc_type = row['pxname'] if (dic_stat.has_key(svc_type)): dic_stat[svc_type].append(dtl_list) else: dic_stat[svc_type] = list() dic_stat[svc_type].append(dtl_list) try: str_dic_stat = str(dic_stat) sql = 'UPDATE ' + DB.HA_TBL + \ ' SET stats = \"' + str_dic_stat + '\"' + \ ' WHERE ha_key = \"' + 'HA' + '\"' db_log.write_log('----- UPDATE HA INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] HA DB Update Fail.') except: LOG.exception() return dic_stat except: LOG.exception() return None
def get_cpu_usage(username, node_ip, only_value=False): try: cmd = 'sudo grep \'cpu\ \' /proc/stat' cmd_rt = SshCommand.ssh_exec(username, node_ip, cmd) ratio = float() if cmd_rt is None: LOG.info("%s CPU check Fail", node_ip) if only_value: return -1 return {'CPU': 'Command fail'} else: if 'cpu ' in cmd_rt: try: f = cmd_rt.split() ratio = (float(f[1]) + float(f[3])) * 100 / \ (float(f[1]) + float(f[3]) + float(f[4])) except: LOG.exception() result = { 'CPU': { 'RATIO': float(format(ratio, '.2f')), 'Description': cmd_rt } } LOG.info(" > CPU : %s", str(format(ratio, '.2f'))) if only_value: return float(format(ratio, '.2f')) return result except: LOG.exception() return -1
def get_mem_usage(username, node_ip, only_value=False): try: cmd = 'sudo free -t -m | grep Mem' cmd_rt = SshCommand.ssh_exec(username, node_ip, cmd) ratio = float() if cmd_rt is None: LOG.info("%s Memory check Fail", node_ip) if only_value: return -1 return {'MEMORY': 'Command fail'} else: if 'Mem' in cmd_rt: try: f = cmd_rt.split() ratio = float(f[2]) * 100 / float(f[1]) except: LOG.exception() result = { 'MEMORY': { 'RATIO': float(format(ratio, '.2f')), 'Description': cmd_rt } } LOG.info(" > MEMORY : %s", str(format(ratio, '.2f'))) if only_value: return float(format(ratio, '.2f')) return result except: LOG.exception() return -1
def run(self): # DB initiation DB.db_initiation() # Start RESTful server try: REST_SVR.rest_server_start() except: print 'Rest Server failed to start' LOG.exception() sys.exit(1) # Periodic monitoring if CONF.watchdog()['interval'] == 0: LOG.info("--- Not running periodic monitoring ---") while True: time.sleep(3600) else: LOG.info("--- Periodic Monitoring Start ---") conn = DB.connection() while True: try: watchdog.periodic(conn) time.sleep(CONF.watchdog()['interval']) except: watchdog.push_event('sonawatcher', 'disconnect', 'critical', 'sonawatcher server shutdown', str(datetime.now())) conn.close() LOG.exception() sys.exit(1)
def exit(self): try: pf = file(PIDFILE, 'r') pid = int(pf.read().strip()) pf.close() LOG.info("--- Daemon STOP [fail to check rest server] ---") try: LOG.info('PID = ' + str(pid)) os.killpg(pid, SIGTERM) except OSError, err: err = str(err) if err.find("No such process") > 0: if os.path.exists(self.pidfile): os.remove(self.pidfile) except: LOG.exception()
def find_swarm_manager(): hostname = '' try: url = CONF.xos()['xos_rest_server'] account = CONF.xos()['xos_rest_account'] cmd = 'curl -H "Accept: application/json; indent=4" -u ' + account + ' -X GET ' + url + '/api/core/controllers/' result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) output, error = result.communicate() if result.returncode != 0: LOG.error("Cmd Fail, cause => %s", error) return '' controller_array = json.loads(output) for controller_info in controller_array: auth_url = controller_info['auth_url'] ''' backend_status = controller_info['backend_status'] LOG.info('xos_sync_backend_status = ' + backend_status) tmp = str(backend_status).split('-') if tmp[0].strip() == '0': ''' LOG.info('swarm_manager_auth_url = ' + auth_url) tmp = str(auth_url).split(':') hostname = tmp[0] break except: LOG.exception() return hostname
def swarm_node_check(conn, db_log, node_name, username, node_ip, swarm_manager): node_status = 'ok' node_list = [] fail_reason = [] try: cmd = 'ssh root@' + swarm_manager + ' \"sudo docker node ls\"' node_rt = SshCommand.ssh_exec(username, node_ip, cmd) if node_rt is not None: try: leader_flag = False for line in node_rt.splitlines(): line = line.decode('utf-8') line = " ".join(line.replace('*', '').split()) tmp = line.split(' ') if line.startswith('ID'): continue if 'Leader' in line: node_json = { 'hostname': tmp[1], 'status': tmp[2], 'availability': tmp[3], 'manager': tmp[4] } leader_flag = True if not ('Ready' in line and 'Active' in line): node_status = 'nok' fail_reason.append(tmp[1] + ' node is not ready.') else: node_json = { 'hostname': tmp[1], 'status': tmp[2], 'availability': tmp[3], 'manager': '' } if 'Down' in line: node_status = 'nok' fail_reason.append(tmp[1] + ' node is down.') node_list.append(node_json) if not leader_flag: node_status = 'nok' fail_reason.append('swarm leader node does not exist.') except: LOG.exception() node_status = 'nok' else: LOG.error("\'%s\' Swarm Node Check Error", node_ip) node_status = 'fail' try: sql = 'UPDATE ' + DB.SWARM_TBL + \ ' SET node = \"' + str(node_list) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE SWARM NODE INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] SWARM NODE DB Update Fail.') except: LOG.exception() except: LOG.exception() node_status = 'fail' return node_status, fail_reason
def swarm_service_check(conn, db_log, node_name, username, node_ip, swarm_manager): service_status = 'ok' service_list = [] ps_list = [] fail_reason = [] try: cmd = 'ssh root@' + swarm_manager + ' \"sudo docker service ls\"' service_rt = SshCommand.ssh_exec(username, node_ip, cmd) instance_list = get_service_list() if service_rt is not None: try: for svc in instance_list: find_flag = False for line in service_rt.splitlines(): line = line.decode('utf-8') if line.startswith('ID'): continue id, name, mode, rep, img = line.split() if svc == name: find_flag = True rep_tmp = rep.split('/') if not (rep_tmp[0] == rep_tmp[1]): service_status = 'nok' svc_json = { 'name': name, 'mode': mode, 'replicas': rep, 'image': img, 'status': 'nok', 'monitor_item': True } fail_reason.append(svc_json) else: svc_json = { 'name': name, 'mode': mode, 'replicas': rep, 'image': img, 'status': 'ok', 'monitor_item': True } service_list.append(svc_json) if not find_flag: service_status = 'nok' fail_reason.append('swarm ' + svc + ' service does not exist.') break for line in service_rt.splitlines(): line = line.decode('utf-8') if line.startswith('ID'): continue id, name, mode, rep, img = line.split() if name in instance_list: continue rep_tmp = rep.split('/') if not (rep_tmp[0] == rep_tmp[1]): svc_json = { 'name': name, 'mode': mode, 'replicas': rep, 'image': img, 'status': 'nok', 'monitor_item': False } else: svc_json = { 'name': name, 'mode': mode, 'replicas': rep, 'image': img, 'status': 'ok', 'monitor_item': False } service_list.append(svc_json) except: LOG.exception() service_status = 'fail' else: LOG.error("\'%s\' Swarm Service Check Error", node_ip) service_status = 'fail' for app in instance_list: cmd = 'ssh root@' + swarm_manager + ' \"sudo docker service ps ' + app + '\"' ps_rt = SshCommand.ssh_exec(username, node_ip, cmd) if ps_rt is not None: for line in ps_rt.splitlines(): line = line.decode('utf-8') if line.startswith('ID'): continue line = line.replace(' \_ ', '') line = " ".join(line.split()) tmp = line.split(' ') ps_json = { 'name': tmp[1], 'image': tmp[2], 'node': tmp[3], 'desired_state': tmp[4], 'current_state': tmp[5] } ps_list.append(ps_json) else: LOG.error("\'%s\' Swarm PS Check Error", node_ip) try: sql = 'UPDATE ' + DB.SWARM_TBL + \ ' SET service = \"' + str(service_list) + '\",' + \ ' ps = \"' + str(ps_list) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE SWARM SERVICE/PS INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] SWARM SERVICE/PS DB Update Fail.') except: LOG.exception() except: LOG.exception() service_status = 'fail' return service_status, fail_reason
def periodic(conn, pre_stat, db_log): try: cur_info = {} #LOG.info('Periodic checking %s', str(CONF.watchdog()['check_system'])) try: node_list = cmd_proc.get_node_list('all', 'nodename, ip_addr, username, type, sub_type') if not node_list: LOG.info("Not Exist Node data ...") return except: LOG.exception() return # Read cur alarm status sql = 'SELECT nodename, item, grade FROM ' + DB.EVENT_TBL db_log.write_log(sql) cur_grade = conn.cursor().execute(sql).fetchall() old_nok_count = 0; for nodename, item, grade in cur_grade: if not cur_info.has_key(nodename): cur_info[nodename] = {} cur_info[nodename][item] = grade if grade != 'ok': old_nok_count += 1 new_nok_count = 0; for node_name, node_ip, user_name, type, sub_type in node_list: #LOG.info('------------------------------------ ' + node_name + ' START ------------------------------------') onos_cluster = 'fail' onos_device = 'fail' onos_link = 'fail' onos_app = 'fail' # ping check ping = net_check(node_ip) ping_reason = [] if ping != 'ok': reason.append('ping check failed on ' + node_ip) new_nok_count += 1 ping = alarm_event.process_event(conn, db_log, node_name, type, 'PING', cur_info[node_name]['PING'], ping, ping_reason) if ping == 'ok': if type.upper() == 'ONOS': # check connection onos_cluster, onos_device, onos_link, onos_app, cluster_reason, device_reason, link_reason, app_reason = chk_onos.onos_check(conn, db_log, node_name, node_ip) onos_cluster = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_CLUSTER', cur_info[node_name]['ONOS_CLUSTER'], onos_cluster, cluster_reason) onos_device = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_DEVICE', cur_info[node_name]['ONOS_DEVICE'], onos_device, device_reason) onos_link = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_LINK', cur_info[node_name]['ONOS_LINK'], onos_link, link_reason) onos_app = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_APP', cur_info[node_name]['ONOS_APP'], onos_app, app_reason) if onos_cluster != 'ok': new_nok_count += 1 if onos_device != 'ok': new_nok_count += 1 if onos_link != 'ok': new_nok_count += 1 if onos_app != 'ok': new_nok_count += 1 try: sql = 'UPDATE ' + DB.STATUS_TBL + \ ' SET' + \ ' PING = \'' + ping + '\',' + \ ' ONOS_CLUSTER = \'' + onos_cluster + '\',' + \ ' ONOS_DEVICE = \'' + onos_device + '\',' + \ ' ONOS_LINK = \'' + onos_link + '\',' + \ ' ONOS_APP = \'' + onos_app + '\',' + \ ' time = \'' + str(datetime.now()) + '\'' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE TOTAL SYSTEM INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] TOTAL SYSTEM INFO DB Update Fail.') except: LOG.exception() # do not version log on everthing is ok if old_nok_count > 0: LOG.info('chk_onos[%s]: ping=%s cluster=%s device=%s link=%s app=%s' % (node_name, ping, onos_cluster, onos_device, onos_link, onos_app)) if old_nok_count > 0 and new_nok_count == 0: alarm_event.process_event(conn, db_log, 'ALL', 'SITE', 'STATUS', 'none', 'ok', []) # send all alarm messages pending alarm_event.flush_event_alarm(); except: LOG.exception() return pre_stat
def do_GET(self): # health check if self.path.startswith('/alive-check'): self.do_HEAD(200) self.wfile.write('ok\n') return if not self.authentication(): self.do_HEAD(401) return else: if not self.headers.getheader('Content-Length'): self.do_HEAD(400) self.wfile.write('Bad Request, Content Length is 0\n') return else: request_size = int(self.headers.getheader("Content-Length")) request_string = self.rfile.read(request_size) request_obj = json.loads(request_string) LOG.info('[REST-SERVER] CLIENT INFO = ' + str(self.client_address)) LOG.info('[REST-SERVER] RECV BODY = \n' + json.dumps(request_obj, sort_keys=True, indent=4)) if self.path.startswith('/command'): try: if command.exist_command(request_obj): res_body = command.parse_command(request_obj) self.do_HEAD(200) self.wfile.write(json.dumps(res_body)) LOG.info('[REST-SERVER] RES BODY = \n%s', json.dumps(res_body, sort_keys=True, indent=4)) else: self.do_HEAD(404) self.wfile.write('command not found') LOG.info('[REST-SERVER] ' + 'command not found') except: LOG.exception() elif self.path.startswith('/regi'): try: self.do_HEAD(200) url = str(request_obj['url']) res_body = command.regi_url(url, self.headers.getheader('Authorization')) self.wfile.write(json.dumps(res_body)) LOG.info('[REST-SERVER] RES BODY = \n%s', json.dumps(res_body, sort_keys=True, indent=4)) except: LOG.exception() elif self.path.startswith('/event_list'): try: self.do_HEAD(200) url = str(request_obj['url']) res_body = command.get_event_list(url, self.headers.getheader('Authorization')) self.wfile.write(json.dumps(res_body)) LOG.info('[REST-SERVER] RES BODY = \n%s', json.dumps(res_body, sort_keys=True, indent=4)) except: LOG.exception() elif self.path.startswith('/unregi'): try: self.do_HEAD(200) url = str(request_obj['url']) res_body = command.unregi_url(url) self.wfile.write(json.dumps(res_body)) LOG.info('[REST-SERVER] RES BODY = \n%s', json.dumps(res_body, sort_keys=True, indent=4)) except: LOG.exception() else: self.do_HEAD(404) self.wfile.write(self.path + ' not found\n') LOG.info('[REST-SERVER] ' + self.path + ' not found')
def do_HEAD(self, res_code): self.send_response(res_code) self.send_header('Content-type', 'application/json') self.end_headers() if res_code != 200: LOG.info('[REST-SERVER] RESPONSE CODE = ' + str(res_code))
def rest_server_start(): LOG.info("--- REST Server Start --- ") rest_server_daemon = multiprocess.Process(name='rest_server', target=run) rest_server_daemon.daemon = True rest_server_daemon.start()
if pid > 0: # exit from second parent sys.exit(0) except OSError, e: sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror)) sys.exit(1) # redirect standard file descriptors si = file(self.stdin, 'r') so = file(self.stdout, 'a+') se = file(self.stderr, 'a+', 0) pid = str(os.getpid()) LOG.info("--- Daemon START ---") sys.stderr.write("\nstarted with pid %s\n" % pid) sys.stderr.flush() if self.pidfile: file(self.pidfile, 'w+').write("%s\n" % pid) atexit.register(self.delpid) os.dup2(si.fileno(), sys.stdin.fileno()) os.dup2(so.fileno(), sys.stdout.fileno()) os.dup2(se.fileno(), sys.stderr.fileno()) # delete pid file when parent process kill def delpid(self): try: os.remove(self.pidfile)
def run(self): db_log = USER_LOG() db_log.set_log('db.log', CONF.base()['log_rotate_time'], CONF.base()['log_backup_count']) pre_stat = dict() # DB initiation DB.db_initiation(db_log) # Start RESTful server try: REST_SVR.rest_server_start() except: print 'Rest Server failed to start' LOG.exception() self.exit() # Periodic monitoring if CONF.watchdog()['interval'] == 0: LOG.info("--- Not running periodic monitoring ---") while True: time.sleep(3600) else: LOG.info("--- Periodic Monitoring Start ---") history_log.write_log("--- Event History Start ---") conn = DB.connection() exitFlag = False while True: try: i = 0 while i < 3: i = i + 1 # check rest server try: url = 'http://' + socket.gethostbyname( socket.gethostname()) + ':' + str(CONF.rest( )['rest_server_port']) + '/alive-check' cmd = 'curl -X GET \"' + url + '\"' LOG.info('cmd = ' + cmd) result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) output, error = result.communicate() if result.returncode != 0: LOG.info('REST SERVER CHECK FAIL [' + str(i) + ']') if i == 3: LOG.info('fail to check rest server.') alarm_event.push_event( 'sonawatcher', 'SONAWATCHER_DISCONNECT', 'critical', 'normal', 'sonawatcher server shutdown', str(datetime.now())) conn.close() exitFlag = True self.exit() break else: break except: LOG.exception() if exitFlag: break pre_stat = watchdog.periodic(conn, pre_stat, db_log) time.sleep(CONF.watchdog()['interval']) except: alarm_event.push_event('sonawatcher', 'SONAWATCHER_DISCONNECT', 'critical', 'normal', 'sonawatcher server shutdown', str(datetime.now())) conn.close() LOG.exception()
def swarm_check(conn, db_log, node_name, user_name, node_ip): str_node = '' str_service = '' str_ps = '' ret_app = 'ok' ret_node = 'ok' node_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo docker node ls') if node_rt is not None: try: leader_flag = False for line in node_rt.splitlines(): line = line.decode('utf-8') str_node = str_node + line + '\n' if line.startswith('ID'): continue if 'Leader' in line: leader_flag = True if not ('Ready' in line and 'Active' in line): ret_node = 'nok' break if 'Down' in line: ret_node = 'nok' break if not leader_flag: ret_node = 'nok' except: LOG.exception() ret_node = 'nok' else: LOG.error("\'%s\' Swarm Node Check Error", node_ip) str_node = 'fail' service_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo docker service ls') if service_rt is not None: try: for app in CONF.swarm()['app_list']: find_flag = False for line in service_rt.splitlines(): line = line.decode('utf-8') if line.startswith('ID'): continue id, name, mode, rep, img = line.split() if app == name: find_flag = True rep_tmp = rep.split('/') if not (rep_tmp[0] == rep_tmp[1]): ret_app = 'nok' break if not find_flag: ret_app = 'nok' break except: LOG.exception() ret_app = 'nok' for line in service_rt.splitlines(): line = line.decode('utf-8') str_service = str_service + line + '\n' else: LOG.error("\'%s\' Swarm Service Check Error", node_ip) str_service = 'fail' ret_app = 'nok' try: for app in CONF.swarm()['app_list']: ps_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo docker service ps ' + app) str_ps = str_ps + ' * ' + app + '\n\n' if ps_rt is not None: for line in ps_rt.splitlines(): line = line.decode('utf-8') str_ps = str_ps + line + '\n' else: LOG.error("\'%s\' Swarm PS Check Error", node_ip) str_ps = str_ps + 'Command failure(' + app + ')\n' str_ps = str_ps + '\n' except: LOG.exception() try: sql = 'UPDATE ' + DB.SWARM_TBL + \ ' SET node = \'' + str_node + '\',' + \ ' service = \'' + str_service + '\',' + \ ' ps = \'' + str_ps + '\'' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE SWARM INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] SWARN DB Update Fail.') except: LOG.exception() return ret_app, ret_node
def vrouter_check(conn, db_log, node_name, user_name, node_ip): ret_docker = 'ok' docker_list = [] fail_list = [] onos_id = '' docker_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo docker ps') if docker_rt is not None: try: for docker in CONF.openstack()['docker_list']: for line in docker_rt.splitlines(): if line.startswith('CONTAINER'): continue tmp_line = line.split() if ' ' + docker in line: if not 'Up' in line: docker_json = {'name': docker, 'status': 'nok', 'type': 'docker'} fail_list.append(docker_json) ret_docker = 'nok' else: docker_json = {'name': docker, 'status': 'ok', 'type': 'docker'} docker_list.append(docker_json) if 'onos' in tmp_line[1]: onos_id = tmp_line[0] except: LOG.exception() else: LOG.error("\'%s\' Vrouter Node Check Error", node_ip) ret_docker = 'fail' onos_app_list = [] route_list = [] if not onos_id == '': try: # get onos container ip onos_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo docker inspect ' + onos_id + ' | grep IPAddress') if onos_rt is not None: for line in onos_rt.splitlines(): line = line.strip() if line.startswith('\"IPAddress'): tmp = line.split(':') onos_ip = tmp[1].strip().replace('\"', '').replace(',', '') break app_list = SshCommand.ssh_pexpect(user_name, node_ip, onos_ip, 'apps -a -s') app_active_list = list() for line in app_list.splitlines(): if line.startswith('fail'): continue app_active_list.append(line.split(".")[2].split()[0]) for app in CONF.openstack()['onos_vrouter_app_list']: if app in app_active_list: app_json = {'name': app, 'status': 'ok', 'type': 'onos_app'} else: app_json = {'name': app, 'status': 'nok', 'type': 'onos_app'} fail_list.append(app_json) ret_docker = 'nok' onos_app_list.append(app_json) str_route = SshCommand.ssh_pexpect(user_name, node_ip, onos_ip, 'routes') for line in str_route.splitlines(): line = line.strip() if (line.startswith('Table') or line.startswith('Network') or line.startswith('Total')): continue new_line = " ".join(line.split()) if new_line.startswith('fail'): continue tmp = new_line.split(' ') route_json = {'network': tmp[0], 'next_hop': tmp[1]} route_list.append(route_json) except: LOG.exception() else: LOG.info('can not find onos_id.') ret_docker = 'fail' try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET docker = \"' + str(docker_list) + '\",' + \ ' onosApp = \"' + str(onos_app_list) + '\",' + \ ' routingTable = \"' + str(route_list) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE GATEWAY INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] GATEWAY DB Update Fail.') except: LOG.exception() return ret_docker, fail_list