def health_display_report(self): infra.create_report_table(self, "Nagios Montitor Summary Repory") infra.add_table_headers(self, "Nagios Montitor Summary Repory", ["Host", "Description", "OK(secs)", "CRITICAL(secs)"]) processedKey = {} for key in self.summaryDict: if processedKey.has_key(key): continue lst = key.split("##") key2 = "" ok_sec = 0 crit_sec = 0 if lst[2] == "OK": key2 = lst[0]+"##"+lst[1]+"##CRITICAL" ok_sec = int(self.summaryDict[key]) else: key2 = lst[0]+"##"+lst[1]+"##OK" crit_sec = int(self.summaryDict[key]) if self.summaryDict.has_key(key2): processedKey[key2] = key2 if ok_sec > 0: crit_sec = int(self.summaryDict[key2]) else: ok_sec = int(self.summaryDict[key2]) infra.add_table_rows(self, "Nagios Montitor Summary Repory", [[lst[0], lst[1],ok_sec,crit_sec]])
def node_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal(self, "Entering Node Disruption plugin") table_name = "Node Disruption" infra.create_report_table(self, table_name) infra.add_table_headers(self, table_name, ["Node", "IP", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() if input_args: print "Inpt " + str(input_args) role = input_args.get('role', None) nodes_to_be_disrupted = [] for node in host_config: if role in host_config[node].get('role', None): infra.display_on_terminal(self, node, " will be disrupted ") nodes_to_be_disrupted.append(node) node_reboot_command = "reboot -f " if self.sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") ha_interval = self.get_ha_interval() for i in range(1): #while infra.is_execution_completed(self.finish_execution) is False: # for node in nodes_to_be_disrupted: node = nodes_to_be_disrupted[0] ip = host_config.get(node, None).get('ip', None) user = host_config.get(node, None).get('user', None) password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "IP: ", ip, " User: "******" Pwd: ", password) infra.display_on_terminal(self, "Executing ", node_reboot_command) code, out, error = infra.ssh_and_execute_command(ip, user, password, node_reboot_command) if error: infra.display_on_terminal(self, "Error ", error, "color=red") infra.display_on_terminal(self, "waiting for ", ip, " to " "come " "online") if infra.wait_for_ping(ip, 240, 10): infra.display_on_terminal(self, "Node ", ip, " is online", "color=green") infra.display_on_terminal(self, "Will sleep for interval ", str(ha_interval)) #time.sleep(ha_interval) infra.add_table_rows(self, table_name, [[node, ip, utils.get_timestamp(), HAConstants.OKGREEN + 'Rebooted' + HAConstants.ENDC]]) # bring it back to stable state infra.display_on_terminal(self, "Waiting for the node to become stable") if infra.wait_for_ping(ip, 240, 10): infra.display_on_terminal(self, "Node ", ip, " is in stable state", "color=green") infra.display_on_terminal(self, "Finishing Node Disruption")
def vm_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal(self, "Entering VM Disruption plugin") table_name = "VM Disruption" infra.create_report_table(self, table_name) infra.add_table_headers(self, table_name, ["VM", "IP", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() print '========',input_args_dict node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() if input_args: print "Inpt " + str(input_args) role = input_args.get('role', None) print '*'*20 print 'host_config ==',host_config print 'input_args ==',input_args print '*'*20 # nodes_to_be_disrupted = [] for node in host_config: if role == host_config[node].get('role', None): jump_host = node print jump_host nodes_to_be_disrupted = input_args.get('name') print nodes_to_be_disrupted node_reboot_command = "reboot" if self.sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") openrc = host_config.get(jump_host, None).get('openrc', None) password = host_config.get(jump_host, None).get('password', None) print openrc ha_interval = self.get_ha_interval() # for i in range(1): while infra.is_execution_completed(self.finish_execution) is False: for node in nodes_to_be_disrupted: # node = nodes_to_be_disrupted[0] # ip = host_config.get(node, None).get('ip', None) # user = host_config.get(node, None).get('user', None) # password = host_config.get(node, None).get('password', None) ip = node # openrc = host_config.get(node, None).get('openrc', None) # password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "IP: ", ip, " openrc: ", openrc) infra.display_on_terminal(self, "Executing ", node_reboot_command) # Using nova api performing the vm stop operation cred = credentials.Credentials(openrc, password,'no_env') try: nova = nova_api.NovaHealth(cred.get_nova_credentials_v2()) ret = nova.nova_stop_server(ip) time.sleep(ha_interval) infra.display_on_terminal(self, "Rebooting ",ip) nova.nova_start_server(ip) error = [] except Exception,error: pass if error: infra.display_on_terminal(self, "Error ", error, "color=red") infra.display_on_terminal(self, "waiting for ", ip, " to " "come " "online") if infra.wait_for_ping(ip, 240, 10): infra.display_on_terminal(self, "Node ", ip, " is online", "color=green") infra.display_on_terminal(self, "Will sleep for interval ", str(ha_interval)) #time.sleep(ha_interval) infra.add_table_rows(self, table_name, [[node, ip, utils.get_timestamp(), HAConstants.OKGREEN + 'Rebooted' + HAConstants.ENDC]])
def display_asible_process_report(self, hist_cnt=5): ''' Display the Error report for processes. ''' process_set = () process_list = [] per_proc_result = {} for service in SERVICE_LIST: svcnm = service['service'] per_proc_result[svcnm] = {} per_proc_result[svcnm]['reslist'] = [] # Identify failed processes. for ts_results in self.ansiresults: ts = None for results in ts_results: name = results.get('name', None) if name is None: continue if name == "ts": ts = results.get('ts', None) if name == "process_check": procname = results['process'] results['ts'] = ts per_proc_result[procname]['reslist'].append(results) if results['ansi_result']['status'] == 'FAIL': process_list.append(results['process']) if len(process_list) == 0: print "***************Ansible Failed Processes***************" print " NONE" return process_set = set(process_list) # Create a new table and set the header. infra.create_report_table(self, "Ansible Failed Processes") hdr_columns = ["Timestamp"] for proc in process_set: hdr_columns.append(proc) infra.add_table_headers(self, "Ansible Failed Processes", hdr_columns) condensed_results = self.build_condensed_results() host_list = self.inventory.get_host_ip_list() rows = [] for host in host_list: single_row = [] single_row.append(host) for fproc in process_set: for result in condensed_results.keys(): if result == "ssh_ping_check": continue elif result == "rabbitmq_check": continue elif result == "mariadb_check": continue if fproc == result: res0 = condensed_results[result]['reslist'][0] host_list = res0['host_list'] if host not in host_list: print "%s not in %s" % (result, host) single_row.append("NA") break timestr = self._get_timestring( condensed_results[result]['reslist'], failed_only=True,host=host) single_row.append(timestr) rows.append(single_row) infra.add_table_rows(self, "Ansible Failed Processes", rows)
def display_ansible_summary_report(self, hist_cnt=5): ''' Display the Ansible Summary Report. ''' infra.create_report_table(self, "Ansible Monitoring Summary") infra.add_table_headers(self, "Ansible Monitoring Summary", ["Host", "SSH & Ping", "RabbitMQ", "MariaDB", "Consolidated Process State"]) condensed_results = self.build_condensed_results() host_list = self.inventory.get_host_ip_list() # BRD. print "Condensed result" pp = pprint.PrettyPrinter(indent=4) pp.pprint(condensed_results) rows = [] for host in host_list: single_row = [] single_row.append(host) for result in condensed_results.keys(): if result == "ssh_ping_check": if len(condensed_results[result]['reslist']) == 1: timestr = ":-)" else: timestr = self._get_timestring( condensed_results[result]['reslist'], host=host) single_row.append(timestr) for result in condensed_results.keys(): if result == "rabbitmq_check": if len(condensed_results[result]['reslist']) == 1: timestr = ":-)" else: timestr = self._get_timestring( condensed_results[result]['reslist'], host=host) single_row.append(timestr) for result in condensed_results.keys(): if result == "mariadb_check": if len(condensed_results[result]['reslist']) == 1: timestr = ":-)" else: timestr = self._get_timestring( condensed_results[result]['reslist'], host=host) single_row.append(timestr) process_check_status = 'PASS' failed_process_list = [] for result in condensed_results.keys(): check_name = condensed_results[result]['reslist'][0]['name'] print "BRD: failed hosts keys: ", condensed_results[result]['reslist'][0].keys() if check_name == "process_check": # Check if all processes are ok. if len(condensed_results[result]['reslist']) > 1: # BRD. for res in condensed_results[result]['reslist']: if host not in res['failed_hosts']: continue process_check_status = 'FAIL' failed_process_list.append(condensed_results[result]) else: timestr = self._get_timestring( condensed_results[result]['reslist'], host=host) if process_check_status == 'PASS': single_row.append(":-)") else: single_row.append("FAIL") rows.append(single_row) infra.add_table_rows(self, "Ansible Monitoring Summary", rows)
def container_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal(self, "Entering Container Disruption plugin") table_name = "Container Disruption" infra.create_report_table(self, table_name) infra.add_table_headers(self, table_name, ["Host", "Container Process", "Timestamp", "Status of Disruption"]) input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() if input_args: print "Inpt " + str(input_args) container_name = input_args.get('container_name', None) role = input_args.get('role', None) disruption_type = input_args.get('disruption', None) infra.display_on_terminal(self, "Container ", container_name, " will be disrupted") nodes_to_be_disrupted = [] for node in host_config: if 'controller' in host_config[node].get('role', None): infra.display_on_terminal(self, node, " will be disrupted ") nodes_to_be_disrupted.append(node) # For now disrupt on only one node break # Deprecate process disruptor and converge on this for both cases later container_stop_command = "systemctl stop " + container_name container_start_command = "systemctl start " + container_name ha_start_delay = self.get_ha_start_delay() if sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") # Start the actual disruption after 45 seconds time.sleep(ha_start_delay) ha_interval = self.get_ha_interval() disruption_count = self.get_disruption_count() if disruption_type == 'infinite': #Override the disruption count in executor.yaml disruption_count = 1 while infra.is_execution_completed(self.finish_execution) is False: if disruption_count: disruption_count = disruption_count - 1 for node in nodes_to_be_disrupted: ip = host_config.get(node, None).get('ip', None) user = host_config.get(node, None).get('user', None) password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "Stopping ", container_name) infra.display_on_terminal(self, "Executing ", container_stop_command) code, out, error = infra.ssh_and_execute_command(ip, user, password, container_stop_command) infra.add_table_rows(self, table_name, [[ip, container_name, utils.get_timestamp(), HAConstants.WARNING + 'Stopped' + HAConstants.ENDC]]) if disruption_type == 'infinite': infra.display_on_terminal(self, "Infinite disruption chosen bring up container manually") break infra.display_on_terminal(self, "Sleeping for interval ", str(ha_interval), " seconds") time.sleep(ha_interval) infra.display_on_terminal(self, "Starting ", container_name) infra.display_on_terminal(self, "Executing ", container_start_command) code, out, error = infra.ssh_and_execute_command(ip, user, password, container_start_command) time.sleep(ha_interval) infra.add_table_rows(self, table_name, [[ip, container_name, utils.get_timestamp(), HAConstants.OKGREEN + 'Started' + HAConstants.ENDC]]) # bring it back to stable state if disruption_type != 'infinite': infra.display_on_terminal(self, "Bringing the container to stable state") infra.display_on_terminal(self, "Executing ", container_start_command) code, out, error = infra.ssh_and_execute_command(ip, user, password, container_start_command) infra.display_on_terminal(self, "Finishing Container Disruption")
def process_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution table_name = "Process Disruption" infra.create_report_table(self, table_name) infra.add_table_headers( self, table_name, ["Host", "Process", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() if input_args: print "Inpt " + str(input_args) process_name = input_args.get('process_name', None) role = input_args.get('role', None) type = input_args.get('type', None) infra.display_on_terminal(self, "Process ", process_name, " will be disrupted") nodes_to_be_disrupted = [] for node in host_config: if 'controller' in host_config[node].get('role', None): infra.display_on_terminal(self, node, " will be disrupted ") nodes_to_be_disrupted.append(node) self.expected_failures.append(node + "::" + process_name) self.set_expected_failures(self.expected_failures) rhel_stop_command = "systemctl stop " + process_name rhel_start_command = "systemctl start " + process_name if sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") ha_interval = self.get_ha_interval() disruption_count = self.get_disruption_count() infra.display_on_terminal(self, "Process will be disrupted ", str(disruption_count)) while infra.is_execution_completed(self.finish_execution) is False: if disruption_count: disruption_count = disruption_count - 1 for node in nodes_to_be_disrupted: ip = host_config.get(node, None).get('ip', None) user = host_config.get(node, None).get('user', None) password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "IP: ", ip, " User: "******" Pwd: ", password) infra.display_on_terminal(self, "Stopping ", process_name) infra.display_on_terminal(self, "Executing ", rhel_stop_command) code, out, error = infra.ssh_and_execute_command( ip, user, password, rhel_stop_command) infra.add_table_rows(self, table_name, [[ ip, process_name, utils.get_timestamp(), HAConstants.WARNING + 'Stopped' + HAConstants.ENDC ]]) infra.display_on_terminal(self, "Sleeping for interval ", str(ha_interval), " seconds") time.sleep(ha_interval) infra.display_on_terminal(self, "Starting ", process_name) infra.display_on_terminal(self, "Executing ", rhel_start_command) code, out, error = infra.ssh_and_execute_command( ip, user, password, rhel_start_command) time.sleep(ha_interval) infra.add_table_rows(self, table_name, [[ ip, process_name, utils.get_timestamp(), HAConstants.OKGREEN + 'Started' + HAConstants.ENDC ]]) # bring it back to stable state infra.display_on_terminal(self, "Bringing the process to stable state") infra.display_on_terminal(self, "Executing ", rhel_start_command) code, out, error = infra.ssh_and_execute_command( ip, user, password, rhel_start_command) infra.display_on_terminal(self, "Finishing Process Disruption")
def generate_downtime_table(self, downtime_dict, table_name): host_agent_status_dict = collections.OrderedDict() col_pos_dict = {} all_agent_list = [agent for agent in downtime_dict] infra.create_report_table(self, table_name) headers = ["Host Names "] + all_agent_list infra.add_table_headers(self, table_name, headers) col_pos = 0 for agent in downtime_dict: count = 0 col_pos_dict[agent] = col_pos for agent_dict in downtime_dict[agent]: if agent_dict is None: print "No agent." for host in agent_dict: cur = None prev = None downtime_range_start = None downtime_range_stop = None downtime_range = set() host_dict = agent_dict[host] for ts, status in host_dict.items(): count += 1 cur = status if (cur == 'FAIL' and prev == 'OK') or \ (cur == 'FAIL' and prev is None): downtime_range_start = ts if cur == 'FAIL' and prev == cur: downtime_range_stop = ts if cur == 'FAIL' and count == len(host_dict): downtime_range_stop = ts if cur == 'OK' and prev == 'FAIL' or\ (cur == 'FAIL' and count == len(host_dict)): downtime_range_stop = ts downtime_range.add(downtime_range_start + " to " + downtime_range_stop) prev = cur if not downtime_range: all_down_range = HAConstants.OKGREEN + ":)" + \ HAConstants.ENDC else: all_down_range = HAConstants.WARNING +\ (", ".join(downtime_range)) +\ HAConstants.ENDC host_dict = {agent: all_down_range} if host_agent_status_dict.get(host, None) is None: host_agent_status_dict[host] = [host_dict] else: host_agent_status_dict.get(host, None).append(host_dict) col_pos += 1 for host in host_agent_status_dict: row = [host] agents = ['-'] * len(all_agent_list) available_agents = set() for agent in host_agent_status_dict[host]: available_agents.add(agent.keys()[0]) missing_agents = set() for agent_status in host_agent_status_dict[host]: missing_agents = set(all_agent_list).difference(available_agents) for agent_name, status in agent_status.items(): colpos = col_pos_dict.get(agent_name) status = agent_status.get(agent_name, None) agents[colpos] = status for missing_agent in missing_agents: colpos = col_pos_dict.get(missing_agent) na_status = 'NA' agents[colpos] = na_status infra.add_table_rows(self, table_name, [row + agents])
def jump_host_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal(self, "Entering Jump Host Disruption plugin") table_name = "Jump host Disruption" infra.create_report_table(self, table_name) infra.add_table_headers(self, table_name, ["VM", "IP", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() print "*"*20 print input_args_dict print "input_args ==>",input_args print "host_config ==>",host_config nodes_to_be_disrupted = input_args.get('name',[]) if input_args: print "Inpt " + str(input_args) role = input_args.get('role', None) # jump_hosts = [] for node in host_config: if role in host_config[node].get('role', None): jump_host = node # jump_hosts.append(node) print "###############",jump_host node_reboot_command = "reboot -f" if self.sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") ha_interval = self.get_ha_interval() # jump host details jump_host_ip = host_config.get(node, None).get('ip', None) user = host_config.get(node,None).get('user',None) password = host_config.get(node,None).get('password',None) #TODO - if its more than one jump host # Write into txt file to pass via ansible playbook f = open('/tmp/remote_ips','w+') for ip in nodes_to_be_disrupted: f.write(ip+'\n') f.close() while infra.is_execution_completed(self.finish_execution) is False: # for node in nodes_to_be_disrupted: # node = nodes_to_be_disrupted[0] # ip = host_config.get(node, None).get('ip', None) # user = host_config.get(node, None).get('user', None) # password = host_config.get(node, None).get('password', None) ip = node # openrc = host_config.get(node, None).get('openrc', None) # password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "Nodes to be disrupted: ", str(nodes_to_be_disrupted), " Jump host: ", jump_host_ip) infra.display_on_terminal(self, "Executing ", node_reboot_command) print "*"*50 print "user :"******"password :"******"jump_host_ip :",jump_host_ip ret = AnsibleRunner(jump_host_ip,user,password).execute_on_remote() print ret # parse the output for report # node_list = os.walk('/tmp/hainfra').next()[1] # output_objs = eval(open('/tmp/hainfra/'+node+'/tmp/output','r').read()) output_objs = eval(open('/tmp/hainfra/output','r').read()) print output_objs for results in output_objs: error = [] for (hostname, result) in results['contacted'].items(): if 'failed' in result: print "%s >>> %s" % (hostname, result['msg']) error = result['msg'] if error: infra.display_on_terminal(self, "Error ", error, "color=red") infra.display_on_terminal(self, "waiting for ", hostname, " to " "come " "online") if infra.wait_for_ping(hostname, 240, 5): infra.display_on_terminal(self, "Node ", hostname, " is online", "color=green") infra.display_on_terminal(self, "Will sleep for interval ", str(ha_interval)) #time.sleep(ha_interval) if not error: infra.add_table_rows(self, table_name, [[jump_host_ip, hostname, utils.get_timestamp(), HAConstants.OKGREEN + 'Rebooted' + HAConstants.ENDC]]) else: infra.add_table_rows(self, table_name, [[jump_host_ip, hostname, utils.get_timestamp(), HAConstants.FAIL + str(error)+ HAConstants.ENDC]]) # bring it back to stable state ''' infra.display_on_terminal(self, "Waiting for the node to become stable") if infra.wait_for_ping(hostname, 240, 10): infra.display_on_terminal(self, "Node ", hostname, " is in stable state", "color=green") ''' infra.display_on_terminal(self, "Finishing Node Disruption")
def vm_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal(self, "Entering VM Disruption plugin") table_name = "VM Disruption" infra.create_report_table(self, table_name) infra.add_table_headers( self, table_name, ["VM", "IP", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() print '========', input_args_dict node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() if input_args: print "Inpt " + str(input_args) role = input_args.get('role', None) print '*' * 20 print 'host_config ==', host_config print 'input_args ==', input_args print '*' * 20 # nodes_to_be_disrupted = [] for node in host_config: if role == host_config[node].get('role', None): jump_host = node print jump_host nodes_to_be_disrupted = input_args.get('name') print nodes_to_be_disrupted node_reboot_command = "reboot" if self.sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") openrc = host_config.get(jump_host, None).get('openrc', None) password = host_config.get(jump_host, None).get('password', None) print openrc ha_interval = self.get_ha_interval() # for i in range(1): while infra.is_execution_completed(self.finish_execution) is False: for node in nodes_to_be_disrupted: # node = nodes_to_be_disrupted[0] # ip = host_config.get(node, None).get('ip', None) # user = host_config.get(node, None).get('user', None) # password = host_config.get(node, None).get('password', None) ip = node # openrc = host_config.get(node, None).get('openrc', None) # password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "IP: ", ip, " openrc: ", openrc) infra.display_on_terminal(self, "Executing ", node_reboot_command) # Using nova api performing the vm stop operation cred = credentials.Credentials(openrc, password, 'no_env') try: nova = nova_api.NovaHealth(cred.get_nova_credentials_v2()) ret = nova.nova_stop_server(ip) time.sleep(ha_interval) infra.display_on_terminal(self, "Rebooting ", ip) nova.nova_start_server(ip) error = [] except Exception, error: pass if error: infra.display_on_terminal(self, "Error ", error, "color=red") infra.display_on_terminal(self, "waiting for ", ip, " to " "come " "online") if infra.wait_for_ping(ip, 240, 10): infra.display_on_terminal(self, "Node ", ip, " is online", "color=green") infra.display_on_terminal(self, "Will sleep for interval ", str(ha_interval)) #time.sleep(ha_interval) infra.add_table_rows(self, table_name, [[ node, ip, utils.get_timestamp(), HAConstants.OKGREEN + 'Rebooted' + HAConstants.ENDC ]])
def generate_downtime_table(self, downtime_dict, table_name): host_agent_status_dict = collections.OrderedDict() col_pos_dict = {} all_agent_list = [agent for agent in downtime_dict] infra.create_report_table(self, table_name) headers = ["Host Names "] + all_agent_list infra.add_table_headers(self, table_name, headers) col_pos = 0 for agent in downtime_dict: count = 0 col_pos_dict[agent] = col_pos for agent_dict in downtime_dict[agent]: if agent_dict is None: print "No agent." for host in agent_dict: cur = None prev = None downtime_range_start = None downtime_range_stop = None downtime_range = set() host_dict = agent_dict[host] for ts, status in host_dict.items(): count += 1 cur = status if (cur == 'FAIL' and prev == 'OK') or \ (cur == 'FAIL' and prev is None): downtime_range_start = ts if cur == 'FAIL' and prev == cur: downtime_range_stop = ts if cur == 'FAIL' and count == len(host_dict): downtime_range_stop = ts if cur == 'OK' and prev == 'FAIL' or\ (cur == 'FAIL' and count == len(host_dict)): downtime_range_stop = ts downtime_range.add(downtime_range_start + " to " + downtime_range_stop) prev = cur if not downtime_range: all_down_range = HAConstants.OKGREEN + ":)" + \ HAConstants.ENDC else: all_down_range = HAConstants.WARNING +\ (", ".join(downtime_range)) +\ HAConstants.ENDC host_dict = {agent: all_down_range} if host_agent_status_dict.get(host, None) is None: host_agent_status_dict[host] = [host_dict] else: host_agent_status_dict.get(host, None).append(host_dict) col_pos += 1 for host in host_agent_status_dict: row = [host] agents = ['-'] * len(all_agent_list) available_agents = set() for agent in host_agent_status_dict[host]: available_agents.add(agent.keys()[0]) missing_agents = set() for agent_status in host_agent_status_dict[host]: missing_agents = set(all_agent_list).difference( available_agents) for agent_name, status in agent_status.items(): colpos = col_pos_dict.get(agent_name) status = agent_status.get(agent_name, None) agents[colpos] = status for missing_agent in missing_agents: colpos = col_pos_dict.get(missing_agent) na_status = 'NA' agents[colpos] = na_status infra.add_table_rows(self, table_name, [row + agents])
def process_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution table_name = "Process Disruption" infra.create_report_table(self, table_name) infra.add_table_headers(self, table_name, ["Host", "Process", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() if input_args: print "Inpt " + str(input_args) process_name = input_args.get('process_name', None) role = input_args.get('role', None) type = input_args.get('type', None) infra.display_on_terminal(self, "Process ", process_name, " will be disrupted") nodes_to_be_disrupted = [] for node in host_config: if 'controller' in host_config[node].get('role', None): infra.display_on_terminal(self, node, " will be disrupted ") nodes_to_be_disrupted.append(node) self.expected_failures.append(node + "::" + process_name) self.set_expected_failures(self.expected_failures) rhel_stop_command = "systemctl stop " + process_name rhel_start_command = "systemctl start " + process_name if sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") ha_interval = self.get_ha_interval() disruption_count = self.get_disruption_count() infra.display_on_terminal(self, "Process will be disrupted " , str(disruption_count)) while infra.is_execution_completed(self.finish_execution) is False: if disruption_count: disruption_count = disruption_count - 1 for node in nodes_to_be_disrupted: ip = host_config.get(node, None).get('ip', None) user = host_config.get(node, None).get('user', None) password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "IP: ", ip, " User: "******" Pwd: ", password) infra.display_on_terminal(self, "Stopping ", process_name) infra.display_on_terminal(self, "Executing ", rhel_stop_command) code, out, error = infra.ssh_and_execute_command(ip, user, password, rhel_stop_command) infra.add_table_rows(self, table_name, [[ip, process_name, utils.get_timestamp(), HAConstants.WARNING + 'Stopped' + HAConstants.ENDC]]) infra.display_on_terminal(self, "Sleeping for interval ", str(ha_interval), " seconds") time.sleep(ha_interval) infra.display_on_terminal(self, "Starting ", process_name) infra.display_on_terminal(self, "Executing ", rhel_start_command) code, out, error = infra.ssh_and_execute_command(ip, user, password, rhel_start_command) time.sleep(ha_interval) infra.add_table_rows(self, table_name, [[ip, process_name, utils.get_timestamp(), HAConstants.OKGREEN + 'Started' + HAConstants.ENDC]]) # bring it back to stable state infra.display_on_terminal(self, "Bringing the process to stable state") infra.display_on_terminal(self, "Executing ", rhel_start_command) code, out, error = infra.ssh_and_execute_command(ip, user, password, rhel_start_command) infra.display_on_terminal(self, "Finishing Process Disruption")
def jump_host_process_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal(self, "Entering Jump Host Process Disruption plugin") table_name = "Jump host Process Disruption" infra.create_report_table(self, table_name) infra.add_table_headers(self, table_name, ["VM", "Process", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() print "*"*20 print input_args_dict print "input_args ==>",input_args print "host_config ==>",host_config nodes_to_be_disrupted = input_args.get('node',[]) process_name = input_args.get('process_name',[]) if input_args: print "Inpt " + str(input_args) role = input_args.get('role', None) # jump_hosts = [] for node in host_config: if role in host_config[node].get('role', None): jump_host = node # jump_hosts.append(node) print "###############",process_name # node_reboot_command = "reboot -f" # process_start_command = rhel_stop_command = "systemctl stop " + process_name rhel_start_command = "systemctl start " + process_name # jump host details jump_host_ip = host_config.get(node, None).get('ip', None) user = host_config.get(node,None).get('user',None) password = host_config.get(node,None).get('password',None) # copy necessary file to jump host runner = AnsibleRunner(jump_host_ip,user,password) infra.display_on_terminal(self, "Copying to ", jump_host_ip) runner.copy('jump_host_executor.py','scripts/','/tmp/') infra.display_on_terminal(self, "Copied to ", jump_host_ip) if self.sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") ha_interval = self.get_ha_interval() #TODO - if its more than one jump host # Write into txt file to pass via ansible playbook ''' f = open('/tmp/remote_ips','w+') for ip in nodes_to_be_disrupted: f.write(ip+'\n') f.close() ''' while infra.is_execution_completed(self.finish_execution) is False: ip = node # openrc = host_config.get(node, None).get('openrc', None) # password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "Nodes to be disrupted: ", str(nodes_to_be_disrupted), " Jump host: ", jump_host_ip) infra.display_on_terminal(self, "Executing ", rhel_stop_command) # ret = AnsibleRunner(jump_host_ip,user,password).execute_on_remote() infra.display_on_terminal(self, "Stopping ", process_name) # replacing the playbook logic with ansible runner # Execute the script on jump host ret = runner.shell('python /tmp/jump_host_executor.py "%s" "%s" >>/tmp/output'%(nodes_to_be_disrupted,rhel_stop_command)) print ret # Fetching the result to local runner.fetch('output','/tmp/','/tmp/hainfra/output') # Deleting the output file runner.shell('rm /tmp/output') # parse the output for report output_objs = eval(open('/tmp/hainfra/output','r').read()) print output_objs for results in output_objs: error = [] for (hostname, result) in results['contacted'].items(): if 'failed' in result: print "%s >>> %s" % (hostname, result['msg']) error = result['msg'] if error: infra.display_on_terminal(self, "Error ", error, "color=red") if not error: infra.add_table_rows(self, table_name, [[hostname, process_name, utils.get_timestamp(), HAConstants.OKGREEN + 'Stopped' + HAConstants.ENDC]]) else: infra.add_table_rows(self, table_name, [[hostname, process_name, utils.get_timestamp(), HAConstants.FAIL + str(error)+ HAConstants.ENDC]]) infra.display_on_terminal(self, "Will sleep for interval ", str(ha_interval)) time.sleep(ha_interval) infra.display_on_terminal(self, "Starting ", process_name) infra.display_on_terminal(self, "Executing ", rhel_start_command) ret = runner.shell('python /tmp/jump_host_executor.py "%s" "%s" >>/tmp/output'%(nodes_to_be_disrupted,rhel_start_command)) print ret runner.fetch('output','/tmp/','/tmp/hainfra/output') runner.shell('rm /tmp/output') # parse the output for report output_objs = eval(open('/tmp/hainfra/output','r').read()) for results in output_objs: hostname = results['hostname'] error = results['error'] if error: infra.display_on_terminal(self, "Error ", error, "color=red") if not error: infra.add_table_rows(self, table_name, [[hostname, process_name, utils.get_timestamp(), HAConstants.OKGREEN + 'Started' + HAConstants.ENDC]]) else: infra.add_table_rows(self, table_name, [[hostname, process_name, utils.get_timestamp(), HAConstants.FAIL + str(error)+ HAConstants.ENDC]]) infra.display_on_terminal(self, "Finishing Process Disruption")
def jump_host_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal(self, "Entering Jump Host Disruption plugin") table_name = "Jump host Disruption" infra.create_report_table(self, table_name) infra.add_table_headers( self, table_name, ["VM", "IP", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() print "*" * 20 print input_args_dict print "input_args ==>", input_args print "host_config ==>", host_config nodes_to_be_disrupted = input_args.get('name', []) if input_args: print "Inpt " + str(input_args) role = input_args.get('role', None) # jump_hosts = [] for node in host_config: if role in host_config[node].get('role', None): jump_host = node # jump_hosts.append(node) print "###############", jump_host node_reboot_command = "reboot -f" if self.sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") ha_interval = self.get_ha_interval() # jump host details jump_host_ip = host_config.get(node, None).get('ip', None) user = host_config.get(node, None).get('user', None) password = host_config.get(node, None).get('password', None) #TODO - if its more than one jump host # Write into txt file to pass via ansible playbook f = open('/tmp/remote_ips', 'w+') for ip in nodes_to_be_disrupted: f.write(ip + '\n') f.close() while infra.is_execution_completed(self.finish_execution) is False: # for node in nodes_to_be_disrupted: # node = nodes_to_be_disrupted[0] # ip = host_config.get(node, None).get('ip', None) # user = host_config.get(node, None).get('user', None) # password = host_config.get(node, None).get('password', None) ip = node # openrc = host_config.get(node, None).get('openrc', None) # password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "Nodes to be disrupted: ", str(nodes_to_be_disrupted), " Jump host: ", jump_host_ip) infra.display_on_terminal(self, "Executing ", node_reboot_command) print "*" * 50 print "user :"******"password :"******"jump_host_ip :", jump_host_ip ret = AnsibleRunner(jump_host_ip, user, password).execute_on_remote() print ret # parse the output for report # node_list = os.walk('/tmp/hainfra').next()[1] # output_objs = eval(open('/tmp/hainfra/'+node+'/tmp/output','r').read()) output_objs = eval(open('/tmp/hainfra/output', 'r').read()) print output_objs for results in output_objs: error = [] for (hostname, result) in results['contacted'].items(): if 'failed' in result: print "%s >>> %s" % (hostname, result['msg']) error = result['msg'] if error: infra.display_on_terminal(self, "Error ", error, "color=red") infra.display_on_terminal(self, "waiting for ", hostname, " to " "come " "online") if infra.wait_for_ping(hostname, 240, 5): infra.display_on_terminal(self, "Node ", hostname, " is online", "color=green") infra.display_on_terminal(self, "Will sleep for interval ", str(ha_interval)) #time.sleep(ha_interval) if not error: infra.add_table_rows(self, table_name, [[ jump_host_ip, hostname, utils.get_timestamp(), HAConstants.OKGREEN + 'Rebooted' + HAConstants.ENDC ]]) else: infra.add_table_rows(self, table_name, [[ jump_host_ip, hostname, utils.get_timestamp(), HAConstants.FAIL + str(error) + HAConstants.ENDC ]]) # bring it back to stable state ''' infra.display_on_terminal(self, "Waiting for the node to become stable") if infra.wait_for_ping(hostname, 240, 10): infra.display_on_terminal(self, "Node ", hostname, " is in stable state", "color=green") ''' infra.display_on_terminal(self, "Finishing Node Disruption")
def jump_host_process_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal( self, "Entering Jump Host Process Disruption plugin") table_name = "Jump host Process Disruption" infra.create_report_table(self, table_name) infra.add_table_headers( self, table_name, ["VM", "Process", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() print "*" * 20 print input_args_dict print "input_args ==>", input_args print "host_config ==>", host_config nodes_to_be_disrupted = input_args.get('node', []) process_name = input_args.get('process_name', []) if input_args: print "Inpt " + str(input_args) role = input_args.get('role', None) # jump_hosts = [] for node in host_config: if role in host_config[node].get('role', None): jump_host = node # jump_hosts.append(node) print "###############", process_name # node_reboot_command = "reboot -f" # process_start_command = rhel_stop_command = "systemctl stop " + process_name rhel_start_command = "systemctl start " + process_name # jump host details jump_host_ip = host_config.get(node, None).get('ip', None) user = host_config.get(node, None).get('user', None) password = host_config.get(node, None).get('password', None) # copy necessary file to jump host runner = AnsibleRunner(jump_host_ip, user, password) infra.display_on_terminal(self, "Copying to ", jump_host_ip) runner.copy('jump_host_executor.py', 'scripts/', '/tmp/') infra.display_on_terminal(self, "Copied to ", jump_host_ip) if self.sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") ha_interval = self.get_ha_interval() #TODO - if its more than one jump host # Write into txt file to pass via ansible playbook ''' f = open('/tmp/remote_ips','w+') for ip in nodes_to_be_disrupted: f.write(ip+'\n') f.close() ''' while infra.is_execution_completed(self.finish_execution) is False: ip = node # openrc = host_config.get(node, None).get('openrc', None) # password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "Nodes to be disrupted: ", str(nodes_to_be_disrupted), " Jump host: ", jump_host_ip) infra.display_on_terminal(self, "Executing ", rhel_stop_command) # ret = AnsibleRunner(jump_host_ip,user,password).execute_on_remote() infra.display_on_terminal(self, "Stopping ", process_name) # replacing the playbook logic with ansible runner # Execute the script on jump host ret = runner.shell( 'python /tmp/jump_host_executor.py "%s" "%s" >>/tmp/output' % (nodes_to_be_disrupted, rhel_stop_command)) print ret # Fetching the result to local runner.fetch('output', '/tmp/', '/tmp/hainfra/output') # Deleting the output file runner.shell('rm /tmp/output') # parse the output for report output_objs = eval(open('/tmp/hainfra/output', 'r').read()) print output_objs for results in output_objs: error = [] for (hostname, result) in results['contacted'].items(): if 'failed' in result: print "%s >>> %s" % (hostname, result['msg']) error = result['msg'] if error: infra.display_on_terminal(self, "Error ", error, "color=red") if not error: infra.add_table_rows(self, table_name, [[ hostname, process_name, utils.get_timestamp(), HAConstants.OKGREEN + 'Stopped' + HAConstants.ENDC ]]) else: infra.add_table_rows(self, table_name, [[ hostname, process_name, utils.get_timestamp(), HAConstants.FAIL + str(error) + HAConstants.ENDC ]]) infra.display_on_terminal(self, "Will sleep for interval ", str(ha_interval)) time.sleep(ha_interval) infra.display_on_terminal(self, "Starting ", process_name) infra.display_on_terminal(self, "Executing ", rhel_start_command) ret = runner.shell( 'python /tmp/jump_host_executor.py "%s" "%s" >>/tmp/output' % (nodes_to_be_disrupted, rhel_start_command)) print ret runner.fetch('output', '/tmp/', '/tmp/hainfra/output') runner.shell('rm /tmp/output') # parse the output for report output_objs = eval(open('/tmp/hainfra/output', 'r').read()) for results in output_objs: hostname = results['hostname'] error = results['error'] if error: infra.display_on_terminal(self, "Error ", error, "color=red") if not error: infra.add_table_rows(self, table_name, [[ hostname, process_name, utils.get_timestamp(), HAConstants.OKGREEN + 'Started' + HAConstants.ENDC ]]) else: infra.add_table_rows(self, table_name, [[ hostname, process_name, utils.get_timestamp(), HAConstants.FAIL + str(error) + HAConstants.ENDC ]]) infra.display_on_terminal(self, "Finishing Process Disruption")