def process_and_report(self, data, report_dict, host_filter, filter_type): format_string = "%s | %s | %s | " ret = [] for ip in data: host_services = data[ip]["services"] for key in host_services: result = {} if host_filter: if ip in host_filter: result = self.create_result_dict(data,ip,key) elif filter_type == "openstackvm": if ip.startswith("AppVm-"): result = self.create_result_dict(data,ip,key) else: result = self.create_result_dict(data,ip,key) if len(result) <= 0: continue result['status'] = \ NagiosMonitor.update_status_color(result['status'], result["status"]) ret.append(result) NagiosMonitor.collect_flapping_service_data(ip, key, report_dict, NagiosMonitor. get_status_string( data[ip] ["services"][key] ["current_state"]), data[ip]["services"] [key]["plugin_output"]) tbl_line = '-' * 62 infra.display_on_terminal(self, tbl_line) infra.display_on_terminal(self, NagiosMonitor.get_severity_color( 'INFO', format_string % ( 'HostName'.ljust(15), "Service Description".ljust(25), "Status".ljust(9)))) infra.display_on_terminal(self, tbl_line) for item in ret: if item.get("ip") == " " and item.get("description") == " ": self.print_data(format_string,item) continue if host_filter: if item.get("ip") in host_filter: self.print_data(format_string, item) elif filter_type == "openstackvm": if item.get("ip").startswith("AppVm-"): self.print_data(format_string, item) else: self.print_data(format_string, item) infra.display_on_terminal(self, tbl_line)
def display_msg_on_term(self, msg, status, host_list=None): ''' Generic function invoked by other check functions to print status. ''' msg = msg.ljust(50) status_msg = status.ljust(10) msg = msg + status_msg if host_list is not None and len(host_list) > 0: msg = msg + str(host_list) if status == 'PASS': infra.display_on_terminal(self, msg, "color=green") else: infra.display_on_terminal(self, msg, "color=red")
def start(self, sync=None, finish_execution=None, mode="basic"): infra.display_on_terminal(self, 'Starting Endpoint Health Check') input_args = self.get_input_arguments() self.finish_execution = finish_execution if 'openrc_file' in input_args['openstack_api']: openrc = input_args['openstack_api']['openrc_file'] else: openrc = None if 'password' in input_args['openstack_api']: password = input_args['openstack_api']['password'] else: password = None if openrc and password: noenv = True else: noenv = False self.cred = openstack_api.credentials.Credentials( openrc, password, noenv) self.frequency = input_args['openstack_api']['frequency'] max_entries = input_args['openstack_api']['max_entries'] if sync: infra.display_on_terminal(self, "Waiting for Runner Notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification from Runner") self.add_to_graph_file("starttime##" + utils.utils.get_timestamp( complete_timestamp=True)) self.health_check_start() self.add_to_graph_file("endtime##" + utils.utils.get_timestamp( complete_timestamp=True)) infra.display_on_terminal(self, "Finished Monitoring") # Generate downtime range of all the endpoints self.generate_downtime_table(self.endpoint_downtime_dict, "Endpoints Downtime") # Generate downtime range table for all the agents self.generate_downtime_table(self.agents_downtime_dict, "Agent Downtime") # Display the final report tables_list = ['Endpoints Downtime', 'Agent Downtime']
def start(self, sync=None, finish_execution=None, mode="basic"): infra.display_on_terminal(self, 'Starting Endpoint Health Check') input_args = self.get_input_arguments() self.finish_execution = finish_execution if 'openrc_file' in input_args['openstack_api']: openrc = input_args['openstack_api']['openrc_file'] else: openrc = None if 'password' in input_args['openstack_api']: password = input_args['openstack_api']['password'] else: password = None if openrc and password: noenv = True else: noenv = False self.cred = openstack_api.credentials.Credentials(openrc, password, noenv) self.frequency = input_args['openstack_api']['frequency'] max_entries = input_args['openstack_api']['max_entries'] if sync: infra.display_on_terminal(self, "Waiting for Runner Notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification from Runner") self.add_to_graph_file("starttime##"+utils.utils.get_timestamp (complete_timestamp=True)) self.health_check_start() self.add_to_graph_file("endtime##"+utils.utils.get_timestamp (complete_timestamp=True)) infra.display_on_terminal(self, "Finished Monitoring") # Generate downtime range of all the endpoints self.generate_downtime_table(self.endpoint_downtime_dict, "Endpoints Downtime") # Generate downtime range table for all the agents self.generate_downtime_table(self.agents_downtime_dict, "Agent Downtime") # Display the final report tables_list = ['Endpoints Downtime', 'Agent Downtime']
def start(self, sync=None, finish_execution=None): self.finish_execution = finish_execution ip_address = self.get_config('nagios_ip') input_args = self.get_input_arguments() filter_type = str(input_args['nagios']['type']) host_config = infra.get_openstack_config() host_filter = [] if filter_type == "node": host_filter = self.generate_filter_list(host_config,filter_type) # Execution starts here if sync: infra.display_on_terminal(self, "Waiting for Runner Notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification from Runner") start_time = datetime.datetime.\ fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S') while infra.is_execution_completed(self.finish_execution) is False: data = self.get_nagios_data(self.url, ip_address) self.process_and_report(data, self.reportDict, host_filter, filter_type) time.sleep(20) end_time = datetime.\ datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S') end_seconds = time.time() NagiosMonitor.write_to_file(self.reportDict, start_time, end_time) NagiosMonitor.calSummaryReport(self.summaryDict, self.reportDict, end_seconds) self.health_display_report()
def jump_host_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal(self, "Entering Jump Host Disruption plugin") table_name = "Jump host Disruption" infra.create_report_table(self, table_name) infra.add_table_headers(self, table_name, ["VM", "IP", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() print "*"*20 print input_args_dict print "input_args ==>",input_args print "host_config ==>",host_config nodes_to_be_disrupted = input_args.get('name',[]) if input_args: print "Inpt " + str(input_args) role = input_args.get('role', None) # jump_hosts = [] for node in host_config: if role in host_config[node].get('role', None): jump_host = node # jump_hosts.append(node) print "###############",jump_host node_reboot_command = "reboot -f" if self.sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") ha_interval = self.get_ha_interval() # jump host details jump_host_ip = host_config.get(node, None).get('ip', None) user = host_config.get(node,None).get('user',None) password = host_config.get(node,None).get('password',None) #TODO - if its more than one jump host # Write into txt file to pass via ansible playbook f = open('/tmp/remote_ips','w+') for ip in nodes_to_be_disrupted: f.write(ip+'\n') f.close() while infra.is_execution_completed(self.finish_execution) is False: # for node in nodes_to_be_disrupted: # node = nodes_to_be_disrupted[0] # ip = host_config.get(node, None).get('ip', None) # user = host_config.get(node, None).get('user', None) # password = host_config.get(node, None).get('password', None) ip = node # openrc = host_config.get(node, None).get('openrc', None) # password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "Nodes to be disrupted: ", str(nodes_to_be_disrupted), " Jump host: ", jump_host_ip) infra.display_on_terminal(self, "Executing ", node_reboot_command) print "*"*50 print "user :"******"password :"******"jump_host_ip :",jump_host_ip ret = AnsibleRunner(jump_host_ip,user,password).execute_on_remote() print ret # parse the output for report # node_list = os.walk('/tmp/hainfra').next()[1] # output_objs = eval(open('/tmp/hainfra/'+node+'/tmp/output','r').read()) output_objs = eval(open('/tmp/hainfra/output','r').read()) print output_objs for results in output_objs: error = [] for (hostname, result) in results['contacted'].items(): if 'failed' in result: print "%s >>> %s" % (hostname, result['msg']) error = result['msg'] if error: infra.display_on_terminal(self, "Error ", error, "color=red") infra.display_on_terminal(self, "waiting for ", hostname, " to " "come " "online") if infra.wait_for_ping(hostname, 240, 5): infra.display_on_terminal(self, "Node ", hostname, " is online", "color=green") infra.display_on_terminal(self, "Will sleep for interval ", str(ha_interval)) #time.sleep(ha_interval) if not error: infra.add_table_rows(self, table_name, [[jump_host_ip, hostname, utils.get_timestamp(), HAConstants.OKGREEN + 'Rebooted' + HAConstants.ENDC]]) else: infra.add_table_rows(self, table_name, [[jump_host_ip, hostname, utils.get_timestamp(), HAConstants.FAIL + str(error)+ HAConstants.ENDC]]) # bring it back to stable state ''' infra.display_on_terminal(self, "Waiting for the node to become stable") if infra.wait_for_ping(hostname, 240, 10): infra.display_on_terminal(self, "Node ", hostname, " is in stable state", "color=green") ''' infra.display_on_terminal(self, "Finishing Node Disruption")
def vm_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal(self, "Entering VM Disruption plugin") table_name = "VM Disruption" infra.create_report_table(self, table_name) infra.add_table_headers(self, table_name, ["VM", "IP", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() print '========',input_args_dict node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() if input_args: print "Inpt " + str(input_args) role = input_args.get('role', None) print '*'*20 print 'host_config ==',host_config print 'input_args ==',input_args print '*'*20 # nodes_to_be_disrupted = [] for node in host_config: if role == host_config[node].get('role', None): jump_host = node print jump_host nodes_to_be_disrupted = input_args.get('name') print nodes_to_be_disrupted node_reboot_command = "reboot" if self.sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") openrc = host_config.get(jump_host, None).get('openrc', None) password = host_config.get(jump_host, None).get('password', None) print openrc ha_interval = self.get_ha_interval() # for i in range(1): while infra.is_execution_completed(self.finish_execution) is False: for node in nodes_to_be_disrupted: # node = nodes_to_be_disrupted[0] # ip = host_config.get(node, None).get('ip', None) # user = host_config.get(node, None).get('user', None) # password = host_config.get(node, None).get('password', None) ip = node # openrc = host_config.get(node, None).get('openrc', None) # password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "IP: ", ip, " openrc: ", openrc) infra.display_on_terminal(self, "Executing ", node_reboot_command) # Using nova api performing the vm stop operation cred = credentials.Credentials(openrc, password,'no_env') try: nova = nova_api.NovaHealth(cred.get_nova_credentials_v2()) ret = nova.nova_stop_server(ip) time.sleep(ha_interval) infra.display_on_terminal(self, "Rebooting ",ip) nova.nova_start_server(ip) error = [] except Exception,error: pass if error: infra.display_on_terminal(self, "Error ", error, "color=red") infra.display_on_terminal(self, "waiting for ", ip, " to " "come " "online") if infra.wait_for_ping(ip, 240, 10): infra.display_on_terminal(self, "Node ", ip, " is online", "color=green") infra.display_on_terminal(self, "Will sleep for interval ", str(ha_interval)) #time.sleep(ha_interval) infra.add_table_rows(self, table_name, [[node, ip, utils.get_timestamp(), HAConstants.OKGREEN + 'Rebooted' + HAConstants.ENDC]])
def process_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution table_name = "Process Disruption" infra.create_report_table(self, table_name) infra.add_table_headers( self, table_name, ["Host", "Process", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() if input_args: print "Inpt " + str(input_args) process_name = input_args.get('process_name', None) role = input_args.get('role', None) type = input_args.get('type', None) infra.display_on_terminal(self, "Process ", process_name, " will be disrupted") nodes_to_be_disrupted = [] for node in host_config: if 'controller' in host_config[node].get('role', None): infra.display_on_terminal(self, node, " will be disrupted ") nodes_to_be_disrupted.append(node) self.expected_failures.append(node + "::" + process_name) self.set_expected_failures(self.expected_failures) rhel_stop_command = "systemctl stop " + process_name rhel_start_command = "systemctl start " + process_name if sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") ha_interval = self.get_ha_interval() disruption_count = self.get_disruption_count() infra.display_on_terminal(self, "Process will be disrupted ", str(disruption_count)) while infra.is_execution_completed(self.finish_execution) is False: if disruption_count: disruption_count = disruption_count - 1 for node in nodes_to_be_disrupted: ip = host_config.get(node, None).get('ip', None) user = host_config.get(node, None).get('user', None) password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "IP: ", ip, " User: "******" Pwd: ", password) infra.display_on_terminal(self, "Stopping ", process_name) infra.display_on_terminal(self, "Executing ", rhel_stop_command) code, out, error = infra.ssh_and_execute_command( ip, user, password, rhel_stop_command) infra.add_table_rows(self, table_name, [[ ip, process_name, utils.get_timestamp(), HAConstants.WARNING + 'Stopped' + HAConstants.ENDC ]]) infra.display_on_terminal(self, "Sleeping for interval ", str(ha_interval), " seconds") time.sleep(ha_interval) infra.display_on_terminal(self, "Starting ", process_name) infra.display_on_terminal(self, "Executing ", rhel_start_command) code, out, error = infra.ssh_and_execute_command( ip, user, password, rhel_start_command) time.sleep(ha_interval) infra.add_table_rows(self, table_name, [[ ip, process_name, utils.get_timestamp(), HAConstants.OKGREEN + 'Started' + HAConstants.ENDC ]]) # bring it back to stable state infra.display_on_terminal(self, "Bringing the process to stable state") infra.display_on_terminal(self, "Executing ", rhel_start_command) code, out, error = infra.ssh_and_execute_command( ip, user, password, rhel_start_command) infra.display_on_terminal(self, "Finishing Process Disruption")
def execute(self, sync=None, finish_execution=None): input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) infra.display_on_terminal(self, "Executing Rally Runner Plugin ") infra.display_on_terminal(self, "Rally preparing...") rally_path = input_args['rally_path'] scenario_file = input_args['scenario_file'] rally_command = rally_path + " -v task start " + scenario_file infra.display_on_terminal(self, "Rally Path -> ", rally_path) infra.display_on_terminal(self, "Scenario file -> ", scenario_file) pattern = "Benchmarking... This can take a while..." infra.display_on_terminal(self, "Executing ", rally_command) proc = subprocess.Popen(rally_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) line = '' while not(line == '' and proc.poll() is not None): line = proc.stdout.readline() if pattern in line: infra.display_on_terminal(self, "Notifying all waiters") time.sleep(5) infra.notify_all_waiters(sync) infra.display_on_terminal(self, line) results_command = rally_path + " task results " infra.display_on_terminal(self, "Collecting results ", results_command) proc = subprocess.Popen(results_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) rally_ouput_table = "" line = '' while not(line == '' and proc.poll() is not None): line = proc.stdout.readline() infra.display_on_terminal(self, line) rally_ouput_table += line # Let the infra know to complete infra.display_on_terminal(self, "Rally finished executing.....") infra.set_execution_completed(finish_execution) infra.create_report_table(self, rally_ouput_table, user_table=True)
def stop(self): infra.display_on_terminal(self, "Stopping the Keystone...")
def process_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution table_name = "Process Disruption" infra.create_report_table(self, table_name) infra.add_table_headers(self, table_name, ["Host", "Process", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() if input_args: print "Inpt " + str(input_args) process_name = input_args.get('process_name', None) role = input_args.get('role', None) type = input_args.get('type', None) infra.display_on_terminal(self, "Process ", process_name, " will be disrupted") nodes_to_be_disrupted = [] for node in host_config: if 'controller' in host_config[node].get('role', None): infra.display_on_terminal(self, node, " will be disrupted ") nodes_to_be_disrupted.append(node) self.expected_failures.append(node + "::" + process_name) self.set_expected_failures(self.expected_failures) rhel_stop_command = "systemctl stop " + process_name rhel_start_command = "systemctl start " + process_name if sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") ha_interval = self.get_ha_interval() disruption_count = self.get_disruption_count() infra.display_on_terminal(self, "Process will be disrupted " , str(disruption_count)) while infra.is_execution_completed(self.finish_execution) is False: if disruption_count: disruption_count = disruption_count - 1 for node in nodes_to_be_disrupted: ip = host_config.get(node, None).get('ip', None) user = host_config.get(node, None).get('user', None) password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "IP: ", ip, " User: "******" Pwd: ", password) infra.display_on_terminal(self, "Stopping ", process_name) infra.display_on_terminal(self, "Executing ", rhel_stop_command) code, out, error = infra.ssh_and_execute_command(ip, user, password, rhel_stop_command) infra.add_table_rows(self, table_name, [[ip, process_name, utils.get_timestamp(), HAConstants.WARNING + 'Stopped' + HAConstants.ENDC]]) infra.display_on_terminal(self, "Sleeping for interval ", str(ha_interval), " seconds") time.sleep(ha_interval) infra.display_on_terminal(self, "Starting ", process_name) infra.display_on_terminal(self, "Executing ", rhel_start_command) code, out, error = infra.ssh_and_execute_command(ip, user, password, rhel_start_command) time.sleep(ha_interval) infra.add_table_rows(self, table_name, [[ip, process_name, utils.get_timestamp(), HAConstants.OKGREEN + 'Started' + HAConstants.ENDC]]) # bring it back to stable state infra.display_on_terminal(self, "Bringing the process to stable state") infra.display_on_terminal(self, "Executing ", rhel_start_command) code, out, error = infra.ssh_and_execute_command(ip, user, password, rhel_start_command) infra.display_on_terminal(self, "Finishing Process Disruption")
def jump_host_process_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal(self, "Entering Jump Host Process Disruption plugin") table_name = "Jump host Process Disruption" infra.create_report_table(self, table_name) infra.add_table_headers(self, table_name, ["VM", "Process", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() print "*"*20 print input_args_dict print "input_args ==>",input_args print "host_config ==>",host_config nodes_to_be_disrupted = input_args.get('node',[]) process_name = input_args.get('process_name',[]) if input_args: print "Inpt " + str(input_args) role = input_args.get('role', None) # jump_hosts = [] for node in host_config: if role in host_config[node].get('role', None): jump_host = node # jump_hosts.append(node) print "###############",process_name # node_reboot_command = "reboot -f" # process_start_command = rhel_stop_command = "systemctl stop " + process_name rhel_start_command = "systemctl start " + process_name # jump host details jump_host_ip = host_config.get(node, None).get('ip', None) user = host_config.get(node,None).get('user',None) password = host_config.get(node,None).get('password',None) # copy necessary file to jump host runner = AnsibleRunner(jump_host_ip,user,password) infra.display_on_terminal(self, "Copying to ", jump_host_ip) runner.copy('jump_host_executor.py','scripts/','/tmp/') infra.display_on_terminal(self, "Copied to ", jump_host_ip) if self.sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") ha_interval = self.get_ha_interval() #TODO - if its more than one jump host # Write into txt file to pass via ansible playbook ''' f = open('/tmp/remote_ips','w+') for ip in nodes_to_be_disrupted: f.write(ip+'\n') f.close() ''' while infra.is_execution_completed(self.finish_execution) is False: ip = node # openrc = host_config.get(node, None).get('openrc', None) # password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "Nodes to be disrupted: ", str(nodes_to_be_disrupted), " Jump host: ", jump_host_ip) infra.display_on_terminal(self, "Executing ", rhel_stop_command) # ret = AnsibleRunner(jump_host_ip,user,password).execute_on_remote() infra.display_on_terminal(self, "Stopping ", process_name) # replacing the playbook logic with ansible runner # Execute the script on jump host ret = runner.shell('python /tmp/jump_host_executor.py "%s" "%s" >>/tmp/output'%(nodes_to_be_disrupted,rhel_stop_command)) print ret # Fetching the result to local runner.fetch('output','/tmp/','/tmp/hainfra/output') # Deleting the output file runner.shell('rm /tmp/output') # parse the output for report output_objs = eval(open('/tmp/hainfra/output','r').read()) print output_objs for results in output_objs: error = [] for (hostname, result) in results['contacted'].items(): if 'failed' in result: print "%s >>> %s" % (hostname, result['msg']) error = result['msg'] if error: infra.display_on_terminal(self, "Error ", error, "color=red") if not error: infra.add_table_rows(self, table_name, [[hostname, process_name, utils.get_timestamp(), HAConstants.OKGREEN + 'Stopped' + HAConstants.ENDC]]) else: infra.add_table_rows(self, table_name, [[hostname, process_name, utils.get_timestamp(), HAConstants.FAIL + str(error)+ HAConstants.ENDC]]) infra.display_on_terminal(self, "Will sleep for interval ", str(ha_interval)) time.sleep(ha_interval) infra.display_on_terminal(self, "Starting ", process_name) infra.display_on_terminal(self, "Executing ", rhel_start_command) ret = runner.shell('python /tmp/jump_host_executor.py "%s" "%s" >>/tmp/output'%(nodes_to_be_disrupted,rhel_start_command)) print ret runner.fetch('output','/tmp/','/tmp/hainfra/output') runner.shell('rm /tmp/output') # parse the output for report output_objs = eval(open('/tmp/hainfra/output','r').read()) for results in output_objs: hostname = results['hostname'] error = results['error'] if error: infra.display_on_terminal(self, "Error ", error, "color=red") if not error: infra.add_table_rows(self, table_name, [[hostname, process_name, utils.get_timestamp(), HAConstants.OKGREEN + 'Started' + HAConstants.ENDC]]) else: infra.add_table_rows(self, table_name, [[hostname, process_name, utils.get_timestamp(), HAConstants.FAIL + str(error)+ HAConstants.ENDC]]) infra.display_on_terminal(self, "Finishing Process Disruption")
def jump_host_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal(self, "Entering Jump Host Disruption plugin") table_name = "Jump host Disruption" infra.create_report_table(self, table_name) infra.add_table_headers( self, table_name, ["VM", "IP", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() print "*" * 20 print input_args_dict print "input_args ==>", input_args print "host_config ==>", host_config nodes_to_be_disrupted = input_args.get('name', []) if input_args: print "Inpt " + str(input_args) role = input_args.get('role', None) # jump_hosts = [] for node in host_config: if role in host_config[node].get('role', None): jump_host = node # jump_hosts.append(node) print "###############", jump_host node_reboot_command = "reboot -f" if self.sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") ha_interval = self.get_ha_interval() # jump host details jump_host_ip = host_config.get(node, None).get('ip', None) user = host_config.get(node, None).get('user', None) password = host_config.get(node, None).get('password', None) #TODO - if its more than one jump host # Write into txt file to pass via ansible playbook f = open('/tmp/remote_ips', 'w+') for ip in nodes_to_be_disrupted: f.write(ip + '\n') f.close() while infra.is_execution_completed(self.finish_execution) is False: # for node in nodes_to_be_disrupted: # node = nodes_to_be_disrupted[0] # ip = host_config.get(node, None).get('ip', None) # user = host_config.get(node, None).get('user', None) # password = host_config.get(node, None).get('password', None) ip = node # openrc = host_config.get(node, None).get('openrc', None) # password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "Nodes to be disrupted: ", str(nodes_to_be_disrupted), " Jump host: ", jump_host_ip) infra.display_on_terminal(self, "Executing ", node_reboot_command) print "*" * 50 print "user :"******"password :"******"jump_host_ip :", jump_host_ip ret = AnsibleRunner(jump_host_ip, user, password).execute_on_remote() print ret # parse the output for report # node_list = os.walk('/tmp/hainfra').next()[1] # output_objs = eval(open('/tmp/hainfra/'+node+'/tmp/output','r').read()) output_objs = eval(open('/tmp/hainfra/output', 'r').read()) print output_objs for results in output_objs: error = [] for (hostname, result) in results['contacted'].items(): if 'failed' in result: print "%s >>> %s" % (hostname, result['msg']) error = result['msg'] if error: infra.display_on_terminal(self, "Error ", error, "color=red") infra.display_on_terminal(self, "waiting for ", hostname, " to " "come " "online") if infra.wait_for_ping(hostname, 240, 5): infra.display_on_terminal(self, "Node ", hostname, " is online", "color=green") infra.display_on_terminal(self, "Will sleep for interval ", str(ha_interval)) #time.sleep(ha_interval) if not error: infra.add_table_rows(self, table_name, [[ jump_host_ip, hostname, utils.get_timestamp(), HAConstants.OKGREEN + 'Rebooted' + HAConstants.ENDC ]]) else: infra.add_table_rows(self, table_name, [[ jump_host_ip, hostname, utils.get_timestamp(), HAConstants.FAIL + str(error) + HAConstants.ENDC ]]) # bring it back to stable state ''' infra.display_on_terminal(self, "Waiting for the node to become stable") if infra.wait_for_ping(hostname, 240, 10): infra.display_on_terminal(self, "Node ", hostname, " is in stable state", "color=green") ''' infra.display_on_terminal(self, "Finishing Node Disruption")
def jump_host_process_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal( self, "Entering Jump Host Process Disruption plugin") table_name = "Jump host Process Disruption" infra.create_report_table(self, table_name) infra.add_table_headers( self, table_name, ["VM", "Process", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() print "*" * 20 print input_args_dict print "input_args ==>", input_args print "host_config ==>", host_config nodes_to_be_disrupted = input_args.get('node', []) process_name = input_args.get('process_name', []) if input_args: print "Inpt " + str(input_args) role = input_args.get('role', None) # jump_hosts = [] for node in host_config: if role in host_config[node].get('role', None): jump_host = node # jump_hosts.append(node) print "###############", process_name # node_reboot_command = "reboot -f" # process_start_command = rhel_stop_command = "systemctl stop " + process_name rhel_start_command = "systemctl start " + process_name # jump host details jump_host_ip = host_config.get(node, None).get('ip', None) user = host_config.get(node, None).get('user', None) password = host_config.get(node, None).get('password', None) # copy necessary file to jump host runner = AnsibleRunner(jump_host_ip, user, password) infra.display_on_terminal(self, "Copying to ", jump_host_ip) runner.copy('jump_host_executor.py', 'scripts/', '/tmp/') infra.display_on_terminal(self, "Copied to ", jump_host_ip) if self.sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") ha_interval = self.get_ha_interval() #TODO - if its more than one jump host # Write into txt file to pass via ansible playbook ''' f = open('/tmp/remote_ips','w+') for ip in nodes_to_be_disrupted: f.write(ip+'\n') f.close() ''' while infra.is_execution_completed(self.finish_execution) is False: ip = node # openrc = host_config.get(node, None).get('openrc', None) # password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "Nodes to be disrupted: ", str(nodes_to_be_disrupted), " Jump host: ", jump_host_ip) infra.display_on_terminal(self, "Executing ", rhel_stop_command) # ret = AnsibleRunner(jump_host_ip,user,password).execute_on_remote() infra.display_on_terminal(self, "Stopping ", process_name) # replacing the playbook logic with ansible runner # Execute the script on jump host ret = runner.shell( 'python /tmp/jump_host_executor.py "%s" "%s" >>/tmp/output' % (nodes_to_be_disrupted, rhel_stop_command)) print ret # Fetching the result to local runner.fetch('output', '/tmp/', '/tmp/hainfra/output') # Deleting the output file runner.shell('rm /tmp/output') # parse the output for report output_objs = eval(open('/tmp/hainfra/output', 'r').read()) print output_objs for results in output_objs: error = [] for (hostname, result) in results['contacted'].items(): if 'failed' in result: print "%s >>> %s" % (hostname, result['msg']) error = result['msg'] if error: infra.display_on_terminal(self, "Error ", error, "color=red") if not error: infra.add_table_rows(self, table_name, [[ hostname, process_name, utils.get_timestamp(), HAConstants.OKGREEN + 'Stopped' + HAConstants.ENDC ]]) else: infra.add_table_rows(self, table_name, [[ hostname, process_name, utils.get_timestamp(), HAConstants.FAIL + str(error) + HAConstants.ENDC ]]) infra.display_on_terminal(self, "Will sleep for interval ", str(ha_interval)) time.sleep(ha_interval) infra.display_on_terminal(self, "Starting ", process_name) infra.display_on_terminal(self, "Executing ", rhel_start_command) ret = runner.shell( 'python /tmp/jump_host_executor.py "%s" "%s" >>/tmp/output' % (nodes_to_be_disrupted, rhel_start_command)) print ret runner.fetch('output', '/tmp/', '/tmp/hainfra/output') runner.shell('rm /tmp/output') # parse the output for report output_objs = eval(open('/tmp/hainfra/output', 'r').read()) for results in output_objs: hostname = results['hostname'] error = results['error'] if error: infra.display_on_terminal(self, "Error ", error, "color=red") if not error: infra.add_table_rows(self, table_name, [[ hostname, process_name, utils.get_timestamp(), HAConstants.OKGREEN + 'Started' + HAConstants.ENDC ]]) else: infra.add_table_rows(self, table_name, [[ hostname, process_name, utils.get_timestamp(), HAConstants.FAIL + str(error) + HAConstants.ENDC ]]) infra.display_on_terminal(self, "Finishing Process Disruption")
def setup(self): infra.display_on_terminal(self,"Setting up the runner")
def teardown(self): infra.display_on_terminal(self,"Tearing down the runner")
def notify(self, *args, **kwargs): infra.display_on_terminal(self, 'Args is %s', str(args)) for key, value in kwargs.iteritems(): infra.display_on_terminal( self, 'Got Notification with key %s, value %s' % (key, value))
def vm_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal(self, "Entering VM Disruption plugin") table_name = "VM Disruption" infra.create_report_table(self, table_name) infra.add_table_headers( self, table_name, ["VM", "IP", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() print '========', input_args_dict node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() if input_args: print "Inpt " + str(input_args) role = input_args.get('role', None) print '*' * 20 print 'host_config ==', host_config print 'input_args ==', input_args print '*' * 20 # nodes_to_be_disrupted = [] for node in host_config: if role == host_config[node].get('role', None): jump_host = node print jump_host nodes_to_be_disrupted = input_args.get('name') print nodes_to_be_disrupted node_reboot_command = "reboot" if self.sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") openrc = host_config.get(jump_host, None).get('openrc', None) password = host_config.get(jump_host, None).get('password', None) print openrc ha_interval = self.get_ha_interval() # for i in range(1): while infra.is_execution_completed(self.finish_execution) is False: for node in nodes_to_be_disrupted: # node = nodes_to_be_disrupted[0] # ip = host_config.get(node, None).get('ip', None) # user = host_config.get(node, None).get('user', None) # password = host_config.get(node, None).get('password', None) ip = node # openrc = host_config.get(node, None).get('openrc', None) # password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "IP: ", ip, " openrc: ", openrc) infra.display_on_terminal(self, "Executing ", node_reboot_command) # Using nova api performing the vm stop operation cred = credentials.Credentials(openrc, password, 'no_env') try: nova = nova_api.NovaHealth(cred.get_nova_credentials_v2()) ret = nova.nova_stop_server(ip) time.sleep(ha_interval) infra.display_on_terminal(self, "Rebooting ", ip) nova.nova_start_server(ip) error = [] except Exception, error: pass if error: infra.display_on_terminal(self, "Error ", error, "color=red") infra.display_on_terminal(self, "waiting for ", ip, " to " "come " "online") if infra.wait_for_ping(ip, 240, 10): infra.display_on_terminal(self, "Node ", ip, " is online", "color=green") infra.display_on_terminal(self, "Will sleep for interval ", str(ha_interval)) #time.sleep(ha_interval) infra.add_table_rows(self, table_name, [[ node, ip, utils.get_timestamp(), HAConstants.OKGREEN + 'Rebooted' + HAConstants.ENDC ]])
def notify(self, *args, **kwargs): infra.display_on_terminal(self, 'Args is %s', str(args)) for key, value in kwargs.iteritems(): infra.display_on_terminal(self, 'Got Notification with key %s, value %s' % (key, value))
def container_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal(self, "Entering Container Disruption plugin") table_name = "Container Disruption" infra.create_report_table(self, table_name) infra.add_table_headers(self, table_name, ["Host", "Container Process", "Timestamp", "Status of Disruption"]) input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() if input_args: print "Inpt " + str(input_args) container_name = input_args.get('container_name', None) role = input_args.get('role', None) disruption_type = input_args.get('disruption', None) infra.display_on_terminal(self, "Container ", container_name, " will be disrupted") nodes_to_be_disrupted = [] for node in host_config: if 'controller' in host_config[node].get('role', None): infra.display_on_terminal(self, node, " will be disrupted ") nodes_to_be_disrupted.append(node) # For now disrupt on only one node break # Deprecate process disruptor and converge on this for both cases later container_stop_command = "systemctl stop " + container_name container_start_command = "systemctl start " + container_name ha_start_delay = self.get_ha_start_delay() if sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") # Start the actual disruption after 45 seconds time.sleep(ha_start_delay) ha_interval = self.get_ha_interval() disruption_count = self.get_disruption_count() if disruption_type == 'infinite': #Override the disruption count in executor.yaml disruption_count = 1 while infra.is_execution_completed(self.finish_execution) is False: if disruption_count: disruption_count = disruption_count - 1 for node in nodes_to_be_disrupted: ip = host_config.get(node, None).get('ip', None) user = host_config.get(node, None).get('user', None) password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "Stopping ", container_name) infra.display_on_terminal(self, "Executing ", container_stop_command) code, out, error = infra.ssh_and_execute_command(ip, user, password, container_stop_command) infra.add_table_rows(self, table_name, [[ip, container_name, utils.get_timestamp(), HAConstants.WARNING + 'Stopped' + HAConstants.ENDC]]) if disruption_type == 'infinite': infra.display_on_terminal(self, "Infinite disruption chosen bring up container manually") break infra.display_on_terminal(self, "Sleeping for interval ", str(ha_interval), " seconds") time.sleep(ha_interval) infra.display_on_terminal(self, "Starting ", container_name) infra.display_on_terminal(self, "Executing ", container_start_command) code, out, error = infra.ssh_and_execute_command(ip, user, password, container_start_command) time.sleep(ha_interval) infra.add_table_rows(self, table_name, [[ip, container_name, utils.get_timestamp(), HAConstants.OKGREEN + 'Started' + HAConstants.ENDC]]) # bring it back to stable state if disruption_type != 'infinite': infra.display_on_terminal(self, "Bringing the container to stable state") infra.display_on_terminal(self, "Executing ", container_start_command) code, out, error = infra.ssh_and_execute_command(ip, user, password, container_start_command) infra.display_on_terminal(self, "Finishing Container Disruption")
def start(self, sync=None, finish_execution=None, args=None): ''' Required start method to implement for the class. ''' # Parse user data and Initialize. self.finish_execution = finish_execution data = self.get_input_arguments() self.loglevel = data['ansible'].get("loglevel", "DEBUG") self.frequency = data['ansible'].get('frequency', 5) self.max_hist_size = data['ansible'].get('max_hist', 25) self.dockerized = data['ansible'].get('dockerized', False) global LOG LOG = infra.ha_logging(__name__, level=self.loglevel) print "ANSIBLE LOG LEVEL: ", self.loglevel LOG.debug("User data: %s", data) # Get MariaDB Username/pass self.mariadb_user = None self.mariadb_password = None mariadb_info = data['ansible'].get('mariadb', None) if mariadb_info is not None: self.mariadb_user = data['ansible']['mariadb'].get('user', None) self.mariadb_password = data['ansible']['mariadb'].get('password', None) self.ansirunner = None setup_file = "../../configs/openstack_config.yaml" self.ansiresults = collections.deque(maxlen=self.max_hist_size) self.inventory = ConfigHelper(host_file=setup_file) LOG.debug("parsed data: ", self.inventory.parsed_data) host_list = self.inventory.get_host_list() host_ip_list = self.inventory.get_host_ip_list() control_ip_list = self.inventory.get_host_ip_list(role='controller') compute_ip_list = self.inventory.get_host_ip_list(role='compute') remote_user = self.inventory.get_host_username(host_list[0]) LOG.debug("Inventory: [all: %s], [control: %s] [compute: %s]", host_ip_list, control_ip_list, compute_ip_list) LOG.debug("Remote user: "******"Waiting for Runner Notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification from Runner") while infra.is_execution_completed(self.finish_execution) is False: #################################################### # Ansible Monitoring Loop. #################################################### ts_results = [] ts = utils.get_timestamp(complete_timestamp=True) ts_results.append({'name': 'ts', 'ts': ts}) msg = "=" * 50 + "\n" + "Timestamp: " + ts infra.display_on_terminal(self, msg) # Ping and SSH Check. host_ip_list = self.inventory.get_host_ip_list() ansi_results = self.ansible_ssh_ping_check(host_ip_list, remote_user) ts_results.append(ansi_results) # Process check. for service in SERVICE_LIST: host_ip_list = self.inventory.get_host_ip_list(role=service['role']) ansi_results = self.ansible_check_process(host_ip_list, remote_user, service['service']) ts_results.append(ansi_results) # RabbitMQ Check. host_ip_list = self.inventory.get_host_ip_list(role='controller') ansi_results = self.ansible_check_rabbitmq(host_ip_list, remote_user) ts_results.append(ansi_results) # MariaDB Check. ansi_results = self.ansible_check_mariadb(host_ip_list, remote_user) ts_results.append(ansi_results) # Add the ts results to main result list. self.ansiresults.append(ts_results) time.sleep(self.frequency) # Generate Summary Reports self.display_ansible_summary_report() self.display_asible_process_report() infra.display_infra_report() self.generate_graphs_output()
def execute(self, sync=None, finish_execution=None): input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) infra.display_on_terminal(self, "Executing Rally Runner Plugin ") infra.display_on_terminal(self, "Rally preparing...") rally_path = input_args['rally_path'] scenario_file = input_args['scenario_file'] rally_command = rally_path + " -v task start " + scenario_file infra.display_on_terminal(self, "Rally Path -> ", rally_path) infra.display_on_terminal(self, "Scenario file -> ", scenario_file) pattern = "Benchmarking... This can take a while..." infra.display_on_terminal(self, "Executing ", rally_command) proc = subprocess.Popen(rally_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) line = '' while not(line == '' and proc.poll() is not None): line = proc.stdout.readline() if pattern in line: infra.display_on_terminal(self, "Notifying all waiters") time.sleep(5) infra.notify_all_waiters(sync) infra.display_on_terminal(self, line) results_command = rally_path + " task show " infra.display_on_terminal(self, "Collecting results ", results_command) proc = subprocess.Popen(results_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) rally_ouput_table = "" line = '' while not(line == '' and proc.poll() is not None): line = proc.stdout.readline() infra.display_on_terminal(self, line) rally_ouput_table += line # Let the infra know to complete infra.display_on_terminal(self, "Rally finished executing.....") infra.set_execution_completed(finish_execution) infra.create_report_table(self, rally_ouput_table, user_table=True)
def node_disruption(self, sync=None, finish_execution=None): self.sync = sync self.finish_execution = finish_execution infra.display_on_terminal(self, "Entering Node Disruption plugin") table_name = "Node Disruption" infra.create_report_table(self, table_name) infra.add_table_headers(self, table_name, ["Node", "IP", "TimeStamp", "Status of Disruption"]) infra.display_on_terminal(self, "Entering Process Disruption plugin") input_args_dict = self.get_input_arguments() node_name = input_args_dict.keys()[0] input_args = input_args_dict.get(node_name, None) host_config = infra.get_openstack_config() if input_args: print "Inpt " + str(input_args) role = input_args.get('role', None) nodes_to_be_disrupted = [] for node in host_config: if role in host_config[node].get('role', None): infra.display_on_terminal(self, node, " will be disrupted ") nodes_to_be_disrupted.append(node) node_reboot_command = "reboot -f " if self.sync: infra.display_on_terminal(self, "Waiting for notification") infra.wait_for_notification(sync) infra.display_on_terminal(self, "Received notification, Starting") ha_interval = self.get_ha_interval() for i in range(1): #while infra.is_execution_completed(self.finish_execution) is False: # for node in nodes_to_be_disrupted: node = nodes_to_be_disrupted[0] ip = host_config.get(node, None).get('ip', None) user = host_config.get(node, None).get('user', None) password = host_config.get(node, None).get('password', None) infra.display_on_terminal(self, "IP: ", ip, " User: "******" Pwd: ", password) infra.display_on_terminal(self, "Executing ", node_reboot_command) code, out, error = infra.ssh_and_execute_command(ip, user, password, node_reboot_command) if error: infra.display_on_terminal(self, "Error ", error, "color=red") infra.display_on_terminal(self, "waiting for ", ip, " to " "come " "online") if infra.wait_for_ping(ip, 240, 10): infra.display_on_terminal(self, "Node ", ip, " is online", "color=green") infra.display_on_terminal(self, "Will sleep for interval ", str(ha_interval)) #time.sleep(ha_interval) infra.add_table_rows(self, table_name, [[node, ip, utils.get_timestamp(), HAConstants.OKGREEN + 'Rebooted' + HAConstants.ENDC]]) # bring it back to stable state infra.display_on_terminal(self, "Waiting for the node to become stable") if infra.wait_for_ping(ip, 240, 10): infra.display_on_terminal(self, "Node ", ip, " is in stable state", "color=green") infra.display_on_terminal(self, "Finishing Node Disruption")
def print_data(self, format_string, item): infra.display_on_terminal(self, format_string % ( item.get("ip").ljust(15), item.get(NagiosMonitor.headers[1])[:40].ljust(25), item.get("status").ljust(9)))