def reset_all_app_stats(self, name): """ Reset all the stats for an application """ l.info("Attempting to reset client stats for %s...", name) assert(name in self.apps) for task_id, info in self.apps[name]['ip_port_map'].items(): port = info[0] ip = info[1] ha_sub = HAnalyser(ip, port, task_id) # Signal it to reset all client stats ha_sub.reset_stats() ha_sub.stop() # closes the ANalyser socket, can not be used anymore
def fetch_app_stats(self, name, group_name=""): """ Fetch stats from all the instances of the app and store it locally. The stats collection is done while looking at "msg_cnt" so it's mandatory that all the stats are required to have a field msg_cnt while collecting the msg_cnt is monitored, and stats collection is completed when the msg_cnt stops increasing between two successive reads. @args: name: Name of the app group_name: Group name if only group singal required (optional) """ assert(name in self.apps) task_list = self.all_task_ids[name] if group_name: assert(group_name in self.app_group) task_list = self.app_group[group_name] l.info("Attempting to fetch client group stats for app[%s], group[%s]...", name, group_name) else: l.info("Attempting to fetch client stats for app[%s]...", name) self.apps[name]['stats'] = {} first_itr = True no_delay_needed_count = 0 for task_id in task_list: info = self.apps[name]['ip_port_map'][task_id] port = info[0] ip = info[1] ha_sub = HAnalyser(ip, port, task_id) # Signal it to start sending data, blocks until PUB responsds with "DONE" after sending all data stats = ha_sub.get_stats() while first_itr: time.sleep(.1) stats2 = ha_sub.get_stats() # if it's the first read make sure that the sub has stopped receiving data if (stats['msg_cnt'] == stats2['msg_cnt']): # first_itr = False no_delay_needed_count += 1 if (no_delay_needed_count > 40): # No more delays if 100 successive read's where # stable on msg_cnt first_itr = False break no_delay_needed_count = 0 stats = stats2 ha_sub.stop() # closes the ANalyser socket, can not be used anymore stats['task_id'] = task_id self.apps[name]['stats'][str(ip) + ':' + str(port)] = stats # copy.deepcopy(stats)
def reset_all_app_stats(self, name, group_name=""): """ Reset all the stats for an application @args: name: Name of the app group_name: Group name if only group singal required (optional) """ assert (name in self.apps) task_list = self.all_task_ids[name] if group_name: assert (group_name in self.app_group) task_list = self.app_group[group_name].tasks_list l.info( "Attempting to reset client group stats for app[%s], group[%s]...", name, group_name) else: l.info("Attempting to reset client stats for app[%s]...", name) for task_id in task_list: info = self.apps[name]['ip_port_map'][task_id] port = info[0] ip = info[1] ha_sub = HAnalyser(ip, port, task_id) # Signal it to reset all client stats ha_sub.reset_stats() ha_sub.stop( ) # closes the ANalyser socket, can not be used anymore
def rerun_test(self, options): self.options = options self.reset_all_app_stats(self.stress_client) # Signal message sending l.info("Sending signal to Cassandra Stress client to start sending all messages..") # Force start-time for ALL clients +60 seconds from current time start_time = datetime.now() + timedelta(seconds=60) l.debug("Current Time: %s, Start Time: %s" % (datetime.now(), start_time)) task_list = self.all_task_ids[self.stress_client] ha_list = [] for task_id in task_list: info = self.apps[self.stress_client]['ip_port_map'][task_id] port = info[0] ip = info[1] ha_stress = HAnalyser(ip, port, task_id) # Signal ALL clients to start sending data, blocks until clients respond with "DONE" after sending all data ha_stress.start_test(start_time=start_time) ha_list.append(ha_stress) l.info('Waiting for test(s) to end...') if self.options.sim_failure: l.debug("Simulate Cassandra Node Failure. Init.") # Thread Event to indicate tests have been completed tests_completed = threading.Event() # Launch parallel Thread to simulate cassandra node failure. l.debug("Launch separate thread to simulate node failure and rejoin.") failure_thread = threading.Thread(target=simulate_node_failure, args=(self.options.cluster_ips.split(','), self.options.test_duration, tests_completed)) failure_thread.start() for idx, ha_stress in enumerate(ha_list): l.debug('Waiting for task [%s] in [%s:%s] test to END. Iteration: %s' % (ha_stress.task_id, ha_stress.server_ip, ha_stress.port, idx)) ha_stress.wait_for_testend() if self.options.sim_failure: l.debug("ALL tests are COMPLETED.") tests_completed.set() l.info('Fetch App Stats') self.fetch_app_stats(self.stress_client) return self.result_parser()
def reset_all_app_stats(self, name, group_name=""): """ Reset all the stats for an application @args: name: Name of the app group_name: Group name if only group singal required (optional) """ assert(name in self.apps) task_list = self.all_task_ids[name] if group_name: assert(group_name in self.app_group) task_list = self.app_group[group_name] l.info("Attempting to reset client group stats for app[%s], group[%s]...", name, group_name) else: l.info("Attempting to reset client stats for app[%s]...", name) for task_id in task_list: info = self.apps[name]['ip_port_map'][task_id] port = info[0] ip = info[1] ha_sub = HAnalyser(ip, port, task_id) # Signal it to reset all client stats ha_sub.reset_stats() ha_sub.stop() # closes the ANalyser socket, can not be used anymore
def fetch_app_stats(self, name): """ Fetch stats from all the instances of the app and store it locally. The stats collection is done while looking at "msg_cnt" so it's mandatory that all the stats are required to have a field msg_cnt while collecting the msg_cnt is monitored, and stats collection is completed when the msg_cnt stops increasing between two successive reads. """ assert(name in self.apps) ipm = self.apps[name]['ip_port_map'] self.apps[name]['stats'] = {} first_itr = True no_delay_needed_count = 0 for task_id, info in ipm.items(): port = info[0] ip = info[1] ha_sub = HAnalyser(ip, port, task_id) # Signal it to start sending data, blocks until PUB responsds with "DONE" after sending all data stats = ha_sub.get_stats() while first_itr: time.sleep(.1) stats2 = ha_sub.get_stats() # if it's the first read make sure that the sub has stopped receiving data if (stats['msg_cnt'] == stats2['msg_cnt']): # first_itr = False no_delay_needed_count += 1 if (no_delay_needed_count > 40): # No more delays if 100 successive read's where # stable on msg_cnt first_itr = False break no_delay_needed_count = 0 stats = stats2 ha_sub.stop() # closes the ANalyser socket, can not be used anymore stats['task_id'] = task_id self.apps[name]['stats'][str(ip) + ':' + str(port)] = stats # copy.deepcopy(stats)
def remove_unresponsive_tasks(self, name, group_name=""): """ Ping all the application task's and if any of they don't respond to ping remove them from active task list. @args: name: Name of the app group_name: Group name if only group singal required (optional) """ assert(name in self.apps) task_list = self.all_task_ids[name] if group_name: assert(group_name in self.app_group) task_list = self.app_group[group_name].tasks_list l.debug('Pinging group instances of app[%s], group[%s] to make sure they are started....', name, group_name) else: l.debug('Pinging instances of app[%s] to make sure they are started....', name) cnt = 0 remove_list = [] for task_id in task_list: info = self.apps[name]['ip_port_map'][task_id] port = info[0] ip = info[1] ha = HAnalyser(ip, port, task_id) # Signal it to start sending data, blocks until PUB responsds with "DONE" after sending all data res = ha.do_ping() if not res: l.info("Ping failed to [%s] %s:%s. removing from client list" % (task_id, ip, port)) remove_list.append(task_id) ha.stop() cnt += res ha.stop() # closes the Analyser socket, can not be used anymore l.info('Done pinging all the clients. Got pong response from %d out of %d' % (cnt, len(self.apps[name]['ip_port_map'].items()))) temp_dict = {} for g_name in self.app_group.keys(): temp_dict[g_name] = [] for item in remove_list: l.info("Removing client [%s]" % (item)) del self.apps[name]['ip_port_map'][item] self.all_task_ids[name].remove(item) for g_name, g_obj in self.app_group.items(): g_list = g_obj.tasks_list l.debug("Checking if bad client[%s] is in group[%s]", item, g_name) l.debug(g_list) if item in g_list: l.info("Appending [%s] in group [%s]", item, g_name) temp_dict[g_name].append(item) l.info(temp_dict) for g_name, bad_list in temp_dict.items(): for bad_client in bad_list: l.info("Removing client [%s] from group [%s]", bad_client, g_name) self.app_group[g_name].tasks_list.remove(bad_client)
def fetch_app_stats(self, name, group_name=""): """ Fetch stats from all the instances of the app and store it locally. The stats collection is done while looking at "msg_cnt" so it's mandatory that all the stats are required to have a field msg_cnt while collecting the msg_cnt is monitored, and stats collection is completed when the msg_cnt stops increasing between two successive reads. @args: name: Name of the app group_name: Group name if only group singal required (optional) """ assert (name in self.apps) task_list = self.all_task_ids[name] if group_name: assert (group_name in self.app_group) task_list = self.app_group[group_name].tasks_list l.info( "Attempting to fetch client group stats for app[%s], group[%s]...", name, group_name) else: l.info("Attempting to fetch client stats for app[%s]...", name) self.apps[name]['stats'] = {} first_itr = True no_delay_needed_count = 0 for task_id in task_list: info = self.apps[name]['ip_port_map'][task_id] port = info[0] ip = info[1] ha_sub = HAnalyser(ip, port, task_id) # Signal it to start sending data, blocks until PUB responsds with "DONE" after sending all data stats = ha_sub.get_stats() while first_itr: time.sleep(.1) stats2 = ha_sub.get_stats() # if it's the first read make sure that the sub has stopped receiving data if (stats['msg_cnt'] == stats2['msg_cnt']): # first_itr = False no_delay_needed_count += 1 if (no_delay_needed_count > 40): # No more delays if 100 successive read's where # stable on msg_cnt first_itr = False break no_delay_needed_count = 0 stats = stats2 ha_sub.stop( ) # closes the ANalyser socket, can not be used anymore stats['task_id'] = task_id self.apps[name]['stats'][str(ip) + ':' + str(port)] = stats # copy.deepcopy(stats)
def rerun_test(self, options): self.options = options self.reset_all_app_stats(self.stress_client) # Signal message sending l.info( "Sending signal to Cassandra Stress client to start sending all messages.." ) # Force start-time for ALL clients +60 seconds from current time start_time = datetime.now() + timedelta(seconds=60) l.debug("Current Time: %s, Start Time: %s" % (datetime.now(), start_time)) task_list = self.all_task_ids[self.stress_client] ha_list = [] for task_id in task_list: info = self.apps[self.stress_client]['ip_port_map'][task_id] port = info[0] ip = info[1] ha_stress = HAnalyser(ip, port, task_id) # Signal ALL clients to start sending data, blocks until clients respond with "DONE" after sending all data ha_stress.start_test(start_time=start_time) ha_list.append(ha_stress) l.info('Waiting for test(s) to end...') if self.options.sim_failure: l.debug("Simulate Cassandra Node Failure. Init.") # Thread Event to indicate tests have been completed tests_completed = threading.Event() # Launch parallel Thread to simulate cassandra node failure. l.debug( "Launch separate thread to simulate node failure and rejoin.") failure_thread = threading.Thread( target=simulate_node_failure, args=(self.options.cluster_ips.split(','), self.options.test_duration, tests_completed)) failure_thread.start() for idx, ha_stress in enumerate(ha_list): l.debug( 'Waiting for task [%s] in [%s:%s] test to END. Iteration: %s' % (ha_stress.task_id, ha_stress.server_ip, ha_stress.port, idx)) ha_stress.wait_for_testend() if self.options.sim_failure: l.debug("ALL tests are COMPLETED.") tests_completed.set() l.info('Fetch App Stats') self.fetch_app_stats(self.stress_client) return self.result_parser()
def ping_all_app_inst(self, name): """ Ping all the application task's and if any of they don't respond to ping remove them from active task list. """ l.info('Pinging all the instances of %s to make sure they are started....', name) cnt = 0 remove_list = [] for task_id, info in self.apps[name]['ip_port_map'].items(): port = info[0] ip = info[1] ha = HAnalyser(ip, port, task_id) # Signal it to start sending data, blocks until PUB responsds with "DONE" after sending all data res = ha.do_ping() if not res: l.info("Ping failed to [%s] %s:%s. removing from client list" % (task_id, ip, port)) remove_list.append(task_id) ha.stop() cnt += res ha.stop() # closes the Analyser socket, can not be used anymore l.info('Done pinging all the clients. Got pong response from %d out of %d' % (cnt, len(self.apps[name]['ip_port_map'].items()))) for item in remove_list: del self.apps[name]['ip_port_map'][item]
def remove_unresponsive_tasks(self, name, group_name=""): """ Ping all the application task's and if any of they don't respond to ping remove them from active task list. @args: name: Name of the app group_name: Group name if only group singal required (optional) """ assert (name in self.apps) task_list = self.all_task_ids[name] if group_name: assert (group_name in self.app_group) task_list = self.app_group[group_name].tasks_list l.debug( 'Pinging group instances of app[%s], group[%s] to make sure they are started....', name, group_name) else: l.debug( 'Pinging instances of app[%s] to make sure they are started....', name) cnt = 0 remove_list = [] for task_id in task_list: info = self.apps[name]['ip_port_map'][task_id] port = info[0] ip = info[1] ha = HAnalyser(ip, port, task_id) # Signal it to start sending data, blocks until PUB responsds with "DONE" after sending all data res = ha.do_ping() if not res: l.info("Ping failed to [%s] %s:%s. removing from client list" % (task_id, ip, port)) remove_list.append(task_id) ha.stop() cnt += res ha.stop() # closes the Analyser socket, can not be used anymore l.info( 'Done pinging all the clients. Got pong response from %d out of %d' % (cnt, len(self.apps[name]['ip_port_map'].items()))) temp_dict = {} for g_name in self.app_group.keys(): temp_dict[g_name] = [] for item in remove_list: l.info("Removing client [%s]" % (item)) del self.apps[name]['ip_port_map'][item] self.all_task_ids[name].remove(item) for g_name, g_obj in self.app_group.items(): g_list = g_obj.tasks_list l.debug("Checking if bad client[%s] is in group[%s]", item, g_name) l.debug(g_list) if item in g_list: l.info("Appending [%s] in group [%s]", item, g_name) temp_dict[g_name].append(item) l.info(temp_dict) for g_name, bad_list in temp_dict.items(): for bad_client in bad_list: l.info("Removing client [%s] from group [%s]", bad_client, g_name) self.app_group[g_name].tasks_list.remove(bad_client)
def rerun_test(self, options): self.set_options(options) self.boundary_setup(self.options, 'msg_rate', self.boundary_resultfn) # self.test_duration = options.test_duration # self.msg_batch = options.msg_batch # self.msg_rate = options.msg_rate l.info("Updating test metrics: test_duration=%s, msg_batch=%s, msg_rate=%s", self.options.test_duration, self.options.msg_batch, self.options.msg_rate) # Update the PUB server with new metrics self.ha_pub.update_config(test_duration=self.options.test_duration, msg_batch=self.options.msg_batch, msg_requested_rate=self.options.msg_rate) l.info("PUB server updated") self.reset_all_app_stats(self.zstsub) # Select which sub's are going to be slow # and send them there rate. # add the properties to the sub app data structure on their rate. acnt = self.get_app_instcnt(self.zstsub) slow_num = int(acnt * options.slow_clients_percent / 100) update_sub_config = False if slow_num: slow_clients = self.get_app_property(self.zstsub, 'slow_clients') if not slow_clients or int(slow_num) != len(slow_clients): # reset all the clients self.set_app_property(self.zstsub, 'slow_clients', Set(self.random_select_instances(self.zstsub, slow_num))) update_sub_config = True rec_num = int(acnt * options.rec_clients_percent / 100) if rec_num: rec_clients = self.get_app_property(self.zstsub, 'reconnecting_clients') if not rec_clients or rec_num != len(rec_clients): self.set_app_property(self.zstsub, 'reconnecting_clients', Set(self.random_select_instances(self.zstsub, rec_num))) update_sub_config = True if update_sub_config: # Now update all the slow clients ipm = self.get_app_ipport_map(self.zstsub) slow_set = self.get_app_property(self.zstsub, 'slow_clients') rec_set = self.get_app_property(self.zstsub, 'reconnecting_clients') for key in ipm.keys(): ip = ipm[key][1] port = ipm[key][0] ha = HAnalyser(ip, port, key) recv_rate = 0 reconnect_rate = 0 if slow_set and key in slow_set: print("Task ID " + key + " Is going to be slow") recv_rate = options.slow_clients_rate if rec_set and key in rec_set: print("Task ID " + key + " Is going to be reconnecting") reconnect_rate = options.rec_clients_rate ha.update_config(recv_rate=recv_rate, reconnect_rate=reconnect_rate) ha.stop() # Signal message sending l.info("Sending signal to PUB to start sending all messages..") self.ha_pub.start_test() self.ha_pub.wait_for_testend() self.fetch_app_stats(self.zstpub) assert(len(self.apps[self.zstpub]['stats']) == 1) pub_data = self.apps[self.zstpub]['stats'].values()[0] l.info("Publisher send %d packets at the rate of %d pps" % (pub_data['msg_cnt'], pub_data['rate'])) # Fetch all sub client data self.fetch_app_stats(self.zstsub) return self.result_parser()
def __init__(self, server_ip, server_port): HAnalyser.__init__(self, server_ip, server_port)
def __init__(self, server_ip, server_port, task_id): HAnalyser.__init__(self, server_ip, server_port, task_id)