def __init__(self, *args, **kwargs): self._test_index = kwargs.get('test_index', None) self._test_id = kwargs.get('test_id', None) self._es_doc_type = "test_stats" self.elasticsearch = ES() self._stats = {} if not self._test_id: super(Stats, self).__init__(*args, **kwargs)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.prometheus_stats = None self.num_of_partitions = 200000000 self.backgroud_task = None self.class_users = {} self.connection_cql = None self._comparison_results = {} self._es = ES()
def get_by_params(cls, es_index=es_index, **params): es_query = cls._get_es_query_from_instance_data(params) filter_path = cls._get_es_filters() es_data = ES().search(index=es_index, q=es_query, filter_path=filter_path, size=10000) if not es_data: return [] es_data = es_data.get('hits', {}).get('hits', {}) if not es_data: return [] return [cls(es_data=es_test_data) for es_test_data in es_data]
def elasticsearch(self) -> Optional[ES]: try: return ES() except Exception as exc: LOGGER.exception("Failed to create ES connection (doc_id=%s)", self._test_id) ElasticsearchEvent(doc_id=self._test_id, error=str(exc)).publish()
def get_prior_tests(self, filter_path=None) -> typing.List['TestResultClass']: output = [] es_query = self.get_same_tests_query() es_result = ES().search(index=self._es_data['_index'], q=es_query, filter_path=filter_path, size=10000) # pylint: disable=unexpected-keyword-arg es_result = es_result.get('hits', {}).get('hits', None) if es_result else None if not es_result: return output for es_data in es_result: # pylint: disable=not-an-iterable test = TestResultClass(es_data) output.append(test) return output
def get_by_params(cls, es_index=es_index, **params): es_query = cls._get_es_query_from_instance_data(params) filter_path = cls._get_es_filters() try: es_data = ES().search(index=es_index, q=es_query, filter_path=filter_path, size=10000) except Exception as exc: # pylint: disable=broad-except LOGGER.warning("Unable to find ES data: %s", exc) es_data = None if not es_data: return [] es_data = es_data.get('hits', {}).get('hits', {}) if not es_data: return [] return [cls(es_data=es_test_data) for es_test_data in es_data]
def __init__( self, es_index, es_doc_type, send_email=False, email_recipients=(), # pylint: disable=too-many-arguments email_template_fp="", query_limit=1000, logger=None): self._es = ES() self._conf = self._es._conf # pylint: disable=protected-access self._es_index = es_index self._es_doc_type = es_doc_type self._limit = query_limit self._send_email = send_email self._email_recipients = email_recipients self._email_template_fp = email_template_fp self.log = logger if logger else LOGGER
class Stats(): """ This class is responsible for creating and updating database entry(document in Elasticsearch DB) There are two usage options: 1. without arguments - as a based class of TestStatsMixin - for saving test statistics 2. with arguments - as a separate object to update an existing document """ def __init__(self, *args, **kwargs): self._test_index = kwargs.get('test_index', None) self._test_id = kwargs.get('test_id', None) self._es_doc_type = "test_stats" self.elasticsearch = ES() self._stats = {} if not self._test_id: super(Stats, self).__init__(*args, **kwargs) def get_doc_id(self): return self._test_id def create(self): self.elasticsearch.create_doc(index=self._test_index, doc_type=self._es_doc_type, doc_id=self._test_id, body=self._stats) def update(self, data): """ Update document :param data: data dictionary """ try: self.elasticsearch.update_doc(index=self._test_index, doc_type=self._es_doc_type, doc_id=self._test_id, body=data) except Exception as ex: # pylint: disable=broad-except LOGGER.error('Failed to update test stats: test_id: %s, error: %s', self._test_id, ex) def exists(self): return self.elasticsearch.exists(index=self._test_index, doc_type=self._es_doc_type, id=self._test_id)
def get_prior_tests(self, filter_path=None) -> typing.List['TestResultClass']: output = [] try: es_query = self.get_same_tests_query() es_result = ES().search( # pylint: disable=unexpected-keyword-arg; pylint doesn't understand Elasticsearch code index=self._es_data['_index'], q=es_query, size=10000, filter_path=filter_path, ) es_result = es_result.get('hits', {}).get( 'hits', None) if es_result else None except Exception as exc: # pylint: disable=broad-except LOGGER.warning("Unable to find ES data: %s", exc) es_result = None if not es_result: return output for es_data in es_result: # pylint: disable=not-an-iterable test = TestResultClass(es_data) output.append(test) return output
class SlaPerUserTest(LongevityTest): """ Test SLA per user feature using cassandra-stress. """ STRESS_WRITE_CMD = 'cassandra-stress write cl=QUORUM n={n} -schema \'replication(factor=3)\' ' \ '-mode cql3 native user={user} password={password} -rate threads={threads}' STRESS_WRITE_DURATION_CMD = 'cassandra-stress write cl=ALL duration={duration} -schema \'replication(factor=3)\' ' \ '-mode cql3 native user={user} password={password} -rate threads={threads} ' \ 'throttle=10000/s -pop seq={pop}' STRESS_READ_CMD = 'cassandra-stress read cl=ALL duration={duration} -mode cql3 native user={user} ' \ 'password={password} -rate threads={threads} -pop {pop}' STRESS_MIXED_CMD = r"cassandra-stress mixed ratio\(write={write_ratio},read={write_ratio}\) cl=QUORUM " \ "duration={duration} " \ "-mode cql3 native user={user} password={password} -rate threads={threads} -pop {pop} " DEFAULT_USER = '******' DEFAULT_USER_PASSWORD = '******' DEFAULT_USER_SLA = 'sla_cassandra' DEFAULT_SHARES = 1000 VALID_DEVIATION_PRC = 10 MIN_CPU_UTILIZATION = 97 WORKLOAD_LATENCY = 'latency' WORKLOAD_THROUGHPUT = 'throughput' CACHE_ONLY_LOAD = 'cache_only' DISK_ONLY_LOAD = 'disk_only' MIXED_LOAD = 'mixed' WORKLOAD_TYPES_INDEX = "workload_tests" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.prometheus_stats = None self.num_of_partitions = 200000000 self.backgroud_task = None self.class_users = {} self.connection_cql = None self._comparison_results = {} self._es = ES() def prepare_schema(self): self.prometheus_stats = PrometheusDBStats( host=self.monitors.nodes[0].public_ip_address) self.connection_cql = self.db_cluster.cql_connection_patient( node=self.db_cluster.nodes[0], user=self.DEFAULT_USER, password=self.DEFAULT_USER_PASSWORD) session = self.connection_cql.session return session def create_test_data(self, rows_amount=None): # Prefill data before tests if rows_amount is not None: self.num_of_partitions = rows_amount write_cmd = self.STRESS_WRITE_CMD.format( n=self.num_of_partitions, user=self.DEFAULT_USER, password=self.DEFAULT_USER_PASSWORD, threads=250) self.run_stress_and_verify_threads( params={ 'stress_cmd': write_cmd, 'prefix': 'preload-', 'stats_aggregate_cmds': False }) @staticmethod def user_to_scheduler_group(test_users, scheduler_shares): for user, shares in test_users.items(): for scheduler_group, sg_shares in scheduler_shares.items(): if shares[0] in sg_shares: test_users[user].append(scheduler_group) break return test_users def validate_scheduler_runtime(self, start_time, end_time, read_users, expected_ratio): users_with_shares = { user['user'].name: [user['service_level'].service_shares] for user in read_users } for node_ip in self.db_cluster.get_node_private_ips(): # Temporary solution scheduler_shares = self.prometheus_stats.get_scylla_scheduler_shares_per_sla( start_time, end_time, node_ip) self.log.debug('SCHEDULERS SHARES FROM PROMETHEUS: {}'.format( scheduler_shares)) if 'service_level_sg_0' in scheduler_shares: scheduler_shares.pop('service_level_sg_0') test_users_to_sg = self.user_to_scheduler_group( test_users=users_with_shares, scheduler_shares=scheduler_shares) self.log.debug('USER - SERVICE LEVEL - SCHEDULER: {}'.format( test_users_to_sg)) # End Temporary solution shards_time_per_sla = self.prometheus_stats.get_scylla_scheduler_runtime_ms( start_time, end_time, node_ip) if not (shards_time_per_sla and scheduler_shares): continue runtime_per_user = {} for username, val in test_users_to_sg.items(): if val[1] in shards_time_per_sla[node_ip]: runtime_per_user[username] = sum(shards_time_per_sla[node_ip][val[1]]) / \ len(shards_time_per_sla[node_ip][val[1]]) else: runtime_per_user[username] = 0 self.log.debug('RUN TIME PER USER: {}'.format(runtime_per_user)) actual_shares_ratio = self.calculate_metrics_ratio_per_user( two_users_list=read_users, metrics=runtime_per_user) self.validate_deviation( expected_ratio=expected_ratio, actual_ratio=actual_shares_ratio, msg='Validate scheduler CPU runtime on the node %s' % node_ip) @staticmethod def create_auths(entities_list_of_dict): """ :param entities_list_of_dict: Expected structure: [{'user': User(), 'role': Role(), 'service_level': ServiceLevel()}, OR {'user': User(), 'service_level': ServiceLevel()}, OR {'role': Role(), 'service_level': ServiceLevel()} ] """ for entity in entities_list_of_dict: service_level = entity.get('service_level') role = entity.get('role') user = entity.get('user') if service_level: service_level.create() if role: role.create() role.attach_service_level(service_level=service_level) if user: user.create() if role: role.grant_me_to(grant_to=user) else: user.attach_service_level(service_level=service_level) def validate_deviation(self, expected_ratio, actual_ratio, msg): dev = self.calculate_deviation(expected_ratio, actual_ratio) self.assertIsNotNone( dev, 'Can\'t compare expected and actual shares ratio. Expected: ' '{expected_ratio}. Actual: {actual_ratio}'.format( expected_ratio=expected_ratio, actual_ratio=actual_ratio)) # TODO: formulate error message self.assertTrue( dev <= self.VALID_DEVIATION_PRC, '{msg}. Actual shares ratio ({actual_ratio}) is not ' 'as expected ({expected_ratio})'.format( msg=msg, actual_ratio=actual_ratio, expected_ratio=expected_ratio)) @staticmethod def calculate_deviation(first, second): if first and second: _first, _second = (first, second) if first > second else (second, first) dev = float(abs(_first - _second) * 100 / _second) return dev return None @staticmethod def calculate_metrics_ratio_per_user(two_users_list, metrics=None): # pylint: disable=invalid-name """ :param metrics: calculate ratio for specific Scylla or cassandra-stress metrics (ops, scheduler_runtime etc..). If metrics name is not defined - ration will be calculated for service_shares """ if two_users_list[0]['service_level'].service_shares > two_users_list[ 1]['service_level'].service_shares: high_shares_user = two_users_list[0] low_shares_user = two_users_list[1] else: high_shares_user = two_users_list[1] low_shares_user = two_users_list[0] if metrics: high_shares_metrics = metrics[high_shares_user['user'].name] low_shares_metrics = metrics[low_shares_user['user'].name] else: high_shares_metrics = high_shares_user[ 'service_level'].service_shares low_shares_metrics = low_shares_user[ 'service_level'].service_shares if not high_shares_metrics or not low_shares_metrics: return None return float(high_shares_metrics) / float(low_shares_metrics) def run_stress_and_verify_threads(self, params=None): read_queue = [] self._run_all_stress_cmds(read_queue, params=params) for queue in read_queue: self.verify_stress_thread(cs_thread_pool=queue) return read_queue def get_c_s_stats(self, read_queue, users, statistic_name): users_names = [user['user'].name for user in users] results = {} for i, read in enumerate(read_queue): res = self.get_stress_results(queue=read, store_results=False) stat_rate, username = None, None if res: stat_rate = res[0].get(statistic_name) username = res[0].get('username') if not (stat_rate and username): self.log.error( 'Stress statistics are not received for user {}. Can\'t complete the test' .format(users_names[i])) return None self.assertEqual( username, users_names[i], msg='Expected that stress was run with user "{}" but it was "{}"' .format(users_names[i], username)) results[username] = float(stat_rate) return results def validate_if_scylla_load_high_enough(self, start_time, wait_cpu_utilization): # pylint: disable=invalid-name end_time = int(time.time()) scylla_load = self.prometheus_stats.get_scylla_reactor_utilization( start_time=start_time, end_time=end_time) self.assertTrue( scylla_load >= wait_cpu_utilization, msg='Load isn\'t high enough. The test results may be not correct') def clean_auth(self, entities_list_of_dict): for entity in entities_list_of_dict: service_level = entity.get('service_level') role = entity.get('role') user = entity.get('user') if user: user.drop() if role: role.drop() if service_level: service_level.drop() self.backgroud_task = None self.connection_cql.cluster.shutdown() def warm_up_cache_before_test(self, max_key_for_read, stress_duration): read_cmds = [ self.STRESS_READ_CMD.format(n=self.num_of_partitions, user=self.DEFAULT_USER, password=self.DEFAULT_USER, pop="seq=1..%d" % max_key_for_read, duration='%dm' % stress_duration, threads=200) ] self.run_stress_and_verify_threads(params={'stress_cmd': read_cmds}) # pylint: disable=too-many-arguments, too-many-locals def define_read_cassandra_stress_command( self, role: Role, load_type: str, c_s_workload_type: str, threads: int, stress_duration_min: int, max_rows_for_read: int = None, stress_command: str = STRESS_READ_CMD, throttle: int = 20000, **kwargs): """ :param role: Role object :param load_type: cache_only/disk_only/mixed :param c_s_workload_type: latency: with ops restriction - using throttle or throughput: no restriction """ def latency(): return '%d throttle=%d/s' % (threads, throttle) def throughput(): # pylint: disable=unused-variable return threads def cache_only(max_rows_for_read): # pylint: disable=unused-variable if not max_rows_for_read: max_rows_for_read = int(self.num_of_partitions * 0.3) return 'seq=1..%d' % max_rows_for_read # Read from cache and disk def mixed(max_rows_for_read): # pylint: disable=unused-variable if not max_rows_for_read: max_rows_for_read = self.num_of_partitions return "'dist=gauss(1..%d, %d, %d)'" % ( max_rows_for_read, int( max_rows_for_read / 2), int(max_rows_for_read * 0.05)) def disk_only(max_rows_for_read): # pylint: disable=unused-variable if not max_rows_for_read: max_rows_for_read = int(self.num_of_partitions * 0.3) return 'seq=%d..%d' % (max_rows_for_read, max_rows_for_read + int(self.num_of_partitions * 0.25)) rate = locals()[c_s_workload_type]( ) # define -rate for c-s command depend on workload type pop = locals()[load_type]( max_rows_for_read ) # define -pop for c-s command depend on load type params = { 'n': self.num_of_partitions, 'user': role.name, 'password': role.password, 'pop': pop, 'duration': '%dm' % stress_duration_min, 'threads': rate } if kwargs: params.update(kwargs['kwargs']) c_s_cmd = stress_command.format(**params) self.log.info("Created cassandra-stress command: %s", c_s_cmd) return c_s_cmd def test_read_throughput_1to5_ratio(self): """ Basic test - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 qith 950 shares - Each user runs load from own loader (round robin) - Expect OPS ratio between two loads is 1:5 (e.g. 190:950) - Expect scheduler run time between two loads is 1:5 (e.g. 190:950) Load from both cache and disk """ self._two_users_load_througput_workload(shares=[190, 950], load=self.MIXED_LOAD) def _two_users_load_througput_workload(self, shares, load): session = self.prepare_schema() self.create_test_data() # Define Service Levels/Roles/Users read_users = [] for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, shares=share) }) expected_shares_ratio = self.calculate_metrics_ratio_per_user( two_users_list=read_users) # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) stress_duration = 10 # minutes read_cmds = [ self.define_read_cassandra_stress_command( role=read_users[0]["role"], load_type=load, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=250, stress_duration_min=stress_duration), self.define_read_cassandra_stress_command( role=read_users[1]["role"], load_type=load, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=250, stress_duration_min=stress_duration) ] try: start_time = time.time() read_queue = self.run_stress_and_verify_threads(params={ 'stress_cmd': read_cmds, 'round_robin': True }) results = self.get_c_s_stats(read_queue=read_queue, users=read_users, statistic_name='op rate') self.validate_if_scylla_load_high_enough( start_time=start_time, wait_cpu_utilization=self.MIN_CPU_UTILIZATION) end_time = time.time() self.validate_scheduler_runtime( start_time=start_time, end_time=end_time, read_users=read_users, expected_ratio=expected_shares_ratio) self.assertTrue(results, msg='Not received cassandra-stress results') self.log.debug('Validate cassandra-stress ops deviation') actual_shares_ratio = self.calculate_metrics_ratio_per_user( two_users_list=read_users, metrics=results) self.validate_deviation(expected_ratio=expected_shares_ratio, actual_ratio=actual_shares_ratio, msg='Validate cassandra-stress ops.') finally: self.clean_auth(entities_list_of_dict=read_users) def test_read_throughput_vs_latency_cache_and_disk(self): # pylint: disable=invalid-name """ Test when one user run load with high latency and another - with high througput The load is run on the full data set (that is read from both the cache and the disk) Troughput - latency test: - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 qith 950 shares - Each user runs load from own loader (round robin): - user950 runs load with throttle - user190 runs load with high throughput Expected results: latency 99th of user950 workload when it runs in parallel with workload of user190 is not significant increased relatively to latency of runed alone user950 workload """ stress_duration = 10 # minutes shares = [190, 950] read_users = [] session = self.prepare_schema() self.create_test_data() # Define Service Levels/Roles/Users for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, shares=share) }) # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) # Define stress commands read_cmds = { 'troughput': self.define_read_cassandra_stress_command( role=read_users[0]["role"], load_type=self.MIXED_LOAD, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=200, stress_duration_min=stress_duration), 'latency': self.define_read_cassandra_stress_command( role=read_users[1]["role"], load_type=self.MIXED_LOAD, c_s_workload_type=self.WORKLOAD_LATENCY, threads=250, stress_duration_min=stress_duration) } self._throughput_latency_tests_run(read_users=read_users, read_cmds=read_cmds, latency_user=read_users[1]) def test_read_throughput_vs_latency_cache_only(self): # pylint: disable=invalid-name """ Test when one user run load with high latency and another - with high througput The load is run on the data set that fully exists in the cache Troughput - latency test: - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 qith 950 shares - Each user runs load from own loader (round robin): - user950 runs load with throttle - user190 runs load with high throughput Expected results: latency 99th of user950 workload when it runs in parallel with workload of user190 is not significant increased relatively to latency of runed alone user950 workload """ stress_duration = 5 # minutes shares = [190, 950] # Select part of the record to warm the cache (all this data will be in the cache). # This amount of data will be read during the test from cache max_key_for_read = int(self.num_of_partitions * 0.5) read_users = [] session = self.prepare_schema() self.create_test_data() # Warm up the cache to guarantee the read will be from disk self.warm_up_cache_before_test(max_key_for_read=max_key_for_read, stress_duration=30) # Define Service Levels/Roles/Users for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, shares=share) }) # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) read_cmds = { 'troughput': self.define_read_cassandra_stress_command( role=read_users[0]["role"], load_type=self.CACHE_ONLY_LOAD, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=200, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_read), 'latency': self.define_read_cassandra_stress_command( role=read_users[1]["role"], load_type=self.CACHE_ONLY_LOAD, c_s_workload_type=self.WORKLOAD_LATENCY, threads=250, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_read) } self._throughput_latency_tests_run(read_users=read_users, read_cmds=read_cmds, latency_user=read_users[1]) def test_read_throughput_vs_latency_disk_only(self): # pylint: disable=invalid-name """ Test when one user run load with high latency and another - with high througput The load is run on the data set that fully exists in the cache Troughput - latency test: - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 qith 950 shares - Each user runs load from own loader (round robin): - user950 runs load with throttle - user190 runs load with high throughput Expected results: latency 99th of user950 workload when it runs in parallel with workload of user190 is not significant increased relatively to latency of runed alone user950 workload """ stress_duration = 5 # minutes session = self.prepare_schema() self.create_test_data() for node in self.db_cluster.nodes: node.stop_scylla_server(verify_up=False, verify_down=True) node.start_scylla_server(verify_up=True, verify_down=False) # Select part of the record to warm the cache (all this data will be in the cache). # cassandra-stress "-pop" parameter will start from more then "max_key_for_cache" row number # (for read from the disk) max_key_for_cache = int(self.num_of_partitions * 0.25) # Warm up the cache to guarantee the read will be from disk self.warm_up_cache_before_test(max_key_for_read=max_key_for_cache, stress_duration=30) # Define Service Levels/Roles/Users shares = [190, 950] read_users = [] for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, shares=share) }) # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) read_cmds = { 'troughput': self.define_read_cassandra_stress_command( role=read_users[0]["role"], load_type=self.DISK_ONLY_LOAD, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=200, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_cache * 2), 'latency': self.define_read_cassandra_stress_command( role=read_users[1]["role"], load_type=self.DISK_ONLY_LOAD, c_s_workload_type=self.WORKLOAD_LATENCY, threads=250, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_cache * 3), 'latency_only': self.define_read_cassandra_stress_command( role=read_users[1]["role"], load_type=self.DISK_ONLY_LOAD, c_s_workload_type=self.WORKLOAD_LATENCY, threads=250, stress_duration_min=stress_duration, max_rows_for_read=max_key_for_cache) } self._throughput_latency_tests_run(read_users=read_users, read_cmds=read_cmds, latency_user=read_users[1]) def test_read_50perc_write_50perc_load(self): """ Test scenario: - Add SLA and grant to user (before any load) - user190 with 190 shares - user950 qith 950 shares - Each user runs load from own loader (round robin) - Expect OPS ratio between two loads is 1:5 (e.g. 190:950) - Expect scheduler run time between two loads is 1:5 (e.g. 190:950) """ session = self.prepare_schema() self.create_test_data() stress_duration_min = 10 # Define Service Levels/Roles/Users shares = [190, 950] read_users = [] for share in shares: read_users.append({ 'user': User(session=session, name='user%d' % share, password='******' % share), 'role': Role(session=session, name='role%d' % share), 'service_level': ServiceLevel(session=session, name='sla%d' % share, shares=share) }) # Create Service Levels/Roles/Users self.create_auths(entities_list_of_dict=read_users) read_cmds = { 'troughput': self.define_read_cassandra_stress_command( role=read_users[0]["role"], load_type=self.MIXED_LOAD, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=120, stress_duration_min=stress_duration_min, stress_command=self.STRESS_MIXED_CMD, kwargs={ 'write_ratio': 1, 'read_ratio': 1 }), 'latency': self.define_read_cassandra_stress_command( role=read_users[1]["role"], load_type=self.MIXED_LOAD, c_s_workload_type=self.WORKLOAD_LATENCY, threads=120, stress_duration_min=stress_duration_min, stress_command=self.STRESS_MIXED_CMD, kwargs={ 'write_ratio': 1, 'read_ratio': 1 }) } self._throughput_latency_tests_run(read_users=read_users, read_cmds=read_cmds, latency_user=read_users[1]) def test_workload_types(self): """ Test scenario: run 2 workload types (batch, interactive) using Roles with relevant ServiceLevel objects attached to them. Validate that the metrics differ and that the difference is within the expected margins. """ session = self.prepare_schema() self.create_test_data(rows_amount=100_000) stress_duration_min = 180 # Define Service Levels/Roles/Users interactive_role = Role(session=session, name="interactive", password="******", login=True, verbose=True).create() batch_role = Role(session=session, name="batch", password="******", login=True, verbose=True).create() interactive_sla = ServiceLevel(session=session, name="interactive", shares=None, workload_type="interactive").create() batch_sla = ServiceLevel(session=session, name="batch", shares=None, workload_type="batch").create() interactive_role.attach_service_level(interactive_sla) batch_role.attach_service_level(batch_sla) read_cmds = { 'throughput_interactive': self.define_read_cassandra_stress_command( role=interactive_role, load_type=self.MIXED_LOAD, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=120, stress_duration_min=stress_duration_min, stress_command=self.STRESS_MIXED_CMD, kwargs={ 'write_ratio': 1, 'read_ratio': 1 }), 'throughput_batch': self.define_read_cassandra_stress_command( role=batch_role, load_type=self.MIXED_LOAD, c_s_workload_type=self.WORKLOAD_THROUGHPUT, threads=120, stress_duration_min=stress_duration_min, stress_command=self.STRESS_MIXED_CMD, kwargs={ 'write_ratio': 1, 'read_ratio': 1 }), } try: self.log.debug( 'Running interactive and batch workloads in sequence...') workloads_queue = self.run_stress_and_verify_threads( params={ 'stress_cmd': [ read_cmds['throughput_interactive'], read_cmds["throughput_batch"], ], 'round_robin': True }) self._comparison_results = self._compare_workloads_c_s_metrics( workloads_queue) self.log.info("C-S comparison results:\n%s", self._comparison_results) self.upload_c_s_comparison_to_es() finally: pass def _throughput_latency_tests_run(self, read_cmds, read_users, latency_user): # pylint: disable=too-many-locals try: # Run latency workload test_start_time = time.time() self.log.debug('Start latency only workload') read_queue = self.run_stress_and_verify_threads( params={ 'stress_cmd': [read_cmds.get('latency_only') or read_cmds['latency']], 'round_robin': True }) latency_99_for_latency_workload = self.get_c_s_stats( read_queue=read_queue, users=[latency_user], statistic_name='latency 99th percentile') self.assertTrue( latency_99_for_latency_workload, msg='Not received cassandra-stress results for latency ' 'workload') # Run throughput and latency workloads self.log.debug( 'Start latency workload in parallel with throughput workload') read_queue = self.run_stress_and_verify_threads( params={ 'stress_cmd': [read_cmds['troughput'], read_cmds['latency']], 'round_robin': True }) latency_99_for_mixed_workload = self.get_c_s_stats( read_queue=read_queue, users=read_users, statistic_name='latency 99th percentile') self.assertTrue( latency_99_for_mixed_workload, msg='Not received cassandra-stress for latency workload') grafana_dataset = self.monitors.get_grafana_screenshot_and_snapshot( test_start_time=test_start_time) grafana_screenshots = grafana_dataset.get('screenshots', []) grafana_snapshots = grafana_dataset.get('snapshots', []) self.log.debug( 'GRAFANA SCREENSHOTS: {}'.format(grafana_screenshots)) self.log.debug('GRAFANA SNAPSHOTS: {}'.format(grafana_snapshots)) # Compare latency of two runs self.log.debug('Test results:\n---------------------\n') latency_99_latency_workload = latency_99_for_latency_workload[ latency_user['user'].name] latency_99_mixed_workload = latency_99_for_mixed_workload[ latency_user['user'].name] deviation = self.calculate_deviation(latency_99_latency_workload, latency_99_mixed_workload) latency_change = 'increased' if latency_99_mixed_workload > latency_99_latency_workload else 'decreased' result_print_str = '\nTest results:\n---------------------\n' result_print_str += '\nWorkload | Latency 99%' result_print_str += '\n========================= | =================' result_print_str += '\nLatency only | {}'.format( latency_99_latency_workload) result_print_str += '\nLatency and throughput | {}'.format( latency_99_mixed_workload) result_print_str += '\n------------------------- | -----------------' result_print_str += '\nLatency 99 is {} in {}%'.format( latency_change, deviation) self.log.info(result_print_str) finally: self.clean_auth(entities_list_of_dict=read_users) def _compare_workloads_c_s_metrics(self, workloads_queue: list) -> dict: comparison_axis = { "latency 95th percentile": 2.0, "latency 99th percentile": 2.0, "op rate": 2.0 } workloads_results = {} for workload in workloads_queue: result = self.get_stress_results(queue=workload, store_results=False) workloads_results.update({result[0].get("username"): result[0]}) assert len(workloads_results) == 2, \ "Expected workload_results length to be 2, got: %s. workload_results: %s" % ( len(workloads_results), workloads_results) comparison_results = {} try: for item, target_margin in comparison_axis.items(): interactive = float(workloads_results["interactive"][item]) batch = float(workloads_results["batch"][item]) ratio = interactive / batch if item == "op rate" else batch / interactive comparison_results.update({ item: { "interactive": interactive, "batch": batch, "diff": batch - interactive, "ratio": ratio, "within_margin": ratio >= target_margin } }) return comparison_results except Exception: self.log.info( "Failed to compare c-s results for batch and interactive" "workloads.") raise def upload_c_s_comparison_to_es(self) -> None: self.log.info("Uploading c-s comparison to ES...") es_body = { self.db_cluster.get_node().db_node_instance_type: { "test_id": self.test_id, "backend": self.db_cluster.params.get("cluster_backend"), "scylla_version": self.get_scylla_versions(), **self._comparison_results } } self._es.create_doc(index="workload_types", doc_type="test_stats", doc_id=self.test_id, body=es_body) self.log.info("C-s comparison uploaded to ES.") def get_email_data(self): self.log.info("Prepare data for email") email_data = {} grafana_dataset = {} try: email_data = self._get_common_email_data() except Exception as error: # pylint: disable=broad-except self.log.error("Error in gathering common email data: Error:\n%s", error) try: grafana_dataset = self.monitors.get_grafana_screenshot_and_snapshot( self.start_time) if self.monitors else {} except Exception as error: # pylint: disable=broad-except self.log.error( "Error in gathering Grafana screenshots and snapshots. Error:\n%s", error) email_data.update({ "grafana_screenshots": grafana_dataset.get("screenshots", []), "grafana_snapshots": grafana_dataset.get("snapshots", []), "scylla_ami_id": self.params.get("ami_id_db_scylla") or "-", "region": self.params.get("region_name") or "-", "workload_comparison": self._comparison_results if self._comparison_results else {} }) return email_data # pylint: disable=inconsistent-return-statements def get_test_status(self) -> str: if self._comparison_results: try: if all((item["within_margin"] for item in self._comparison_results.values())): return "SUCCESS" else: return "FAILED" except KeyError as exc: self.log.error( "Exception on attempting to check workload comparison results:\n%s", exc) return super().get_test_status()
def gen_kibana_dashboard_url( dashboard_path="app/kibana#/dashboard/03414b70-0e89-11e9-a976-2fe0f5890cd0?_g=()" ): return "%s/%s" % (ES()._conf.get('kibana_url'), dashboard_path) # pylint: disable=protected-access
def _create_es_connection(): return ES()
class BaseResultsAnalyzer(): # pylint: disable=too-many-instance-attributes def __init__( self, es_index, es_doc_type, send_email=False, email_recipients=(), # pylint: disable=too-many-arguments email_template_fp="", query_limit=1000, logger=None): self._es = ES() self._conf = self._es._conf # pylint: disable=protected-access self._es_index = es_index self._es_doc_type = es_doc_type self._limit = query_limit self._send_email = send_email self._email_recipients = email_recipients self._email_template_fp = email_template_fp self.log = logger if logger else LOGGER def get_all(self): """ Get all the test results in json format """ return self._es.search(index=self._es_index, size=self._limit) # pylint: disable=unexpected-keyword-arg def get_test_by_id(self, test_id): """ Get test results by test id :param test_id: test id created by performance test :return: test results in json format """ if not self._es.exists( index=self._es_index, doc_type=self._es_doc_type, id=test_id): self.log.error('Test results not found: {}'.format(test_id)) return None return self._es.get(index=self._es_index, doc_type=self._es_doc_type, id=test_id) def _test_version(self, test_doc): if test_doc['_source'].get('versions'): for value in ('scylla-server', 'scylla-enterprise-server'): key = test_doc['_source']['versions'].get(value) if key: return key self.log.error('Scylla version is not found for test %s', test_doc['_id']) return None def render_to_html(self, results, html_file_path=""): """ Render analysis results to html template :param results: results dictionary :param html_file_path: Boolean, whether to save html file on disk :return: html string """ self.log.info("Rendering results to html using '%s' template...", self._email_template_fp) loader = jinja2.FileSystemLoader( os.path.dirname(os.path.abspath(__file__))) print(os.path.dirname(os.path.abspath(__file__))) env = jinja2.Environment(loader=loader, autoescape=True, extensions=['jinja2.ext.loopcontrols']) template = env.get_template(self._email_template_fp) html = template.render(results) self.log.info("Results has been rendered to html") if html_file_path: with open(html_file_path, "w") as html_file: html_file.write(html) self.log.info("HTML report saved to '%s'.", html_file_path) return html def send_email(self, subject, content, html=True, files=()): if self._send_email and self._email_recipients: self.log.debug('Send email to {}'.format(self._email_recipients)) email = Email() email.send(subject, content, html=html, recipients=self._email_recipients, files=files) else: self.log.warning( "Won't send email (send_email: %s, recipients: %s)", self._send_email, self._email_recipients) def gen_kibana_dashboard_url(self, dashboard_path=""): return "%s/%s" % (self._conf.get('kibana_url'), dashboard_path)
def __init__(self): self._nodes: list["BaseNode"] = [] self._benchmark_runners: list[ScyllaNodeBenchmarkRunner] = [] self._es = ES() self._comparison = {}
class ScyllaClusterBenchmarkManager(metaclass=Singleton): """ ScyllaClusterBenchmarkManager gathers the benchmark results of all the relevant db nodes in the cluster and presents them in unified fashion. ElasticSearch is used to store the results. """ def __init__(self): self._nodes: list["BaseNode"] = [] self._benchmark_runners: list[ScyllaNodeBenchmarkRunner] = [] self._es = ES() self._comparison = {} @property def comparison(self): return self._comparison def add_node(self, new_node: "BaseNode"): if new_node.distro.is_debian_like: self._benchmark_runners.append(ScyllaNodeBenchmarkRunner(new_node)) else: LOGGER.debug( "Skipped installing benchmarking tools on a non-debian-like distro." ) def add_nodes(self, nodes: list["BaseNode"]): for node in nodes: self.add_node(node) def install_benchmark_tools(self): try: parallel = ParallelObject(self._benchmark_runners, timeout=300) parallel.run(lambda x: x.install_benchmark_tools(), ignore_exceptions=True) except TimeoutError as exc: LOGGER.warning( "Ran into TimeoutError while installing benchmark tools: Exception:\n%s", exc) def run_benchmarks(self): try: parallel = ParallelObject(self._benchmark_runners, timeout=300) parallel.run(lambda x: x.run_benchmarks(), ignore_exceptions=True) except TimeoutError as exc: LOGGER.warning( "Run into TimeoutError during running benchmarks. Exception:\n%s", exc) self._collect_benchmark_output() self._compare_results() def _collect_benchmark_output(self): """ Collect the results from ScyllaClusterBenchmarkRunner instances and post them to Elasticsearch. """ test_id = TestConfig().test_id() for runner in self._benchmark_runners: if runner.benchmark_results: results = { "test_id": test_id, "node_instance_type": runner.node_instance_type, "node_name": runner.node_name, **runner.benchmark_results } doc_id = f"{test_id}-{runner.node_name.split('-')[-1]}" self._es.create_doc(index=ES_INDEX, doc_type=None, doc_id=doc_id, body=results) else: LOGGER.info("No benchmarks results for node: %s", runner.node_name) def _get_all_benchmark_results(self) -> dict: return self._es.get_all("node_benchmarks") def _compare_results(self): for runner in self._benchmark_runners: if not runner.benchmark_results: continue try: result = ComparableResult( sysbench_eps=runner. benchmark_results["sysbench_events_per_second"], cassandra_fio_read_bw=runner.benchmark_results[ "cassandra_fio_lcs_64k_read"]["read"]["bw"], cassandra_fio_write_bw=runner.benchmark_results[ "cassandra_fio_lcs_64k_write"]["write"]["bw"]) averages = self._get_average_results( es_docs=self._get_all_benchmark_results(), instance_type=runner.node_instance_type, test_id=TestConfig().test_id()) self._comparison.update( self._check_results(node_name=runner.node_name, averages=averages, result=result, margins=Margins( sysbench_eps=0.03, cassandra_fio_read_bw=0.01, cassandra_fio_write_bw=0.01))) except Exception as exc: # pylint: disable=broad-except LOGGER.warning( "Failed to generate comparable result for the following item:\n%s" "\nException:%s", runner.benchmark_results, exc) continue @staticmethod def _check_results(node_name: str, averages: Averages, result: ComparableResult, margins: Margins) -> dict: results = {node_name: {}} for item in result._fields: avg_ratio = result[item] / averages[item] if averages[ item] > 0 else 1.0 results[node_name][item] = { "value": result[item], "average": averages[item], "average_ratio": avg_ratio, "is_within_margin": avg_ratio > (1 - margins[item]) } return results @staticmethod def _get_average_results(es_docs: dict, instance_type: str, test_id: str): sources = [item["_source"] for item in es_docs["hits"]["hits"]] docs = [ doc for doc in sources if doc["node_instance_type"] == instance_type and doc["test_id"] != test_id ] results = [] if not docs: return Averages() for item in docs: try: results.append( ComparableResult( sysbench_eps=item["sysbench_events_per_second"], cassandra_fio_read_bw=item[ "cassandra_fio_lcs_64k_read"]["read"]["bw"], cassandra_fio_write_bw=item[ "cassandra_fio_lcs_64k_write"]["write"]["bw"])) except Exception as exc: # pylint: disable=broad-except LOGGER.warning( "Failed to generate comparable result for the following item:\n%s" "\nException:%s", item, exc) eps = [item.sysbench_eps for item in results] read_bw = [item.cassandra_fio_read_bw for item in results] write_bw = [item.cassandra_fio_write_bw for item in results] return Averages(sysbench_eps=sum(eps) / len(eps), cassandra_fio_read_bw=sum(read_bw) / len(read_bw), cassandra_fio_write_bw=sum(write_bw) / len(write_bw))
class ScyllaClusterBenchmarkManager(metaclass=Singleton): """ ScyllaClusterBenchmarkManager gathers the benchmark results of all the relevant db nodes in the cluster and presents them in unified fashion. ElasticSearch is used to store the results. """ def __init__(self): self._nodes: list["BaseNode"] = [] self._benchmark_runners: list[ScyllaNodeBenchmarkRunner] = [] self._es = ES() def add_node(self, new_node: "BaseNode"): if new_node.distro.is_debian_like: self._benchmark_runners.append(ScyllaNodeBenchmarkRunner(new_node)) else: LOGGER.debug( "Skipped installing benchmarking tools on a non-debian-like distro." ) def add_nodes(self, nodes: list["BaseNode"]): for node in nodes: self.add_node(node) def install_benchmark_tools(self): try: parallel = ParallelObject(self._benchmark_runners, timeout=300) parallel.run(lambda x: x.install_benchmark_tools(), ignore_exceptions=True) except TimeoutError as exc: LOGGER.warning( "Ran into TimeoutError while installing benchmark tools: Exception:\n%s", exc) def run_benchmarks(self): try: parallel = ParallelObject(self._benchmark_runners, timeout=300) parallel.run(lambda x: x.run_benchmarks(), ignore_exceptions=True) except TimeoutError as exc: LOGGER.warning( "Run into TimeoutError during running benchmarks. Exception:\n%s", exc) self._collect_benchmark_output() def _collect_benchmark_output(self): """ Collect the results from ScyllaClusterBenchmarkRunner instances and post them to Elasticsearch. """ test_id = TestConfig().test_id() for runner in self._benchmark_runners: if runner.benchmark_results: results = { "test_id": test_id, "node_instance_type": runner.node_instance_type, "node_name": runner.node_name, **runner.benchmark_results } doc_id = f"{test_id}-{runner.node_name.split('-')[-1]}" self._es.create_doc(index=ES_INDEX, doc_type=None, doc_id=doc_id, body=results) else: LOGGER.info("No benchmarks results for node: %s", runner.node_name)