def query(self, query, start, end, scrap_metrics_step=None): """ :param start: time=<rfc3339 | unix_timestamp>: Start timestamp. :param end: time=<rfc3339 | unix_timestamp>: End timestamp. :param scrap_metrics_step is the granularity of data requested from Prometheus DB :param query: :return: { metric: { }, values: [[linux_timestamp1, value1], [linux_timestamp2, value2]...[linux_timestampN, valueN]] } """ url = "http://{}:{}/api/v1/query_range?query=".format( normalize_ipv6_url(self.host), self.port) if not scrap_metrics_step: scrap_metrics_step = self.scylla_scrape_interval _query = "{url}{query}&start={start}&end={end}&step={scrap_metrics_step}".format( url=url, query=query, start=start, end=end, scrap_metrics_step=scrap_metrics_step) LOGGER.debug("Query to PrometheusDB: %s", _query) result = self.request(url=_query) if result: return result["data"]["result"] else: LOGGER.error("Prometheus query unsuccessful!") return []
def __init__(self, host, port=9090, alternator=None): self.host = host self.port = port self.range_query_url = "http://{}:{}/api/v1/query_range?query=".format( normalize_ipv6_url(host), port) self.config = self.get_configuration() self.alternator = alternator
def get_configuration(self): result = self.request(url="http://{}:{}/api/v1/status/config".format(normalize_ipv6_url(self.host), self.port)) configs = yaml.safe_load(result["data"]["yaml"]) LOGGER.debug("Parsed Prometheus configs: %s", configs) new_scrape_configs = {} for conf in configs["scrape_configs"]: new_scrape_configs[conf["job_name"]] = conf configs["scrape_configs"] = new_scrape_configs return configs
def web_driver_docker_client(self) -> Optional[DockerClient]: if not self.ssh_login_info: return None SSHAgent.add_keys((self.ssh_login_info["key_file"], )) # since a bug in docker package https://github.com/docker-library/python/issues/517 that need to explicitly # pass down the port for supporting ipv6 user = self.ssh_login_info['user'] hostname = normalize_ipv6_url(self.ssh_login_info['hostname']) try: return DockerClient(base_url=f"ssh://{user}@{hostname}:22", timeout=DOCKER_API_CALL_TIMEOUT) except paramiko.ssh_exception.BadHostKeyException as exc: system_host_keys_path = os.path.expanduser("~/.ssh/known_hosts") system_host_keys = paramiko.hostkeys.HostKeys(system_host_keys_path) if system_host_keys.pop(exc.hostname, None): system_host_keys.save(system_host_keys_path) return DockerClient(base_url=f"ssh://{user}@{hostname}:22", timeout=DOCKER_API_CALL_TIMEOUT)
def check_timeout(self): # pylint: disable=too-many-locals assert self.monitors.nodes, 'Monitor node should be set, we will try to get metrics from Prometheus server' base_url = "http://%s:9090/api/v1/query_range" % normalize_ipv6_url( self.monitors.nodes[0].external_address) range_str = "&start={0.start}&end={0.end}".format(self) cmd = [ "curl", "{}?query=scylla_storage_proxy_coordinator_read_timeouts{}&step=60s" .format(base_url, range_str) ] self.log.debug( 'Get read timeout per minute by Prometheus API, cmd: %s', cmd) result = subprocess.check_output(cmd, shell=True) orig_data = json.loads(result) read_timeout_msg = 'Read timeout of whole datacenter per minute should be less than 5000' self.log.debug('Check if we have significant read timeout, %s', read_timeout_msg) # parse prometheus response to generate a result matrix matrix = [] for i in orig_data['data']['result']: shard_unit = [] for j in i['values']: shard_unit.append(int(j[1])) matrix.append(shard_unit) # go through the matrix to check timeout per minute prev = None significant = [] for time_idx in range(len(matrix[0])): all_timeout = 0 for shard_unit in matrix: all_timeout += shard_unit[time_idx] if prev: timeout_per_min = all_timeout - prev self.log.debug('timeout_per_min: %s', timeout_per_min) if timeout_per_min > 5000: significant.append(timeout_per_min) prev = all_timeout self.log.debug(significant) assert not significant, read_timeout_msg
def test_custom_time(self): """ Run cassandra-stress with params defined in data_dir/scylla.yaml """ # pylint: disable=too-many-locals,too-many-branches,too-many-statements self.db_cluster.add_nemesis(nemesis=self.get_nemesis_class(), tester_obj=self) stress_queue = list() write_queue = list() verify_queue = list() # prepare write workload prepare_write_cmd = self.params.get('prepare_write_cmd', default=None) keyspace_num = self.params.get('keyspace_num', default=1) pre_create_schema = self.params.get('pre_create_schema', default=False) alternator_port = self.params.get('alternator_port', default=None) if alternator_port: endpoint_url = 'http://{}:{}'.format(normalize_ipv6_url(self.db_cluster.nodes[0].external_address), alternator_port) alternator_create_table(endpoint_url, test_params=self.params) if prepare_write_cmd: # In some cases (like many keyspaces), we want to create the schema (all keyspaces & tables) before the load # starts - due to the heavy load, the schema propogation can take long time and c-s fails. if pre_create_schema: self._pre_create_schema(keyspace_num, scylla_encryption_options=self.params.get( 'scylla_encryption_options', None)) # When the load is too heavy for one lader when using MULTI-KEYSPACES, the load is spreaded evenly across # the loaders (round_robin). if keyspace_num > 1 and self.params.get('round_robin'): self.log.debug("Using round_robin for multiple Keyspaces...") for i in range(1, keyspace_num + 1): keyspace_name = self._get_keyspace_name(i) self._run_all_stress_cmds(write_queue, params={'stress_cmd': prepare_write_cmd, 'keyspace_name': keyspace_name, 'round_robin': True}) # Not using round_robin and all keyspaces will run on all loaders else: self._run_all_stress_cmds(write_queue, params={'stress_cmd': prepare_write_cmd, 'keyspace_num': keyspace_num, 'round_robin': self.params.get('round_robin')}) # In some cases we don't want the nemesis to run during the "prepare" stage in order to be 100% sure that # all keys were written succesfully if self.params.get('nemesis_during_prepare'): # Wait for some data (according to the param in the yal) to be populated, for multi keyspace need to # pay attention to the fact it checks only on keyspace1 self.db_cluster.wait_total_space_used_per_node(keyspace=None) self.db_cluster.start_nemesis() # Wait on the queue till all threads come back. # todo: we need to improve this part for some cases that threads are being killed and we don't catch it. for stress in write_queue: self.verify_stress_thread(cs_thread_pool=stress) # Run nodetool flush on all nodes to make sure nothing left in memory # I decided to comment this out for now, when we found the data corruption bug, we wanted to be on the safe # side, but I don't think we should continue with this approach. # If we decided to add this back in the future, we need to wrap it with try-except because it can run # in parallel to nemesis and it will fail on one of the nodes. # self._flush_all_nodes() # In case we would like to verify all keys were written successfully before we start other stress / nemesis prepare_verify_cmd = self.params.get('prepare_verify_cmd', default=None) if prepare_verify_cmd: self._run_all_stress_cmds(verify_queue, params={'stress_cmd': prepare_verify_cmd, 'keyspace_num': keyspace_num}) for stress in verify_queue: self.verify_stress_thread(cs_thread_pool=stress) # Collect data about partitions and their rows amount validate_partitions = self.params.get('validate_partitions', default=None) table_name, primary_key_column, partitions_dict_before = '', '', {} if validate_partitions: table_name = self.params.get('table_name', default=None) primary_key_column = self.params.get('primary_key_column', default=None) self.log.debug('Save partitons info before reads') partitions_dict_before = self.collect_partitions_info(table_name=table_name, primary_key_column=primary_key_column, save_into_file_name='partitions_rows_before.log') stress_cmd = self.params.get('stress_cmd', default=None) if stress_cmd: # Stress: Same as in prepare_write - allow the load to be spread across all loaders when using multi ks if keyspace_num > 1 and self.params.get('round_robin'): self.log.debug("Using round_robin for multiple Keyspaces...") for i in range(1, keyspace_num + 1): keyspace_name = self._get_keyspace_name(i) params = {'keyspace_name': keyspace_name, 'round_robin': True, 'stress_cmd': stress_cmd} self._run_all_stress_cmds(stress_queue, params) # The old method when we run all stress_cmds for all keyspace on the same loader else: params = {'keyspace_num': keyspace_num, 'stress_cmd': stress_cmd} self._run_all_stress_cmds(stress_queue, params) customer_profiles = self.params.get('cs_user_profiles', default=[]) if customer_profiles: cs_duration = self.params.get('cs_duration', default='50m') for cs_profile in customer_profiles: assert os.path.exists(cs_profile), 'File not found: {}'.format(cs_profile) self.log.debug('Run stress test with user profile {}, duration {}'.format(cs_profile, cs_duration)) profile_dst = os.path.join('/tmp', os.path.basename(cs_profile)) with open(cs_profile) as pconf: cont = pconf.readlines() user_profile_table_count = self.params.get( # pylint: disable=invalid-name 'user_profile_table_count', default=1) for i in range(user_profile_table_count): for cmd in [line.lstrip('#').strip() for line in cont if line.find('cassandra-stress') > 0]: stress_cmd = (cmd.format(profile_dst, cs_duration)) params = {'stress_cmd': stress_cmd, 'profile': cs_profile} self.log.debug('Stress cmd: {}'.format(stress_cmd)) self._run_all_stress_cmds(stress_queue, params) fullscan = self._get_fullscan_params() if fullscan: self.log.info('Fullscan target: {} Fullscan interval: {}'.format(fullscan['ks.cf'], fullscan['interval'])) self.run_fullscan_thread(ks_cf=fullscan['ks.cf'], interval=fullscan['interval']) # Check if we shall wait for total_used_space or if nemesis wasn't started if not prepare_write_cmd or not self.params.get('nemesis_during_prepare'): self.db_cluster.wait_total_space_used_per_node(keyspace=None) self.db_cluster.start_nemesis() stress_read_cmd = self.params.get('stress_read_cmd', default=None) if stress_read_cmd: params = {'keyspace_num': keyspace_num, 'stress_cmd': stress_read_cmd} self._run_all_stress_cmds(stress_queue, params) for stress in stress_queue: self.verify_stress_thread(cs_thread_pool=stress) if (stress_read_cmd or stress_cmd) and validate_partitions: self.log.debug('Save partitons info after reads') partitions_dict_after = self.collect_partitions_info(table_name=table_name, primary_key_column=primary_key_column, save_into_file_name='partitions_rows_after.log') self.assertEqual(partitions_dict_before, partitions_dict_after, msg='Row amount in partitions is not same before and after running of nemesis')
def create_snapshot(self): url = "http://{}:{}/api/v1/admin/tsdb/snapshot".format( normalize_ipv6_url(self.host), self.port) result = self.request(url, True) LOGGER.debug('Request result: {}'.format(result)) return result
def create_endpoint_url(self, node): return 'http://{}:{}'.format(normalize_ipv6_url(node.external_address), self.params.get("alternator_port"))