def stress_seq_range(self, row_count, command_part1, command_part2): load_ip_count = len(self.load_ips) row_count_per_ip = row_count // load_ip_count range_points = [1] for i in range(load_ip_count): range_points.append(range_points[-1] + row_count_per_ip) range_points[-1] = row_count population_commands = [] # FIXME - cleanup for i in range(len(range_points) - 1): population_commands.append( f' n={range_points[i + 1] - range_points[i] + 1} -pop seq={range_points[i]}..{range_points[i + 1]} ' ) log(population_commands) log_important("Cassandra-Stress: started") run_parallel( self.__stress, [(ip, 10 if i > 0 else 0, command_part1 + pop_command + command_part2) for i, (ip, pop_command ) in enumerate(zip(self.load_ips, population_commands))]) log_important("Cassandra-Stress: done")
def exec(self, cql): """ Executes a CQL command. Parameters ---------- cql: str The CQL command """ if not self.started: self.wait_for_cql_start() self.started = True script_name = str(uuid.uuid4()) + ".cql" log_important(f"cqlsh exec: [{cql}]") ssh = self.__new_ssh(self.ip) ssh.exec(f"touch {script_name}") ssh.exec(f"echo \"{cql}\" > {script_name}") cmd = "cqlsh " if self.username: cmd += f"-u {self.username} " if self.password: cmd += f"-p {self.password} " cmd += f"-f {script_name}" ssh.exec(cmd) ssh.exec(f"rm {script_name}") log_important(f"cqlsh done")
def wait_for_cql_start(self, timeout=7200, connect_timeout=10, max_tries_per_second=2): log_important(f"cql: wait for start") wait_for_cql_start(self.ip, timeout, connect_timeout, max_tries_per_second) log_important(f"cqlsh: running")
def run(self, command): log_important( f'Disk Explorer run: started [{datetime.now().strftime("%H:%M:%S")}]' ) log(f"python3 diskplorer.py {command}") run_parallel(self.__run, [(ip, command) for ip in self.ips]) log_important( f'Disk Explorer run: done [{datetime.now().strftime("%H:%M:%S")}]')
def start(self): log_important(f"Starting Cassandra nodes {self.cluster_public_ips}") for public_ip in self.cluster_public_ips: self.__start(public_ip) wait_for_cql_start(public_ip) log_machine(public_ip, f"""Node finished bootstrapping""") self.__start_exporter(public_ip) log_important( f"Starting Cassandra nodes {self.cluster_public_ips}: done")
def stress(self, command, load_index=None): if load_index is None: log_important("scylla-bench: started") run_parallel(self.__stress, [(ip, command) for ip in self.load_ips]) log_important("scylla-bench: done") else: log("using load_index " + str(load_index)) self.__stress(self.load_ips[load_index], command)
def stress(self, command, load_index=None): if load_index is None: log_important("Cassandra-Stress: started") run_parallel(self.__stress, [(ip, 10 if i > 0 else 0, command) for i, ip in enumerate(self.load_ips)]) log_important("Cassandra-Stress: done") else: self.__stress(self.load_ips[load_index], 0, command)
def restart_cluster(cluster_public_ips, cluster_user, ssh_options, duration_seconds=90): log_important("Restart cluster ") pssh = PSSH(cluster_public_ips, cluster_user, ssh_options) log("nodetool drain") pssh.exec("nodetool drain") log("sudo systemctl restart scylla-server") pssh.exec("sudo systemctl restart scylla-server") log(f"Waiting {duration_seconds} seconds") sleep(duration_seconds) log_important("Cluster restarted")
def start(self): log_important("Prometheus start: started") ssh = SSH(self.ip, self.user, self.ssh_options) ssh.exec( f""" mkdir -p data dir=$(find . -maxdepth 1 -type d -name "scylla-monitoring*" -print -quit) cd $dir ./start-all.sh -v {self.scylla_version} -d ../data """) log_important("Prometheus start: done")
def stop(self): log_important("Prometheus stop: started") ssh = SSH(self.ip, self.user, self.ssh_options) ssh.exec( f""" dir=$(find . -maxdepth 1 -type d -name "scylla-monitoring*" -print -quit) echo "directory [$dir]" cd $dir ./kill-all.sh """) log_important("Prometheus stop: done")
def __install_perf(self): log_important("Perf install: started") pssh = self.__pssh() if not self.updated: pssh.update() self.updated = True # This part sucks.. Should no be a dependency on a particular version pssh.install_one("perf", "linux-tools-5.4.0-1035-aws") log_important("Perf install: done")
def exec(self, command): """ Returns the perf command on the remote machine. The command needs to be the full command like 'sudo perf record ...' """ log_important(f"Perf: started") log(command) pssh = PSSH(self.ip_list, self.user, self.ssh_options) pssh.exec(f""" cd /tmp {command} """) log_important(f"Perf: done")
def __install_scylla_debuginfo(self): """ Install scylla debug info """ pssh = self.__pssh() if not self.updated: pssh.update() self.updated = True log_important("Installing debuginfo: started") pssh.try_install("scylla_debuginfo") pssh.try_install("scylla-server-dbg") log_important("Installing debuginfo: done")
def cli(): parser = argparse.ArgumentParser() parser.add_argument("dir", help="The directory containing the prometheus data", nargs=1) args = parser.parse_args() old_wd = os.getcwd() data_dir = args.dir[0] if not os.path.isabs(data_dir): data_dir = os.path.join(old_wd, data_dir) if not os.path.exists(data_dir): print(f"[{data_dir}] does not exist!") exit(1) if not os.path.isdir(data_dir): print(f"[{data_dir}] is not a directory!") exit(1) scylla_monitoring_path = os.environ['SCYLLA_MONITORING'] old_wd = os.getcwd() os.chdir(scylla_monitoring_path) util.log_important("Killing Scylla Monitoring: starting") os.system("./kill-all.sh") util.log_important("Killing Scylla Monitoring: done") util.log_important("Starting Scylla Monitoring: started") os.system(f"./start-all.sh -d {data_dir} -s prometheus/scylla_servers.example.yml") util.log_important("Starting Scylla Monitoring: done") os.chdir(old_wd)
def stop(self, load_index=None, erase_data=False): if load_index is None: log_important("Stop Cassandra: started") run_parallel(self.__stop, [(ip, ) for ip in self.cluster_public_ips]) log_important("Stop Cassandra: done") else: self.__stop(self.cluster_public_ips[load_index]) if erase_data: ssh = self.__new_ssh(self.cluster_public_ips[load_index]) path_prefix = 'cassandra-raid/' if self.setup_raid else './' ssh.exec( f"rm -rf {path_prefix}apache-cassandra-{self.cassandra_version}/data" )
def clear_cluster(cluster_public_ips, cluster_user, ssh_options, duration_seconds=90): log_important("Shutting down cluster and removing all data") pssh = PSSH(cluster_public_ips, cluster_user, ssh_options) # pssh.exec("nodetool flush") log("Stopping scylla") pssh.exec("sudo systemctl stop scylla-server") log("Removing data dir") pssh.exec("sudo rm -fr /var/lib/scylla/data/*") log("Removing commit log") pssh.exec("sudo rm -fr /var/lib/scylla/commitlog/*") log("Starting scylla") pssh.exec("sudo systemctl start scylla-server") log(f"Waiting {duration_seconds} seconds") sleep(duration_seconds) log_important("Cluster cleared and restarted")
def __trim_recursivly(self, dir): if self.warmup_seconds is None and self.warmup_seconds is None: return log_important("HdrLogProcessor.trim_recursively") for hdr_file in glob.iglob(dir + '/*/*.hdr', recursive=True): filename = os.path.basename(hdr_file) if filename.startswith("trimmed_"): continue log(hdr_file) self.__trim(hdr_file) log_important("HdrLogProcessor.trim_recursively")
def collect_flamegraph(self, dir, data_file="perf.data", flamegraph_file="flamegraph.svg"): """ Collect the remotely created flame-graphs """ log_important(f"Perf collecting flamegraph: started") pssh = PSSH(self.ip_list, self.user, self.ssh_options) # --no-online pssh.exec(f""" cd /tmp sudo perf script -i {data_file} | FlameGraph/stackcollapse-perf.pl | FlameGraph/flamegraph.pl --hash > {flamegraph_file} """) pssh.scp_from_remote(f"/tmp/{flamegraph_file}", dir) pssh.exec(f"rm /tmp/{flamegraph_file}") log_important(f"Perf collecting flamegraph: done")
def collect_results(self, dir, warmup_seconds=None, cooldown_seconds=None): """ Parameters ---------- dir: str The download directory. """ log_important(f"Collecting results: started") run_parallel(self.__collect, [(ip, dir) for ip in self.load_ips]) p = HdrLogProcessor(self.properties, warmup_seconds=warmup_seconds, cooldown_seconds=cooldown_seconds) p.process(dir) log_important(f"Collecting results: done") log(f"Results can be found in [{dir}]")
def insert(self, partition_count, nodes, partition_offset=0, concurrency=64, clustering_row_count=1, extra_args=""): log_important(f"Inserting {partition_count} partitions") start_seconds = time.time() # todo: there could be some loss if there is a reaminer. pc_per_lg = partition_count // len(self.load_ips) cmd_list = [] for i in range(0, len(self.load_ips)): cmd = f"""-workload sequential \ -clustering-row-count {clustering_row_count} \ -mode write \ -partition-count {pc_per_lg} \ -partition-offset {partition_offset} \ -nodes {nodes} \ -concurrency {concurrency} \ {extra_args}""" # clean the string up. cmd = " ".join(cmd.split()) cmd_list.append(cmd) partition_offset = partition_offset + pc_per_lg futures = [] for i in range(0, len(self.load_ips)): f = self.async_stress(cmd_list[i], load_index=i) futures.append(f) if i == 0: # first one is given some extra time to set up the tables and all that. time.sleep(10) for f in futures: f.join() duration_seconds = time.time() - start_seconds log(f"Duration : {duration_seconds} seconds") log(f"Insertion rate: {partition_count // duration_seconds} items/second" ) log_important(f"Inserting {partition_count} partitions: done")
def install(self): log_important("Installing Cassandra: started") if self.setup_raid: log_important("Installing Cassandra: setting up RAID") raid = RAID(self.cluster_public_ips, self.ssh_user, '/dev/nvme*n1', 'cassandra-raid', 0, self.properties) raid.install() log_important("Installing Cassandra: finished setting up RAID") run_parallel(self.__install, [(ip, ) for ip in self.cluster_public_ips]) log_important("Installing Cassandra: done")
def __install_flamegraph(self): log_important("Perf install flamegraph: started") pssh = self.__pssh() if not self.updated: pssh.update() self.updated = True pssh.install("git") # needed for addr2line pssh.install("binutils") pssh.exec(f""" cd /tmp if [ ! -d FlameGraph ]; then echo "cloning flamegraph" git clone https://github.com/brendangregg/FlameGraph fi """) log_important("Perf install flamegraph: done")
def collect_results(self, dir, warmup_seconds=None, cooldown_seconds=None): """ Parameters ---------- dir: str The download directory. warmup_seconds : str The warmup period in seconds. If the value is set, additional files will be created where the warmup period is trimmed. cooldown_seconds : str The cooldown period in seconds. If the value is set, additional files will be created where the cooldown period is trimmed. """ log_important(f"Collecting results: started") run_parallel(self.__collect, [(ip, dir) for ip in self.load_ips]) p = HdrLogProcessor(self.properties, warmup_seconds=warmup_seconds, cooldown_seconds=cooldown_seconds) p.process(dir) log_important(f"Collecting results: done") log(f"Results can be found in [{dir}]")
def insert(self, profile, item_count, nodes, mode="native cql3", rate="threads=100", sequence_start=None): log_important(f"Inserting {item_count} items") start_seconds = time.time() per_load_generator = item_count // len(self.load_ips) start = sequence_start if sequence_start is None: start = 1 end = start + per_load_generator - 1 cmd_list = [] for i in range(0, len(self.load_ips)): cmd = f'user profile={profile} "ops(insert=1)" n={per_load_generator} no-warmup -pop seq={start}..{end} -mode {mode} -rate {rate} -node {nodes}' log(self.load_ips[i] + " " + cmd) cmd_list.append(cmd) start = end + 1 end = end + per_load_generator futures = [] for i in range(0, len(self.load_ips)): f = self.async_stress(cmd_list[i], load_index=i) futures.append(f) if i == 0: time.sleep(10) for f in futures: f.join() duration_seconds = time.time() - start_seconds log(f"Duration : {duration_seconds} seconds") log(f"Insertion rate: {item_count // duration_seconds} items/second") log_important(f"Inserting {item_count} items: done")
def __merge_recursivly(self, dir): log_important("HdrLogProcessor.merge_recursively") log(dir) # todo be careful with merging the merge file. files_map = {} for hdr_file in glob.iglob(dir + '/*/*.hdr', recursive=True): log(hdr_file) base = os.path.splitext(os.path.basename(hdr_file))[0] files = files_map.get(base) if files is None: files = [] files_map[base] = files files.append(hdr_file) for name, files in files_map.items(): input = "" for file in files: input = input + " -ifp " + file cmd = f'{self.java_path} -cp {self.lib_dir}/processor.jar CommandDispatcherMain union {input} -of {dir}/{name}.hdr' log(cmd) os.system(cmd) log_important("HdrLogProcessor.merge_recursively")
def nodes_start(cluster_user, ssh_options, *public_ips): log_important(f"Starting nodes {public_ips}") pssh = PSSH(public_ips, cluster_user, ssh_options) pssh.exec("sudo systemctl start scylla-server") log_important(f"Starting nodes {public_ips}: done")
def install(self): log_important("Installing Cassandra-Stress: started") run_parallel(self.__install, [(ip, ) for ip in self.load_ips]) log_important("Installing Cassandra-Stress: done")
def prepare(self, kill_java=True): log_important(f"Preparing load generator: started") run_parallel(self.__prepare, [(ip, kill_java) for ip in self.load_ips]) log_important(f"Preparing load generator: done")
def upload(self, file): log_important(f"Upload: started") run_parallel(self.__upload, [(ip, file) for ip in self.load_ips]) log_important(f"Upload: done")
def install(self): log_important("Installing Scylla: started") run_parallel(self.__install, [(ip,) for ip in self.cluster_public_ips]) log_important("Installing Scylla: done")