Exemplo n.º 1
0
    def stress_seq_range(self, row_count, command_part1, command_part2):
        load_ip_count = len(self.load_ips)
        row_count_per_ip = row_count // load_ip_count
        range_points = [1]
        for i in range(load_ip_count):
            range_points.append(range_points[-1] + row_count_per_ip)
        range_points[-1] = row_count

        population_commands = []
        # FIXME - cleanup
        for i in range(len(range_points) - 1):
            population_commands.append(
                f' n={range_points[i + 1] - range_points[i] + 1} -pop seq={range_points[i]}..{range_points[i + 1]} '
            )

        log(population_commands)

        log_important("Cassandra-Stress: started")
        run_parallel(
            self.__stress,
            [(ip, 10 if i > 0 else 0,
              command_part1 + pop_command + command_part2)
             for i, (ip, pop_command
                     ) in enumerate(zip(self.load_ips, population_commands))])
        log_important("Cassandra-Stress: done")
Exemplo n.º 2
0
    def exec(self, cql):
        """
        Executes a CQL command.

        Parameters
        ----------
        cql: str
            The CQL command
        """

        if not self.started:
            self.wait_for_cql_start()
            self.started = True

        script_name = str(uuid.uuid4()) + ".cql"
        log_important(f"cqlsh exec: [{cql}]")
        ssh = self.__new_ssh(self.ip)
        ssh.exec(f"touch {script_name}")
        ssh.exec(f"echo \"{cql}\" > {script_name}")
        cmd = "cqlsh "
        if self.username:
            cmd += f"-u {self.username} "
        if self.password:
            cmd += f"-p {self.password} "
        cmd += f"-f {script_name}"
        ssh.exec(cmd)
        ssh.exec(f"rm {script_name}")
        log_important(f"cqlsh done")
Exemplo n.º 3
0
 def wait_for_cql_start(self,
                        timeout=7200,
                        connect_timeout=10,
                        max_tries_per_second=2):
     log_important(f"cql: wait for start")
     wait_for_cql_start(self.ip, timeout, connect_timeout,
                        max_tries_per_second)
     log_important(f"cqlsh: running")
 def run(self, command):
     log_important(
         f'Disk Explorer run: started [{datetime.now().strftime("%H:%M:%S")}]'
     )
     log(f"python3 diskplorer.py {command}")
     run_parallel(self.__run, [(ip, command) for ip in self.ips])
     log_important(
         f'Disk Explorer run: done [{datetime.now().strftime("%H:%M:%S")}]')
Exemplo n.º 5
0
 def start(self):
     log_important(f"Starting Cassandra nodes {self.cluster_public_ips}")
     for public_ip in self.cluster_public_ips:
         self.__start(public_ip)
         wait_for_cql_start(public_ip)
         log_machine(public_ip, f"""Node finished bootstrapping""")
         self.__start_exporter(public_ip)
     log_important(
         f"Starting Cassandra nodes {self.cluster_public_ips}: done")
 def stress(self, command, load_index=None):
     if load_index is None:
         log_important("scylla-bench: started")
         run_parallel(self.__stress,
                      [(ip, command) for ip in self.load_ips])
         log_important("scylla-bench: done")
     else:
         log("using load_index " + str(load_index))
         self.__stress(self.load_ips[load_index], command)
Exemplo n.º 7
0
 def stress(self, command, load_index=None):
     if load_index is None:
         log_important("Cassandra-Stress: started")
         run_parallel(self.__stress,
                      [(ip, 10 if i > 0 else 0, command)
                       for i, ip in enumerate(self.load_ips)])
         log_important("Cassandra-Stress: done")
     else:
         self.__stress(self.load_ips[load_index], 0, command)
Exemplo n.º 8
0
def restart_cluster(cluster_public_ips, cluster_user, ssh_options, duration_seconds=90):
    log_important("Restart cluster ")
    pssh = PSSH(cluster_public_ips, cluster_user, ssh_options)
    log("nodetool drain")
    pssh.exec("nodetool drain")
    log("sudo systemctl restart scylla-server")
    pssh.exec("sudo systemctl restart scylla-server")
    log(f"Waiting {duration_seconds} seconds")
    sleep(duration_seconds)
    log_important("Cluster restarted")
 def start(self):
     log_important("Prometheus start: started")
     ssh = SSH(self.ip, self.user, self.ssh_options)
     ssh.exec(
         f"""
         mkdir -p data
         dir=$(find . -maxdepth 1 -type d -name "scylla-monitoring*" -print -quit)
         cd $dir            
         ./start-all.sh -v {self.scylla_version} -d ../data
         """)
     log_important("Prometheus start: done")
 def stop(self):
     log_important("Prometheus stop: started")
     ssh = SSH(self.ip, self.user, self.ssh_options)
     ssh.exec(
         f"""
         dir=$(find . -maxdepth 1 -type d -name "scylla-monitoring*" -print -quit)   
         echo "directory [$dir]"         
         cd $dir
         ./kill-all.sh
         """)
     log_important("Prometheus stop: done")
Exemplo n.º 11
0
    def __install_perf(self):
        log_important("Perf install: started")
        pssh = self.__pssh()

        if not self.updated:
            pssh.update()
            self.updated = True

        # This part sucks.. Should no be a dependency on a particular version
        pssh.install_one("perf", "linux-tools-5.4.0-1035-aws")
        log_important("Perf install: done")
Exemplo n.º 12
0
 def exec(self, command):
     """
     Returns the perf command on the remote machine.
     The command needs to be the full command like 'sudo perf record ...'
     """
     log_important(f"Perf: started")
     log(command)
     pssh = PSSH(self.ip_list, self.user, self.ssh_options)
     pssh.exec(f"""
             cd /tmp
             {command}
             """)
     log_important(f"Perf: done")
Exemplo n.º 13
0
    def __install_scylla_debuginfo(self):
        """
        Install scylla debug info
        """
        pssh = self.__pssh()

        if not self.updated:
            pssh.update()
            self.updated = True

        log_important("Installing debuginfo: started")
        pssh.try_install("scylla_debuginfo")
        pssh.try_install("scylla-server-dbg")
        log_important("Installing debuginfo: done")
def cli():
    parser = argparse.ArgumentParser()
    parser.add_argument("dir", help="The directory containing the prometheus data", nargs=1)
    args = parser.parse_args()

    old_wd = os.getcwd()
    data_dir = args.dir[0]

    if not os.path.isabs(data_dir):
        data_dir = os.path.join(old_wd, data_dir)

    if not os.path.exists(data_dir):
        print(f"[{data_dir}] does not exist!")
        exit(1)

    if not os.path.isdir(data_dir):
        print(f"[{data_dir}] is not a directory!")
        exit(1)

    scylla_monitoring_path = os.environ['SCYLLA_MONITORING']

    old_wd = os.getcwd()
    os.chdir(scylla_monitoring_path)

    util.log_important("Killing Scylla Monitoring: starting")
    os.system("./kill-all.sh")
    util.log_important("Killing Scylla Monitoring: done")

    util.log_important("Starting Scylla Monitoring: started")
    os.system(f"./start-all.sh -d {data_dir} -s prometheus/scylla_servers.example.yml")
    util.log_important("Starting Scylla Monitoring: done")

    os.chdir(old_wd)
Exemplo n.º 15
0
    def stop(self, load_index=None, erase_data=False):
        if load_index is None:
            log_important("Stop Cassandra: started")
            run_parallel(self.__stop,
                         [(ip, ) for ip in self.cluster_public_ips])
            log_important("Stop Cassandra: done")
        else:
            self.__stop(self.cluster_public_ips[load_index])

            if erase_data:
                ssh = self.__new_ssh(self.cluster_public_ips[load_index])
                path_prefix = 'cassandra-raid/' if self.setup_raid else './'
                ssh.exec(
                    f"rm -rf {path_prefix}apache-cassandra-{self.cassandra_version}/data"
                )
Exemplo n.º 16
0
def clear_cluster(cluster_public_ips, cluster_user, ssh_options, duration_seconds=90):
    log_important("Shutting down cluster and removing all data")
    pssh = PSSH(cluster_public_ips, cluster_user, ssh_options)
    # pssh.exec("nodetool flush")
    log("Stopping scylla")
    pssh.exec("sudo systemctl stop scylla-server")
    log("Removing data dir")
    pssh.exec("sudo rm -fr /var/lib/scylla/data/*")
    log("Removing commit log")
    pssh.exec("sudo rm -fr /var/lib/scylla/commitlog/*")
    log("Starting scylla")
    pssh.exec("sudo systemctl start scylla-server")
    log(f"Waiting {duration_seconds} seconds")
    sleep(duration_seconds)
    log_important("Cluster cleared and restarted")
Exemplo n.º 17
0
    def __trim_recursivly(self, dir):
        if self.warmup_seconds is None and self.warmup_seconds is None:
            return

        log_important("HdrLogProcessor.trim_recursively")

        for hdr_file in glob.iglob(dir + '/*/*.hdr', recursive=True):
            filename = os.path.basename(hdr_file)
            if filename.startswith("trimmed_"):
                continue

            log(hdr_file)
            self.__trim(hdr_file)

        log_important("HdrLogProcessor.trim_recursively")
Exemplo n.º 18
0
 def collect_flamegraph(self,
                        dir,
                        data_file="perf.data",
                        flamegraph_file="flamegraph.svg"):
     """
     Collect the remotely created flame-graphs
     """
     log_important(f"Perf collecting flamegraph: started")
     pssh = PSSH(self.ip_list, self.user, self.ssh_options)
     # --no-online
     pssh.exec(f"""
             cd /tmp
             sudo perf script -i {data_file} | FlameGraph/stackcollapse-perf.pl | FlameGraph/flamegraph.pl --hash > {flamegraph_file}
             """)
     pssh.scp_from_remote(f"/tmp/{flamegraph_file}", dir)
     pssh.exec(f"rm /tmp/{flamegraph_file}")
     log_important(f"Perf collecting flamegraph: done")
    def collect_results(self, dir, warmup_seconds=None, cooldown_seconds=None):
        """
        Parameters
        ----------
        dir: str
            The download directory.
        """

        log_important(f"Collecting results: started")
        run_parallel(self.__collect, [(ip, dir) for ip in self.load_ips])

        p = HdrLogProcessor(self.properties,
                            warmup_seconds=warmup_seconds,
                            cooldown_seconds=cooldown_seconds)
        p.process(dir)

        log_important(f"Collecting results: done")
        log(f"Results can be found in [{dir}]")
    def insert(self,
               partition_count,
               nodes,
               partition_offset=0,
               concurrency=64,
               clustering_row_count=1,
               extra_args=""):
        log_important(f"Inserting {partition_count} partitions")
        start_seconds = time.time()

        # todo: there could be some loss if there is a reaminer.
        pc_per_lg = partition_count // len(self.load_ips)

        cmd_list = []
        for i in range(0, len(self.load_ips)):
            cmd = f"""-workload sequential \
                      -clustering-row-count {clustering_row_count} \
                      -mode write \
                      -partition-count {pc_per_lg} \
                      -partition-offset {partition_offset} \
                      -nodes {nodes} \
                      -concurrency {concurrency} \ 
                      {extra_args}"""
            # clean the string up.
            cmd = " ".join(cmd.split())
            cmd_list.append(cmd)
            partition_offset = partition_offset + pc_per_lg

        futures = []
        for i in range(0, len(self.load_ips)):
            f = self.async_stress(cmd_list[i], load_index=i)
            futures.append(f)
            if i == 0:
                # first one is given some extra time to set up the tables and all that.
                time.sleep(10)

        for f in futures:
            f.join()

        duration_seconds = time.time() - start_seconds
        log(f"Duration : {duration_seconds} seconds")
        log(f"Insertion rate: {partition_count // duration_seconds} items/second"
            )
        log_important(f"Inserting {partition_count} partitions: done")
Exemplo n.º 21
0
 def install(self):
     log_important("Installing Cassandra: started")
     if self.setup_raid:
         log_important("Installing Cassandra: setting up RAID")
         raid = RAID(self.cluster_public_ips, self.ssh_user, '/dev/nvme*n1',
                     'cassandra-raid', 0, self.properties)
         raid.install()
         log_important("Installing Cassandra: finished setting up RAID")
     run_parallel(self.__install,
                  [(ip, ) for ip in self.cluster_public_ips])
     log_important("Installing Cassandra: done")
Exemplo n.º 22
0
    def __install_flamegraph(self):
        log_important("Perf install flamegraph: started")
        pssh = self.__pssh()

        if not self.updated:
            pssh.update()
            self.updated = True

        pssh.install("git")
        # needed for addr2line
        pssh.install("binutils")
        pssh.exec(f"""
                cd /tmp
                if [ ! -d FlameGraph ]; then
                    echo "cloning flamegraph"
                    git clone https://github.com/brendangregg/FlameGraph
                fi
                """)
        log_important("Perf install flamegraph: done")
Exemplo n.º 23
0
    def collect_results(self, dir, warmup_seconds=None, cooldown_seconds=None):
        """
        Parameters
        ----------
        dir: str
            The download directory.
        warmup_seconds : str
            The warmup period in seconds. If the value is set, additional files will 
            be created where the warmup period is trimmed.
        cooldown_seconds : str
            The cooldown period in seconds. If the value is set, additional files will 
            be created where the cooldown period is trimmed.            
        """

        log_important(f"Collecting results: started")
        run_parallel(self.__collect, [(ip, dir) for ip in self.load_ips])
        p = HdrLogProcessor(self.properties,
                            warmup_seconds=warmup_seconds,
                            cooldown_seconds=cooldown_seconds)
        p.process(dir)
        log_important(f"Collecting results: done")
        log(f"Results can be found in [{dir}]")
Exemplo n.º 24
0
    def insert(self,
               profile,
               item_count,
               nodes,
               mode="native cql3",
               rate="threads=100",
               sequence_start=None):
        log_important(f"Inserting {item_count} items")
        start_seconds = time.time()

        per_load_generator = item_count // len(self.load_ips)
        start = sequence_start
        if sequence_start is None:
            start = 1

        end = start + per_load_generator - 1
        cmd_list = []
        for i in range(0, len(self.load_ips)):
            cmd = f'user profile={profile} "ops(insert=1)" n={per_load_generator} no-warmup -pop seq={start}..{end} -mode {mode} -rate {rate}  -node {nodes}'
            log(self.load_ips[i] + " " + cmd)
            cmd_list.append(cmd)
            start = end + 1
            end = end + per_load_generator

        futures = []
        for i in range(0, len(self.load_ips)):
            f = self.async_stress(cmd_list[i], load_index=i)
            futures.append(f)
            if i == 0:
                time.sleep(10)

        for f in futures:
            f.join()

        duration_seconds = time.time() - start_seconds
        log(f"Duration : {duration_seconds} seconds")
        log(f"Insertion rate: {item_count // duration_seconds} items/second")
        log_important(f"Inserting {item_count} items: done")
Exemplo n.º 25
0
    def __merge_recursivly(self, dir):
        log_important("HdrLogProcessor.merge_recursively")
        log(dir)
        # todo be careful with merging the merge file.
        files_map = {}

        for hdr_file in glob.iglob(dir + '/*/*.hdr', recursive=True):
            log(hdr_file)
            base = os.path.splitext(os.path.basename(hdr_file))[0]
            files = files_map.get(base)
            if files is None:
                files = []
                files_map[base] = files
            files.append(hdr_file)

        for name, files in files_map.items():
            input = ""
            for file in files:
                input = input + " -ifp " + file
            cmd = f'{self.java_path} -cp {self.lib_dir}/processor.jar CommandDispatcherMain union {input} -of {dir}/{name}.hdr'
            log(cmd)
            os.system(cmd)

        log_important("HdrLogProcessor.merge_recursively")
Exemplo n.º 26
0
def nodes_start(cluster_user, ssh_options, *public_ips):
    log_important(f"Starting nodes {public_ips}")
    pssh = PSSH(public_ips, cluster_user, ssh_options)
    pssh.exec("sudo systemctl start scylla-server")
    log_important(f"Starting nodes {public_ips}: done")
Exemplo n.º 27
0
 def install(self):
     log_important("Installing Cassandra-Stress: started")
     run_parallel(self.__install, [(ip, ) for ip in self.load_ips])
     log_important("Installing Cassandra-Stress: done")
Exemplo n.º 28
0
 def prepare(self, kill_java=True):
     log_important(f"Preparing load generator: started")
     run_parallel(self.__prepare, [(ip, kill_java) for ip in self.load_ips])
     log_important(f"Preparing load generator: done")
Exemplo n.º 29
0
 def upload(self, file):
     log_important(f"Upload: started")
     run_parallel(self.__upload, [(ip, file) for ip in self.load_ips])
     log_important(f"Upload: done")
Exemplo n.º 30
0
 def install(self):
     log_important("Installing Scylla: started")
     run_parallel(self.__install, [(ip,) for ip in self.cluster_public_ips])
     log_important("Installing Scylla: done")