def test_info_event(self):
     event = InfoEvent(message="m1")
     event.event_id = "aff29bce-d75c-4f86-9890-c6d9c1c25d3e"
     self.assertEqual(
         str(event), "(InfoEvent Severity.NORMAL) period_type=not-set "
         "event_id=aff29bce-d75c-4f86-9890-c6d9c1c25d3e: message=m1")
     self.assertEqual(event, pickle.loads(pickle.dumps(event)))
Exemplo n.º 2
0
    def _run_stress(self, loader, loader_idx, cpu_idx):
        KclStressEvent.start(node=loader, stress_cmd=self.stress_cmd).publish()
        try:
            options_str = self.stress_cmd.replace('table_compare', '').strip()
            options = dict(item.strip().split("=") for item in options_str.split(";"))
            interval = int(options.get('interval', 20))
            src_table = options.get('src_table')
            dst_table = options.get('dst_table')

            while not self._stop_event.is_set():
                node: BaseNode = self.db_node_to_query(loader)
                node.run_nodetool('flush')

                src_size = node.get_cfstats(src_table)['Number of partitions (estimate)']
                dst_size = node.get_cfstats(dst_table)['Number of partitions (estimate)']

                status = f"== CompareTablesSizesThread: dst table/src table number of partitions: {dst_size}/{src_size} =="
                LOGGER.info(status)
                InfoEvent(status)

                if src_size == 0:
                    continue
                if dst_size >= src_size:
                    InfoEvent("== CompareTablesSizesThread: Done ==")
                    break
                time.sleep(interval)
            return None

        except Exception as exc:  # pylint: disable=broad-except
            errors_str = format_stress_cmd_error(exc)
            KclStressEvent.failure(node=loader, stress_cmd=self.stress_cmd, errors=[errors_str, ]).publish()
            raise
        finally:
            KclStressEvent.finish(node=loader).publish()
    def _create_repair_and_alter_it_with_repair_control(self):
        keyspace_to_be_repaired = "keyspace2"
        if not self.is_cred_file_configured:
            self.update_config_file()
        manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0])
        mgr_cluster = manager_tool.add_cluster(name=self.CLUSTER_NAME + '_repair_control',
                                               db_cluster=self.db_cluster,
                                               auth_token=self.monitors.mgmt_auth_token)
        # writing 292968720 rows, equal to the amount of data written in the prepare (around 100gb per node),
        # to create a large data fault and therefore a longer running repair
        self.create_missing_rows_in_cluster(create_missing_rows_in_multiple_nodes=True,
                                            keyspace_to_be_repaired=keyspace_to_be_repaired,
                                            total_num_of_rows=292968720)
        arg_list = [{"intensity": .0001},
                    {"intensity": 0},
                    {"parallel": 1},
                    {"intensity": 2, "parallel": 1}]

        InfoEvent(message="Repair started")
        repair_task = mgr_cluster.create_repair_task(keyspace="keyspace2")
        next_percentage_block = 20
        repair_task.wait_for_percentage(next_percentage_block)
        for args in arg_list:
            next_percentage_block += 20
            InfoEvent(message=f"Changing repair args to: {args}")
            mgr_cluster.control_repair(**args)
            repair_task.wait_for_percentage(next_percentage_block)
        repair_task.wait_and_get_final_status(step=30)
        InfoEvent(message="Repair ended")
Exemplo n.º 4
0
    def _run_stress(self, loader, loader_idx, cpu_idx):
        KclStressEvent.start(node=loader, stress_cmd=self.stress_cmd).publish()
        try:
            options_str = self.stress_cmd.replace('table_compare', '').strip()
            options = dict(item.strip().split("=")
                           for item in options_str.split(";"))
            interval = int(options.get('interval', 20))
            timeout = int(options.get('timeout', 28800))
            src_table = options.get('src_table')
            dst_table = options.get('dst_table')
            start_time = time.time()

            while not self._stop_event.is_set():
                node: BaseNode = self.db_node_to_query(loader)
                node.running_nemesis = "Compare tables size by cf-stats"
                node.run_nodetool('flush')

                dst_size = node.get_cfstats(
                    dst_table)['Number of partitions (estimate)']
                src_size = node.get_cfstats(
                    src_table)['Number of partitions (estimate)']

                node.running_nemesis = None
                elapsed_time = time.time() - start_time
                status = f"== CompareTablesSizesThread: dst table/src table number of partitions: {dst_size}/{src_size} =="
                LOGGER.info(status)
                status_msg = f'[{elapsed_time}/{timeout}] {status}'
                InfoEvent(status_msg).publish()

                if src_size == 0:
                    continue
                if elapsed_time > timeout:
                    InfoEvent(
                        f"== CompareTablesSizesThread: exiting on timeout of {timeout}"
                    ).publish()
                    break
                time.sleep(interval)
            return None

        except Exception as exc:  # pylint: disable=broad-except
            errors_str = format_stress_cmd_error(exc)
            KclStressEvent.failure(node=loader,
                                   stress_cmd=self.stress_cmd,
                                   errors=[
                                       errors_str,
                                   ]).publish()
            raise
        finally:
            KclStressEvent.finish(node=loader).publish()
 def test_repair_control(self):
     InfoEvent(message="Starting C-S write load")
     self.run_prepare_write_cmd()
     InfoEvent(message="Flushing")
     for node in self.db_cluster.nodes:
         node.run_nodetool("flush")
     InfoEvent(message="Waiting for compactions to end")
     self.wait_no_compactions_running(n=90, sleep_time=30)
     InfoEvent(message="Starting C-S read load")
     stress_read_thread = self.generate_background_read_load()
     time.sleep(600)  # So we will see the base load of the cluster
     InfoEvent(message="Sleep ended - Starting tests")
     self._create_repair_and_alter_it_with_repair_control()
     load_results = stress_read_thread.get_results()
     self.log.info(f'load={load_results}')
Exemplo n.º 6
0
    def test_events_analyzer(self):
        start_events_analyzer(_registry=self.events_processes_registry)
        events_analyzer = get_events_process(
            name=EVENTS_ANALYZER_ID, _registry=self.events_processes_registry)

        time.sleep(EVENTS_SUBSCRIBERS_START_DELAY)

        try:
            self.assertIsInstance(events_analyzer, EventsAnalyzer)
            self.assertTrue(events_analyzer.is_alive())
            self.assertEqual(events_analyzer._registry,
                             self.events_main_device._registry)
            self.assertEqual(events_analyzer._registry,
                             self.events_processes_registry)

            event1 = InfoEvent(message="m1")
            event2 = SpotTerminationEvent(node="n1", message="m2")

            with unittest.mock.patch(
                    "sdcm.sct_events.events_analyzer.EventsAnalyzer.kill_test"
            ) as mock:
                with self.wait_for_n_events(events_analyzer,
                                            count=2,
                                            timeout=1):
                    self.events_main_device.publish_event(event1)
                    self.events_main_device.publish_event(event2)

            self.assertEqual(self.events_main_device.events_counter,
                             events_analyzer.events_counter)

            mock.assert_called_once()
        finally:
            events_analyzer.stop(timeout=1)
 def _repair_intensity_feature(self, fault_multiple_nodes):
     InfoEvent(message="Starting C-S write load").publish()
     self.run_prepare_write_cmd()
     InfoEvent(message="Flushing").publish()
     for node in self.db_cluster.nodes:
         node.run_nodetool("flush")
     InfoEvent(message="Waiting for compactions to end").publish()
     self.wait_no_compactions_running(n=30, sleep_time=30)
     InfoEvent(message="Starting C-S read load").publish()
     stress_read_thread = self.generate_background_read_load()
     time.sleep(600)  # So we will see the base load of the cluster
     InfoEvent(message="Sleep ended - Starting tests").publish()
     with self.subTest('test_intensity_and_parallel'):
         self.test_intensity_and_parallel(fault_multiple_nodes=fault_multiple_nodes)
     load_results = stress_read_thread.get_results()
     self.log.info(f'load={load_results}')
 def measure_nodes_space_amplification_after_write(self, dict_nodes_initial_capacity, written_data_size_gb,
                                                   start_time):
     self.log.info(f"Space amplification results after a write of: {written_data_size_gb} are:")
     dict_nodes_space_amplification = self._get_nodes_space_ampl_over_time_gb(
         dict_nodes_initial_capacity=dict_nodes_initial_capacity,
         written_data_size_gb=written_data_size_gb, start_time=start_time)
     InfoEvent(message=f"Space amplification results after a write of: {written_data_size_gb} are: "
                       f"{dict_nodes_space_amplification}").publish()
    def test_intensity_and_parallel(self, fault_multiple_nodes):
        keyspace_to_be_repaired = "keyspace2"
        InfoEvent(message='starting test_intensity_and_parallel').publish()
        if not self.is_cred_file_configured:
            self.update_config_file()
        manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0])
        mgr_cluster = manager_tool.add_cluster(
            name=self.CLUSTER_NAME + '_intensity_and_parallel',
            db_cluster=self.db_cluster,
            auth_token=self.monitors.mgmt_auth_token,
        )

        InfoEvent(message="Starting faulty load (to be repaired)").publish()
        self.create_missing_rows_in_cluster(create_missing_rows_in_multiple_nodes=fault_multiple_nodes,
                                            keyspace_to_be_repaired=keyspace_to_be_repaired,
                                            total_num_of_rows=29296872)

        InfoEvent(message="Starting a repair with no intensity").publish()
        base_repair_task = mgr_cluster.create_repair_task(keyspace="keyspace*")
        base_repair_task.wait_and_get_final_status(step=30)
        assert base_repair_task.status == TaskStatus.DONE, "The base repair task did not end in the expected time"
        InfoEvent(message=f"The base repair, with no intensity argument, took {base_repair_task.duration}").publish()

        with self.db_cluster.cql_connection_patient(self.db_cluster.nodes[0]) as session:
            session.execute(f"DROP KEYSPACE IF EXISTS {keyspace_to_be_repaired}")

        arg_list = [{"intensity": .5},
                    {"intensity": .25},
                    {"intensity": .0001},
                    {"intensity": 2},
                    {"intensity": 4},
                    {"parallel": 1},
                    {"parallel": 2},
                    {"intensity": 2, "parallel": 1},
                    {"intensity": 100},
                    {"intensity": 0}]

        for arg_dict in arg_list:
            InfoEvent(message="Starting faulty load (to be repaired)").publish()
            self.create_missing_rows_in_cluster(create_missing_rows_in_multiple_nodes=fault_multiple_nodes,
                                                keyspace_to_be_repaired=keyspace_to_be_repaired,
                                                total_num_of_rows=29296872)

            InfoEvent(message=f"Starting a repair with {arg_dict}").publish()
            repair_task = mgr_cluster.create_repair_task(**arg_dict, keyspace="keyspace*")
            repair_task.wait_and_get_final_status(step=30)
            InfoEvent(message=f"repair with {arg_dict} took {repair_task.duration}").publish()

            with self.db_cluster.cql_connection_patient(self.db_cluster.nodes[0]) as session:
                session.execute(f"DROP KEYSPACE IF EXISTS {keyspace_to_be_repaired}")
        InfoEvent(message='finishing test_intensity_and_parallel').publish()
Exemplo n.º 10
0
    def _run_stress(self, loader, loader_idx, cpu_idx):
        KclStressEvent.start(node=loader, stress_cmd=self.stress_cmd).publish()
        try:
            src_table = self._options.get('src_table')
            dst_table = self._options.get('dst_table')
            end_time = time.time() + self._timeout

            while not self._stop_event.is_set():
                node: BaseNode = self.db_node_to_query(loader)
                node.running_nemesis = "Compare tables size by cf-stats"
                node.run_nodetool('flush')

                dst_size = node.get_cfstats(
                    dst_table)['Number of partitions (estimate)']
                src_size = node.get_cfstats(
                    src_table)['Number of partitions (estimate)']

                node.running_nemesis = None
                status = f"== CompareTablesSizesThread: dst table/src table number of partitions: {dst_size}/{src_size} =="
                LOGGER.info(status)
                InfoEvent(f'[{time.time()}/{end_time}] {status}').publish()

                if src_size == 0:
                    continue
                if time.time() > end_time:
                    InfoEvent(
                        f"== CompareTablesSizesThread: exiting on timeout of {self._timeout}"
                    ).publish()
                    break
                time.sleep(self._interval)
            return None

        except Exception as exc:  # pylint: disable=broad-except
            KclStressEvent.failure(node=loader,
                                   stress_cmd=self.stress_cmd,
                                   errors=[
                                       format_stress_cmd_error(exc),
                                   ]).publish()
            raise
        finally:
            KclStressEvent.finish(node=loader).publish()
    def test_ics_space_amplification_goal(self):  # pylint: disable=too-many-locals
        """
        (1) writing new data. wait for compactions to finish.
        (2) over-writing existing data.
        (3) measure space amplification after over-writing with SAG=None,1.5,1.2,None
        """

        self._set_enforce_min_threshold_true()
        # (1) writing new data.
        prepare_write_cmd = self.params.get('prepare_write_cmd')
        InfoEvent(message=f"Starting C-S prepare load: {prepare_write_cmd}").publish()
        self.run_prepare_write_cmd()
        InfoEvent(message="Wait for compactions to finish after write is done.").publish()
        self.wait_no_compactions_running()

        stress_cmd = self.params.get('stress_cmd')
        sag_testing_values = [None, '1.5', '1.2', '1.5', None]
        column_size = 205
        num_of_columns = 5
        # the below number is 1TB (yaml stress cmd total write) in bytes / 205 (column_size) / 5 (num_of_columns)
        overwrite_ops_num = 1072694271
        total_data_to_overwrite_gb = round(overwrite_ops_num * column_size * num_of_columns / (1024 ** 3), 2)
        min_threshold = '4'

        # (2) over-writing existing data.
        for sag in sag_testing_values:
            dict_nodes_capacity_before_overwrite_data = self._get_nodes_used_capacity()
            InfoEvent(
                message=f"Nodes used capacity before start overwriting data:"
                        f" {dict_nodes_capacity_before_overwrite_data}").publish()
            additional_compaction_params = {'min_threshold': min_threshold}
            if sag:
                additional_compaction_params.update({'space_amplification_goal': sag})
            # (3) Altering compaction with SAG=None,1.5,1.2,1.5,None
            self._alter_table_compaction(additional_compaction_params=additional_compaction_params)
            stress_queue = list()
            InfoEvent(message=f"Starting C-S over-write load: {stress_cmd}").publish()

            start_time = time.time()
            params = {'keyspace_num': 1, 'stress_cmd': stress_cmd,
                      'round_robin': self.params.get('round_robin')}
            self._run_all_stress_cmds(stress_queue, params)

            for stress in stress_queue:
                self.verify_stress_thread(cs_thread_pool=stress)

            InfoEvent(message="Wait for compactions to finish after over-write is done.").publish()
            self.wait_no_compactions_running()
            # (3) measure space amplification for the re-written data
            self.measure_nodes_space_amplification_after_write(
                dict_nodes_initial_capacity=dict_nodes_capacity_before_overwrite_data,
                written_data_size_gb=total_data_to_overwrite_gb, start_time=start_time)

        InfoEvent(message=f"Space-amplification-goal testing cycles are done.").publish()
    def _alter_table_compaction(self, compaction_strategy=CompactionStrategy.INCREMENTAL, table_name='standard1',
                                keyspace_name='keyspace1',
                                additional_compaction_params: dict = None):
        """
         Alters table compaction like: ALTER TABLE mykeyspace.mytable WITH
                                        compaction = {'class' : 'IncrementalCompactionStrategy'}
        """

        base_query = f"ALTER TABLE {keyspace_name}.{table_name} WITH compaction = "
        dict_requested_compaction = {'class': compaction_strategy.value}
        if additional_compaction_params:
            dict_requested_compaction.update(additional_compaction_params)

        full_alter_query = base_query + str(dict_requested_compaction)
        LOGGER.debug(f"Alter table query is: {full_alter_query}")
        node1: BaseNode = self.db_cluster.nodes[0]
        node1.run_cqlsh(cmd=full_alter_query)
        InfoEvent(message=f"Altered table by: {full_alter_query}").publish()
    def test_latency(self):
        """
        Test steps:

        1. Prepare cluster with data (reach steady_stet of compactions and ~x10 capacity than RAM.
        with round_robin and list of stress_cmd - the data will load several times faster.
        2. Run WRITE workload with gauss population.
        """
        self.run_pre_create_keyspace()
        self.run_fstrim_on_all_db_nodes()
        self.preload_data()

        for workload in self.ycsb_workloads:
            self.wait_no_compactions_running()
            self.run_fstrim_on_all_db_nodes()
            InfoEvent(message="Starting YCSB %s (%s)" %
                      (workload.name, workload.detailed_name)).publish()
            self.run_workload(stress_cmd=self._create_stress_cmd(workload),
                              sub_type=workload.sub_type)
 def info_event(self) -> Generator[InfoEvent, None, None]:
     yield InfoEvent(message="This is a mock InfoEvent")
 def test_info_event(self):
     event = InfoEvent(message="m1")
     self.assertEqual(str(event), "(InfoEvent Severity.NORMAL): message=m1")
     self.assertEqual(event, pickle.loads(pickle.dumps(event)))