예제 #1
0
    def inner(self, *args, **kwargs):
        # Perform validation of truncate entries in case the new version is 3.1 or more
        node = args[0]
        if self.truncate_entries_flag:
            base_version = self.params.get('scylla_version', default='')
            system_truncated = bool(
                parse_version(base_version) >= parse_version('3.1')
                and not is_enterprise(base_version))
            with self.cql_connection_patient(
                    node, keyspace='truncate_ks') as session:
                self.cql_truncate_simple_tables(session=session,
                                                rows=self.insert_rows)
                self.validate_truncated_entries_for_table(
                    session=session, system_truncated=system_truncated)

        func_result = func(self, *args, **kwargs)

        result = node.remoter.run('scylla --version')
        new_version = result.stdout
        if new_version and parse_version(new_version) >= parse_version('3.1'):
            # re-new connection
            with self.cql_connection_patient(
                    node, keyspace='truncate_ks') as session:
                self.validate_truncated_entries_for_table(
                    session=session, system_truncated=True)
                self.read_data_from_truncated_tables(session=session)
                self.cql_insert_data_to_simple_tables(session=session,
                                                      rows=self.insert_rows)
        return func_result
    def get_version_list(self):
        """
        return all supported releases versions, and the supported base versions of upgrade test.
        """
        supported_versions = []

        # Filter out the unsupported versions
        for version_prefix, _ in self.repo_maps.items():
            # can't find the major version from the version_prefix string
            if not re.match(r'\d+.\d+', version_prefix):
                continue
            # OSS: the major version is smaller than the start support version
            if self.oss_start_support_version and not is_enterprise(version_prefix) and \
                    parse_version(version_prefix) < parse_version(self.oss_start_support_version):
                continue
            # Enterprise: the major version is smaller than the start support version
            if self.ent_start_support_version and is_enterprise(version_prefix) and \
                    parse_version(version_prefix) < parse_version(self.ent_start_support_version):
                continue
            supported_versions.append(version_prefix)
        version_list = self.get_supported_scylla_base_versions(
            supported_versions)
        return supported_versions, version_list
 def get_product_and_version(self, scylla_version: str = None):
     """
     return scylla product name and major version. if scylla_version isn't assigned,
     we will try to get the major version from the scylla_repo url.
     """
     if scylla_version is None:
         assert 'unstable/' in self.scylla_repo
         product, version = self.scylla_repo.split('unstable/')[1].split(
             '/')[0:2]
         scylla_version = version.replace('branch-',
                                          '').replace('enterprise-', '')
     else:
         product = 'scylla-enterprise' if is_enterprise(
             scylla_version) else 'scylla'
     return product, scylla_version
예제 #4
0
 def get_product_and_version(self, scylla_version: str = None) -> tuple[str, str]:
     """
     return scylla product name and major version. if scylla_version isn't assigned,
     we will try to get the major version from the scylla_repo url.
     """
     LOGGER.info("Getting scylla product and major version for upgrade versions listing...")
     if scylla_version is None:
         assert 'unstable/' in self.scylla_repo, "Did not find 'unstable/' in scylla_repo. " \
                                                 "Scylla repo: %s" % self.scylla_repo
         product, version = self.scylla_repo.split('unstable/')[1].split('/')[0:2]
         scylla_version = version.replace('branch-', '').replace('enterprise-', '')
     else:
         product = 'scylla-enterprise' if is_enterprise(scylla_version) else 'scylla'
     LOGGER.info("Scylla product and major version used for upgrade versions listing: %s, %s", product, version)
     return product, scylla_version
예제 #5
0
    def test_rolling_upgrade(self):  # pylint: disable=too-many-locals,too-many-statements
        """
        Upgrade half of nodes in the cluster, and start special read workload
        during the stage. Checksum method is changed to xxhash from Scylla 2.2,
        we want to use this case to verify the read (cl=ALL) workload works
        well, upgrade all nodes to new version in the end.
        """

        # In case the target version >= 3.1 we need to perform test for truncate entries
        target_upgrade_version = self.params.get('target_upgrade_version',
                                                 default='')
        self.truncate_entries_flag = False
        if target_upgrade_version and parse_version(target_upgrade_version) >= parse_version('3.1') and \
                not is_enterprise(target_upgrade_version):
            self.truncate_entries_flag = True

        with self.subTest('pre-test - prepare test keyspaces and tables'):
            # prepare test keyspaces and tables before upgrade to avoid schema change during mixed cluster.
            self.prepare_keyspaces_and_tables()
            self.fill_and_verify_db_data('BEFORE UPGRADE', pre_fill=True)

            # write workload during entire test
            self.log.info('Starting c-s write workload during entire test')
            write_stress_during_entire_test = self.params.get(
                'write_stress_during_entire_test')
            entire_write_cs_thread_pool = self.run_stress_thread(
                stress_cmd=write_stress_during_entire_test)

            # Prepare keyspace and tables for truncate test
            if self.truncate_entries_flag:
                self.insert_rows = 10
                self.fill_db_data_for_truncate_test(
                    insert_rows=self.insert_rows)

        with self.subTest('pre-test - Run stress workload before upgrade'):
            # complex workload: prepare write
            self.log.info('Starting c-s complex workload (5M) to prepare data')
            stress_cmd_complex_prepare = self.params.get(
                'stress_cmd_complex_prepare')
            complex_cs_thread_pool = self.run_stress_thread(
                stress_cmd=stress_cmd_complex_prepare,
                profile='data_dir/complex_schema.yaml')

            # wait for the complex workload to finish
            self.verify_stress_thread(complex_cs_thread_pool)

            # generate random order to upgrade
            nodes_num = len(self.db_cluster.nodes)
            # prepare an array containing the indexes
            indexes = list(range(nodes_num))
            # shuffle it so we will upgrade the nodes in a
            # random order
            random.shuffle(indexes)

            # prepare write workload
            self.log.info('Starting c-s prepare write workload (n=10000000)')
            prepare_write_stress = self.params.get('prepare_write_stress')
            prepare_write_cs_thread_pool = self.run_stress_thread(
                stress_cmd=prepare_write_stress)
            self.log.info(
                'Sleeping for 60s to let cassandra-stress start before the upgrade...'
            )
            time.sleep(60)

        with DbEventsFilter(type='DATABASE_ERROR', line='Failed to load schema'), \
                DbEventsFilter(type='SCHEMA_FAILURE', line='Failed to load schema'), \
                DbEventsFilter(type='DATABASE_ERROR', line='Failed to pull schema'), \
                DbEventsFilter(type='RUNTIME_ERROR', line='Failed to load schema'):

            step = 'Step1 - Upgrade First Node '
            with self.subTest(step):
                # upgrade first node
                self.db_cluster.node_to_upgrade = self.db_cluster.nodes[
                    indexes[0]]
                self.log.info('Upgrade Node %s begin',
                              self.db_cluster.node_to_upgrade.name)
                self.upgrade_node(self.db_cluster.node_to_upgrade)
                self.log.info('Upgrade Node %s ended',
                              self.db_cluster.node_to_upgrade.name)
                self.db_cluster.node_to_upgrade.check_node_health()

                # wait for the prepare write workload to finish
                self.verify_stress_thread(prepare_write_cs_thread_pool)

                # read workload (cl=QUORUM)
                self.log.info(
                    'Starting c-s read workload (cl=QUORUM n=10000000)')
                stress_cmd_read_cl_quorum = self.params.get(
                    'stress_cmd_read_cl_quorum')
                read_stress_queue = self.run_stress_thread(
                    stress_cmd=stress_cmd_read_cl_quorum)
                # wait for the read workload to finish
                self.verify_stress_thread(read_stress_queue)
                self.fill_and_verify_db_data('after upgraded one node')
                self.search_for_idx_token_error_after_upgrade(
                    node=self.db_cluster.node_to_upgrade,
                    step=step + ' - after upgraded one node')

                # read workload
                self.log.info('Starting c-s read workload for 10m')
                stress_cmd_read_10m = self.params.get('stress_cmd_read_10m')
                read_10m_cs_thread_pool = self.run_stress_thread(
                    stress_cmd=stress_cmd_read_10m)

                self.log.info(
                    'Sleeping for 60s to let cassandra-stress start before the upgrade...'
                )
                time.sleep(60)

            step = 'Step2 - Upgrade Second Node '
            with self.subTest(step):
                # upgrade second node
                self.db_cluster.node_to_upgrade = self.db_cluster.nodes[
                    indexes[1]]
                self.log.info('Upgrade Node %s begin',
                              self.db_cluster.node_to_upgrade.name)
                self.upgrade_node(self.db_cluster.node_to_upgrade)
                self.log.info('Upgrade Node %s ended',
                              self.db_cluster.node_to_upgrade.name)
                self.db_cluster.node_to_upgrade.check_node_health()

                # wait for the 10m read workload to finish
                self.verify_stress_thread(read_10m_cs_thread_pool)
                self.fill_and_verify_db_data('after upgraded two nodes')
                self.search_for_idx_token_error_after_upgrade(
                    node=self.db_cluster.node_to_upgrade,
                    step=step + ' - after upgraded two nodes')

                # read workload (60m)
                self.log.info('Starting c-s read workload for 60m')
                stress_cmd_read_60m = self.params.get('stress_cmd_read_60m')
                read_60m_cs_thread_pool = self.run_stress_thread(
                    stress_cmd=stress_cmd_read_60m)
                self.log.info(
                    'Sleeping for 60s to let cassandra-stress start before the rollback...'
                )
                time.sleep(60)

            with self.subTest('Step3 - Rollback Second Node '):
                # rollback second node
                self.log.info('Rollback Node %s begin',
                              self.db_cluster.nodes[indexes[1]].name)
                self.rollback_node(self.db_cluster.nodes[indexes[1]])
                self.log.info('Rollback Node %s ended',
                              self.db_cluster.nodes[indexes[1]].name)
                self.db_cluster.nodes[indexes[1]].check_node_health()

        step = 'Step4 - Verify data during mixed cluster mode '
        with self.subTest(step):
            self.fill_and_verify_db_data('after rollback the second node')
            self.log.info('Repair the first upgraded Node')
            self.db_cluster.nodes[indexes[0]].run_nodetool(sub_cmd='repair')
            self.search_for_idx_token_error_after_upgrade(
                node=self.db_cluster.node_to_upgrade, step=step)

        with DbEventsFilter(type='DATABASE_ERROR', line='Failed to load schema'), \
                DbEventsFilter(type='SCHEMA_FAILURE', line='Failed to load schema'), \
                DbEventsFilter(type='DATABASE_ERROR', line='Failed to pull schema'), \
                DbEventsFilter(type='RUNTIME_ERROR', line='Failed to load schema'):

            step = 'Step5 - Upgrade rest of the Nodes '
            with self.subTest(step):
                for i in indexes[1:]:
                    self.db_cluster.node_to_upgrade = self.db_cluster.nodes[i]
                    self.log.info('Upgrade Node %s begin',
                                  self.db_cluster.node_to_upgrade.name)
                    self.upgrade_node(self.db_cluster.node_to_upgrade)
                    self.log.info('Upgrade Node %s ended',
                                  self.db_cluster.node_to_upgrade.name)
                    self.db_cluster.node_to_upgrade.check_node_health()
                    self.fill_and_verify_db_data(
                        'after upgraded %s' %
                        self.db_cluster.node_to_upgrade.name)
                    self.search_for_idx_token_error_after_upgrade(
                        node=self.db_cluster.node_to_upgrade, step=step)

        with self.subTest('Step6 - Verify stress results after upgrade '):
            self.log.info(
                'Waiting for stress threads to complete after upgrade')
            # wait for the 60m read workload to finish
            self.verify_stress_thread(read_60m_cs_thread_pool)

            self.verify_stress_thread(entire_write_cs_thread_pool)

        with self.subTest(
                'Step7 - Upgrade sstables to latest supported version '):
            # figure out what is the last supported sstable version
            self.expected_sstable_format_version = self.get_highest_supported_sstable_version(
            )

            # run 'nodetool upgradesstables' on all nodes and check/wait for all file to be upgraded
            upgradesstables = self.db_cluster.run_func_parallel(
                func=self.upgradesstables_if_command_available)

            # only check sstable format version if all nodes had 'nodetool upgradesstables' available
            if all(upgradesstables):
                self.log.info('Upgrading sstables if new version is available')
                tables_upgraded = self.db_cluster.run_func_parallel(
                    func=self.wait_for_sstable_upgrade)
                assert all(tables_upgraded
                           ), "Failed to upgrade the sstable format {}".format(
                               tables_upgraded)

            # Verify sstabledump
            self.log.info(
                'Starting sstabledump to verify correctness of sstables')
            self.db_cluster.nodes[0].remoter.run(
                'for i in `sudo find /var/lib/scylla/data/keyspace_complex/ -type f |grep -v manifest.json |'
                'grep -v snapshots |head -n 1`; do echo $i; sudo sstabledump $i 1>/tmp/sstabledump.output || '
                'exit 1; done',
                verbose=True)

        with self.subTest(
                'Step8 - Run stress and verify after upgrading entire cluster '
        ):
            self.log.info('Starting verify_stress_after_cluster_upgrade')
            verify_stress_after_cluster_upgrade = self.params.get(  # pylint: disable=invalid-name
                'verify_stress_after_cluster_upgrade')
            verify_stress_cs_thread_pool = self.run_stress_thread(
                stress_cmd=verify_stress_after_cluster_upgrade)
            self.verify_stress_thread(verify_stress_cs_thread_pool)

            # complex workload: verify data by simple read cl=ALL
            self.log.info(
                'Starting c-s complex workload to verify data by simple read')
            stress_cmd_complex_verify_read = self.params.get(
                'stress_cmd_complex_verify_read')
            complex_cs_thread_pool = self.run_stress_thread(
                stress_cmd=stress_cmd_complex_verify_read,
                profile='data_dir/complex_schema.yaml')
            # wait for the read complex workload to finish
            self.verify_stress_thread(complex_cs_thread_pool)

            # After adjusted the workloads, there is a entire write workload, and it uses a fixed duration for catching
            # the data lose.
            # But the execute time of workloads are not exact, so let only use basic prepare write & read verify for
            # complex workloads,and comment two complex workloads.
            #
            # TODO: retest commented workloads and decide to enable or delete them.
            #
            # complex workload: verify data by multiple ops
            # self.log.info('Starting c-s complex workload to verify data by multiple ops')
            # stress_cmd_complex_verify_more = self.params.get('stress_cmd_complex_verify_more')
            # complex_cs_thread_pool = self.run_stress_thread(stress_cmd=stress_cmd_complex_verify_more,
            #                                                profile='data_dir/complex_schema.yaml')

            # wait for the complex workload to finish
            # self.verify_stress_thread(complex_cs_thread_pool)

            # complex workload: verify data by delete 1/10 data
            # self.log.info('Starting c-s complex workload to verify data by delete')
            # stress_cmd_complex_verify_delete = self.params.get('stress_cmd_complex_verify_delete')
            # complex_cs_thread_pool = self.run_stress_thread(stress_cmd=stress_cmd_complex_verify_delete,
            #                                                profile='data_dir/complex_schema.yaml')
            # wait for the complex workload to finish
            # self.verify_stress_thread(complex_cs_thread_pool)

        # During the test we filter and ignore some specific errors, but we want to allow only certain amount of them
        step = 'Step9 - Search for errors that we filter during the test '
        with self.subTest(step):
            self.log.info(
                'Checking how many failed_to_load_schem errors happened during the test'
            )
            error_factor = 3
            schema_load_error_num = 0

            for node in self.db_cluster.nodes:
                errors = node.search_database_log(
                    search_pattern='Failed to load schema version',
                    start_from_beginning=True,
                    publish_events=False)
                schema_load_error_num += len(errors)
                self.search_for_idx_token_error_after_upgrade(node=node,
                                                              step=step)

            self.log.info('schema_load_error_num: %d', schema_load_error_num)
            assert schema_load_error_num <= error_factor * 8 * \
                len(self.db_cluster.nodes), 'Only allowing shards_num * %d schema load errors per host during the ' \
                                            'entire test, actual: %d' % (
                    error_factor, schema_load_error_num)

            self.log.info(
                'all nodes were upgraded, and last workaround is verified.')
예제 #6
0
 def test_05_is_enterprise(self):
     self.assertEqual(is_enterprise('2019.1.1'), True)
     self.assertEqual(is_enterprise('2018'), True)
     self.assertEqual(is_enterprise('3.1'), False)
     self.assertEqual(is_enterprise('2.2'), False)
     self.assertEqual(is_enterprise('666.development'), False)
    def get_supported_scylla_base_versions(self, supported_versions):  # pylint: disable=too-many-branches
        """
        We have special base versions list for each release, and we don't support to upgraded from enterprise
        to opensource. This function is used to get the base versions list which will be used in the upgrade test.

        @supported_version: all scylla release versions, the base versions will be filtered out from the supported_version
        """
        oss_base_version = []
        ent_base_version = []

        oss_release_list = [
            v for v in supported_versions if not is_enterprise(v)
        ]
        ent_release_list = [v for v in supported_versions if is_enterprise(v)]

        # The major version of unstable scylla, eg: 4.6.dev, 4.5, 2021.2.dev, 2021.1
        version = self.scylla_version
        product = self.product

        if product == 'scylla-enterprise':
            if version in supported_versions:
                # The dest version is a released enterprise version
                idx = ent_release_list.index(version)
                oss_base_version.append(supported_src_oss.get(version))
                if idx == 0:
                    # The dest version is a minor release of latest enterprise release
                    ent_base_version.append(version)
                else:
                    # Choose the last two releases as upgrade base
                    ent_base_version += ent_release_list[idx - 1:][:2]
            elif version == 'enterprise' or parse_version(
                    version) > parse_version(ent_release_list[0]):
                oss_base_version.append(ent_release_list[-1])
            elif re.match(r'\d+.\d+',
                          version) and parse_version(version) >= parse_version(
                              ent_release_list[0]):
                oss_base_version.append(oss_release_list[-1])
                ent_base_version += ent_release_list[-2:]
        elif product == 'scylla':
            if version in supported_versions:
                # The dest version is a released opensource version
                idx = oss_release_list.index(version)
                if idx == 0:
                    # The dest version is a minor release of latest enterprise release
                    oss_base_version.append(version)
                else:
                    # Choose the last two releases as upgrade base
                    oss_base_version += oss_release_list[idx - 1:][:2]
            elif version == 'master' or parse_version(version) > parse_version(
                    oss_release_list[0]):
                oss_base_version.append(oss_release_list[-1])
            elif re.match(r'\d+.\d+',
                          version) and parse_version(version) < parse_version(
                              oss_release_list[0]):
                # If dest version is smaller than the first supported opensource release, it might be an invalid dest version
                oss_base_version.append(oss_release_list[-1])
        else:
            raise ValueError("Unsupported product %s" % product)

        # Filter out unsupported version
        oss_base_version = [
            v for v in oss_base_version if v in supported_versions
        ]
        ent_base_version = [
            v for v in ent_base_version if v in supported_versions
        ]

        return oss_base_version + ent_base_version